Skip to content

Commit

Permalink
Replace use of API wrapper stream and event with plain CUDA, part 1 (#…
Browse files Browse the repository at this point in the history
…389)

Replace cuda::stream_t<> with cudaStream_t in client code
Replace cuda::event_t with cudaEvent_t in the client code
Clean up BuildFiles
  • Loading branch information
makortel authored and fwyzard committed Nov 6, 2020
1 parent 1d5f516 commit 957d2de
Show file tree
Hide file tree
Showing 12 changed files with 153 additions and 166 deletions.
2 changes: 1 addition & 1 deletion CUDADataFormats/Track/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<use name="rootcore"/>
<use name="DataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
Expand Down
3 changes: 0 additions & 3 deletions RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,23 +36,20 @@
<bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
<use name="eigen"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<flags CXXFLAGS="-g"/>
</bin>

<bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
<use name="eigen"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<flags CXXFLAGS="-g -DUSE_BL"/>
</bin>

<bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
<use name="eigen"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<flags CXXFLAGS="-g"/>
</bin>
Expand Down
74 changes: 37 additions & 37 deletions RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
uint32_t hitsInFit,
uint32_t maxNumberOfTuples,
cuda::stream_t<> &stream) {
cudaStream_t stream) {
assert(tuples_d);

auto blockSize = 64;
Expand All @@ -20,64 +20,64 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,

for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
// fit triplets
kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
cudaCheck(cudaGetLastError());

kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
3,
offset);
kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
3,
offset);
cudaCheck(cudaGetLastError());

// fit quads
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
cudaCheck(cudaGetLastError());

kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
4,
offset);
kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
4,
offset);
cudaCheck(cudaGetLastError());

if (fit5as4_) {
// fit penta (only first 4)
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
cudaCheck(cudaGetLastError());

kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
cudaCheck(cudaGetLastError());
} else {
// fit penta (all 5)
kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
cudaCheck(cudaGetLastError());

kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
cudaCheck(cudaGetLastError());
}

Expand Down
1 change: 0 additions & 1 deletion RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="ofast-flag"/>
<use name="CommonTools/RecoAlgos"/>
<use name="FWCore/Framework"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSo
}

template <>
void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
auto nhits = hh.nHits();

#ifdef NTUPLE_DEBUG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
}

template <>
void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
auto nhits = hh.nHits();

#ifdef NTUPLE_DEBUG
Expand All @@ -166,12 +166,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
int threadsPerBlock = 128;
// at least one block!
int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
nhits,
device_theCellNeighbors_,
device_theCellNeighborsContainer_.get(),
device_theCellTracks_,
device_theCellTracksContainer_.get());
cudaCheck(cudaGetLastError());
}

Expand Down Expand Up @@ -199,18 +199,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
dim3 blks(1, blocks, 1);
dim3 thrs(stride, threadsPerBlock, 1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
m_params.idealConditions_,
m_params.doClusterCut_,
m_params.doZCut_,
m_params.doPhiCut_,
m_params.maxNumberOfDoublets_);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_,
device_theCellTracks_,
hh.view(),
device_isOuterHitOfCell_.get(),
nActualPairs,
m_params.idealConditions_,
m_params.doClusterCut_,
m_params.doZCut_,
m_params.doPhiCut_,
m_params.maxNumberOfDoublets_);
cudaCheck(cudaGetLastError());

#ifdef GPU_DEBUG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ class CAHitNtupletGeneratorKernels {

void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);

void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
void allocateOnGPU(cuda::stream_t<>& stream);
void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
void allocateOnGPU(cudaStream_t stream);
void cleanup(cudaStream_t cudaStream);

static void printCounters(Counters const* counters);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

template <>
#ifdef __CUDACC__
void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cuda::stream_t<>& stream) {
void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
#else
void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
#endif
//////////////////////////////////////////////////////////
// ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
Expand Down Expand Up @@ -42,10 +42,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
constexpr
#endif
(std::is_same<Traits, cudaCompat::GPUTraits>::value) {
cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
} else {
*device_nCells_ = 0;
}
cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
cudautils::launchZero(device_hitToTuple_.get(), stream.id()); // we may wish to keep it in the edm...
cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
cudautils::launchZero(device_hitToTuple_.get(), stream); // we may wish to keep it in the edm...
}
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&

PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
float bfield,
cuda::stream_t<>& stream) const {
cudaStream_t stream) const {
PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));

auto* soa = tracks.get();
Expand All @@ -174,32 +174,31 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);

kernels.buildDoublets(hits_d, stream);
kernels.launchKernels(hits_d, soa, stream.id());
kernels.fillHitDetIndices(hits_d.view(), soa, stream.id()); // in principle needed only if Hits not "available"
kernels.launchKernels(hits_d, soa, stream);
kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available"
if (m_params.useRiemannFit_) {
fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
} else {
fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
}
kernels.classifyTuples(hits_d, soa, stream.id());
kernels.classifyTuples(hits_d, soa, stream);

return tracks;
}

PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
auto dummyStream = cuda::stream::wrap(0, 0, false);

auto* soa = tracks.get();
assert(soa);

CAHitNtupletGeneratorKernelsCPU kernels(m_params);
kernels.counters_ = m_counters;
kernels.allocateOnGPU(dummyStream);
kernels.allocateOnGPU(nullptr);

kernels.buildDoublets(hits_d, dummyStream);
kernels.launchKernels(hits_d, soa, dummyStream.id());
kernels.fillHitDetIndices(hits_d.view(), soa, dummyStream.id()); // in principle needed only if Hits not "available"
kernels.buildDoublets(hits_d, nullptr);
kernels.launchKernels(hits_d, soa, nullptr);
kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available"

if (0 == hits_d.nHits())
return tracks;
Expand All @@ -214,7 +213,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
}

kernels.classifyTuples(hits_d, soa, dummyStream.id());
kernels.classifyTuples(hits_d, soa, nullptr);

return tracks;
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,16 @@ class CAHitNtupletGeneratorOnGPU {
static void fillDescriptions(edm::ParameterSetDescription& desc);
static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }

PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
float bfield,
cuda::stream_t<>& stream) const;
PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;

PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;

private:
void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;

void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cuda::stream_t<>& cudaStream);
void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);

void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;

Params m_params;

Expand Down
10 changes: 2 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,8 @@ class HelixFitOnGPU {
~HelixFitOnGPU() { deallocateOnGPU(); }

void setBField(double bField) { bField_ = bField; }
void launchRiemannKernels(HitsView const *hv,
uint32_t nhits,
uint32_t maxNumberOfTuples,
cuda::stream_t<> &cudaStream);
void launchBrokenLineKernels(HitsView const *hv,
uint32_t nhits,
uint32_t maxNumberOfTuples,
cuda::stream_t<> &cudaStream);
void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);

void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
Expand Down
Loading

0 comments on commit 957d2de

Please sign in to comment.