Replace use of API wrapper stream and event with plain CUDA, part 1 (#…

…389) Replace cuda::stream_t<> with cudaStream_t in client code Replace cuda::event_t with cudaEvent_t in the client code Clean up BuildFiles
cms-patatrack · Nov 6, 2020 · 957d2de · 957d2de
1 parent 1d5f516
commit 957d2de
Show file tree

Hide file tree

Showing 12 changed files with 153 additions and 166 deletions.
diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
@@ -1,4 +1,4 @@
-<use name="cuda-api-wrappers"/>
+<use name="cuda"/>
 <use name="rootcore"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -36,23 +36,20 @@
 <bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
 
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -4,7 +4,7 @@
 void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
-                                            cuda::stream_t<> &stream) {
+                                            cudaStream_t stream) {
   assert(tuples_d);
 
   auto blockSize = 64;
@@ -20,64 +20,64 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                  bField_,
-                                                                  outputSoa_d,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  3,
-                                                                  offset);
+    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                             bField_,
+                                                             outputSoa_d,
+                                                             hitsGPU_.get(),
+                                                             hits_geGPU_.get(),
+                                                             fast_fit_resultsGPU_.get(),
+                                                             3,
+                                                             offset);
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      4,
-                                                                      offset);
+    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                 bField_,
+                                                                 outputSoa_d,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 4,
+                                                                 offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        bField_,
-                                                                        outputSoa_d,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        bField_,
-                                                                        outputSoa_d,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
       cudaCheck(cudaGetLastError());
     }
 

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,5 +1,4 @@
 <use name="cuda"/>
-<use name="cuda-api-wrappers"/>
 <use name="ofast-flag"/>
 <use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -11,7 +11,7 @@ void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSo
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -147,7 +147,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
@@ -166,12 +166,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
-                                                                                nhits,
-                                                                                device_theCellNeighbors_,
-                                                                                device_theCellNeighborsContainer_.get(),
-                                                                                device_theCellTracks_,
-                                                                                device_theCellTracksContainer_.get());
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_,
+                                                                           device_theCellNeighborsContainer_.get(),
+                                                                           device_theCellTracks_,
+                                                                           device_theCellTracksContainer_.get());
     cudaCheck(cudaGetLastError());
   }
 
@@ -199,18 +199,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
   int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(device_theCells_.get(),
-                                                                         device_nCells_,
-                                                                         device_theCellNeighbors_,
-                                                                         device_theCellTracks_,
-                                                                         hh.view(),
-                                                                         device_isOuterHitOfCell_.get(),
-                                                                         nActualPairs,
-                                                                         m_params.idealConditions_,
-                                                                         m_params.doClusterCut_,
-                                                                         m_params.doZCut_,
-                                                                         m_params.doPhiCut_,
-                                                                         m_params.maxNumberOfDoublets_);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_,
+                                                                    device_theCellTracks_,
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    m_params.idealConditions_,
+                                                                    m_params.doClusterCut_,
+                                                                    m_params.doZCut_,
+                                                                    m_params.doPhiCut_,
+                                                                    m_params.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -170,8 +170,8 @@ class CAHitNtupletGeneratorKernels {
 
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
-  void allocateOnGPU(cuda::stream_t<>& stream);
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,9 +4,9 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cuda::stream_t<>& stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
@@ -42,10 +42,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
       constexpr
 #endif
       (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
-    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
   }
-  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
-  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
+  cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
+  cudautils::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -161,7 +161,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                                     float bfield,
-                                                                    cuda::stream_t<>& stream) const {
+                                                                    cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
@@ -174,32 +174,31 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream.id());
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream.id());  // in principle needed only if Hits not "available"
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
     fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   }
-  kernels.classifyTuples(hits_d, soa, stream.id());
+  kernels.classifyTuples(hits_d, soa, stream);
 
   return tracks;
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
   PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-  auto dummyStream = cuda::stream::wrap(0, 0, false);
 
   auto* soa = tracks.get();
   assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(dummyStream);
+  kernels.allocateOnGPU(nullptr);
 
-  kernels.buildDoublets(hits_d, dummyStream);
-  kernels.launchKernels(hits_d, soa, dummyStream.id());
-  kernels.fillHitDetIndices(hits_d.view(), soa, dummyStream.id());  // in principle needed only if Hits not "available"
+  kernels.buildDoublets(hits_d, nullptr);
+  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
 
   if (0 == hits_d.nHits())
     return tracks;
@@ -214,7 +213,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
   }
 
-  kernels.classifyTuples(hits_d, soa, dummyStream.id());
+  kernels.classifyTuples(hits_d, soa, nullptr);
 
   return tracks;
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,18 +47,16 @@ class CAHitNtupletGeneratorOnGPU {
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                          float bfield,
-                                          cuda::stream_t<>& stream) const;
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
   PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
-  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
 
-  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cuda::stream_t<>& cudaStream);
+  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
 
-  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
 
   Params m_params;
 

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -45,14 +45,8 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsView const *hv,
-                            uint32_t nhits,
-                            uint32_t maxNumberOfTuples,
-                            cuda::stream_t<> &cudaStream);
-  void launchBrokenLineKernels(HitsView const *hv,
-                               uint32_t nhits,
-                               uint32_t maxNumberOfTuples,
-                               cuda::stream_t<> &cudaStream);
+  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);