Move BeamSpot transfer to GPU to its own producer (#318)

Implement a non-caching host allocator, useful for host-to-device copy buffers: - not bound to any CUDA stream to allow use in EDM beginStream(); - with the possibility to pass flags to cudaHostAlloc(), e.g. cudaHostAllocWriteCombined. Add perfect forwarding overload for CUDAProduct constructor, enabling the use of CUDAScopedContext::emplace() in BeamSpotToCUDA::produce(). Move the BeamSpot host-to-device transfer to its own EDProducer, making use of beginStream()-allocated write-combined memory for the transfer.
cms-patatrack · Dec 29, 2020 · 6658678 · 6658678
1 parent 18ccfc7
commit 6658678
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 14 deletions.
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -24,7 +24,9 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
 #include "RecoTracker/Record/interface/CkfComponentsRecord.h"
 
@@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   std::unique_ptr<PixelUnpackingRegions> regions_;
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
   PixelDataFormatter::Errors errors_;
 
   const bool includeErrors_;
@@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   }
 
   if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+
+  edm::Service<CUDAService> cs;
+  if(cs->enabled()) {
+    wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
+  }
 }
 
 void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
 
   // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
   ErrorChecker errorcheck;
-  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
   for(int fedId: fedIds_) {
     if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
     if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
@@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
     const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
 
     assert(0 == (ew-bw)%2);
-    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
     wordCounterGPU+=(ew-bw);
 
   } // end of for loop
 
   gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
-                             wordFedAppender,
+                             *wordFedAppender_,
                              std::move(errors_),
                              wordCounterGPU, fedCounter, 
                              useQuality_, includeErrors_,

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -45,10 +45,9 @@ namespace pixelgpudetails {
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
 
-  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
-    edm::Service<CUDAService> cs;
-    word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
-    fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
+    word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
   }
 
   void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -11,6 +11,7 @@
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 #include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
@@ -159,7 +160,7 @@ namespace pixelgpudetails {
   public:
     class WordFedAppender {
     public:
-      WordFedAppender(cuda::stream_t<>& cudaStream);
+      WordFedAppender();
       ~WordFedAppender() = default;
 
       void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
@@ -168,8 +169,8 @@ namespace pixelgpudetails {
       const unsigned char *fedId() const { return fedId_.get(); }
 
     private:
-      cudautils::host::unique_ptr<unsigned int[]> word_;
-      cudautils::host::unique_ptr<unsigned char[]> fedId_;
+      cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
+      cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/BeamSpot"/>
 <use name="DataFormats/TrackerCommon"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <limits>
 
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -15,7 +16,7 @@ namespace gpuPixelRecHits {
 
 
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
-                          float const * __restrict__  bs,
+                          BeamSpotCUDA::Data const * __restrict__  bs,
                           uint16_t const * __restrict__  id,
 			  uint16_t const * __restrict__  x,
 			  uint16_t const * __restrict__  y,
@@ -143,9 +144,9 @@ namespace gpuPixelRecHits {
     // to global and compute phi... 
     cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]);
     // here correct for the beamspot...
-    xg[h]-=bs[0];
-    yg[h]-=bs[1];
-    zg[h]-=bs[2];
+    xg[h]-=bs->x;
+    yg[h]-=bs->y;
+    zg[h]-=bs->z;
 
     rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]);
     iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);