From 41cd273dfa2f0647c0c133de005e6442af1e561c Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 29 Mar 2019 06:50:23 -0400 Subject: [PATCH] Migrate the pixel rechits producer and CA to the new heterogeneous framework (#338) Use cleaned hits. Use pixel layer and ladders geometry, and use pixel triplets in the gaps. Optimise GPU memory usage: - reduce the number of memory allocations - fix the size of the cub workspace - allocate memory per event via the caching allocator - use constant memory for geometry and parameters - use shared memory where the content is the same for every thread Optimise kernel launches, and add a protection for empty events and overflows. --- .../python/RecoLocalTracker_cff.py | 6 +- .../interface/PixelTrackingGPUConstants.h | 11 - .../plugins/SiPixelDigisClustersFromSoA.cc | 2 +- .../plugins/SiPixelRawToClusterGPUKernel.cu | 85 +++-- .../plugins/gpuCalibPixel.h | 123 ++----- .../plugins/gpuClusteringConstants.h | 10 +- .../siPixelClustersHeterogeneous_cfi.py | 37 --- .../SiPixelRecHits/interface/PixelCPEFast.h | 1 + .../SiPixelRecHits/interface/pixelCPEforGPU.h | 22 +- .../SiPixelRecHits/plugins/BuildFile.xml | 1 + .../SiPixelRecHits/plugins/PixelRecHits.cu | 189 ++--------- .../SiPixelRecHits/plugins/PixelRecHits.h | 52 +-- .../plugins/SiPixelRecHitCUDA.cc | 118 +++++++ .../plugins/SiPixelRecHitFromSOA.cc | 187 +++++++++++ .../plugins/SiPixelRecHitHeterogeneous.cc | 313 ------------------ .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 74 +++-- .../siPixelRecHitsHeterogeneousProduct.h | 79 ----- .../python/SiPixelRecHits_cfi.py | 33 +- .../SiPixelRecHits/src/PixelCPEFast.cc | 49 ++- .../customizePixelTracksForProfiling.py | 2 - .../PixelTrackFitting/interface/FitResult.h | 4 +- .../plugins/PixelTrackProducerFromCUDA.cc | 1 - .../plugins/BrokenLineFitOnGPU.cu | 68 ++-- .../PixelTriplets/plugins/BuildFile.xml | 2 - .../PixelTriplets/plugins/CAConstants.h | 30 +- .../CAHitNtupletHeterogeneousEDProducer.cc | 41 ++- .../plugins/CAHitQuadrupletGeneratorGPU.cc | 46 ++- .../plugins/CAHitQuadrupletGeneratorGPU.h | 21 +- .../CAHitQuadrupletGeneratorKernels.cu | 139 +++++--- .../plugins/CAHitQuadrupletGeneratorKernels.h | 47 +-- .../CAHitQuadrupletGeneratorKernelsAlloc.cu | 41 ++- .../PixelTriplets/plugins/GPUCACell.h | 94 ++++-- .../PixelTriplets/plugins/HelixFitOnGPU.cc | 17 - .../PixelTriplets/plugins/HelixFitOnGPU.h | 22 +- .../PixelTriplets/plugins/RecHitsMap.h | 3 + .../PixelTriplets/plugins/RiemannFitOnGPU.cu | 80 +++-- .../PixelTriplets/plugins/gpuFishbone.h | 6 +- .../PixelTriplets/plugins/gpuPixelDoublets.h | 271 +++------------ .../plugins/gpuPixelDoubletsAlgos.h | 220 ++++++++++++ .../plugins/pixelTuplesHeterogeneousProduct.h | 6 +- 40 files changed, 1253 insertions(+), 1300 deletions(-) delete mode 100644 RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h delete mode 100644 RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py create mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc create mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc delete mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc delete mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py index a486a83d178f4..08f871e45f8d7 100644 --- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py +++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py @@ -13,7 +13,7 @@ from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import * from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import * -pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting) +pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplittingTask) striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits) trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer) @@ -21,10 +21,6 @@ striptrackerlocalreco = cms.Sequence(striptrackerlocalrecoTask) trackerlocalreco = cms.Sequence(trackerlocalrecoTask) -from Configuration.ProcessModifiers.gpu_cff import gpu -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous -gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous) - from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import * from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import * diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h b/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h deleted file mode 100644 index ceb831d7865b6..0000000000000 --- a/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h -#define RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h - -#include - -namespace PixelGPUConstants { - constexpr uint16_t maxNumberOfHits = 40000; // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away - -} - -#endif // RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc index 2c7da14cf72af..ba184d766feaf 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc @@ -149,7 +149,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con } // fill final clusters - fillClusters((*detDigis).detId()); + if (detDigis) fillClusters((*detDigis).detId()); //std::cout << "filled " << totCluseFilled << " clusters" << std::endl; iEvent.put(digiPutToken_, std::move(collection)); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 8fdb2ed8c90d5..6a832128c1cc2 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -36,6 +36,7 @@ #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" // local includes #include "SiPixelRawToClusterGPUKernel.h" @@ -456,6 +457,54 @@ namespace pixelgpudetails { } // end of Raw to Digi kernel + + __global__ + void fillHitsModuleStart(uint32_t const * __restrict__ cluStart, uint32_t * __restrict__ moduleStart) { + + assert(gpuClustering::MaxNumModules<2048); // easy to extend at least till 32*1024 + assert(1==gridDim.x); + assert(0==blockIdx.x); + + int first = threadIdx.x; + + // limit to MaxHitsInModule; + for (int i=first, iend=gpuClustering::MaxNumModules; i=moduleStart[1023]); + assert(moduleStart[1025]>=moduleStart[1024]); + assert(moduleStart[gpuClustering::MaxNumModules]>=moduleStart[1025]); + + for (int i=first, iend=gpuClustering::MaxNumModules+1; i=moduleStart[i-i]); + // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID] + // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856] + if (i==96 || i==1184 || i==1744 || i==gpuClustering::MaxNumModules) printf("moduleStart %d %d\n",i,moduleStart[i]); + } +#endif + + // avoid overflow + constexpr auto MAX_HITS = gpuClustering::MaxNumClusters; + for (int i=first, iend=gpuClustering::MaxNumModules+1; i MAX_HITS) moduleStart[i] = MAX_HITS; + } + } + + // Interface to outside void SiPixelRawToClusterGPUKernel::makeClustersAsync( const SiPixelFedCablingMapGPU *cablingMap, @@ -478,6 +527,7 @@ namespace pixelgpudetails { edm::Service cs; nModules_Clusters_h = cs->make_host_unique(2, stream); + if (wordCounter) // protect in case of empty event.... { const int threadsPerBlock = 512; const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all @@ -511,19 +561,24 @@ namespace pixelgpudetails { digiErrors_d.copyErrorToHostAsync(stream); } } - // End of Raw2Digi and passing data for cluserisation + // End of Raw2Digi and passing data for clustering { // clusterizer ... using namespace gpuClustering; int threadsPerBlock = 256; - int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; + int blocks = (std::max(int(wordCounter),int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock; + gpuCalibPixel::calibDigis<<>>( digis_d.moduleInd(), digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(), gains, - wordCounter); + wordCounter, + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.clusModuleStart() + ); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -532,8 +587,6 @@ namespace pixelgpudetails { << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id())); - countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -546,7 +599,6 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( digis_d.c_moduleInd(), digis_d.c_xx(), digis_d.c_yy(), @@ -567,26 +619,19 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); + // count the module start indices already here (instead of // rechits) so that the number of clusters/hits can be made // available in the rechit producer without additional points of // synchronization/ExternalWork - // - // Temporary storage - size_t tempScanStorageSize = 0; - { - uint32_t *tmp = nullptr; - cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); - } - auto tempScanStorage_d = cs->make_device_unique(tempScanStorageSize, stream); - // Set first the first element to 0 - cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); - // Then use inclusive_scan to get the partial sum to the rest - cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize, - clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, - stream.id())); + + // MUST be ONE block + fillHitsModuleStart<<<1, 1024, 0, stream.id()>>>(clusters_d.c_clusInModule(),clusters_d.clusModuleStart()); + // last element holds the number of all clusters cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + } // end clusterizer scope } } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h index 5a681e791f94f..5087516fa009d 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h @@ -5,6 +5,9 @@ #include #include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h" + +#include "gpuClusteringConstants.h" + #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" namespace gpuCalibPixel { @@ -22,104 +25,46 @@ namespace gpuCalibPixel { uint16_t const * __restrict__ y, uint16_t * adc, SiPixelGainForHLTonGPU const * __restrict__ ped, - int numElements + int numElements, + uint32_t * __restrict__ moduleStart, // just to zero first + uint32_t * __restrict__ nClustersInModule, // just to zero them + uint32_t * __restrict__ clusModuleStart // just to zero first ) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= numElements) return; - if (InvId==id[i]) return; + int first = blockDim.x * blockIdx.x + threadIdx.x; - float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; - float offset = id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; - - bool isDeadColumn=false, isNoisyColumn=false; - - int row = x[i]; - int col = y[i]; - auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn); - float pedestal = ret.first; float gain = ret.second; - // float pedestal = 0; float gain = 1.; - if ( isDeadColumn | isNoisyColumn ) - { - id[i]=InvId; adc[i] =0; - printf("bad pixel at %d in %d\n",i,id[i]); - } - else { - float vcal = adc[i] * gain - pedestal*gain; - adc[i] = std::max(100, int( vcal * conversionFactor + offset)); + // zero for next kernels... + if (0==first) clusModuleStart[0] = moduleStart[0]=0; + for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x*blockDim.x) { + nClustersInModule[i]=0; } - // if (threadIdx.x==0) - // printf ("calibrated %d\n",id[i]); -} - - __global__ void calibADCByModule(uint16_t * id, - uint16_t const * __restrict__ x, - uint16_t const * __restrict__ y, - uint16_t * adc, - uint32_t * moduleStart, - SiPixelGainForHLTonGPU const * __restrict__ ped, - int numElements - ) -{ - - - auto first = moduleStart[1 + blockIdx.x]; - - auto me = id[first]; - - assert(me<2000); + for (int i = first; i < numElements; i += gridDim.x*blockDim.x) { + if (InvId==id[i]) continue; - /// depends on "me" + float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; + float offset = id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; - float conversionFactor = me<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; - float offset = me<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; + bool isDeadColumn=false, isNoisyColumn=false; - -#ifdef GPU_DEBUG - if (me%100==1) - if (threadIdx.x==0) printf("start pixel calibration for module %d in block %d\n",me,blockIdx.x); -#endif - - first+=threadIdx.x; - - // __syncthreads(); - - float pedestal=0,gain=0; - bool isDeadColumn=false, isNoisyColumn=false; - int oldCol=-1, oldAveragedBlock=-1; - - for (int i=first; inumberOfRowsAveragedOver_; // 80.... ( row<80 will be faster...) - if ( (col!=oldCol) | ( averagedBlock != oldAveragedBlock) ) { - oldCol=col; oldAveragedBlock= averagedBlock; - auto ret = ped->getPedAndGain(me,col, row, isDeadColumn, isNoisyColumn); - pedestal = ret.first; gain = ret.second; - } - if ( isDeadColumn | isNoisyColumn ) - { id[i]=InvId; adc[i] =0; } - else { - float vcal = adc[i] * gain - pedestal*gain; - adc[i] = std::max(100, int( vcal * conversionFactor + offset)); - } - } - - __syncthreads(); - //reset start - if(0==threadIdx.x) { - auto & k = moduleStart[1 + blockIdx.x]; - while (id[k]==InvId) ++k; + int row = x[i]; + int col = y[i]; + auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn); + float pedestal = ret.first; float gain = ret.second; + // float pedestal = 0; float gain = 1.; + if ( isDeadColumn | isNoisyColumn ) + { + id[i]=InvId; adc[i] =0; + printf("bad pixel at %d in %d\n",i,id[i]); + } + else { + float vcal = adc[i] * gain - pedestal*gain; + adc[i] = std::max(100, int( vcal * conversionFactor + offset)); + } } - - - } - - + + } } -#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h +#endif diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h index 7b4bb5a1c8c95..dc03de8c690c9 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h @@ -1,14 +1,6 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h -#include - -namespace gpuClustering { - constexpr uint32_t MaxNumModules = 2000; - constexpr uint32_t MaxNumPixels = 256 * 2000; // this does not mean maxPixelPerModule == 256! - constexpr uint32_t MaxNumClustersPerModules = 1024; - constexpr uint16_t InvId = 9999; // must be > MaxNumModules - -} +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py deleted file mode 100644 index b86520bb28287..0000000000000 --- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py +++ /dev/null @@ -1,37 +0,0 @@ -import FWCore.ParameterSet.Config as cms - -from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneousDefault_cfi import siPixelClustersHeterogeneousDefault as _siPixelClustersHeterogeneousDefault -siPixelClustersHeterogeneous = _siPixelClustersHeterogeneousDefault.clone() - -# following copied from SiPixelRawToDigi_cfi -siPixelClustersHeterogeneous.IncludeErrors = cms.bool(True) -siPixelClustersHeterogeneous.InputLabel = cms.InputTag("rawDataCollector") -siPixelClustersHeterogeneous.UseQualityInfo = cms.bool(False) -## ErrorList: list of error codes used by tracking to invalidate modules -siPixelClustersHeterogeneous.ErrorList = cms.vint32(29) -## UserErrorList: list of error codes used by Pixel experts for investigation -siPixelClustersHeterogeneous.UserErrorList = cms.vint32(40) -## Use pilot blades -siPixelClustersHeterogeneous.UsePilotBlade = cms.bool(False) -## Use phase1 -siPixelClustersHeterogeneous.UsePhase1 = cms.bool(False) -## Empty Regions PSet means complete unpacking -siPixelClustersHeterogeneous.Regions = cms.PSet( ) -siPixelClustersHeterogeneous.CablingMapLabel = cms.string("") - -# The following is copied from siPixelClusters_cfi, clearly not -# maintainable in the long run -from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel -phase1Pixel.toModify(siPixelClustersHeterogeneous, - VCaltoElectronGain = cms.int32(47), # L2-4: 47 +- 4.7 - VCaltoElectronGain_L1 = cms.int32(50), # L1: 49.6 +- 2.6 - VCaltoElectronOffset = cms.int32(-60), # L2-4: -60 +- 130 - VCaltoElectronOffset_L1 = cms.int32(-670), # L1: -670 +- 220 - ChannelThreshold = cms.int32(10), - SeedThreshold = cms.int32(1000), - ClusterThreshold = cms.int32(4000), - ClusterThreshold_L1 = cms.int32(2000) -) - -# The following is copied from SiPixelRawToDigi_cfi -phase1Pixel.toModify(siPixelClustersHeterogeneous, UsePhase1=True) diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h index 9b8924988e848..82d71ce19f01e 100644 --- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h +++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h @@ -83,6 +83,7 @@ class PixelCPEFast final : public PixelCPEBase std::vector> m_detParamsGPU; pixelCPEforGPU::CommonParams m_commonParamsGPU; + pixelCPEforGPU::LayerGeometry m_layerGeometry; struct GPUData { ~GPUData(); diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h index 5776e054fd330..c33ccc85b16cb 100644 --- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h +++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h @@ -6,9 +6,11 @@ #include #include +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "DataFormats/GeometrySurface/interface/SOARotation.h" #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" namespace pixelCPEforGPU { @@ -43,9 +45,16 @@ namespace pixelCPEforGPU { }; - struct ParamsOnGPU { + struct LayerGeometry { + uint32_t layerStart[phase1PixelTopology::numberOfLayers + 1]; + uint8_t layer[phase1PixelTopology::layerIndexSize]; + }; + + struct ParamsOnGPU { + CommonParams * m_commonParams; DetParams * m_detParams; + LayerGeometry * m_layerGeometry; constexpr CommonParams const & __restrict__ commonParams() const { @@ -57,6 +66,13 @@ namespace pixelCPEforGPU { DetParams const * __restrict__ l = m_detParams; return l[i]; } + constexpr + LayerGeometry const & __restrict__ layerGeometry() const { + return *m_layerGeometry; + } + + __device__ uint8_t layer(uint16_t id) const { return __ldg(m_layerGeometry->layer+id/phase1PixelTopology::maxModuleStride);}; + }; // SOA (on device) @@ -86,8 +102,8 @@ namespace pixelCPEforGPU { }; - constexpr uint32_t MaxClusInModule=256; - using ClusParams = ClusParamsT<256>; + constexpr uint32_t MaxHitsInModule = gpuClustering::MaxHitsInModule; + using ClusParams = ClusParamsT; constexpr inline void computeAnglesFromDet(DetParams const & __restrict__ detParams, float const x, float const y, float & cotalpha, float & cotbeta) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml index 27ee3af86e102..9385896a5e287 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml @@ -1,4 +1,5 @@ + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 7d19e23eb1119..d5cfda78600a4 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -6,6 +6,9 @@ #include // CMSSW headers +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" @@ -14,130 +17,32 @@ namespace { __global__ - void setHitsLayerStart(const uint32_t* hitsModuleStart, const uint32_t* layerStart, uint32_t* hitsLayerStart) { + void setHitsLayerStart(uint32_t const * __restrict__ hitsModuleStart, pixelCPEforGPU::ParamsOnGPU const * cpeParams, uint32_t* hitsLayerStart) { auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < 10) { - hitsLayerStart[i] = hitsModuleStart[layerStart[i]]; - } - else if(i == 10) { - hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules]; - } - } - - template - T *slicePitch(void *ptr, size_t pitch, size_t row) { - return reinterpret_cast( reinterpret_cast(ptr) + pitch*row); - } -} - -namespace pixelgpudetails { - PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { - - constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits(); - - cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); - - // Coalesce all 32bit and 16bit arrays to two big blobs - // - // This is just a toy. Please don't copy-paste the logic but - // create a proper abstraction (e.g. along FWCore/SOA, or - // FWCore/Utilities/interface/SoATuple.h - // - // Order such that the first ones are the ones transferred to CPU - static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious - cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9)); - cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id())); - //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_; - gpu_.charge_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0); - gpu_.xl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1); - gpu_.yl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2); - gpu_.xerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3); - gpu_.yerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4); - gpu_.xg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5); - gpu_.yg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6); - gpu_.zg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7); - gpu_.rg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8); - - // Order such that the first ones are the ones transferred to CPU - cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 7)); - cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*7, cudaStream.id())); - //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_; - gpu_.detInd_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0); - gpu_.mr_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1); - gpu_.mc_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2); - gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3); - gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); - gpu_.xsize_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 5); - gpu_.ysize_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 6); - - cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist))); - cudaCheck(cudaMalloc((void **) & gpu_.hws_d, HitsOnGPU::Hist::wsSize())); - cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); - - // Feels a bit dumb but constexpr arrays are not supported for device code - // TODO: should be moved to EventSetup (or better ideas?) - // Would it be better to use "constant memory"? - cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayerStart_, 11 * sizeof(uint32_t))); - cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id())); - cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayer_, phase1PixelTopology::layer.size() * sizeof(uint8_t))); - cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayer_, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size() * sizeof(uint8_t), cudaMemcpyDefault, cudaStream.id())); - - gpu_.phase1TopologyLayerStart_d = d_phase1TopologyLayerStart_; - gpu_.phase1TopologyLayer_d = d_phase1TopologyLayer_; - - gpu_.me_d = gpu_d; - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id())); - - cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t))); - - // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks - // to '*256' it is even aligned by cache line - h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); - cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5)); - h_charge_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 0); - h_xl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 1); - h_yl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 2); - h_xe_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 3); - h_ye_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 4); - - h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t); - cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3)); - h_detInd_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 0); - h_mr_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 1); - h_mc_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 2); + assert(0==hitsModuleStart[0]); + if(i < 11) { + hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; #ifdef GPU_DEBUG - cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t))); + printf ("LayerStart %d %d: %d\n",i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); #endif + } } - PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { - cudaCheck(cudaFree(gpu_.hitsLayerStart_d)); - cudaCheck(cudaFree(gpu_.owner_32bit_)); - cudaCheck(cudaFree(gpu_.owner_16bit_)); - cudaCheck(cudaFree(gpu_.hist_d)); - cudaCheck(cudaFree(gpu_.hws_d)); - cudaCheck(cudaFree(gpu_d)); - cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); - cudaCheck(cudaFree(d_phase1TopologyLayer_)); +} - cudaCheck(cudaFreeHost(h_hitsModuleStart_)); - cudaCheck(cudaFreeHost(h_owner_32bit_)); - cudaCheck(cudaFreeHost(h_owner_16bit_)); -#ifdef GPU_DEBUG - cudaCheck(cudaFreeHost(h_hitsLayerStart_)); -#endif - } +namespace pixelgpudetails { - void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHit2DCUDA PixelRecHitGPUKernel::makeHitsAsync( + SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, - bool transferToCPU, - cuda::stream_t<>& stream) { - gpu_.hitsModuleStart_d = clusters_d.clusModuleStart(); - gpu_.cpeParams = cpeParams; // copy it for use in clients - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); + cuda::stream_t<>& stream + ) const { + + auto nHits = clusters_d.nClusters(); + TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream); int threadsPerBlock = 256; int blocks = digis_d.nModules(); // active modules (with digis) @@ -145,6 +50,7 @@ namespace pixelgpudetails { #ifdef GPU_DEBUG std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif + if(blocks) // protect from empty events gpuPixelRecHits::getHits<<>>( cpeParams, bs_d.data(), @@ -154,54 +60,27 @@ namespace pixelgpudetails { clusters_d.clusInModule(), clusters_d.moduleId(), digis_d.clus(), digis_d.nDigis(), - gpu_.hitsModuleStart_d, - gpu_.charge_d, - gpu_.detInd_d, - gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d, - gpu_.iphi_d, - gpu_.xl_d, gpu_.yl_d, - gpu_.xerr_d, gpu_.yerr_d, - gpu_.mr_d, gpu_.mc_d, - gpu_.xsize_d, gpu_.ysize_d + clusters_d.clusModuleStart(), + hits_d.view() ); cudaCheck(cudaGetLastError()); + // assuming full warp of threads is better than a smaller number... - setHitsLayerStart<<<1, 32, 0, stream.id()>>>(gpu_.hitsModuleStart_d, d_phase1TopologyLayerStart_, gpu_.hitsLayerStart_d); + setHitsLayerStart<<<1, 32, 0, stream.id()>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); cudaCheck(cudaGetLastError()); - // needed only if hits on CPU are required... - nhits_ = clusters_d.nClusters(); - if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); -#ifdef GPU_DEBUG - cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); -#endif - - cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_, - gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, - nhits_*sizeof(uint16_t), 3, - cudaMemcpyDefault, stream.id())); - - cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_, - gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, - nhits_*sizeof(uint32_t), 5, - cudaMemcpyDefault, stream.id())); + if (nHits >= TrackingRecHit2DSOAView::maxHits()) { + edm::LogWarning("PixelRecHitGPUKernel" ) << "Hits Overflow " << nHits << " > " << TrackingRecHit2DSOAView::maxHits(); + } -#ifdef GPU_DEBUG - cudaStreamSynchronize(stream.id()); - - std::cout << "hit layerStart "; - for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << h_hitsLayerStart_[i] << ' '; - std::cout << "end:" << h_hitsLayerStart_[10] << std::endl; -#endif - - // for timing test - // cudaStreamSynchronize(stream.id()); - // auto nhits_ = h_hitsLayerStart_[10]; - // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d); + if (nHits) { + edm::Service cs; + auto hws = cs->make_device_unique(TrackingRecHit2DSOAView::Hist::wsSize(),stream); + cudautils::fillManyFromVector(hits_d.phiBinner(), hws.get(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream.id()); + cudaCheck(cudaGetLastError()); } - - cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); + return hits_d; } -} + +} // end namespace diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 8e5599c239789..096d6c7492f2a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -4,72 +4,32 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include #include -#include -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" - - -namespace pixelCPEforGPU { - struct ParamsOnGPU; -} namespace pixelgpudetails { - using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; - - using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; class PixelRecHitGPUKernel { public: - PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream); - ~PixelRecHitGPUKernel(); + PixelRecHitGPUKernel() = default; + ~PixelRecHitGPUKernel() = default; PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel(PixelRecHitGPUKernel&&) = delete; PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - void makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHit2DCUDA makeHitsAsync( + SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const * cpeParams, - bool transferToCPU, - cuda::stream_t<>& stream); - - HitsOnCPU getOutput() const { - return HitsOnCPU{ - h_hitsModuleStart_, h_detInd_, h_charge_, - h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_, - gpu_d, nhits_ - }; - } + cuda::stream_t<>& stream) const; - private: - HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product" - HitsOnGPU gpu_; - uint32_t nhits_ = 0; - uint32_t *d_phase1TopologyLayerStart_ = nullptr; - uint8_t *d_phase1TopologyLayer_ = nullptr; - uint32_t *h_hitsModuleStart_ = nullptr; - uint16_t *h_detInd_ = nullptr; - int32_t *h_charge_ = nullptr; - float *h_xl_ = nullptr; - float *h_yl_ = nullptr; - float *h_xe_ = nullptr; - float *h_ye_ = nullptr; - uint16_t *h_mr_ = nullptr; - uint16_t *h_mc_ = nullptr; - void *h_owner_32bit_ = nullptr; - size_t h_owner_32bit_pitch_ = 0; - void *h_owner_16bit_ = nullptr; - size_t h_owner_16bit_pitch_ = 0; -#ifdef GPU_DEBUG - uint32_t *h_hitsLayerStart_ = nullptr; -#endif }; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc new file mode 100644 index 0000000000000..7e17f6c029ac2 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -0,0 +1,118 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + + +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" + +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" + +#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" +#include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h" + +#include "PixelRecHits.h" // TODO : spit product from kernel + +#include + +class SiPixelRecHitCUDA : public edm::global::EDProducer<> { + +public: + + + explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig); + ~SiPixelRecHitCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + // The mess with inputs will be cleaned up when migrating to the new framework + edm::EDGetTokenT> tBeamSpot; + edm::EDGetTokenT> token_; + edm::EDGetTokenT> tokenDigi_; + + edm::EDPutTokenT> tokenHit_; + + std::string cpeName_; + + pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; + +}; + +SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig): + tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), + token_(consumes>(iConfig.getParameter("src"))), + tokenDigi_(consumes>(iConfig.getParameter("src"))), + tokenHit_(produces>()), + cpeName_(iConfig.getParameter("CPE")) +{} + +void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("beamSpot", edm::InputTag("offlineBeamSpotCUDA")); + desc.add("src", edm::InputTag("siPixelClustersCUDAPreSplitting")); + desc.add("CPE", "PixelCPEFast"); + descriptions.add("siPixelRecHitCUDA",desc); +} + + +void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { + + // const TrackerGeometry *geom_ = nullptr; + const PixelClusterParameterEstimator *cpe_ = nullptr; + + /* + edm::ESHandle geom; + es.get().get( geom ); + geom_ = geom.product(); + */ + + edm::ESHandle hCPE; + es.get().get(cpeName_, hCPE); + cpe_ = dynamic_cast< const PixelCPEBase* >(hCPE.product()); + + PixelCPEFast const * fcpe = dynamic_cast(cpe_); + if (!fcpe) { + throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible...."; + } + + edm::Handle> hclusters; + iEvent.getByToken(token_, hclusters); + + CUDAScopedContext ctx{*hclusters}; + auto const& clusters = ctx.get(*hclusters); + + edm::Handle> hdigis; + iEvent.getByToken(tokenDigi_, hdigis); + auto const& digis = ctx.get(*hdigis); + + edm::Handle> hbs; + iEvent.getByToken(tBeamSpot, hbs); + auto const& bs = ctx.get(*hbs); + + ctx.emplace(iEvent,tokenHit_, + std::move( + gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream()) + )); +} + +DEFINE_FWK_MODULE(SiPixelRecHitCUDA); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc new file mode 100644 index 0000000000000..9d3c70aebf2cc --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc @@ -0,0 +1,187 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + +#include "DataFormats/Common/interface/DetSetVectorNew.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" + +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/InputTag.h" + +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h" + + +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" + + +#include + +class SiPixelRecHitFromSOA: public edm::stream::EDProducer { + + +public: + + explicit SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig); + ~SiPixelRecHitFromSOA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + + edm::EDGetTokenT> tokenHit_; // CUDA hits + edm::EDGetTokenT clusterToken_; // Legacy Clusters + + uint32_t m_nHits; + cudautils::host::unique_ptr m_store16; + cudautils::host::unique_ptr m_store32; + cudautils::host::unique_ptr m_hitsModuleStart; +}; + +SiPixelRecHitFromSOA::SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig): + tokenHit_(consumes>(iConfig.getParameter("pixelRecHitSrc"))), + clusterToken_(consumes(iConfig.getParameter("src"))) +{ + produces(); +} + + +void SiPixelRecHitFromSOA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting")); + desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); + descriptions.add("siPixelRecHitFromSOA",desc); +} + + +void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + + CUDAProduct const& inputDataWrapped = iEvent.get(tokenHit_); + CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + m_nHits = inputData.nHits(); + + // std::cout<< "converting " << m_nHits << " Hits"<< std::endl; + + if (0==m_nHits) return; + m_store32 = inputData.localCoordToHostAsync(ctx.stream()); + // m_store16 = inputData.detIndexToHostAsync(ctx.stream(); + m_hitsModuleStart = inputData.hitsModuleStartToHostAsync(ctx.stream()); +} + +void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) { + + auto output = std::make_unique(); + if (0==m_nHits) { + iEvent.put(std::move(output)); + return; + } + + auto xl = m_store32.get(); + auto yl = xl+m_nHits; + auto xe = yl+m_nHits; + auto ye = xe+m_nHits; + + edm::ESHandle geom; + es.get().get( geom ); + geom = geom.product(); + + edm::Handle hclusters; + iEvent.getByToken(clusterToken_, hclusters); + + auto const & input = *hclusters; + + int numberOfDetUnits = 0; + int numberOfClusters = 0; + for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) { + numberOfDetUnits++; + unsigned int detid = DSViter->detId(); + DetId detIdObject( detid ); + const GeomDetUnit * genericDet = geom->idToDetUnit( detIdObject ); + auto gind = genericDet->index(); + const PixelGeomDetUnit * pixDet = dynamic_cast(genericDet); + assert(pixDet); + SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*output, detid); + auto fc = m_hitsModuleStart[gind]; + auto lc = m_hitsModuleStart[gind+1]; + auto nhits = lc-fc; + + //std::cout << "in det " << gind << "conv " << nhits << " hits from " << DSViter->size() << " legacy clusters" + // <<' '<< lc <<','<size()); + if (nhits!=DSViter->size()) { + edm::LogWarning("GPUHits2CPU") <<"nhits!= ndigi " << nhits << ' ' << DSViter->size() << std::endl; + } + for (auto const & clust : *DSViter) { + if (ic>=nhits) { + // FIXME add a way to handle this case, or at least notify via edm::LogError + break; + } + auto ij = jnd(clust.originalId()); + if (ij>=TrackingRecHit2DSOAView::maxHits()) break; // overflow... + assert(clust.originalId()>=0); assert(clust.originalId() tuple = cpe_->getParameters( clust, *genericDet ); + LocalPoint lp( std::get<0>(tuple) ); + LocalError le( std::get<1>(tuple) ); + SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) ); + */ + + // Create a persistent edm::Ref to the cluster + edm::Ref< edmNew::DetSetVector, SiPixelCluster > cluster = edmNew::makeRefTo(hclusters, &clust); + // Make a RecHit and add it to the DetSet + SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster); + // + // Now save it ================= + recHitsOnDetUnit.push_back(hit); + // ============================= + + // std::cout << "SiPixelRecHitGPUVI " << numberOfClusters << ' '<< lp << " " << le << std::endl; + + } // <-- End loop on Clusters + + + // LogDebug("SiPixelRecHitGPU") + //std::cout << "SiPixelRecHitGPUVI " + // << " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //; + // << std::endl; + + + } // <-- End loop on DetUnits + + /* + std::cout << "SiPixelRecHitGPUVI $ det, clus, lost " + << numberOfDetUnits << ' ' + << numberOfClusters << ' ' + << std::endl; + */ + + iEvent.put(std::move(output)); +} + +DEFINE_FWK_MODULE(SiPixelRecHitFromSOA); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc deleted file mode 100644 index df0bfb5baa0d6..0000000000000 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ /dev/null @@ -1,313 +0,0 @@ -#include "CUDADataFormats/Common/interface/CUDAProduct.h" -#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" -#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "DataFormats/Common/interface/DetSetVectorNew.h" -#include "DataFormats/Common/interface/Handle.h" -#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" -#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" -#include "FWCore/Framework/interface/ESHandle.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" -#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Utilities/interface/InputTag.h" -#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" -#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" -#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" -#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" -#include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h" - -#include "PixelRecHits.h" // TODO : spit product from kernel - -#include - -class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer > { - -public: - using CPUProduct = siPixelRecHitsHeterogeneousProduct::CPUProduct; - using GPUProduct = siPixelRecHitsHeterogeneousProduct::GPUProduct; - using Output = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit; - - - explicit SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig); - ~SiPixelRecHitHeterogeneous() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - // CPU implementation - void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; - - // GPU implementation - void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override; - void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void convertGPUtoCPU(edm::Event& iEvent, const edm::Handle& inputhandle, const siPixelRecHitsHeterogeneousProduct::GPUProduct & gpu) const; - - // Commonalities - void initialize(const edm::EventSetup& es); - - // CPU - void run(const edm::Handle& inputhandle, SiPixelRecHitCollectionNew &output) const; - // GPU - void run(const edm::Handle& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const; - - - // The mess with inputs will be cleaned up when migrating to the new framework - edm::EDGetTokenT> tBeamSpot; - edm::EDGetTokenT> token_; - edm::EDGetTokenT> tokenDigi_; - edm::EDGetTokenT clusterToken_; - std::string cpeName_; - - const TrackerGeometry *geom_ = nullptr; - const PixelClusterParameterEstimator *cpe_ = nullptr; - - std::unique_ptr gpuAlgo_; - - bool enableTransfer_; - bool enableConversion_; -}; - -SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig): - HeterogeneousEDProducer(iConfig), - tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), - token_(consumes>(iConfig.getParameter("heterogeneousSrc"))), - tokenDigi_(consumes>(iConfig.getParameter("heterogeneousSrc"))), - cpeName_(iConfig.getParameter("CPE")) -{ - enableConversion_ = iConfig.getParameter("gpuEnableConversion"); - enableTransfer_ = enableConversion_ || iConfig.getParameter("gpuEnableTransfer"); - - produces(); - if(enableConversion_) { - clusterToken_ = consumes(iConfig.getParameter("src")); - produces(); - } -} - -void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - - desc.add("beamSpot", edm::InputTag("offlineBeamSpotCUDA")); - desc.add("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting")); - desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); - desc.add("CPE", "PixelCPEFast"); - - desc.add("gpuEnableTransfer", true); - desc.add("gpuEnableConversion", true); - - HeterogeneousEDProducer::fillPSetDescription(desc); - - descriptions.add("siPixelRecHitHeterogeneous",desc); -} - -void SiPixelRecHitHeterogeneous::initialize(const edm::EventSetup& es) { - edm::ESHandle geom; - es.get().get( geom ); - geom_ = geom.product(); - - edm::ESHandle hCPE; - es.get().get(cpeName_, hCPE); - cpe_ = dynamic_cast< const PixelCPEBase* >(hCPE.product()); -} - -void SiPixelRecHitHeterogeneous::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - throw cms::Exception("NotImplemented") << "CPU version is no longer implemented"; -} - -void SiPixelRecHitHeterogeneous::run(const edm::Handle& inputhandle, SiPixelRecHitCollectionNew &output) const { - const auto& input = *inputhandle; - - edmNew::DetSetVector::const_iterator DSViter=input.begin(); - for ( ; DSViter != input.end() ; DSViter++) { - unsigned int detid = DSViter->detId(); - DetId detIdObject( detid ); - const GeomDetUnit * genericDet = geom_->idToDetUnit( detIdObject ); - const PixelGeomDetUnit * pixDet = dynamic_cast(genericDet); - assert(pixDet); - SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output,detid); - - edmNew::DetSet::const_iterator clustIt = DSViter->begin(), clustEnd = DSViter->end(); - - for ( ; clustIt != clustEnd; clustIt++) { - std::tuple tuple = cpe_->getParameters( *clustIt, *genericDet ); - LocalPoint lp( std::get<0>(tuple) ); - LocalError le( std::get<1>(tuple) ); - SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) ); - // Create a persistent edm::Ref to the cluster - edm::Ref< edmNew::DetSetVector, SiPixelCluster > cluster = edmNew::makeRefTo( inputhandle, clustIt); - // Make a RecHit and add it to the DetSet - // old : recHitsOnDetUnit.push_back( new SiPixelRecHit( lp, le, detIdObject, &*clustIt) ); - SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster); - // - // Now save it ================= - recHitsOnDetUnit.push_back(hit); - } // <-- End loop on Clusters - } // <-- End loop on DetUnits -} - - -void SiPixelRecHitHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { - gpuAlgo_ = std::make_unique(cudaStream); -} - -void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { - initialize(iSetup); - - PixelCPEFast const * fcpe = dynamic_cast(cpe_); - if (!fcpe) { - throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible...."; - } - - edm::Handle> hclusters; - iEvent.getByToken(token_, hclusters); - // temporary check (until the migration) - edm::Service cs; - assert(hclusters->device() == cs->getCurrentDevice()); - CUDAScopedContext ctx{*hclusters}; - auto const& clusters = ctx.get(*hclusters); - - edm::Handle> hdigis; - iEvent.getByToken(tokenDigi_, hdigis); - auto const& digis = ctx.get(*hdigis); - - edm::Handle> hbs; - iEvent.getByToken(tBeamSpot, hbs); - auto const& bs = ctx.get(*hbs); - - // We're processing in a stream given by base class, so need to - // synchronize explicitly (implementation is from - // CUDAScopedContext). In practice these should not be needed - // (because of synchronizations upstream), but let's play generic. - if (not hclusters->isAvailable()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0)); - } - if (not hdigis->isAvailable()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hdigis->event()->id(), 0)); - } - if (not hbs->isAvailable()) { - cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hbs->event()->id(), 0)); - } - - gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream); - -} - -void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { - auto output = std::make_unique(gpuAlgo_->getOutput()); - - if(enableConversion_) { - // Need the CPU clusters to - // - properly fill the output DetSetVector of hits - // - to set up edm::Refs to the clusters - edm::Handle hclusters; - iEvent.getByToken(clusterToken_, hclusters); - - convertGPUtoCPU(iEvent.event(), hclusters, *output); - } - - iEvent.put(std::move(output), heterogeneous::DisableTransfer{}); -} - -void SiPixelRecHitHeterogeneous::convertGPUtoCPU(edm::Event& iEvent, const edm::Handle& inputhandle, const pixelgpudetails::HitsOnCPU & hoc) const{ - assert(hoc.gpu_d); - auto output = std::make_unique(); - run(inputhandle, *output, hoc); - iEvent.put(std::move(output)); -} - -void SiPixelRecHitHeterogeneous::run(const edm::Handle& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const { - auto const & input = *inputhandle; - - int numberOfDetUnits = 0; - int numberOfClusters = 0; - for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) { - numberOfDetUnits++; - unsigned int detid = DSViter->detId(); - DetId detIdObject( detid ); - const GeomDetUnit * genericDet = geom_->idToDetUnit( detIdObject ); - auto gind = genericDet->index(); - const PixelGeomDetUnit * pixDet = dynamic_cast(genericDet); - assert(pixDet); - SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output, detid); - auto fc = hoc.hitsModuleStart[gind]; - auto lc = hoc.hitsModuleStart[gind+1]; - auto nhits = lc-fc; - uint32_t ic=0; - auto jnd = [&](int k) { return fc+k; }; - assert(nhits<=DSViter->size()); - if (nhits!=DSViter->size()) { - edm::LogWarning("GPUHits2CPU") <<"nhits!= ndigi " << nhits << ' ' << DSViter->size() << std::endl; - } - for (auto const & clust : *DSViter) { - if (ic>=nhits) { - // FIXME add a way to handle this case, or at least notify via edm::LogError - break; - } - auto ij = jnd(clust.originalId()); - assert(clust.originalId()>=0); assert(clust.originalId() tuple = cpe_->getParameters( clust, *genericDet ); - LocalPoint lp( std::get<0>(tuple) ); - LocalError le( std::get<1>(tuple) ); - SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) ); - */ - - // Create a persistent edm::Ref to the cluster - edm::Ref< edmNew::DetSetVector, SiPixelCluster > cluster = edmNew::makeRefTo( inputhandle, &clust); - // Make a RecHit and add it to the DetSet - SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster); - // - // Now save it ================= - recHitsOnDetUnit.push_back(hit); - // ============================= - - // std::cout << "SiPixelRecHitGPUVI " << numberOfClusters << ' '<< lp << " " << le << std::endl; - - } // <-- End loop on Clusters - - - // LogDebug("SiPixelRecHitGPU") - //std::cout << "SiPixelRecHitGPUVI " - // << " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //; - // << std::endl; - - - } // <-- End loop on DetUnits - - /* - std::cout << "SiPixelRecHitGPUVI $ det, clus, lost " - << numberOfDetUnits << ' ' - << numberOfClusters << ' ' - << std::endl; - */ -} - -DEFINE_FWK_MODULE(SiPixelRecHitHeterogeneous); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index caf58c0615dbb..2874df10c16c3 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -6,15 +6,13 @@ #include #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" - namespace gpuPixelRecHits { - - __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams, BeamSpotCUDA::Data const * __restrict__ bs, uint16_t const * __restrict__ id, @@ -27,19 +25,15 @@ namespace gpuPixelRecHits { int32_t const * __restrict__ clus, int numElements, uint32_t const * __restrict__ hitsModuleStart, - int32_t * chargeh, - uint16_t * detInd, - float * xg, float * yg, float * zg, float * rg, int16_t * iph, - float * xl, float * yl, - float * xe, float * ye, - uint16_t * mr, uint16_t * mc, - int16_t * xs, int16_t * ys - ) - { + TrackingRecHit2DSOAView * phits + ) +{ + + auto & hits = *phits; // to be moved in common namespace... constexpr uint16_t InvId=9999; // must be > MaxNumModules - constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule; + constexpr uint32_t MaxHitsInModule = pixelCPEforGPU::MaxHitsInModule; using ClusParams = pixelCPEforGPU::ClusParams; @@ -66,14 +60,14 @@ namespace gpuPixelRecHits { if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]); #endif - assert(blockDim.x >= MaxClusInModule); + assert(blockDim.x >= MaxHitsInModule); - if (threadIdx.x==0 && nclus > MaxClusInModule) { - printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule); + if (threadIdx.x==0 && nclus > MaxHitsInModule) { + printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxHitsInModule); // zero charge: do not bother to do it in parallel - for (auto d=MaxClusInModule; d= nclus) return; first = hitsModuleStart[me]; auto h = first+ic; // output index in global memory - assert(h < 2000*256); + if (h >= TrackingRecHit2DSOAView::maxHits()) return; // overflow... pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); - chargeh[h] = clusParams.charge[ic]; - detInd[h] = me; + // store it + + hits.charge(h) = clusParams.charge[ic]; - xl[h]= clusParams.xpos[ic]; - yl[h]= clusParams.ypos[ic]; + hits.detectorIndex(h) = me; - xs[h]= clusParams.xsize[ic]; - ys[h]= clusParams.ysize[ic]; + float xl,yl; + hits.xLocal(h) = xl = clusParams.xpos[ic]; + hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits.clusterSizeX(h) = clusParams.xsize[ic]; + hits.clusterSizeY(h) = clusParams.ysize[ic]; - xe[h]= clusParams.xerr[ic]*clusParams.xerr[ic]; - ye[h]= clusParams.yerr[ic]*clusParams.yerr[ic]; - mr[h]= clusParams.minRow[ic]; - mc[h]= clusParams.minCol[ic]; - + hits.xerrLocal(h) = clusParams.xerr[ic]*clusParams.xerr[ic]; + hits.yerrLocal(h) = clusParams.yerr[ic]*clusParams.yerr[ic]; + + // keep it local for computations + float xg,yg,zg; // to global and compute phi... - cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]); + cpeParams->detParams(me).frame.toGlobal(xl,yl, xg,yg,zg); // here correct for the beamspot... - xg[h]-=bs->x; - yg[h]-=bs->y; - zg[h]-=bs->z; + xg-=bs->x; + yg-=bs->y; + zg-=bs->z; + + hits.xGlobal(h) = xg; + hits.yGlobal(h) = yg; + hits.zGlobal(h) = zg; - rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]); - iph[h] = unsafe_atan2s<7>(yg[h],xg[h]); + hits.rGlobal(h) = std::sqrt(xg*xg+yg*yg); + hits.iphi(h) = unsafe_atan2s<7>(yg,xg); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h deleted file mode 100644 index a90a37da1065f..0000000000000 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h -#define RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h - -#include -#include - -#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" - -namespace pixelCPEforGPU { - struct ParamsOnGPU; -} - -namespace siPixelRecHitsHeterogeneousProduct { - - using CPUProduct = int; // dummy - - static constexpr uint32_t maxHits() { return 65536;} - using hindex_type = uint16_t; // if above is <=2^16 - - struct HitsOnGPU{ - pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned - const uint32_t * hitsModuleStart_d; // forwarded from clusters - uint32_t * hitsLayerStart_d; - int32_t * charge_d; - uint16_t * detInd_d; - float *xg_d, *yg_d, *zg_d, *rg_d; - float *xl_d, *yl_d; - float *xerr_d, *yerr_d; - int16_t * iphi_d; - uint16_t * sortIndex_d; - uint16_t * mr_d; - uint16_t * mc_d; - int16_t * xsize_d; - int16_t * ysize_d; - - - using Hist = HistoContainer; - Hist * hist_d; - uint8_t * hws_d; - - HitsOnGPU const * me_d = nullptr; - - // Owning pointers to the 32/16 bit arrays with size MAX_HITS - void *owner_32bit_; - size_t owner_32bit_pitch_; - void *owner_16bit_; - size_t owner_16bit_pitch_; - - // forwarded from PixelRecHit (FIXME) - uint32_t const * phase1TopologyLayerStart_d; - uint8_t const * phase1TopologyLayer_d; - - }; - - struct HitsOnCPU { - uint32_t const * hitsModuleStart = nullptr; - uint16_t const * detInd = nullptr; - int32_t const * charge = nullptr; - float const * xl = nullptr; - float const * yl = nullptr; - float const * xe = nullptr; - float const * ye = nullptr; - uint16_t const * mr = nullptr; - uint16_t const * mc = nullptr; - - HitsOnGPU const * gpu_d = nullptr; - uint32_t nHits; - }; - - using GPUProduct = HitsOnCPU; // FIXME fill cpu vectors on demand - - using HeterogeneousPixelRecHit = HeterogeneousProductImpl, - heterogeneous::GPUCudaProduct >; -} - -#endif // RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index 465aa0bb346ce..8995471470f37 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -6,6 +6,37 @@ VerboseLevel = cms.untracked.int32(0) ) -siPixelRecHitsPreSplitting = siPixelRecHits.clone( +_siPixelRecHitsPreSplitting = siPixelRecHits.clone( src = 'siPixelClustersPreSplitting' ) + +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +siPixelRecHitsPreSplitting = SwitchProducerCUDA( + cpu = _siPixelRecHitsPreSplitting.clone() +) + + + +from Configuration.ProcessModifiers.gpu_cff import gpu +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSOA_cfi import siPixelRecHitFromSOA as _siPixelRecHitFromSOA + +gpu.toModify(siPixelRecHitsPreSplitting, + cuda = _siPixelRecHitFromSOA.clone() +) + + +siPixelRecHitsPreSplittingTask = cms.Task(siPixelRecHitsPreSplitting) + +siPixelRecHitsCUDAPreSplitting = _siPixelRecHitCUDA.clone() +siPixelRecHitsLegacyPreSplitting = _siPixelRecHitFromSOA.clone() +siPixelRecHitsPreSplittingTaskCUDA = cms.Task( + siPixelRecHitsCUDAPreSplitting, + siPixelRecHitsLegacyPreSplitting, +) + +from Configuration.ProcessModifiers.gpu_cff import gpu +_siPixelRecHitsPreSplittingTask_gpu = siPixelRecHitsPreSplittingTask.copy() +_siPixelRecHitsPreSplittingTask_gpu.add(siPixelRecHitsPreSplittingTaskCUDA) +gpu.toReplaceWith(siPixelRecHitsPreSplittingTask, _siPixelRecHitsPreSplittingTask_gpu) + diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc index eb51dd5a2eaeb..1111e4866a8d2 100644 --- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc +++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc @@ -22,6 +22,8 @@ namespace { constexpr float micronsToCm = 1.0e-4; } + + //----------------------------------------------------------------------------- //! The constructor. //----------------------------------------------------------------------------- @@ -68,13 +70,16 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const & conf, const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cuda::stream_t<>& cudaStream) const { const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) { + // and now copy to device... cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams))); cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams))); + cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry))); cudaCheck(cudaMalloc((void**) & data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU))); cudaCheck(cudaMemcpyAsync(data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_commonParams, &this->m_commonParamsGPU, sizeof(pixelCPEforGPU::CommonParams), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_layerGeometry, &this->m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.data(), this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams), cudaMemcpyDefault, stream.id())); }); return data.d_paramsOnGPU; @@ -86,7 +91,13 @@ void PixelCPEFast::fillParamsForGpu() { m_commonParamsGPU.thePitchX = m_DetParams[0].thePitchX; m_commonParamsGPU.thePitchY = m_DetParams[0].thePitchY; - //uint32_t oldLayer = 0; + uint32_t oldLayer = 0; + uint32_t oldLadder=0; + float rl=0; + float zl = 0; + float miz = 90, mxz=0; + float pl = 0; + int nl=0; m_detParamsGPU.resize(m_DetParams.size()); for (auto i=0U; isurface().rotation()); g.frame = pixelCPEforGPU::Frame(vv.x(),vv.y(),vv.z(),rr); + zl+=vv.z(); + miz = std::min(miz,std::abs(vv.z())); + mxz = std::max(mxz,std::abs(vv.z())); + rl+=vv.perp(); + pl+=vv.phi(); // (not obvious) // errors ..... ClusterParamGeneric cp; @@ -196,13 +227,19 @@ void PixelCPEFast::fillParamsForGpu() { } */ - for (int i=0; i<3; ++i) { g.sx[i] = std::sqrt(g.sx[i]*g.sx[i]+lape.xx()); g.sy[i] = std::sqrt(g.sy[i]*g.sy[i]+lape.yy()); } } + + // fill Layer and ladders geometry + memcpy(m_layerGeometry.layerStart, phase1PixelTopology::layerStart, sizeof(phase1PixelTopology::layerStart)); + memcpy(m_layerGeometry.layer, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size()); + + + } PixelCPEFast::~PixelCPEFast() {} diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py index 58935e9a6991c..1021918c0ce6c 100644 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py @@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process): process = customizePixelTracksForProfiling(process) # Disable conversions to legacy - process.siPixelRecHitsPreSplitting.gpuEnableConversion = False process.pixelTracksHitQuadruplets.gpuEnableConversion = False process.pixelTracks.gpuEnableConversion = False process.pixelVertices.gpuEnableConversion = False @@ -32,7 +31,6 @@ def customizePixelTracksForProfilingDisableTransfer(process): process = customizePixelTracksForProfilingDisableConversion(process) # Disable "unnecessary" transfers to CPU - process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False process.pixelTracksHitQuadruplets.gpuEnableTransfer = False process.pixelVertices.gpuEnableTransfer = False diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h index 0d9be5a346d0a..e6ab9f93ca306 100644 --- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h +++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h @@ -38,7 +38,7 @@ namespace Rfit |cov(X0, R)|cov(Y0, R)|cov( R, R)| */ int32_t q; //!< particle charge - float chi2 = 0.0; + float chi2; }; struct line_fit @@ -49,7 +49,7 @@ namespace Rfit |cov(c_t,c_t)|cov(Zip,c_t)| \n |cov(c_t,Zip)|cov(Zip,Zip)| */ - double chi2 = 0.0; + double chi2; }; struct helix_fit diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc index c05fa9172ffe3..c6dd66cab18e5 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc @@ -1,7 +1,6 @@ #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/Frameworkfwd.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index a0b2b9e56f59c..894a80616af02 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -11,12 +11,12 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" -using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; - -using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; +using HitsOnGPU = TrackingRecHit2DSOAView; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; using namespace Eigen; @@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets, for (unsigned int i = 0; i < hitsInFit; ++i) { auto hit = hitId[i]; float ge[6]; - hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge); + hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); #ifdef BL_DUMP_HITS if (dump){ - printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); - printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); + printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit)); + printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); } #endif - hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit]; + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]; } BrokenLine::BL_Fast_fit(hits,fast_fit); @@ -167,65 +167,71 @@ void kernelBLFit( } -void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream) +void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream) { - assert(tuples_d); assert(fast_fit_resultsGPU_); + assert(tuples_d); auto blockSize = 64; auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - for (uint32_t offset=0; offset cs; + auto hitsGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream); + auto hits_geGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream); + auto fast_fit_resultsGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream); + + for (uint32_t offset=0; offset<<>>( - tuples_d, tupleMultiplicity_d, hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + kernelBLFastFit<3><<>>( + tuples_d, tupleMultiplicity_d, hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); cudaCheck(cudaGetLastError()); - kernelBLFit<3><<>>( + kernelBLFit<3><<>>( tupleMultiplicity_d, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); cudaCheck(cudaGetLastError()); // fit quads - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + kernelBLFastFit<4><<>>( + tuples_d, tupleMultiplicity_d, hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); cudaCheck(cudaGetLastError()); - kernelBLFit<4><<>>( + kernelBLFit<4><<>>( tupleMultiplicity_d, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); cudaCheck(cudaGetLastError()); if (fit5as4_) { // fit penta (only first 4) - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + kernelBLFastFit<4><<>>( + tuples_d, tupleMultiplicity_d, hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); - kernelBLFit<4><<>>( + kernelBLFit<4><<>>( tupleMultiplicity_d, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); } else { // fit penta (all 5) - kernelBLFastFit<5><<>>( - tuples_d, tupleMultiplicity_d, hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + kernelBLFastFit<5><<>>( + tuples_d, tupleMultiplicity_d, hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); - kernelBLFit<5><<>>( + kernelBLFit<5><<>>( tupleMultiplicity_d, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index 3c8397cf572f6..d4140692181bf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -11,8 +11,6 @@ - - diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h index 48a9ec7adf2f7..a33c613402dcf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -6,32 +6,52 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" -#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" // #define ONLY_PHICUT namespace CAConstants { // constants - - constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; } +#ifdef GPU_SMALL_EVENTS + constexpr uint32_t maxNumberOfTuples() { return 3*1024;} +#else + constexpr uint32_t maxNumberOfTuples() { return 6*1024;} +#endif + constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); } #ifndef ONLY_PHICUT +#ifndef GPU_SMALL_EVENTS constexpr uint32_t maxNumberOfDoublets() { return 262144; } constexpr uint32_t maxCellsPerHit() { return 128; } +#else + constexpr uint32_t maxNumberOfDoublets() { return 262144/2; } + constexpr uint32_t maxCellsPerHit() { return 128/2; } +#endif #else constexpr uint32_t maxNumberOfDoublets() { return 6*262144; } constexpr uint32_t maxCellsPerHit() { return 4*128; } #endif + constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;} + + constexpr uint32_t maxNumberOfLayerPairs() { return 13; } constexpr uint32_t maxNumberOfLayers() { return 10; } - constexpr uint32_t maxTuples() { return 6*1024;} + constexpr uint32_t maxTuples() { return maxNumberOfTuples();} // types using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct using tindex_type = uint16_t; // for tuples + + using CellNeighbors = GPU::VecArray< uint32_t, 36>; + using CellTracks = GPU::VecArray< tindex_type, 42>; + + using CellNeighborsVector = GPU::SimpleVector; + using CellTracksVector = GPU::SimpleVector; + using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>; using TuplesContainer = OneToManyAssoc; - using HitToTuple = OneToManyAssoc; // 3.5 should be enough + using HitToTuple = OneToManyAssoc; // 3.5 should be enough using TupleMultiplicity = OneToManyAssoc; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc index 8007e9a67df3f..c47f5c2fe4c5a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc @@ -1,5 +1,6 @@ #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/Common/interface/OwnVector.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" #include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/MakerMacros.h" @@ -12,7 +13,12 @@ #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" +#include "FWCore/ServiceRegistry/interface/Service.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h" #include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h" #include "RecoTracker/TkHitPairs/interface/RegionsSeedingHitSets.h" @@ -33,7 +39,7 @@ class CAHitNtupletHeterogeneousEDProducer heterogeneous::GPUCuda, heterogeneous::CPU>> { public: - using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit; + using PixelRecHitsH = TrackingRecHit2DCUDA; using GPUProduct = pixelTuplesHeterogeneousProduct::GPUProduct; using CPUProduct = pixelTuplesHeterogeneousProduct::CPUProduct; using Output = pixelTuplesHeterogeneousProduct::HeterogeneousPixelTuples; @@ -57,7 +63,7 @@ class CAHitNtupletHeterogeneousEDProducer private: edm::EDGetTokenT> regionToken_; - edm::EDGetTokenT gpuHits_; + edm::EDGetTokenT> gpuHits_; edm::EDGetTokenT cpuHits_; edm::RunningAverage localRA_; @@ -74,8 +80,7 @@ class CAHitNtupletHeterogeneousEDProducer CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer( const edm::ParameterSet &iConfig) : HeterogeneousEDProducer(iConfig), - gpuHits_(consumesHeterogeneous(iConfig.getParameter("heterogeneousPixelRecHitSrc"))), - cpuHits_(consumes(iConfig.getParameter("heterogeneousPixelRecHitSrc"))), + gpuHits_(consumes>(iConfig.getParameter("pixelRecHitSrc"))), GPUGenerator_(iConfig, consumesCollector()), useRiemannFit_(iConfig.getParameter("useRiemannFit")), enableConversion_(iConfig.getParameter("gpuEnableConversion")), @@ -83,6 +88,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer( { produces(); if(enableConversion_) { + cpuHits_ = consumes(iConfig.getParameter("pixelRecHitLegacySrc")); regionToken_ = consumes>(iConfig.getParameter("trackingRegions")); produces(); } @@ -93,7 +99,8 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions( edm::ParameterSetDescription desc; desc.add("trackingRegions", edm::InputTag("globalTrackingRegionFromBeamSpot")); - desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplitting")); + desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting")); + desc.add("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting")); desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); desc.add("gpuEnableTransfer", true); desc.add("gpuEnableConversion", true); @@ -113,20 +120,28 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda( const edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup, cuda::stream_t<> &cudaStream) { - edm::Handle gh; - iEvent.getByToken(gpuHits_, gh); - auto const & gHits = *gh; + edm::Handle> hHits; + iEvent.getByToken(gpuHits_, hHits); - GPUGenerator_.buildDoublets(gHits,cudaStream.id()); + // temporary check (until the migration) + edm::Service cs; + assert(hHits->device() == cs->getCurrentDevice()); + + CUDAScopedContext ctx{*hHits}; + auto const& gHits = ctx.get(*hHits); + + if(not hHits->isAvailable()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hHits->event()->id(), 0)); + } + + GPUGenerator_.buildDoublets(gHits,cudaStream); GPUGenerator_.initEvent(iEvent.event(), iSetup); LogDebug("CAHitNtupletHeterogeneousEDProducer") << "Creating ntuplets on GPU"; - GPUGenerator_.hitNtuplets(gHits, iSetup, useRiemannFit_, enableTransfer_, cudaStream.id()); - - + GPUGenerator_.hitNtuplets(gHits, iSetup, useRiemannFit_, enableTransfer_, cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc index 749447ebf88b8..98d8b4ca2bc96 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc @@ -65,7 +65,7 @@ void CAHitQuadrupletGeneratorGPU::hitNtuplets( edm::EventSetup const& es, bool useRiemannFit, bool transferToCPU, - cudaStream_t cudaStream) + cuda::stream_t<> &cudaStream) { hitsOnCPU = &hh; launchKernels(hh, useRiemannFit, transferToCPU, cudaStream); @@ -75,12 +75,29 @@ void CAHitQuadrupletGeneratorGPU::fillResults( const TrackingRegion ®ion, SiPixelRecHitCollectionNew const & rechits, std::vector &result, const edm::EventSetup &es) { + + assert(hitsOnCPU); + auto nhits = hitsOnCPU->nHits(); + + uint32_t hitsModuleStart[gpuClustering::MaxNumModules+1]; + // to be understood where to locate + cudaCheck(cudaMemcpy(hitsModuleStart, hitsOnCPU->hitsModuleStart(), (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault)); + + auto fc = hitsModuleStart; + hitmap_.clear(); auto const & rcs = rechits.data(); - for (auto const & h : rcs) hitmap_.add(h, &h); + hitmap_.resize(nhits); + for (auto const & h : rcs) { + auto const & thit = static_cast(h); + auto detI = thit.det()->index(); + auto const & clus = thit.firstClusterRef(); + assert(clus.isPixel()); + auto i = fc[detI] + clus.pixelCluster().originalId(); + assert(inHits; int index = 0; auto const & foundQuads = fetchKernelResult(index); @@ -99,8 +116,9 @@ void CAHitQuadrupletGeneratorGPU::fillResults( auto k = foundQuads[quadId][i]; if (k<0) { phits[i] = nullptr;continue; } // (actually break...) assert(k &cudaStream) { - kernels.launchKernels(hh, gpu_, cudaStream); + kernels.launchKernels(hh, gpu_, cudaStream.id()); if (useRiemannFit) { - fitter.launchRiemannKernels(hh, hh.nHits, CAConstants::maxNumberOfQuadruplets(), cudaStream); + fitter.launchRiemannKernels(hh, hh.nHits(), CAConstants::maxNumberOfQuadruplets(), cudaStream); } else { - fitter.launchBrokenLineKernels(hh, hh.nHits, CAConstants::maxNumberOfQuadruplets(), cudaStream); + fitter.launchBrokenLineKernels(hh, hh.nHits(), CAConstants::maxNumberOfQuadruplets(), cudaStream); } - kernels.classifyTuples(hh, gpu_, cudaStream); + kernels.classifyTuples(hh, gpu_, cudaStream.id()); if (transferToCPU) { cudaCheck(cudaMemcpyAsync(tuples_,gpu_.tuples_d, sizeof(TuplesOnGPU::Container), - cudaMemcpyDeviceToHost, cudaStream)); + cudaMemcpyDeviceToHost, cudaStream.id())); cudaCheck(cudaMemcpyAsync(helix_fit_results_,gpu_.helix_fit_results_d, sizeof(Rfit::helix_fit)*CAConstants::maxNumberOfQuadruplets(), - cudaMemcpyDeviceToHost, cudaStream)); + cudaMemcpyDeviceToHost, cudaStream.id())); cudaCheck(cudaMemcpyAsync(quality_,gpu_.quality_d, sizeof(Quality)*CAConstants::maxNumberOfQuadruplets(), - cudaMemcpyDeviceToHost, cudaStream)); + cudaMemcpyDeviceToHost, cudaStream.id())); } @@ -233,6 +251,6 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int) return quadsInterface; } -void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { +void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream) { kernels.buildDoublets(hh,stream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index 5bf4964ab3488..0c5217daff0bc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -4,11 +4,11 @@ #include #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Utilities/interface/EDGetToken.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" -#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/RZLine.h" #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h" #include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" @@ -20,7 +20,6 @@ #include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h" -#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" #include "CAHitQuadrupletGeneratorKernels.h" #include "HelixFitOnGPU.h" @@ -41,9 +40,9 @@ namespace edm { class CAHitQuadrupletGeneratorGPU { public: - using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; - using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; - using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + using hindex_type = TrackingRecHit2DSOAView::hindex_type; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; using TuplesOnCPU = pixelTuplesHeterogeneousProduct::TuplesOnCPU; @@ -65,16 +64,16 @@ class CAHitQuadrupletGeneratorGPU { void initEvent(const edm::Event& ev, const edm::EventSetup& es); - void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream); + void buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream); void hitNtuplets(HitsOnCPU const & hh, const edm::EventSetup& es, bool useRiemannFit, bool transferToCPU, - cudaStream_t stream); + cuda::stream_t<> &cudaStream); TuplesOnCPU getOutput() const { - return TuplesOnCPU { std::move(indToEdm), hitsOnCPU->gpu_d, tuples_, helix_fit_results_, quality_, gpu_d, nTuples_}; + return TuplesOnCPU { std::move(indToEdm), hitsOnCPU->view(), tuples_, helix_fit_results_, quality_, gpu_d, nTuples_}; } void cleanup(cudaStream_t stream); @@ -87,7 +86,7 @@ class CAHitQuadrupletGeneratorGPU { private: - void launchKernels(HitsOnCPU const & hh, bool useRiemannFit, bool transferToCPU, cudaStream_t); + void launchKernels(HitsOnCPU const & hh, bool useRiemannFit, bool transferToCPU, cuda::stream_t<> &cudaStream); std::vector> fetchKernelResult(int); @@ -114,7 +113,7 @@ class CAHitQuadrupletGeneratorGPU { // input HitsOnCPU const * hitsOnCPU=nullptr; - RecHitsMap hitmap_ = RecHitsMap(nullptr); + std::vector hitmap_; }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu index e685c4674e0d9..cff61138d7d29 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu @@ -14,9 +14,14 @@ #include "gpuFishbone.h" #include "CAConstants.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + + + using namespace gpuPixelDoublets; -using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; +using HitsOnGPU = CAHitQuadrupletGeneratorKernels::HitsOnGPU; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; using Quality = pixelTuplesHeterogeneousProduct::Quality; @@ -25,13 +30,10 @@ using Quality = pixelTuplesHeterogeneousProduct::Quality; __global__ void kernel_checkOverflows(TuplesOnGPU::Container * foundNtuplets, AtomicPairCounter * apc, GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells, + CellNeighborsVector const * cellNeighbors, CellTracksVector const * cellTracks, GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell, uint32_t nHits, CAHitQuadrupletGeneratorKernels::Counters * counters) { - __shared__ uint32_t killedCell; - killedCell=0; - __syncthreads(); - auto idx = threadIdx.x + blockIdx.x * blockDim.x; auto & c = *counters; @@ -64,20 +66,19 @@ void kernel_checkOverflows(TuplesOnGPU::Container * foundNtuplets, AtomicPairCou if (idx < (*nCells) ) { auto &thisCell = cells[idx]; - if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.theTracks.full()) //++tooManyTracks[thisCell.theLayerPairId]; + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.theDoubletId<0) atomicAdd(&killedCell,1); + if (thisCell.theDoubletId<0) atomicAdd(&c.nKilledCells,1); + if (thisCell.outerNeighbors().empty()) atomicAdd(&c.nEmptyCells,1); + if (thisCell.tracks().empty()) atomicAdd(&c.nZeroTrackCells,1); } if (idx < nHits) { if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; printf("OuterHitOfCell overflow %d\n", idx); } - __syncthreads(); - if (threadIdx.x==0) atomicAdd(&c.nKilledCells,killedCell); - // if (threadIdx.x==0) printf("number of killed cells %d\n",killedCell); } @@ -95,7 +96,7 @@ kernel_fishboneCleaner(GPUCACell const * cells, uint32_t const * __restrict__ nC auto const & thisCell = cells[cellIndex]; if (thisCell.theDoubletId>=0) return; - for (auto it : thisCell.theTracks) quality[it] = bad; + for (auto it : thisCell.tracks()) quality[it] = bad; } @@ -125,13 +126,13 @@ kernel_fastDuplicateRemover(GPUCACell const * cells, uint32_t const * __restrict }; // find maxNh - for (auto it : thisCell.theTracks) { + for (auto it : thisCell.tracks()) { if (quality[it] == bad) continue; auto nh = foundNtuplets->size(it); maxNh = std::max(nh,maxNh); } // find min chi2 - for (auto it : thisCell.theTracks) { + for (auto it : thisCell.tracks()) { auto nh = foundNtuplets->size(it); if (nh!=maxNh) continue; if (quality[it]!= bad && @@ -141,7 +142,7 @@ kernel_fastDuplicateRemover(GPUCACell const * cells, uint32_t const * __restrict } } // mark duplicates - for (auto it : thisCell.theTracks) { + for (auto it : thisCell.tracks()) { if (quality[it]!= bad && it!=im) quality[it] = dup; //no race: simple assignment of the same constant } } @@ -151,6 +152,7 @@ void kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to zero them, GPUCACell::Hits const * __restrict__ hhp, GPUCACell * cells, uint32_t const * __restrict__ nCells, + CellNeighborsVector * cellNeighbors, GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell) { auto const & hh = *hhp; @@ -179,15 +181,16 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z if (thisCell.check_alignment(hh, cells[otherCell], ptmin, hardCurvCut) ) { - cells[otherCell].theOuterNeighbors.push_back(cellIndex); + cells[otherCell].addOuterNeighbor(cellIndex, *cellNeighbors); } } } __global__ void kernel_find_ntuplets( - GPUCACell::Hits const * __restrict__ hhp, + GPUCACell::Hits const * __restrict__ hhp, GPUCACell * __restrict__ cells, uint32_t const * nCells, + CellTracksVector * cellTracks, TuplesOnGPU::Container * foundNtuplets, AtomicPairCounter * apc, GPUCACell::TupleMultiplicity * tupleMultiplicity, unsigned int minHitsPerNtuplet) @@ -209,7 +212,7 @@ void kernel_find_ntuplets( if (thisCell.theLayerPairId==0 || thisCell.theLayerPairId==3 || thisCell.theLayerPairId==8) { // inner layer is 0 FIXME GPUCACell::TmpTuple stack; stack.reset(); - thisCell.find_ntuplets(hh, cells, *foundNtuplets, *apc, + thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, #ifdef CA_USE_LOCAL_COUNTERS local, #else @@ -341,7 +344,7 @@ void kernel_doStatsForHitInTracks(CAHitQuadrupletGeneratorKernels::HitToTuple co } __global__ -void kernel_tripletCleaner(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp, +void kernel_tripletCleaner(TrackingRecHit2DSOAView const * __restrict__ hhp, TuplesOnGPU::Container const * __restrict__ ptuples, Rfit::helix_fit const * __restrict__ hfit, Quality * __restrict__ quality, @@ -422,10 +425,10 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets(); - auto nhits = hh.nHits; - assert(nhits <= PixelGPUConstants::maxNumberOfHits); + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); - if (earlyFishbone_) { + if (nhits>1 && earlyFishbone_) { auto nthTot = 64; auto stride = 4; auto blockSize = nthTot/stride; @@ -433,9 +436,9 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... dim3 blks(1,numberOfBlocks,1); dim3 thrs(stride,blockSize,1); fishbone<<>>( - hh.gpu_d, - device_theCells_, device_nCells_, - device_isOuterHitOfCell_, + hh.view(), + device_theCells_.get(), device_nCells_, + device_isOuterHitOfCell_.get(), nhits, false ); cudaCheck(cudaGetLastError()); @@ -455,15 +458,17 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... kernel_connect<<>>( gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.gpu_d, - device_theCells_, device_nCells_, - device_isOuterHitOfCell_ + hh.view(), + device_theCells_.get(), device_nCells_, + device_theCellNeighbors_, + device_isOuterHitOfCell_.get() ); cudaCheck(cudaGetLastError()); kernel_find_ntuplets<<>>( - hh.gpu_d, - device_theCells_, device_nCells_, + hh.view(), + device_theCells_.get(), device_nCells_, + device_theCellTracks_, gpu_.tuples_d, gpu_.apc_d, device_tupleMultiplicity_, @@ -482,7 +487,7 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... kernel_fillMultiplicity<<>>(gpu_.tuples_d,device_tupleMultiplicity_); cudaCheck(cudaGetLastError()); - if (lateFishbone_) { + if (nhits>1 && lateFishbone_) { auto nthTot = 128; auto stride = 16; auto blockSize = nthTot/stride; @@ -490,9 +495,9 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... dim3 blks(1,numberOfBlocks,1); dim3 thrs(stride,blockSize,1); fishbone<<>>( - hh.gpu_d, - device_theCells_, device_nCells_, - device_isOuterHitOfCell_, + hh.view(), + device_theCells_.get(), device_nCells_, + device_isOuterHitOfCell_.get(), nhits, true ); cudaCheck(cudaGetLastError()); @@ -502,11 +507,15 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1)/blockSize; kernel_checkOverflows<<>>( gpu_.tuples_d, gpu_.apc_d, - device_theCells_, device_nCells_, - device_isOuterHitOfCell_, nhits, + device_theCells_.get(), device_nCells_, + device_theCellNeighbors_,device_theCellTracks_, + device_isOuterHitOfCell_.get(), nhits, counters_ ); cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); +#endif } @@ -514,16 +523,39 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... } -void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { - auto nhits = hh.nHits; +void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream) { + auto nhits = hh.nHits(); + +#ifdef GPU_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + edm::Service cs; + device_isOuterHitOfCell_ = cs->make_device_unique(nhits, stream); + { + int threadsPerBlock = 128; + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(),nhits, + device_theCellNeighbors_, device_theCellNeighborsContainer_.get(), + device_theCellTracks_, device_theCellTracksContainer_.get() + ); + cudaCheck(cudaGetLastError()); + } + + device_theCells_ = cs->make_device_unique(CAConstants::maxNumberOfDoublets(), stream); + + if (0==nhits) return; // protect against empty events int stride=1; int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride; int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock; dim3 blks(1,blocks,1); dim3 thrs(stride,threadsPerBlock,1); - gpuPixelDoublets::getDoubletsFromHisto<<>>( - device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_, idealConditions_); + gpuPixelDoublets::getDoubletsFromHisto<<>>( + device_theCells_.get(), device_nCells_, + device_theCellNeighbors_, device_theCellTracks_, + hh.view(), device_isOuterHitOfCell_.get(), idealConditions_); cudaCheck(cudaGetLastError()); } @@ -533,31 +565,40 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple // classify tracks based on kinematics auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize; kernel_classifyTracks<<>>(tuples.tuples_d, tuples.helix_fit_results_d, tuples.quality_d); + cudaCheck(cudaGetLastError()); // apply fishbone cleaning to good tracks numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1)/blockSize; - kernel_fishboneCleaner<<>>(device_theCells_, device_nCells_,tuples.quality_d); + kernel_fishboneCleaner<<>>(device_theCells_.get(), device_nCells_,tuples.quality_d); + cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a doublet) numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1)/blockSize; - kernel_fastDuplicateRemover<<>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d); + kernel_fastDuplicateRemover<<>>(device_theCells_.get(), device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d); + cudaCheck(cudaGetLastError()); // fill hit->track "map" numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize; kernel_countHitInTracks<<>>(tuples.tuples_d,tuples.quality_d,device_hitToTuple_); + cudaCheck(cudaGetLastError()); cudautils::launchFinalize(device_hitToTuple_,device_tmws_,cudaStream); + cudaCheck(cudaGetLastError()); kernel_fillHitInTracks<<>>(tuples.tuples_d,tuples.quality_d,device_hitToTuple_); + cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a hit) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1)/blockSize; - kernel_tripletCleaner<<>>(hh.gpu_d,tuples.tuples_d,tuples.helix_fit_results_d,tuples.quality_d,device_hitToTuple_); + kernel_tripletCleaner<<>>(hh.view(),tuples.tuples_d,tuples.helix_fit_results_d,tuples.quality_d,device_hitToTuple_); + cudaCheck(cudaGetLastError()); if (doStats_) { // counters (add flag???) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1)/blockSize; kernel_doStatsForHitInTracks<<>>(device_hitToTuple_, counters_); + cudaCheck(cudaGetLastError()); numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize; kernel_doStatsForTracks<<>>(tuples.tuples_d,tuples.quality_d,counters_); + cudaCheck(cudaGetLastError()); } } @@ -566,10 +607,16 @@ __global__ void kernel_printCounters(CAHitQuadrupletGeneratorKernels::Counters const * counters) { auto const & c = *counters; - printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld\n",c.nEvents,c.nHits,c.nCells,c.nTuples,c.nGoodTracks,c.nUsedHits, c.nDupHits, c.nKilledCells); - printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f||\n",c.nEvents,c.nHits/double(c.nEvents),c.nCells/double(c.nEvents), + printf("||Counters | nEvents | nHits | nCells | nTuples | nGoodTracks | nUsedHits | nDupHits | nKilledCells | nEmptyCells | nZeroTrackCells ||\n"); + printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",c.nEvents,c.nHits,c.nCells, + c.nTuples,c.nGoodTracks,c.nUsedHits, c.nDupHits, c.nKilledCells, c.nEmptyCells,c.nZeroTrackCells + ); + printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f||\n", + c.nEvents,c.nHits/double(c.nEvents),c.nCells/double(c.nEvents), c.nTuples/double(c.nEvents),c.nGoodTracks/double(c.nEvents), - c.nUsedHits/double(c.nEvents),c.nDupHits/double(c.nEvents),c.nKilledCells/double(c.nEvents)); + c.nUsedHits/double(c.nEvents),c.nDupHits/double(c.nEvents),c.nKilledCells/double(c.nEvents), + c.nEmptyCells/double(c.nCells),c.nZeroTrackCells/double(c.nCells) + ); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h index 357fdcf1ad3f7..1917d13f2934f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h @@ -2,12 +2,12 @@ #define RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorKernels_h -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" - #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h" #include "GPUCACell.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" @@ -24,10 +24,12 @@ class CAHitQuadrupletGeneratorKernels { unsigned long long nUsedHits; unsigned long long nDupHits; unsigned long long nKilledCells; + unsigned long long nEmptyCells; + unsigned long long nZeroTrackCells; }; - using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; - using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; @@ -51,7 +53,7 @@ class CAHitQuadrupletGeneratorKernels { void classifyTuples(HitsOnCPU const & hh, TuplesOnGPU & tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream); + void buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream); void allocateOnGPU(); void deallocateOnGPU(); void cleanup(cudaStream_t cudaStream); @@ -59,24 +61,31 @@ class CAHitQuadrupletGeneratorKernels { private: - Counters * counters_ = nullptr; + Counters * counters_ = nullptr; + + // workspace + CAConstants::CellNeighborsVector * device_theCellNeighbors_ = nullptr; + cudautils::device::unique_ptr device_theCellNeighborsContainer_; + CAConstants::CellTracksVector * device_theCellTracks_ = nullptr; + cudautils::device::unique_ptr device_theCellTracksContainer_; + - // workspace - GPUCACell* device_theCells_ = nullptr; - GPUCACell::OuterHitOfCell* device_isOuterHitOfCell_ = nullptr; - uint32_t* device_nCells_ = nullptr; + cudautils::device::unique_ptr device_theCells_; + cudautils::device::unique_ptr device_isOuterHitOfCell_; + uint32_t* device_nCells_ = nullptr; - HitToTuple * device_hitToTuple_ = nullptr; - AtomicPairCounter * device_hitToTuple_apc_ = nullptr; + HitToTuple * device_hitToTuple_ = nullptr; + AtomicPairCounter * device_hitToTuple_apc_ = nullptr; - TupleMultiplicity * device_tupleMultiplicity_ = nullptr; - uint8_t * device_tmws_ = nullptr; + TupleMultiplicity * device_tupleMultiplicity_ = nullptr; + uint8_t * device_tmws_ = nullptr; - const uint32_t minHitsPerNtuplet_; - const bool earlyFishbone_; - const bool lateFishbone_; - const bool idealConditions_; - const bool doStats_; + // params + const uint32_t minHitsPerNtuplet_; + const bool earlyFishbone_; + const bool lateFishbone_; + const bool idealConditions_; + const bool doStats_; }; #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorKernels_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu index 2c65058a1651a..8c5e646bb8325 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu @@ -11,9 +11,10 @@ CAHitQuadrupletGeneratorKernels::deallocateOnGPU() } cudaFree(counters_); - cudaFree(device_theCells_); - cudaFree(device_isOuterHitOfCell_); cudaFree(device_nCells_); + cudaFree(device_theCellNeighbors_); + cudaFree(device_theCellTracks_); + cudaFree(device_hitToTuple_); cudaFree(device_hitToTuple_apc_); cudaFree(device_tupleMultiplicity_); @@ -29,31 +30,37 @@ void CAHitQuadrupletGeneratorKernels::allocateOnGPU() cudaCheck(cudaMalloc(&counters_, sizeof(Counters))); cudaCheck(cudaMemset(counters_,0,sizeof(Counters))); - cudaCheck(cudaMalloc(&device_theCells_, - CAConstants::maxNumberOfLayerPairs() * CAConstants::maxNumberOfDoublets() * sizeof(GPUCACell))); cudaCheck(cudaMalloc(&device_nCells_, sizeof(uint32_t))); cudaCheck(cudaMemset(device_nCells_, 0, sizeof(uint32_t))); - cudaCheck(cudaMalloc(&device_isOuterHitOfCell_, - PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell))); - cudaCheck(cudaMemset(device_isOuterHitOfCell_, 0, - PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell))); + cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector))); + cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector))); + cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector))); + cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector))); + - cudaCheck(cudaMalloc(&device_hitToTuple_, sizeof(HitToTuple))); - cudaCheck(cudaMemset(device_hitToTuple_,0,sizeof(HitToTuple))); // overkill - cudaCheck(cudaMalloc(&device_hitToTuple_apc_, sizeof(AtomicPairCounter))); + cudaCheck(cudaMalloc(&device_hitToTuple_, sizeof(HitToTuple))); + cudaCheck(cudaMemset(device_hitToTuple_,0,sizeof(HitToTuple))); // overkill + cudaCheck(cudaMalloc(&device_hitToTuple_apc_, sizeof(AtomicPairCounter))); - cudaCheck(cudaMalloc(&device_tupleMultiplicity_,sizeof(TupleMultiplicity))); - cudaCheck(cudaMemset(device_tupleMultiplicity_,0,sizeof(TupleMultiplicity))); // overkill + cudaCheck(cudaMalloc(&device_tupleMultiplicity_,sizeof(TupleMultiplicity))); + cudaCheck(cudaMemset(device_tupleMultiplicity_,0,sizeof(TupleMultiplicity))); // overkill - cudaCheck(cudaMalloc(&device_tmws_, std::max(TupleMultiplicity::wsSize(),HitToTuple::wsSize()))); + cudaCheck(cudaMalloc(&device_tmws_, std::max(TupleMultiplicity::wsSize(),HitToTuple::wsSize()))); } void CAHitQuadrupletGeneratorKernels::cleanup(cudaStream_t cudaStream) { + +#ifdef GPU_DEBUG + std::cout << "CAHitQuadrupletGeneratorKernels::cleanup" << std::endl; +#endif + // this lazily resets temporary memory for the next event, and is not needed for reading the output - cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0, - PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell), - cudaStream)); + device_theCells_ = nullptr; + device_isOuterHitOfCell_ = nullptr; + device_theCellNeighborsContainer_ = nullptr; + device_theCellTracksContainer_ = nullptr; + cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream)); cudautils::launchZero(device_tupleMultiplicity_,cudaStream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index b3e23792a4083..f20a96dc79b73 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -6,24 +6,28 @@ #include +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" - #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h" class GPUCACell { public: + using ptrAsInt = unsigned long long; + static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit(); using OuterHitOfCell = CAConstants::OuterHitOfCell; + using CellNeighbors = CAConstants::CellNeighbors; + using CellTracks = CAConstants::CellTracks; + using CellNeighborsVector = CAConstants::CellNeighborsVector; + using CellTracksVector = CAConstants::CellTracksVector; - - using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; - using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + using Hits = TrackingRecHit2DSOAView; + using hindex_type = Hits::hindex_type; using TmpTuple = GPU::VecArray; @@ -35,7 +39,8 @@ class GPUCACell { #ifdef __CUDACC__ __device__ __forceinline__ - void init(Hits const & hh, + void init(CellNeighborsVector & cellNeighbors, CellTracksVector & cellTracks, + Hits const & hh, int layerPairId, int doubletId, hindex_type innerHitId, hindex_type outerHitId) { @@ -44,23 +49,45 @@ class GPUCACell { theDoubletId = doubletId; theLayerPairId = layerPairId; - theInnerZ = __ldg(hh.zg_d+innerHitId); - theInnerR = __ldg(hh.rg_d+innerHitId); - theOuterNeighbors.reset(); - theTracks.reset(); + theInnerZ = hh.zGlobal(innerHitId); + theInnerR = hh.rGlobal(innerHitId); + + outerNeighbors().reset(); + tracks().reset(); + assert(outerNeighbors().empty()); + assert(tracks().empty()); + } - __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); } - __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); } - __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); } - __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); } - __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; } - __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); } - __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; } - __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); } - __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theInnerHitId); } - __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theOuterHitId); } + __device__ __forceinline__ + int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector & cellNeighbors) { + return outerNeighbors().push_back(t); + } + + __device__ __forceinline__ + int addTrack(CellTracks::value_t t, CellTracksVector & cellTracks) { + return tracks().push_back(t); + } + + __device__ __forceinline__ CellTracks & tracks() { return theTracks;} + __device__ __forceinline__ CellTracks const & tracks() const { return theTracks;} + __device__ __forceinline__ CellNeighbors & outerNeighbors() { return theOuterNeighbors;} + __device__ __forceinline__ CellNeighbors const & outerNeighbors() const { return theOuterNeighbors;} + __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return hh.xGlobal(theInnerHitId); } + __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return hh.xGlobal(theOuterHitId); } + __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return hh.yGlobal(theInnerHitId); } + __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return hh.yGlobal(theOuterHitId); } + __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } + __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return hh.zGlobal(theOuterHitId); } + __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } + __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return hh.rGlobal(theOuterHitId); } + + __device__ __forceinline__ auto get_inner_iphi(Hits const & hh) const { return hh.iphi(theInnerHitId); } + __device__ __forceinline__ auto get_outer_iphi(Hits const & hh) const { return hh.iphi(theOuterHitId); } + + __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return hh.detectorIndex(theInnerHitId); } + __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return hh.detectorIndex(theOuterHitId); } constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; @@ -143,13 +170,19 @@ class GPUCACell { __device__ inline bool hole(Hits const & hh, GPUCACell const & innerCell) const { - constexpr float r4 = 16.f; + int p = get_outer_iphi(hh); + if (p<0) p+=std::numeric_limits::max(); + p = (64*p)/std::numeric_limits::max(); + p %=2; + float r4 = p==0 ? 15.815 : 16.146; // later on from geom auto ri = innerCell.get_inner_r(hh); auto zi = innerCell.get_inner_z(hh); auto ro = get_outer_r(hh); auto zo = get_outer_z(hh); auto z4 = std::abs(zi + (r4-ri)*(zo-zi)/(ro-ri)); - return z4>25.f && z4<33.f; + auto zm = z4-6.7*int(z4/6.7); + auto h = zm<0.2 || zm>6.5; + return h || ( z4>26 && z4<32.f); } @@ -161,6 +194,7 @@ class GPUCACell { inline void find_ntuplets( Hits const & hh, GPUCACell * __restrict__ cells, + CellTracksVector & cellTracks, TuplesOnGPU::Container & foundNtuplets, AtomicPairCounter & apc, CM & tupleMultiplicity, @@ -176,10 +210,10 @@ class GPUCACell { tmpNtuplet.push_back_unsafe(theDoubletId); assert(tmpNtuplet.size()<=4); - if(theOuterNeighbors.size()>0) { - for (int j = 0; j < theOuterNeighbors.size(); ++j) { - auto otherCell = theOuterNeighbors[j]; - cells[otherCell].find_ntuplets(hh, cells, foundNtuplets, apc, tupleMultiplicity, + if(outerNeighbors().size()>0) { + for (int j = 0; j < outerNeighbors().size(); ++j) { + auto otherCell = outerNeighbors()[j]; + cells[otherCell].find_ntuplets(hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, tmpNtuplet, minHitsPerNtuplet); } } else { // if long enough save... @@ -194,7 +228,7 @@ class GPUCACell { hits[nh] = theOuterHitId; auto it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1); if (it>=0) { // if negative is overflow.... - for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it); + for (auto c : tmpNtuplet) cells[c].addTrack(it,cellTracks); tupleMultiplicity.countDirect(tmpNtuplet.size()+1); } } @@ -206,9 +240,11 @@ class GPUCACell { #endif // __CUDACC__ - GPU::VecArray< uint32_t, 36> theOuterNeighbors; - GPU::VecArray< uint16_t, 42> theTracks; +private: + CellNeighbors theOuterNeighbors; + CellTracks theTracks; +public: int32_t theDoubletId; int32_t theLayerPairId; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index 393eb9b020c3d..b2eab7626279e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -9,27 +9,10 @@ void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMu assert(tuples_d); assert(tupleMultiplicity_d); assert(helix_fit_results_d); - cudaCheck(cudaMalloc(&hitsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>))); - cudaCheck(cudaMemset(hitsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>))); - - cudaCheck(cudaMalloc(&hits_geGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f))); - cudaCheck(cudaMemset(hits_geGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f))); - - cudaCheck(cudaMalloc(&fast_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d))); - cudaCheck(cudaMemset(fast_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d))); - - cudaCheck(cudaMalloc(&circle_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit))); - cudaCheck(cudaMemset(circle_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit))); - } void HelixFitOnGPU::deallocateOnGPU() { - cudaFree(hitsGPU_); - cudaFree(hits_geGPU_); - cudaFree(fast_fit_resultsGPU_); - cudaFree(circle_fit_resultsGPU_); - } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 40a62c4c1c723..a50116c87321d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -4,9 +4,11 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h" -namespace siPixelRecHitsHeterogeneousProduct { - struct HitsOnCPU; -} +#include + +class TrackingRecHit2DSOAView; +class TrackingRecHit2DCUDA; + namespace Rfit { constexpr uint32_t maxNumberOfConcurrentFits() { return 6*1024;} @@ -35,8 +37,8 @@ namespace Rfit { class HelixFitOnGPU { public: - using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; - using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; using TupleMultiplicity = CAConstants::TupleMultiplicity; @@ -45,8 +47,8 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocateOnGPU();} void setBField(double bField) { bField_ = bField;} - void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream); + void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream); void allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results); void deallocateOnGPU(); @@ -62,12 +64,6 @@ class HelixFitOnGPU { double bField_; Rfit::helix_fit * helix_fit_results_d = nullptr; - // Riemann Fit internals - double *hitsGPU_ = nullptr; - float *hits_geGPU_ = nullptr; - double *fast_fit_resultsGPU_ = nullptr; - Rfit::circle_fit *circle_fit_resultsGPU_ = nullptr; - const bool fit5as4_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h index d27a639a5a9bf..f7538fc822011 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h @@ -8,6 +8,9 @@ #include #include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h" +#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h" +#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" +#include "DataFormats/SiStripCluster/interface/SiStripCluster.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" // store T for each cluster diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index e9111cb0b5db1..ea801b1b46389 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -11,12 +11,14 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; -using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; + +using HitsOnGPU = TrackingRecHit2DSOAView; using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; using namespace Eigen; @@ -66,10 +68,10 @@ void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets, auto hit = hitId[i]; // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); float ge[6]; - hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge); + hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); - hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit]; + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]; } Rfit::Fast_fit(hits,fast_fit); @@ -190,85 +192,93 @@ void kernelLineFit( } -void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream) +void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream) { - assert(tuples_d); assert(fast_fit_resultsGPU_); + assert(tuples_d); auto blockSize = 64; auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + // Fit internals + edm::Service cs; + auto hitsGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream); + auto hits_geGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream); + auto fast_fit_resultsGPU_ = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream); + auto circle_fit_resultsGPU_holder = cs->make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit),stream); + Rfit::circle_fit * circle_fit_resultsGPU_ = (Rfit::circle_fit*)(circle_fit_resultsGPU_holder.get()); + for (uint32_t offset=0; offset<<>>( + kernelFastFit<3><<>>( tuples_d, tupleMultiplicity_d, 3, - hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset); + hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset); cudaCheck(cudaGetLastError()); - kernelCircleFit<3><<>>( + kernelCircleFit<3><<>>( tupleMultiplicity_d, 3, bField_, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset); + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); - kernelLineFit<3><<>>( + kernelLineFit<3><<>>( tupleMultiplicity_d, 3, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); // quads - kernelFastFit<4><<>>( + kernelFastFit<4><<>>( tuples_d, tupleMultiplicity_d, 4, - hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset); + hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset); cudaCheck(cudaGetLastError()); - kernelCircleFit<4><<>>( + kernelCircleFit<4><<>>( tupleMultiplicity_d, 4, bField_, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset); + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); - kernelLineFit<4><<>>( + kernelLineFit<4><<>>( tupleMultiplicity_d, 4, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); if (fit5as4_) { // penta - kernelFastFit<4><<>>( + kernelFastFit<4><<>>( tuples_d, tupleMultiplicity_d, 5, - hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset); + hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset); cudaCheck(cudaGetLastError()); - kernelCircleFit<4><<>>( + kernelCircleFit<4><<>>( tupleMultiplicity_d, 5, bField_, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset); + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); - kernelLineFit<4><<>>( + kernelLineFit<4><<>>( tupleMultiplicity_d, 5, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); } else { // penta all 5 - kernelFastFit<5><<>>( + kernelFastFit<5><<>>( tuples_d, tupleMultiplicity_d, 5, - hh.gpu_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset); + hh.view(), + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset); cudaCheck(cudaGetLastError()); - kernelCircleFit<5><<>>( + kernelCircleFit<5><<>>( tupleMultiplicity_d, 5, bField_, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset); + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); - kernelLineFit<5><<>>( + kernelLineFit<5><<>>( tupleMultiplicity_d, 5, bField_, helix_fit_results_d, - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset); cudaCheck(cudaGetLastError()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 4368bc12ab5f8..e5f8406bd31b8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -10,7 +10,6 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "DataFormats/Math/interface/approx_atan2.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" #include "GPUCACell.h" @@ -32,8 +31,7 @@ namespace gpuPixelDoublets { auto const & hh = *hhp; - uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d; - auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);}; + auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id);}; // x run faster... auto idy = threadIdx.y + blockIdx.y * blockDim.y; @@ -54,7 +52,7 @@ namespace gpuPixelDoublets { auto sg=0; for (uint32_t ic=0; ic -#include -#include -#include -#include +#include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h" -#include "DataFormats/Math/interface/approx_atan2.h" -#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" - -#include "GPUCACell.h" -#include "CAConstants.h" - - -// useful for benchmark -// #define ONLY_PHICUT -// #define USE_ZCUT -// #define NO_CLSCUT +#define CONSTANT_VAR __constant__ namespace gpuPixelDoublets { - constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets(); // not really relevant - - template - __device__ - __forceinline__ - void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, - uint32_t nPairs, - GPUCACell * cells, - uint32_t * nCells, - int16_t const * __restrict__ iphi, - Hist const & __restrict__ hist, - uint32_t const * __restrict__ offsets, - siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, - GPUCACell::OuterHitOfCell * isOuterHitOfCell, - int16_t const * __restrict__ phicuts, -#ifdef USE_ZCUT - float const * __restrict__ minz, - float const * __restrict__ maxz, -#endif - float const * __restrict__ maxr, bool ideal_cond) - { - -#ifndef NO_CLSCUT - // ysize cuts (z in the barrel) times 8 - constexpr int minYsizeB1=36; - constexpr int minYsizeB2=28; - constexpr int maxDYsize12=28; - constexpr int maxDYsize=20; -#endif - - auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; }; - - // nPairsMax to be optimized later (originally was 64). - // If it should be much bigger, consider using a block-wide parallel prefix scan, - // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html - const int nPairsMax = 16; - assert(nPairs <= nPairsMax); - uint32_t innerLayerCumulativeSize[nPairsMax]; - innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); - for (uint32_t i = 1; i < nPairs; ++i) { - innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]); - } - auto ntot = innerLayerCumulativeSize[nPairs-1]; - - // x runs faster - auto idy = blockIdx.y * blockDim.y + threadIdx.y; - auto first = threadIdx.x; - auto stride = blockDim.x; - for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) { - - uint32_t pairLayerId=0; - while (j >= innerLayerCumulativeSize[pairLayerId++]); - --pairLayerId; // move to lower_bound ?? - - assert(pairLayerId < nPairs); - assert(j < innerLayerCumulativeSize[pairLayerId]); - assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]); - - uint8_t inner = layerPairs[2*pairLayerId]; - uint8_t outer = layerPairs[2*pairLayerId+1]; - assert(outer > inner); - - auto hoff = Hist::histOff(outer); - - auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; - i += offsets[inner]; - - // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); - - assert(i >= offsets[inner]); - assert(i < offsets[inner+1]); - - // found hit corresponding to our cuda thread, now do the job - auto mez = __ldg(hh.zg_d+i); - -#ifdef USE_ZCUT - // this statement is responsible for a 10% slow down of the kernel once all following cuts are optimized... - if (mezmaxz[pairLayerId]) continue; -#endif - -#ifndef NO_CLSCUT - auto mes = __ldg(hh.ysize_d+i); - - // if ideal treat inner ladder as outer - auto mi = __ldg(hh.detInd_d+i); - if (inner==0) assert(mi<96); - const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue... - - // auto mesx = __ldg(hh.xsize_d+i); - // if (mesx<0) continue; // remove edges in x as overlap will take care - - if (inner==0 && outer>3 && isOuterLadder) // B1 and F1 - if (mes>0 && mes3) // B2 and F1 - if (mes>0 && mes (ro-ri)*(ro-ri); - }; - auto z0cutoff = [&](int j) { - auto zo = __ldg(hh.zg_d+j); - auto ro = __ldg(hh.rg_d+j); - auto dr = ro-mer; - return dr > maxr[pairLayerId] || - dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; - }; - -#ifndef NO_CLSCUT - auto zsizeCut = [&](int j) { - auto onlyBarrel = outer<4; - auto so = __ldg(hh.ysize_d+j); - //auto sox = __ldg(hh.xsize_d+j); - auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize; - return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy; - }; -#endif - - auto iphicut = phicuts[pairLayerId]; - - auto kl = Hist::bin(int16_t(mep-iphicut)); - auto kh = Hist::bin(int16_t(mep+iphicut)); - auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();}; - -#ifdef GPU_DEBUG - int tot = 0; - int nmin = 0; - int tooMany=0; -#endif - - auto khh = kh; - incr(khh); - for (auto kk = kl; kk != khh; incr(kk)) { -#ifdef GPU_DEBUG - if (kk != kl && kk != kh) - nmin += hist.size(kk+hoff); -#endif - auto const * __restrict__ p = hist.begin(kk+hoff); - auto const * __restrict__ e = hist.end(kk+hoff); - p+=first; - for (;p < e; p+=stride) { - auto oi=__ldg(p); - assert(oi>=offsets[outer]); - assert(oi iphicut) - continue; -#ifndef ONLY_PHICUT -#ifndef NO_CLSCUT - if (zsizeCut(oi)) continue; -#endif - if (z0cutoff(oi) || ptcut(oi)) continue; -#endif - auto ind = atomicAdd(nCells, 1); - if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector?? - // int layerPairId, int doubletId, int innerHitId, int outerHitId) - cells[ind].init(hh, pairLayerId, ind, i, oi); - isOuterHitOfCell[oi].push_back(ind); -#ifdef GPU_DEBUG - if (isOuterHitOfCell[oi].full()) ++tooMany; - ++tot; -#endif - } - } -#ifdef GPU_DEBUG - if (tooMany > 0) - printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany); -#endif - } // loop in block... - } + using namespace gpuPixelDoubletsAlgos; - constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y - constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16; - - __global__ - __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP) - void getDoubletsFromHisto(GPUCACell * cells, - uint32_t * nCells, - siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp, - GPUCACell::OuterHitOfCell * isOuterHitOfCell, - bool ideal_cond) - { constexpr int nPairs = 13; - constexpr const uint8_t layerPairs[2*nPairs] = { + CONSTANT_VAR const uint8_t layerPairs[2*nPairs] = { 0, 1, 1, 2, 2, 3, // 0, 4, 1, 4, 2, 4, 4, 5, 5, 6, 0, 7, 1, 7, 2, 7, 7, 8, 8, 9, // neg @@ -228,41 +21,71 @@ namespace gpuPixelDoublets { constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06); constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); - constexpr const int16_t phicuts[nPairs] { + CONSTANT_VAR const int16_t phicuts[nPairs] { phi0p05, phi0p05, phi0p06, phi0p07, phi0p06, phi0p06, phi0p05, phi0p05, phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 }; -#ifdef USE_ZCUT - float const minz[nPairs] = { + CONSTANT_VAR float const minz[nPairs] = { -20., -22., -22., -30., -30.,-30., -70., -70., 0., 10., 15., -70., -70. }; - float const maxz[nPairs] = { + CONSTANT_VAR float const maxz[nPairs] = { 20., 22., 22., 0., -10., -15., 70., 70., 30., 30., 30., 70., 70. }; -#endif - float const maxr[nPairs] = { + CONSTANT_VAR float const maxr[nPairs] = { 20., 20., 20., 9., 7., 6., 5., 5., 9., 7., 6., 5., 5. }; + + constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets(); // not really relevant + + constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets(); + + + using CellNeighbors = CAConstants::CellNeighbors; + using CellTracks = CAConstants::CellTracks; + using CellNeighborsVector = CAConstants::CellNeighborsVector; + using CellTracksVector = CAConstants::CellTracksVector; + + __global__ + void initDoublets(GPUCACell::OuterHitOfCell * isOuterHitOfCell, int nHits, + CellNeighborsVector * cellNeighbors, CellNeighbors * cellNeighborsContainer, + CellTracksVector * cellTracks, CellTracks * cellTracksContainer + ) + { + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i=first; i +#include +#include +#include +#include + +#include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + +#include "GPUCACell.h" +#include "CAConstants.h" + + +// useful for benchmark +// #define ONLY_PHICUT +// #define NO_ZCUT +// #define NO_CLSCUT + +namespace gpuPixelDoubletsAlgos { + + constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets(); // not really relevant + + constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets(); + + + using CellNeighbors = CAConstants::CellNeighbors; + using CellTracks = CAConstants::CellTracks; + using CellNeighborsVector = CAConstants::CellNeighborsVector; + using CellTracksVector = CAConstants::CellTracksVector; + + __device__ + __forceinline__ + void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, + uint32_t nPairs, + GPUCACell * cells, + uint32_t * nCells, + CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks, + TrackingRecHit2DSOAView const & __restrict__ hh, + GPUCACell::OuterHitOfCell * isOuterHitOfCell, + int16_t const * __restrict__ phicuts, + float const * __restrict__ minz, + float const * __restrict__ maxz, + float const * __restrict__ maxr, + bool ideal_cond) + { + +#ifndef NO_CLSCUT + // ysize cuts (z in the barrel) times 8 + constexpr int minYsizeB1=36; + constexpr int minYsizeB2=28; + constexpr int maxDYsize12=28; + constexpr int maxDYsize=20; +#endif + + using Hist = TrackingRecHit2DSOAView::Hist; + + auto const & __restrict__ hist = hh.phiBinner(); + uint32_t const * __restrict__ offsets = hh.hitsLayerStart(); + assert(offsets); + + auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; }; + + // nPairsMax to be optimized later (originally was 64). + // If it should be much bigger, consider using a block-wide parallel prefix scan, + // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html + const int nPairsMax = 16; + assert(nPairs <= nPairsMax); + __shared__ uint32_t innerLayerCumulativeSize[nPairsMax]; + __shared__ uint32_t ntot; + if (threadIdx.y==0 && threadIdx.x==0) { + innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); + for (uint32_t i = 1; i < nPairs; ++i) { + innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]); + } + ntot = innerLayerCumulativeSize[nPairs-1]; + } + __syncthreads(); + + // x runs faster + auto idy = blockIdx.y * blockDim.y + threadIdx.y; + auto first = threadIdx.x; + auto stride = blockDim.x; + for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) { + + uint32_t pairLayerId=0; + while (j >= innerLayerCumulativeSize[pairLayerId++]); + --pairLayerId; // move to lower_bound ?? + + assert(pairLayerId < nPairs); + assert(j < innerLayerCumulativeSize[pairLayerId]); + assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]); + + uint8_t inner = layerPairs[2*pairLayerId]; + uint8_t outer = layerPairs[2*pairLayerId+1]; + assert(outer > inner); + + auto hoff = Hist::histOff(outer); + + auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; + i += offsets[inner]; + + // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); + + assert(i >= offsets[inner]); + assert(i < offsets[inner+1]); + + // found hit corresponding to our cuda thread, now do the job + auto mez = hh.zGlobal(i); + +#ifndef NO_ZCUT + if (mezmaxz[pairLayerId]) continue; +#endif + +#ifndef NO_CLSCUT + auto mes = hh.clusterSizeY(i); + + // if ideal treat inner ladder as outer + auto mi = hh.detectorIndex(i); + if (inner==0) assert(mi<96); + const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue... + + + if (inner==0 && outer>3 && isOuterLadder) // B1 and F1 + if (mes>0 && mes3) // B2 and F1 + if (mes>0 && mes (ro-ri)*(ro-ri); + }; + auto z0cutoff = [&](int j) { + auto zo = hh.zGlobal(j);; + auto ro = hh.rGlobal(j); + auto dr = ro-mer; + return dr > maxr[pairLayerId] || + dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; + }; + +#ifndef NO_CLSCUT + auto zsizeCut = [&](int j) { + auto onlyBarrel = outer<4; + auto so = hh.clusterSizeY(j); + auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize; + return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy; + }; +#endif + + auto iphicut = phicuts[pairLayerId]; + + auto kl = Hist::bin(int16_t(mep-iphicut)); + auto kh = Hist::bin(int16_t(mep+iphicut)); + auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();}; + +#ifdef GPU_DEBUG + int tot = 0; + int nmin = 0; + int tooMany=0; +#endif + + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { +#ifdef GPU_DEBUG + if (kk != kl && kk != kh) + nmin += hist.size(kk+hoff); +#endif + auto const * __restrict__ p = hist.begin(kk+hoff); + auto const * __restrict__ e = hist.end(kk+hoff); + p+=first; + for (;p < e; p+=stride) { + auto oi=__ldg(p); + assert(oi>=offsets[outer]); + assert(oi iphicut) + continue; +#ifndef ONLY_PHICUT +#ifndef NO_CLSCUT + if (zsizeCut(oi)) continue; +#endif + if (z0cutoff(oi) || ptcut(oi)) continue; +#endif + auto ind = atomicAdd(nCells, 1); + if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector?? + // int layerPairId, int doubletId, int innerHitId, int outerHitId) + cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi); + isOuterHitOfCell[oi].push_back(ind); +#ifdef GPU_DEBUG + if (isOuterHitOfCell[oi].full()) ++tooMany; + ++tot; +#endif + } + } +#ifdef GPU_DEBUG + if (tooMany > 0) + printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany); +#endif + } // loop in block... + } + +} // namespace end + +#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h b/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h index a1c4a26f2fff6..54851ecda1179 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h @@ -12,9 +12,7 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h" -namespace siPixelRecHitsHeterogeneousProduct { - struct HitsOnGPU; -} +class TrackingRecHit2DSOAView; namespace pixelTuplesHeterogeneousProduct { @@ -42,7 +40,7 @@ namespace pixelTuplesHeterogeneousProduct { using Container = TuplesOnGPU::Container; - siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hitsOnGPU_d = nullptr; // forwarding + TrackingRecHit2DSOAView const * hitsOnGPU_d = nullptr; // forwarding Container const * tuples = nullptr;