From 68a82524dcf31698ee8cdedc9b7d1859404c9e11 Mon Sep 17 00:00:00 2001 From: Felice Pantaleo Date: Thu, 13 Sep 2018 16:40:00 +0200 Subject: [PATCH] reduce the maximum number of Pixel hits in CA (#163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change approach: use one global array of hits and isOuterHitOfCell instead of one per layer. This allows to allocate memory for the maximum number of hits over all layers, instead of the maximum number of hits per layer times the number of layers. The distribution of the number of pixel hits for data at pileup 50 is reasonably Gaussian around 18300 ± 3500; set the the maxNumberOfHits to 40000, which is > 6 σ away. Start moving constants for pixel-related GPU algorithms to a dedicated header file. --- .../interface/PixelTrackingGPUConstants.h | 11 +++++++++ .../plugins/CAHitQuadrupletGeneratorGPU.cu | 24 +++++++++---------- .../plugins/CAHitQuadrupletGeneratorGPU.h | 8 +++---- 3 files changed, 26 insertions(+), 17 deletions(-) create mode 100644 RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h b/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h new file mode 100644 index 0000000000000..ceb831d7865b6 --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h @@ -0,0 +1,11 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h +#define RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h + +#include + +namespace PixelGPUConstants { + constexpr uint16_t maxNumberOfHits = 40000; // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away + +} + +#endif // RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 4b87896bc6f3b..ba387c4a25b8f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -14,7 +14,7 @@ __global__ void kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, GPUCACell *cells, uint32_t const * nCells, GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, - uint32_t nHits) { + uint32_t nHits, uint32_t maxNumberOfDoublets) { auto idx = threadIdx.x + blockIdx.x * blockDim.x; #ifdef GPU_DEBUG @@ -28,9 +28,8 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, } if (idx < nHits) { if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; - printf("OuterHitOfCell overflow %d\n", idx); + printf("OuterHitOfCell overflow %d\n", idx); } - } @@ -38,7 +37,7 @@ __global__ void kernel_connect(GPU::SimpleVector *foundNtuplets, GPUCACell *cells, uint32_t const * nCells, GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, - float ptmin, + float ptmin, float region_origin_radius, const float thetaCut, const float phiCut, const float hardPtCut, unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { @@ -93,7 +92,7 @@ kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets, int ma (*foundNtuplets)[i].hitId[2], (*foundNtuplets)[i].hitId[3] ); - + } } @@ -124,9 +123,9 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU() cudaCheck(cudaMemset(device_nCells_, 0, sizeof(uint32_t))); cudaCheck(cudaMalloc(&device_isOuterHitOfCell_, - maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray))); + PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray))); cudaCheck(cudaMemset(device_isOuterHitOfCell_, 0, - maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray))); + PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray))); h_foundNtupletsVec_.resize(maxNumberOfRegions_); h_foundNtupletsData_.resize(maxNumberOfRegions_); @@ -158,15 +157,15 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, h_foundNtupletsVec_[regionIndex]->reset(); auto nhits = hh.nHits; - + assert(nhits <= PixelGPUConstants::maxNumberOfHits); auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512; kernel_connect<<>>( d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel device_theCells_, device_nCells_, device_isOuterHitOfCell_, - region.ptMin(), + region.ptMin(), region.originRBound(), caThetaCut, caPhiCut, caHardPtCut, - maxNumberOfDoublets_, maxNumberOfHits_ + maxNumberOfDoublets_, PixelGPUConstants::maxNumberOfHits ); cudaCheck(cudaGetLastError()); @@ -181,7 +180,8 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, kernel_checkOverflows<<>>( d_foundNtupletsVec_[regionIndex], device_theCells_, device_nCells_, - device_isOuterHitOfCell_, nhits + device_isOuterHitOfCell_, nhits, + maxNumberOfDoublets_ ); @@ -201,7 +201,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, void CAHitQuadrupletGeneratorGPU::cleanup(cudaStream_t cudaStream) { // this lazily resets temporary memory for the next event, and is not needed for reading the output cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0, - maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray), + PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray), cudaStream)); cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream)); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index 013581fdfd5b6..c06f1be2f257b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -7,9 +7,11 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Utilities/interface/EDGetToken.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" +#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/RZLine.h" #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h" +#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" #include "RecoTracker/TkHitPairs/interface/HitPairGeneratorFromLayerPair.h" #include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h" #include "RecoTracker/TkHitPairs/interface/LayerHitMapCache.h" @@ -18,8 +20,6 @@ #include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h" -#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" - #include "GPUCACell.h" @@ -67,7 +67,6 @@ class CAHitQuadrupletGeneratorGPU { void deallocateOnGPU(); private: -// LayerCacheType theLayerCache; std::unique_ptr theComparitor; @@ -151,11 +150,10 @@ class CAHitQuadrupletGeneratorGPU { const float caHardPtCut = 0.f; static constexpr int maxNumberOfQuadruplets_ = 10000; - static constexpr int maxCellsPerHit_ = 256; // 2048; // 512; + static constexpr int maxCellsPerHit_ = 256; static constexpr int maxNumberOfLayerPairs_ = 13; static constexpr int maxNumberOfLayers_ = 10; static constexpr int maxNumberOfDoublets_ = 262144; - static constexpr int maxNumberOfHits_ = 20000; static constexpr int maxNumberOfRegions_ = 2; std::vector*> h_foundNtupletsVec_;