diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 593821fe805ed..f9b4b2f8a4c16 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -27,6 +27,13 @@ namespace cms { extern thread_local dim3 blockIdx; extern thread_local dim3 gridDim; + template + T1 atomicCAS(T1* address, T1 compare, T2 val) { + T1 old = *address; + *address = old == compare ? val : old; + return old; + } + template T1 atomicInc(T1* a, T2 b) { auto ret = *a; diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 6d4d5f4e7cc5e..33dc6a18ffa2a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -41,9 +41,9 @@ namespace cms { namespace cuda { // limited to 32*32 elements.... - template - __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci, - T* __restrict__ co, + template + __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci, + VT* co, uint32_t size, T* ws #ifndef __CUDA_ARCH__ @@ -138,7 +138,9 @@ namespace cms { // in principle not limited.... template - __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) { + __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) { + volatile T const* ci = ici; + volatile T* co = ico; __shared__ T ws[32]; #ifdef __CUDA_ARCH__ assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below @@ -152,6 +154,7 @@ namespace cms { // count blocks that finished __shared__ bool isLastBlockDone; if (0 == threadIdx.x) { + __threadfence(); auto value = atomicAdd(pc, 1); // block counter isLastBlockDone = (value == (int(gridDim.x) - 1)); } diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h index 8158f414b07d4..3770dbac574d9 100644 --- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h @@ -13,11 +13,11 @@ namespace cms::cuda::allocator { // Use caching or not constexpr bool useCaching = true; // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 8; + constexpr unsigned int binGrowth = 2; // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 1; + constexpr unsigned int minBin = 8; // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 10; + constexpr unsigned int maxBin = 30; // Total storage for the allocator. 0 means no limit. constexpr size_t maxCachedBytes = 0; // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h index fce0c23596137..0ebbdf3ed3705 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -27,7 +27,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); } #ifndef ONLY_PHICUT #ifndef GPU_SMALL_EVENTS - constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; } + constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; } constexpr uint32_t maxCellsPerHit() { return 128; } #else constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; } @@ -37,7 +37,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; } constexpr uint32_t maxCellsPerHit() { return 8 * 128; } #endif - constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; } + constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; } constexpr uint32_t maxNumberOfLayerPairs() { return 20; } constexpr uint32_t maxNumberOfLayers() { return 10; } @@ -49,7 +49,7 @@ namespace CAConstants { #ifndef ONLY_PHICUT using CellNeighbors = cms::cuda::VecArray; - using CellTracks = cms::cuda::VecArray; + using CellTracks = cms::cuda::VecArray; #else using CellNeighbors = cms::cuda::VecArray; using CellTracks = cms::cuda::VecArray; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 4eafb6dccd31c..4d4791b87ad3b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_isOuterHitOfCell_.reset( (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); assert(device_isOuterHitOfCell_.get()); + + cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); + gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_)); @@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr assert(nActualPairs <= gpuPixelDoublets::nPairs); gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, @@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -101,7 +109,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_find_ntuplets(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -129,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 541ab5ed905f5..8a213eee2f579 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_find_ntuplets<<>>(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, @@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif + + // free space asap + // device_isOuterHitOfCell_.reset(); } template <> @@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); assert(device_isOuterHitOfCell_.get()); + + cellStorage_ = cms::cuda::make_device_unique( + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks), + stream); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); + { int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); cudaCheck(cudaGetLastError()); } @@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr dim3 thrs(stride, threadsPerBlock, 1); gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index e112e9d17adeb..7ab3ed010927e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels { private: // workspace - CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr; - unique_ptr device_theCellNeighborsContainer_; - CAConstants::CellTracksVector* device_theCellTracks_ = nullptr; - unique_ptr device_theCellTracksContainer_; + unique_ptr cellStorage_; + unique_ptr device_theCellNeighbors_; + CAConstants::CellNeighbors* device_theCellNeighborsContainer_; + unique_ptr device_theCellTracks_; + CAConstants::CellTracks* device_theCellTracksContainer_; unique_ptr device_theCells_; unique_ptr device_isOuterHitOfCell_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h index 05bf4f09f7f93..fb750267f5c37 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - /* not used at the moment - cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector))); - cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector))); - */ + device_theCellNeighbors_ = Traits::template make_unique(stream); + device_theCellTracks_ = Traits::template make_unique(stream); device_hitToTuple_ = Traits::template make_unique(stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 654b37c076f99..691395887dddb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, printf("Tuples overflow\n"); if (*nCells >= maxNumberOfDoublets) printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow\n"); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); } for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 4a8240706efc2..3e16728a002dd 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.counters_ = m_counters; - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); kernels.allocateOnGPU(stream); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } else { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 6e1c2a587e212..e913b77fe0953 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -56,24 +56,56 @@ class GPUCACell { theInnerZ = hh.zGlobal(innerHitId); theInnerR = hh.rGlobal(innerHitId); - outerNeighbors().reset(); - tracks().reset(); + // link to default empty + theOuterNeighbors = &cellNeighbors[0]; + theTracks = &cellTracks[0]; assert(outerNeighbors().empty()); assert(tracks().empty()); } __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) { + // use smart cache + if (outerNeighbors().empty()) { + auto i = cellNeighbors.extend(); // maybe waisted.... + if (i > 0) { + cellNeighbors[i].reset(); +#ifdef __CUDACC__ + auto zero = (ptrAsInt)(&cellNeighbors[0]); + atomicCAS((ptrAsInt*)(&theOuterNeighbors), + zero, + (ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back... +#else + theOuterNeighbors = &cellNeighbors[i]; +#endif + } else + return -1; + } + __threadfence(); return outerNeighbors().push_back(t); } __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) { + if (tracks().empty()) { + auto i = cellTracks.extend(); // maybe waisted.... + if (i > 0) { + cellTracks[i].reset(); +#ifdef __CUDACC__ + auto zero = (ptrAsInt)(&cellTracks[0]); + atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back... +#else + theTracks = &cellTracks[i]; +#endif + } else + return -1; + } + __threadfence(); return tracks().push_back(t); } - __device__ __forceinline__ CellTracks& tracks() { return theTracks; } - __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; } - __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; } - __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; } + __device__ __forceinline__ CellTracks& tracks() { return *theTracks; } + __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } + __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } + __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } @@ -297,8 +329,8 @@ class GPUCACell { } private: - CellNeighbors theOuterNeighbors; - CellTracks theTracks; + CellNeighbors* theOuterNeighbors; + CellTracks* theTracks; public: int32_t theDoubletId; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 8e0b05dcb6c8a..5b0d3e8833a52 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -73,6 +73,17 @@ namespace gpuPixelDoublets { int first = blockIdx.x * blockDim.x + threadIdx.x; for (int i = first; i < nHits; i += gridDim.x * blockDim.x) isOuterHitOfCell[i].reset(); + + if (0 == first) { + cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); + cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); + auto i = cellNeighbors->extend(); + assert(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(); + assert(0 == i); + (*cellTracks)[0].reset(); + } } constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index 9b8b315e93937..92fa4370faa70 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -27,3 +27,8 @@ + + + + + diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp new file mode 100644 index 0000000000000..5c57eb7005691 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp @@ -0,0 +1,25 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h" + +#include +#include + +template +void print() { + std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl; +} + +int main() { + using namespace CAConstants; + + print(); + print(); + print(); + print(); + print(); + print(); + print(); + + print(); + + return 0; +}