From defe0756d3d12d912f1976571ca0a4ff9a367a81 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 13 Aug 2018 16:19:08 +0200 Subject: [PATCH] Speed up CPU side of GPU rechits (#125) - allocate HitsOnCPU buffers once per job per edm stream - coalesce multiple 32 bit and multiple 16 bit rechit buffers to two larger buffers; the allocation is done with cudaMallocPitch, the transfer with cudaMemcpy2DAsync - initialise the full memory buffer to keep cuda-memchekc happy --- .../SiPixelRecHits/plugins/PixelRecHits.cu | 125 +++++++++++------- .../SiPixelRecHits/plugins/PixelRecHits.h | 22 ++- 2 files changed, 96 insertions(+), 51 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 01bac5534035d..03821b9b16fb0 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -24,27 +24,52 @@ namespace { hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules]; } } + + template + T *slicePitch(void *ptr, size_t pitch, size_t row) { + return reinterpret_cast( reinterpret_cast(ptr) + pitch*row); + } } namespace pixelgpudetails { PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { + constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; + cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); - cudaCheck(cudaMalloc((void **) & gpu_.charge_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.xg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.zg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.rg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, (gpuClustering::MaxNumModules * 256) * sizeof(int16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mr_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mc_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); + + // Coalesce all 32bit and 16bit arrays to two big blobs + // + // This is just a toy. Please don't copy-paste the logic but + // create a proper abstraction (e.g. along FWCore/SOA, or + // FWCore/Utilities/interface/SoATuple.h + // + // Order such that the first ones are the ones transferred to CPU + static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious + cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9)); + cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id())); + //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_; + gpu_.charge_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0); + gpu_.xl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1); + gpu_.yl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2); + gpu_.xerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3); + gpu_.yerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4); + gpu_.xg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5); + gpu_.yg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6); + gpu_.zg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7); + gpu_.rg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8); + + // Order such that the first ones are the ones transferred to CPU + cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5)); + cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*5, cudaStream.id())); + //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_; + gpu_.detInd_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0); + gpu_.mr_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1); + gpu_.mc_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2); + gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3); + gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); + cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist))); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; @@ -57,6 +82,23 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id())); cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t))); + + // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks + // to '*256' it is even aligned by cache line + h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); + cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5)); + h_charge_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 0); + h_xl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 1); + h_yl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 2); + h_xe_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 3); + h_ye_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 4); + + h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t); + cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3)); + h_detInd_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 0); + h_mr_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 1); + h_mc_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 2); + #ifdef GPU_DEBUG cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t))); #endif @@ -64,25 +106,15 @@ namespace pixelgpudetails { PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { cudaCheck(cudaFree(gpu_.bs_d)); cudaCheck(cudaFree(gpu_.hitsLayerStart_d)); - cudaCheck(cudaFree(gpu_.charge_d)); - cudaCheck(cudaFree(gpu_.detInd_d)); - cudaCheck(cudaFree(gpu_.xg_d)); - cudaCheck(cudaFree(gpu_.yg_d)); - cudaCheck(cudaFree(gpu_.zg_d)); - cudaCheck(cudaFree(gpu_.rg_d)); - cudaCheck(cudaFree(gpu_.xl_d)); - cudaCheck(cudaFree(gpu_.yl_d)); - cudaCheck(cudaFree(gpu_.xerr_d)); - cudaCheck(cudaFree(gpu_.yerr_d)); - cudaCheck(cudaFree(gpu_.iphi_d)); - cudaCheck(cudaFree(gpu_.sortIndex_d)); - cudaCheck(cudaFree(gpu_.mr_d)); - cudaCheck(cudaFree(gpu_.mc_d)); + cudaCheck(cudaFree(gpu_.owner_32bit_)); + cudaCheck(cudaFree(gpu_.owner_16bit_)); cudaCheck(cudaFree(gpu_.hist_d)); cudaCheck(cudaFree(gpu_d)); cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); cudaCheck(cudaFreeHost(h_hitsModuleStart_)); + cudaCheck(cudaFreeHost(h_owner_32bit_)); + cudaCheck(cudaFreeHost(h_owner_16bit_)); #ifdef GPU_DEBUG cudaCheck(cudaFreeHost(h_hitsLayerStart_)); #endif @@ -92,9 +124,9 @@ namespace pixelgpudetails { float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); - gpu_.hitsModuleStart_d = input.clusModuleStart_d; - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); + gpu_.hitsModuleStart_d = input.clusModuleStart_d; + cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) @@ -123,20 +155,21 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // needed only if hits on CPU are required... + nhits_ = input.nClusters; cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #endif - auto nhits = input.nClusters; - cpu_ = std::make_unique(nhits); - cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + + cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_, + gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, + nhits_*sizeof(uint16_t), 3, + cudaMemcpyDefault, stream.id())); + + cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_, + gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, + nhits_*sizeof(uint32_t), 5, + cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaStreamSynchronize(stream.id()); @@ -148,15 +181,9 @@ namespace pixelgpudetails { // for timing test // cudaStreamSynchronize(stream.id()); - // auto nhits = h_hitsLayerStart_[10]; + // auto nhits_ = h_hitsLayerStart_[10]; // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d); - cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits, 256, stream.id()); - } - - std::unique_ptr&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) { - cpu_->gpu_d = gpu_d; - memcpy(cpu_->hitsModuleStart, h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)); - return std::move(cpu_); + cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); } } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 9e66a9a89a17a..3d49e5138d84a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -36,14 +36,32 @@ namespace pixelgpudetails { pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream); - std::unique_ptr&& getOutput(cuda::stream_t<>& stream); + HitsOnCPU getOutput() const { + return HitsOnCPU{ + h_hitsModuleStart_, h_detInd_, h_charge_, + h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_, + gpu_d, nhits_ + }; + } private: HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product" HitsOnGPU gpu_; - std::unique_ptr cpu_; + uint32_t nhits_ = 0; uint32_t *d_phase1TopologyLayerStart_ = nullptr; uint32_t *h_hitsModuleStart_ = nullptr; + uint16_t *h_detInd_ = nullptr; + int32_t *h_charge_ = nullptr; + float *h_xl_ = nullptr; + float *h_yl_ = nullptr; + float *h_xe_ = nullptr; + float *h_ye_ = nullptr; + uint16_t *h_mr_ = nullptr; + uint16_t *h_mc_ = nullptr; + void *h_owner_32bit_ = nullptr; + size_t h_owner_32bit_pitch_ = 0; + void *h_owner_16bit_ = nullptr; + size_t h_owner_16bit_pitch_ = 0; #ifdef GPU_DEBUG uint32_t *h_hitsLayerStart_ = nullptr; #endif