From fcfce445f5cb00421006af7905a017d410ee38b9 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 9 Aug 2018 11:29:24 +0200 Subject: [PATCH 1/4] Make rechits more async by moving the memory allocation to first --- .../SiPixelRecHits/plugins/PixelRecHits.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 01bac5534035d..d42bf7297dd57 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -92,9 +92,14 @@ namespace pixelgpudetails { float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); - gpu_.hitsModuleStart_d = input.clusModuleStart_d; - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); + // memory allocation needs to be first to not to device-synchronize in between + // even better would be to avoid the allocation... + const auto nhits = input.nClusters; + cpu_ = std::make_unique(nhits); + + cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); + gpu_.hitsModuleStart_d = input.clusModuleStart_d; + cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) @@ -127,8 +132,6 @@ namespace pixelgpudetails { #ifdef GPU_DEBUG cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #endif - auto nhits = input.nClusters; - cpu_ = std::make_unique(nhits); cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); From 4cf67743bb1547dfe2152acb8a495d4bc8cbcc9f Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 9 Aug 2018 13:38:00 +0200 Subject: [PATCH 2/4] Allocate HitsOnCPU buffers once per job per EDM stream --- .../SiPixelRecHits/plugins/PixelRecHits.cu | 78 ++++++++++--------- .../SiPixelRecHits/plugins/PixelRecHits.h | 18 ++++- .../plugins/SiPixelRecHitHeterogeneous.cc | 2 +- .../siPixelRecHitsHeterogeneousProduct.h | 32 +++----- 4 files changed, 70 insertions(+), 60 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index d42bf7297dd57..648278d6e53ba 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -29,22 +29,24 @@ namespace { namespace pixelgpudetails { PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { + constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; + cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); - cudaCheck(cudaMalloc((void **) & gpu_.charge_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.xg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.zg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.rg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, (gpuClustering::MaxNumModules * 256) * sizeof(int16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mr_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mc_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t))); + cudaCheck(cudaMalloc((void **) & gpu_.charge_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, MAX_HITS * sizeof(uint16_t))); + cudaCheck(cudaMalloc((void **) & gpu_.xg_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.yg_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.zg_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.rg_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.xl_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.yl_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, MAX_HITS * sizeof(float))); + cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, MAX_HITS * sizeof(int16_t))); + cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, MAX_HITS * sizeof(uint16_t))); + cudaCheck(cudaMalloc((void **) & gpu_.mr_d, MAX_HITS * sizeof(uint16_t))); + cudaCheck(cudaMalloc((void **) & gpu_.mc_d, MAX_HITS * sizeof(uint16_t))); cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist))); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; @@ -57,6 +59,14 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id())); cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t))); + cudaCheck(cudaMallocHost(&h_detInd_, MAX_HITS*sizeof(uint16_t))); + cudaCheck(cudaMallocHost(&h_charge_, MAX_HITS*sizeof(int32_t))); + cudaCheck(cudaMallocHost(&h_xl_, MAX_HITS*sizeof(float))); + cudaCheck(cudaMallocHost(&h_yl_, MAX_HITS*sizeof(float))); + cudaCheck(cudaMallocHost(&h_xe_, MAX_HITS*sizeof(float))); + cudaCheck(cudaMallocHost(&h_ye_, MAX_HITS*sizeof(float))); + cudaCheck(cudaMallocHost(&h_mr_, MAX_HITS*sizeof(uint16_t))); + cudaCheck(cudaMallocHost(&h_mc_, MAX_HITS*sizeof(uint16_t))); #ifdef GPU_DEBUG cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t))); #endif @@ -83,6 +93,14 @@ namespace pixelgpudetails { cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); cudaCheck(cudaFreeHost(h_hitsModuleStart_)); + cudaCheck(cudaFreeHost(h_detInd_)); + cudaCheck(cudaFreeHost(h_charge_)); + cudaCheck(cudaFreeHost(h_xl_)); + cudaCheck(cudaFreeHost(h_yl_)); + cudaCheck(cudaFreeHost(h_xe_)); + cudaCheck(cudaFreeHost(h_ye_)); + cudaCheck(cudaFreeHost(h_mr_)); + cudaCheck(cudaFreeHost(h_mc_)); #ifdef GPU_DEBUG cudaCheck(cudaFreeHost(h_hitsLayerStart_)); #endif @@ -92,11 +110,6 @@ namespace pixelgpudetails { float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream) { - // memory allocation needs to be first to not to device-synchronize in between - // even better would be to avoid the allocation... - const auto nhits = input.nClusters; - cpu_ = std::make_unique(nhits); - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); gpu_.hitsModuleStart_d = input.clusModuleStart_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); @@ -128,18 +141,19 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // needed only if hits on CPU are required... + nhits_ = input.nClusters; cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #endif - cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_detInd_, gpu_.detInd_d, nhits_*sizeof(int16_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_charge_, gpu_.charge_d, nhits_ * sizeof(int32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_xl_, gpu_.xl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_yl_, gpu_.yl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_xe_, gpu_.xerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_ye_, gpu_.yerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_mr_, gpu_.mr_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(h_mc_, gpu_.mc_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaStreamSynchronize(stream.id()); @@ -151,15 +165,9 @@ namespace pixelgpudetails { // for timing test // cudaStreamSynchronize(stream.id()); - // auto nhits = h_hitsLayerStart_[10]; + // auto nhits_ = h_hitsLayerStart_[10]; // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d); - cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits, 256, stream.id()); - } - - std::unique_ptr&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) { - cpu_->gpu_d = gpu_d; - memcpy(cpu_->hitsModuleStart, h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)); - return std::move(cpu_); + cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); } } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 9e66a9a89a17a..0921a41a1fd89 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -36,14 +36,28 @@ namespace pixelgpudetails { pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream); - std::unique_ptr&& getOutput(cuda::stream_t<>& stream); + HitsOnCPU getOutput() const { + return HitsOnCPU{ + h_hitsModuleStart_, h_detInd_, h_charge_, + h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_, + gpu_d, nhits_ + }; + } private: HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product" HitsOnGPU gpu_; - std::unique_ptr cpu_; + uint32_t nhits_ = 0; uint32_t *d_phase1TopologyLayerStart_ = nullptr; uint32_t *h_hitsModuleStart_ = nullptr; + uint16_t *h_detInd_ = nullptr; + int32_t *h_charge_ = nullptr; + float *h_xl_ = nullptr; + float *h_yl_ = nullptr; + float *h_xe_ = nullptr; + float *h_ye_ = nullptr; + uint16_t *h_mr_ = nullptr; + uint16_t *h_mc_ = nullptr; #ifdef GPU_DEBUG uint32_t *h_hitsLayerStart_ = nullptr; #endif diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index 05b6846b9aacb..27811f2d020b0 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i } void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { - auto output = gpuAlgo_->getOutput(cudaStream); + auto output = std::make_unique(gpuAlgo_->getOutput()); // Need the CPU clusters to // - properly fill the output DetSetVector of hits diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 864f7f4f7b2aa..49d90c94c0465 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -36,30 +36,18 @@ namespace siPixelRecHitsHeterogeneousProduct { }; struct HitsOnCPU { - HitsOnCPU() = default; + uint32_t const * hitsModuleStart = nullptr; + uint16_t const * detInd = nullptr; + int32_t const * charge = nullptr; + float const * xl = nullptr; + float const * yl = nullptr; + float const * xe = nullptr; + float const * ye = nullptr; + uint16_t const * mr = nullptr; + uint16_t const * mc = nullptr; - explicit HitsOnCPU(uint32_t nhits) : - detInd(nhits), - charge(nhits), - xl(nhits), - yl(nhits), - xe(nhits), - ye(nhits), - mr(nhits), - mc(nhits), - nHits(nhits) - { } - - uint32_t hitsModuleStart[2001]; - std::vector> detInd; - std::vector> charge; - std::vector> xl, yl; - std::vector> xe, ye; - std::vector> mr; - std::vector> mc; - - uint32_t nHits; HitsOnGPU const * gpu_d = nullptr; + uint32_t nHits; }; using GPUProduct = HitsOnCPU; // FIXME fill cpu vectors on demand From 2a710effcf4905909b60f71e138d5a22be88e074 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 10 Aug 2018 09:56:09 +0200 Subject: [PATCH 3/4] Coalesce 32bit and 16bit rechit buffers to two using cudaMallocPitch, use cudaMemcpy2DAsync to transfer Trying to reduce memory copy overheads --- .../SiPixelRecHits/plugins/PixelRecHits.cu | 118 ++++++++++-------- .../SiPixelRecHits/plugins/PixelRecHits.h | 4 + .../siPixelRecHitsHeterogeneousProduct.h | 6 + 3 files changed, 76 insertions(+), 52 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 648278d6e53ba..3be5d0030de3e 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -24,6 +24,11 @@ namespace { hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules]; } } + + template + T *slicePitch(void *ptr, size_t pitch, size_t row) { + return reinterpret_cast( reinterpret_cast(ptr) + pitch*row); + } } namespace pixelgpudetails { @@ -33,20 +38,36 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); - cudaCheck(cudaMalloc((void **) & gpu_.charge_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, MAX_HITS * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.xg_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yg_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.zg_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.rg_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xl_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yl_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, MAX_HITS * sizeof(float))); - cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, MAX_HITS * sizeof(int16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, MAX_HITS * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mr_d, MAX_HITS * sizeof(uint16_t))); - cudaCheck(cudaMalloc((void **) & gpu_.mc_d, MAX_HITS * sizeof(uint16_t))); + + // Coalesce all 32bit and 16bit arrays to two big blobs + // + // This is just a toy. Please don't copy-paste the logic but + // create a proper abstraction (e.g. along FWCore/SOA, or + // FWCore/Utilities/interface/SoATuple.h + // + // Order such that the first ones are the ones transferred to CPU + static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious + cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9)); + //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_; + gpu_.charge_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0); + gpu_.xl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1); + gpu_.yl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2); + gpu_.xerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3); + gpu_.yerr_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4); + gpu_.xg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5); + gpu_.yg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6); + gpu_.zg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7); + gpu_.rg_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8); + + // Order such that the first ones are the ones transferred to CPU + cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5)); + //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_; + gpu_.detInd_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0); + gpu_.mr_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1); + gpu_.mc_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2); + gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3); + gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); + cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist))); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; @@ -59,14 +80,23 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id())); cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&h_detInd_, MAX_HITS*sizeof(uint16_t))); - cudaCheck(cudaMallocHost(&h_charge_, MAX_HITS*sizeof(int32_t))); - cudaCheck(cudaMallocHost(&h_xl_, MAX_HITS*sizeof(float))); - cudaCheck(cudaMallocHost(&h_yl_, MAX_HITS*sizeof(float))); - cudaCheck(cudaMallocHost(&h_xe_, MAX_HITS*sizeof(float))); - cudaCheck(cudaMallocHost(&h_ye_, MAX_HITS*sizeof(float))); - cudaCheck(cudaMallocHost(&h_mr_, MAX_HITS*sizeof(uint16_t))); - cudaCheck(cudaMallocHost(&h_mc_, MAX_HITS*sizeof(uint16_t))); + + // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks + // to '*256' it is even aligned by cache line + h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); + cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5)); + h_charge_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 0); + h_xl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 1); + h_yl_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 2); + h_xe_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 3); + h_ye_ = slicePitch(h_owner_32bit_, h_owner_32bit_pitch_, 4); + + h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t); + cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3)); + h_detInd_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 0); + h_mr_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 1); + h_mc_ = slicePitch(h_owner_16bit_, h_owner_16bit_pitch_, 2); + #ifdef GPU_DEBUG cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t))); #endif @@ -74,33 +104,15 @@ namespace pixelgpudetails { PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { cudaCheck(cudaFree(gpu_.bs_d)); cudaCheck(cudaFree(gpu_.hitsLayerStart_d)); - cudaCheck(cudaFree(gpu_.charge_d)); - cudaCheck(cudaFree(gpu_.detInd_d)); - cudaCheck(cudaFree(gpu_.xg_d)); - cudaCheck(cudaFree(gpu_.yg_d)); - cudaCheck(cudaFree(gpu_.zg_d)); - cudaCheck(cudaFree(gpu_.rg_d)); - cudaCheck(cudaFree(gpu_.xl_d)); - cudaCheck(cudaFree(gpu_.yl_d)); - cudaCheck(cudaFree(gpu_.xerr_d)); - cudaCheck(cudaFree(gpu_.yerr_d)); - cudaCheck(cudaFree(gpu_.iphi_d)); - cudaCheck(cudaFree(gpu_.sortIndex_d)); - cudaCheck(cudaFree(gpu_.mr_d)); - cudaCheck(cudaFree(gpu_.mc_d)); + cudaCheck(cudaFree(gpu_.owner_32bit_)); + cudaCheck(cudaFree(gpu_.owner_16bit_)); cudaCheck(cudaFree(gpu_.hist_d)); cudaCheck(cudaFree(gpu_d)); cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); cudaCheck(cudaFreeHost(h_hitsModuleStart_)); - cudaCheck(cudaFreeHost(h_detInd_)); - cudaCheck(cudaFreeHost(h_charge_)); - cudaCheck(cudaFreeHost(h_xl_)); - cudaCheck(cudaFreeHost(h_yl_)); - cudaCheck(cudaFreeHost(h_xe_)); - cudaCheck(cudaFreeHost(h_ye_)); - cudaCheck(cudaFreeHost(h_mr_)); - cudaCheck(cudaFreeHost(h_mc_)); + cudaCheck(cudaFreeHost(h_owner_32bit_)); + cudaCheck(cudaFreeHost(h_owner_16bit_)); #ifdef GPU_DEBUG cudaCheck(cudaFreeHost(h_hitsLayerStart_)); #endif @@ -146,14 +158,16 @@ namespace pixelgpudetails { #ifdef GPU_DEBUG cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #endif - cudaCheck(cudaMemcpyAsync(h_detInd_, gpu_.detInd_d, nhits_*sizeof(int16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_charge_, gpu_.charge_d, nhits_ * sizeof(int32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_xl_, gpu_.xl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_yl_, gpu_.yl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_xe_, gpu_.xerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_ye_, gpu_.yerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_mr_, gpu_.mr_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(h_mc_, gpu_.mc_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + + cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_, + gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, + nhits_*sizeof(uint16_t), 3, + cudaMemcpyDefault, stream.id())); + + cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_, + gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, + nhits_*sizeof(uint32_t), 5, + cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaStreamSynchronize(stream.id()); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 0921a41a1fd89..3d49e5138d84a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -58,6 +58,10 @@ namespace pixelgpudetails { float *h_ye_ = nullptr; uint16_t *h_mr_ = nullptr; uint16_t *h_mc_ = nullptr; + void *h_owner_32bit_ = nullptr; + size_t h_owner_32bit_pitch_ = 0; + void *h_owner_16bit_ = nullptr; + size_t h_owner_16bit_pitch_ = 0; #ifdef GPU_DEBUG uint32_t *h_hitsLayerStart_ = nullptr; #endif diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 49d90c94c0465..5ae74d047338f 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -33,6 +33,12 @@ namespace siPixelRecHitsHeterogeneousProduct { Hist * hist_d; HitsOnGPU const * me_d = nullptr; + + // Owning pointers to the 32/16 bit arrays with size MAX_HITS + void *owner_32bit_; + size_t owner_32bit_pitch_; + void *owner_16bit_; + size_t owner_16bit_pitch_; }; struct HitsOnCPU { From c04392ddc0b037faa506de0255b2773ad2165f7c Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 13 Aug 2018 09:15:05 +0200 Subject: [PATCH 4/4] Initialize memory --- RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 3be5d0030de3e..03821b9b16fb0 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -48,6 +48,7 @@ namespace pixelgpudetails { // Order such that the first ones are the ones transferred to CPU static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9)); + cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id())); //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_; gpu_.charge_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0); gpu_.xl_d = slicePitch(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1); @@ -61,6 +62,7 @@ namespace pixelgpudetails { // Order such that the first ones are the ones transferred to CPU cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5)); + cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*5, cudaStream.id())); //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_; gpu_.detInd_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0); gpu_.mr_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);