From fcfce445f5cb00421006af7905a017d410ee38b9 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 9 Aug 2018 11:29:24 +0200
Subject: [PATCH 1/4] Make rechits more async by moving the memory allocation
 to first

---
 .../SiPixelRecHits/plugins/PixelRecHits.cu          | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 01bac5534035d..d42bf7297dd57 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -92,9 +92,14 @@ namespace pixelgpudetails {
                                            float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            cuda::stream_t<>& stream) {
-   cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
-   gpu_.hitsModuleStart_d = input.clusModuleStart_d;
-   cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
+    // memory allocation needs to be first to not to device-synchronize in between
+    // even better would be to avoid the allocation...
+    const auto nhits = input.nClusters;
+    cpu_ = std::make_unique<HitsOnCPU>(nhits);
+
+    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
+    gpu_.hitsModuleStart_d = input.clusModuleStart_d;
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
@@ -127,8 +132,6 @@ namespace pixelgpudetails {
 #ifdef GPU_DEBUG
     cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #endif
-    auto nhits = input.nClusters;
-    cpu_ = std::make_unique<HitsOnCPU>(nhits);
     cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));

From 4cf67743bb1547dfe2152acb8a495d4bc8cbcc9f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 9 Aug 2018 13:38:00 +0200
Subject: [PATCH 2/4] Allocate HitsOnCPU buffers once per job per EDM stream

---
 .../SiPixelRecHits/plugins/PixelRecHits.cu    | 78 ++++++++++---------
 .../SiPixelRecHits/plugins/PixelRecHits.h     | 18 ++++-
 .../plugins/SiPixelRecHitHeterogeneous.cc     |  2 +-
 .../siPixelRecHitsHeterogeneousProduct.h      | 32 +++-----
 4 files changed, 70 insertions(+), 60 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index d42bf7297dd57..648278d6e53ba 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -29,22 +29,24 @@ namespace {
 namespace pixelgpudetails {
   PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
 
+    constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
+
     cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.charge_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.zg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.rg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, (gpuClustering::MaxNumModules * 256) * sizeof(int16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.mr_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.mc_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void **) & gpu_.charge_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, MAX_HITS * sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void **) & gpu_.xg_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.yg_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.zg_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.rg_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.xl_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.yl_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, MAX_HITS * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, MAX_HITS * sizeof(int16_t)));
+    cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, MAX_HITS * sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void **) & gpu_.mr_d, MAX_HITS * sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void **) & gpu_.mc_d, MAX_HITS * sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
     gpu_.me_d = gpu_d;
@@ -57,6 +59,14 @@ namespace pixelgpudetails {
     cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
 
     cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));
+    cudaCheck(cudaMallocHost(&h_detInd_, MAX_HITS*sizeof(uint16_t)));
+    cudaCheck(cudaMallocHost(&h_charge_, MAX_HITS*sizeof(int32_t)));
+    cudaCheck(cudaMallocHost(&h_xl_, MAX_HITS*sizeof(float)));
+    cudaCheck(cudaMallocHost(&h_yl_, MAX_HITS*sizeof(float)));
+    cudaCheck(cudaMallocHost(&h_xe_, MAX_HITS*sizeof(float)));
+    cudaCheck(cudaMallocHost(&h_ye_, MAX_HITS*sizeof(float)));
+    cudaCheck(cudaMallocHost(&h_mr_, MAX_HITS*sizeof(uint16_t)));
+    cudaCheck(cudaMallocHost(&h_mc_, MAX_HITS*sizeof(uint16_t)));
 #ifdef GPU_DEBUG
     cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t)));
 #endif
@@ -83,6 +93,14 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
 
     cudaCheck(cudaFreeHost(h_hitsModuleStart_));
+    cudaCheck(cudaFreeHost(h_detInd_));
+    cudaCheck(cudaFreeHost(h_charge_));
+    cudaCheck(cudaFreeHost(h_xl_));
+    cudaCheck(cudaFreeHost(h_yl_));
+    cudaCheck(cudaFreeHost(h_xe_));
+    cudaCheck(cudaFreeHost(h_ye_));
+    cudaCheck(cudaFreeHost(h_mr_));
+    cudaCheck(cudaFreeHost(h_mc_));
 #ifdef GPU_DEBUG
     cudaCheck(cudaFreeHost(h_hitsLayerStart_));
 #endif
@@ -92,11 +110,6 @@ namespace pixelgpudetails {
                                            float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            cuda::stream_t<>& stream) {
-    // memory allocation needs to be first to not to device-synchronize in between
-    // even better would be to avoid the allocation...
-    const auto nhits = input.nClusters;
-    cpu_ = std::make_unique<HitsOnCPU>(nhits);
-
     cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
     gpu_.hitsModuleStart_d = input.clusModuleStart_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
@@ -128,18 +141,19 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     // needed only if hits on CPU are required...
+    nhits_ = input.nClusters;
     cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #ifdef GPU_DEBUG
     cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #endif
-    cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_detInd_, gpu_.detInd_d, nhits_*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_charge_, gpu_.charge_d, nhits_ * sizeof(int32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_xl_, gpu_.xl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_yl_, gpu_.yl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_xe_, gpu_.xerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_ye_, gpu_.yerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_mr_, gpu_.mr_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(h_mc_, gpu_.mc_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
 
 #ifdef GPU_DEBUG
     cudaStreamSynchronize(stream.id());
@@ -151,15 +165,9 @@ namespace pixelgpudetails {
 
     // for timing test
     // cudaStreamSynchronize(stream.id());
-    // auto nhits = h_hitsLayerStart_[10];
+    // auto nhits_ = h_hitsLayerStart_[10];
     // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
 
-    cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits, 256, stream.id());
-  }
-
-  std::unique_ptr<HitsOnCPU>&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) {
-    cpu_->gpu_d = gpu_d;
-    memcpy(cpu_->hitsModuleStart, h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
-    return std::move(cpu_);
+    cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
   }
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 9e66a9a89a17a..0921a41a1fd89 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -36,14 +36,28 @@ namespace pixelgpudetails {
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        cuda::stream_t<>& stream);
 
-    std::unique_ptr<HitsOnCPU>&& getOutput(cuda::stream_t<>& stream);
+    HitsOnCPU getOutput() const {
+      return HitsOnCPU{
+        h_hitsModuleStart_, h_detInd_, h_charge_,
+        h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_,
+        gpu_d, nhits_
+      };
+    }
 
   private:
     HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
     HitsOnGPU gpu_;
-    std::unique_ptr<HitsOnCPU> cpu_;
+    uint32_t nhits_ = 0;
     uint32_t *d_phase1TopologyLayerStart_ = nullptr;
     uint32_t *h_hitsModuleStart_ = nullptr;
+    uint16_t *h_detInd_ = nullptr;
+    int32_t *h_charge_ = nullptr;
+    float *h_xl_ = nullptr;
+    float *h_yl_ = nullptr;
+    float *h_xe_ = nullptr;
+    float *h_ye_ = nullptr;
+    uint16_t *h_mr_ = nullptr;
+    uint16_t *h_mc_ = nullptr;
 #ifdef GPU_DEBUG
     uint32_t *h_hitsLayerStart_ = nullptr;
 #endif
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 05b6846b9aacb..27811f2d020b0 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
 }
 
 void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
-  auto output = gpuAlgo_->getOutput(cudaStream);
+  auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput());
 
  // Need the CPU clusters to
   // - properly fill the output DetSetVector of hits
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 864f7f4f7b2aa..49d90c94c0465 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -36,30 +36,18 @@ namespace siPixelRecHitsHeterogeneousProduct {
   };
 
   struct HitsOnCPU {
-    HitsOnCPU() = default;
+    uint32_t const * hitsModuleStart = nullptr;
+    uint16_t const * detInd = nullptr;
+    int32_t const * charge = nullptr;
+    float const * xl = nullptr;
+    float const * yl = nullptr;
+    float const * xe = nullptr;
+    float const * ye = nullptr;
+    uint16_t const * mr = nullptr;
+    uint16_t const * mc = nullptr;
 
-    explicit HitsOnCPU(uint32_t nhits) :
-      detInd(nhits),
-      charge(nhits),
-      xl(nhits),
-      yl(nhits),
-      xe(nhits),
-      ye(nhits),
-      mr(nhits),
-      mc(nhits),
-      nHits(nhits)
-    { }
-
-    uint32_t hitsModuleStart[2001];
-    std::vector<uint16_t,  CUDAHostAllocator<uint16_t>> detInd;
-    std::vector<int32_t,  CUDAHostAllocator<int32_t>> charge;
-    std::vector<float,    CUDAHostAllocator<float>> xl, yl;
-    std::vector<float,    CUDAHostAllocator<float>> xe, ye;
-    std::vector<uint16_t, CUDAHostAllocator<uint16_t>> mr;
-    std::vector<uint16_t, CUDAHostAllocator<uint16_t>> mc;
-
-    uint32_t nHits;
     HitsOnGPU const * gpu_d = nullptr;
+    uint32_t nHits;
   };
 
   using GPUProduct = HitsOnCPU;  // FIXME fill cpu vectors on demand

From 2a710effcf4905909b60f71e138d5a22be88e074 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 10 Aug 2018 09:56:09 +0200
Subject: [PATCH 3/4] Coalesce 32bit and 16bit rechit buffers to two using
 cudaMallocPitch, use cudaMemcpy2DAsync to transfer

Trying to reduce memory copy overheads
---
 .../SiPixelRecHits/plugins/PixelRecHits.cu    | 118 ++++++++++--------
 .../SiPixelRecHits/plugins/PixelRecHits.h     |   4 +
 .../siPixelRecHitsHeterogeneousProduct.h      |   6 +
 3 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 648278d6e53ba..3be5d0030de3e 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -24,6 +24,11 @@ namespace {
       hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules];
     }
   }
+
+  template <typename T>
+  T *slicePitch(void *ptr, size_t pitch, size_t row) {
+    return reinterpret_cast<T *>( reinterpret_cast<char *>(ptr) + pitch*row);
+  }
 }
 
 namespace pixelgpudetails {
@@ -33,20 +38,36 @@ namespace pixelgpudetails {
 
     cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.charge_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, MAX_HITS * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xg_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yg_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.zg_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.rg_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xl_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yl_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, MAX_HITS * sizeof(float)));
-    cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, MAX_HITS * sizeof(int16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, MAX_HITS * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.mr_d, MAX_HITS * sizeof(uint16_t)));
-    cudaCheck(cudaMalloc((void **) & gpu_.mc_d, MAX_HITS * sizeof(uint16_t)));
+
+    // Coalesce all 32bit and 16bit arrays to two big blobs
+    //
+    // This is just a toy. Please don't copy-paste the logic but
+    // create a proper abstraction (e.g. along FWCore/SOA, or
+    // FWCore/Utilities/interface/SoATuple.h
+    //
+    // Order such that the first ones are the ones transferred to CPU
+    static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
+    cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9));
+    //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_;
+    gpu_.charge_d = slicePitch<int32_t>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0);
+    gpu_.xl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1);
+    gpu_.yl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2);
+    gpu_.xerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3);
+    gpu_.yerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4);
+    gpu_.xg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5);
+    gpu_.yg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6);
+    gpu_.zg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7);
+    gpu_.rg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8);
+
+    // Order such that the first ones are the ones transferred to CPU
+    cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5));
+    //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_;
+    gpu_.detInd_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0);
+    gpu_.mr_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);
+    gpu_.mc_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2);
+    gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
+    gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
+
     cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
     gpu_.me_d = gpu_d;
@@ -59,14 +80,23 @@ namespace pixelgpudetails {
     cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
 
     cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));
-    cudaCheck(cudaMallocHost(&h_detInd_, MAX_HITS*sizeof(uint16_t)));
-    cudaCheck(cudaMallocHost(&h_charge_, MAX_HITS*sizeof(int32_t)));
-    cudaCheck(cudaMallocHost(&h_xl_, MAX_HITS*sizeof(float)));
-    cudaCheck(cudaMallocHost(&h_yl_, MAX_HITS*sizeof(float)));
-    cudaCheck(cudaMallocHost(&h_xe_, MAX_HITS*sizeof(float)));
-    cudaCheck(cudaMallocHost(&h_ye_, MAX_HITS*sizeof(float)));
-    cudaCheck(cudaMallocHost(&h_mr_, MAX_HITS*sizeof(uint16_t)));
-    cudaCheck(cudaMallocHost(&h_mc_, MAX_HITS*sizeof(uint16_t)));
+
+    // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks
+    // to '*256' it is even aligned by cache line
+    h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); 
+    cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5));
+    h_charge_ = slicePitch<int32_t>(h_owner_32bit_, h_owner_32bit_pitch_, 0);
+    h_xl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 1);
+    h_yl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 2);
+    h_xe_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 3);
+    h_ye_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 4);
+
+    h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t);
+    cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3));
+    h_detInd_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 0);
+    h_mr_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 1);
+    h_mc_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 2);
+
 #ifdef GPU_DEBUG
     cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t)));
 #endif
@@ -74,33 +104,15 @@ namespace pixelgpudetails {
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
     cudaCheck(cudaFree(gpu_.bs_d));
     cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
-    cudaCheck(cudaFree(gpu_.charge_d));
-    cudaCheck(cudaFree(gpu_.detInd_d));
-    cudaCheck(cudaFree(gpu_.xg_d));
-    cudaCheck(cudaFree(gpu_.yg_d));
-    cudaCheck(cudaFree(gpu_.zg_d));
-    cudaCheck(cudaFree(gpu_.rg_d));
-    cudaCheck(cudaFree(gpu_.xl_d));
-    cudaCheck(cudaFree(gpu_.yl_d));
-    cudaCheck(cudaFree(gpu_.xerr_d));
-    cudaCheck(cudaFree(gpu_.yerr_d));
-    cudaCheck(cudaFree(gpu_.iphi_d));
-    cudaCheck(cudaFree(gpu_.sortIndex_d));
-    cudaCheck(cudaFree(gpu_.mr_d));
-    cudaCheck(cudaFree(gpu_.mc_d));
+    cudaCheck(cudaFree(gpu_.owner_32bit_));
+    cudaCheck(cudaFree(gpu_.owner_16bit_));
     cudaCheck(cudaFree(gpu_.hist_d));
     cudaCheck(cudaFree(gpu_d));
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
 
     cudaCheck(cudaFreeHost(h_hitsModuleStart_));
-    cudaCheck(cudaFreeHost(h_detInd_));
-    cudaCheck(cudaFreeHost(h_charge_));
-    cudaCheck(cudaFreeHost(h_xl_));
-    cudaCheck(cudaFreeHost(h_yl_));
-    cudaCheck(cudaFreeHost(h_xe_));
-    cudaCheck(cudaFreeHost(h_ye_));
-    cudaCheck(cudaFreeHost(h_mr_));
-    cudaCheck(cudaFreeHost(h_mc_));
+    cudaCheck(cudaFreeHost(h_owner_32bit_));
+    cudaCheck(cudaFreeHost(h_owner_16bit_));
 #ifdef GPU_DEBUG
     cudaCheck(cudaFreeHost(h_hitsLayerStart_));
 #endif
@@ -146,14 +158,16 @@ namespace pixelgpudetails {
 #ifdef GPU_DEBUG
     cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #endif
-    cudaCheck(cudaMemcpyAsync(h_detInd_, gpu_.detInd_d, nhits_*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_charge_, gpu_.charge_d, nhits_ * sizeof(int32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_xl_, gpu_.xl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_yl_, gpu_.yl_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_xe_, gpu_.xerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_ye_, gpu_.yerr_d, nhits_ * sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_mr_, gpu_.mr_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(h_mc_, gpu_.mc_d, nhits_ * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+
+    cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_,
+                                gpu_.owner_16bit_, gpu_.owner_16bit_pitch_,
+                                nhits_*sizeof(uint16_t), 3,
+                                cudaMemcpyDefault, stream.id()));
+
+    cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_,
+                                gpu_.owner_32bit_, gpu_.owner_32bit_pitch_,
+                                nhits_*sizeof(uint32_t), 5,
+                                cudaMemcpyDefault, stream.id()));
 
 #ifdef GPU_DEBUG
     cudaStreamSynchronize(stream.id());
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 0921a41a1fd89..3d49e5138d84a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -58,6 +58,10 @@ namespace pixelgpudetails {
     float *h_ye_ = nullptr;
     uint16_t *h_mr_ = nullptr;
     uint16_t *h_mc_ = nullptr;
+    void *h_owner_32bit_ = nullptr;
+    size_t h_owner_32bit_pitch_ = 0;
+    void *h_owner_16bit_ = nullptr;
+    size_t h_owner_16bit_pitch_ = 0;
 #ifdef GPU_DEBUG
     uint32_t *h_hitsLayerStart_ = nullptr;
 #endif
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 49d90c94c0465..5ae74d047338f 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -33,6 +33,12 @@ namespace siPixelRecHitsHeterogeneousProduct {
      Hist * hist_d;
 
      HitsOnGPU const * me_d = nullptr;
+
+    // Owning pointers to the 32/16 bit arrays with size MAX_HITS
+    void *owner_32bit_;
+    size_t owner_32bit_pitch_;
+    void *owner_16bit_;
+    size_t owner_16bit_pitch_;
   };
 
   struct HitsOnCPU {

From c04392ddc0b037faa506de0255b2773ad2165f7c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 13 Aug 2018 09:15:05 +0200
Subject: [PATCH 4/4] Initialize memory

---
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 3be5d0030de3e..03821b9b16fb0 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -48,6 +48,7 @@ namespace pixelgpudetails {
     // Order such that the first ones are the ones transferred to CPU
     static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
     cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9));
+    cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id()));
     //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_;
     gpu_.charge_d = slicePitch<int32_t>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0);
     gpu_.xl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1);
@@ -61,6 +62,7 @@ namespace pixelgpudetails {
 
     // Order such that the first ones are the ones transferred to CPU
     cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5));
+    cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*5, cudaStream.id()));
     //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_;
     gpu_.detInd_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0);
     gpu_.mr_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);