From 41cd273dfa2f0647c0c133de005e6442af1e561c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 29 Mar 2019 06:50:23 -0400
Subject: [PATCH] Migrate the pixel rechits producer and CA to the new
 heterogeneous framework (#338)

Use cleaned hits.
Use pixel layer and ladders geometry, and use pixel triplets in the gaps.

Optimise GPU memory usage:
  - reduce the number of memory allocations
  - fix the size of the cub workspace
  - allocate memory per event via the caching allocator
  - use constant memory for geometry and parameters
  - use shared memory where the content is the same for every thread

Optimise kernel launches, and add a protection for empty events and overflows.
---
 .../python/RecoLocalTracker_cff.py            |   6 +-
 .../interface/PixelTrackingGPUConstants.h     |  11 -
 .../plugins/SiPixelDigisClustersFromSoA.cc    |   2 +-
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |  85 +++--
 .../plugins/gpuCalibPixel.h                   | 123 ++-----
 .../plugins/gpuClusteringConstants.h          |  10 +-
 .../siPixelClustersHeterogeneous_cfi.py       |  37 ---
 .../SiPixelRecHits/interface/PixelCPEFast.h   |   1 +
 .../SiPixelRecHits/interface/pixelCPEforGPU.h |  22 +-
 .../SiPixelRecHits/plugins/BuildFile.xml      |   1 +
 .../SiPixelRecHits/plugins/PixelRecHits.cu    | 189 ++---------
 .../SiPixelRecHits/plugins/PixelRecHits.h     |  52 +--
 .../plugins/SiPixelRecHitCUDA.cc              | 118 +++++++
 .../plugins/SiPixelRecHitFromSOA.cc           | 187 +++++++++++
 .../plugins/SiPixelRecHitHeterogeneous.cc     | 313 ------------------
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  |  74 +++--
 .../siPixelRecHitsHeterogeneousProduct.h      |  79 -----
 .../python/SiPixelRecHits_cfi.py              |  33 +-
 .../SiPixelRecHits/src/PixelCPEFast.cc        |  49 ++-
 .../customizePixelTracksForProfiling.py       |   2 -
 .../PixelTrackFitting/interface/FitResult.h   |   4 +-
 .../plugins/PixelTrackProducerFromCUDA.cc     |   1 -
 .../plugins/BrokenLineFitOnGPU.cu             |  68 ++--
 .../PixelTriplets/plugins/BuildFile.xml       |   2 -
 .../PixelTriplets/plugins/CAConstants.h       |  30 +-
 .../CAHitNtupletHeterogeneousEDProducer.cc    |  41 ++-
 .../plugins/CAHitQuadrupletGeneratorGPU.cc    |  46 ++-
 .../plugins/CAHitQuadrupletGeneratorGPU.h     |  21 +-
 .../CAHitQuadrupletGeneratorKernels.cu        | 139 +++++---
 .../plugins/CAHitQuadrupletGeneratorKernels.h |  47 +--
 .../CAHitQuadrupletGeneratorKernelsAlloc.cu   |  41 ++-
 .../PixelTriplets/plugins/GPUCACell.h         |  94 ++++--
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  17 -
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  22 +-
 .../PixelTriplets/plugins/RecHitsMap.h        |   3 +
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |  80 +++--
 .../PixelTriplets/plugins/gpuFishbone.h       |   6 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 271 +++------------
 .../plugins/gpuPixelDoubletsAlgos.h           | 220 ++++++++++++
 .../plugins/pixelTuplesHeterogeneousProduct.h |   6 +-
 40 files changed, 1253 insertions(+), 1300 deletions(-)
 delete mode 100644 RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h
 delete mode 100644 RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py
 create mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
 create mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
 delete mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
 delete mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h

diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
index a486a83d178f4..08f871e45f8d7 100644
--- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
+++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -13,7 +13,7 @@
 from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import *
 from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import *
 
-pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting)
+pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplittingTask)
 striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits)
 trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer)
 
@@ -21,10 +21,6 @@
 striptrackerlocalreco = cms.Sequence(striptrackerlocalrecoTask)
 trackerlocalreco = cms.Sequence(trackerlocalrecoTask)
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous
-gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous)
-
 from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
 from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h b/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h
deleted file mode 100644
index ceb831d7865b6..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h
-#define RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h
-
-#include <cstdint>
-
-namespace PixelGPUConstants {
-  constexpr uint16_t maxNumberOfHits = 40000;       // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
-
-}
-
-#endif // RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
index 2c7da14cf72af..ba184d766feaf 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -149,7 +149,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
   }
 
   // fill final clusters
-  fillClusters((*detDigis).detId());
+  if (detDigis) fillClusters((*detDigis).detId());
   //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
 
   iEvent.put(digiPutToken_, std::move(collection));
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 8fdb2ed8c90d5..6a832128c1cc2 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -36,6 +36,7 @@
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 // local includes
 #include "SiPixelRawToClusterGPUKernel.h"
@@ -456,6 +457,54 @@ namespace pixelgpudetails {
 
   } // end of Raw to Digi kernel
 
+
+  __global__
+  void fillHitsModuleStart(uint32_t const * __restrict__ cluStart, uint32_t * __restrict__ moduleStart) {
+     
+     assert(gpuClustering::MaxNumModules<2048);  // easy to extend at least till 32*1024
+     assert(1==gridDim.x);
+     assert(0==blockIdx.x);
+
+     int first = threadIdx.x;
+
+     // limit to MaxHitsInModule;
+     for (int i=first, iend=gpuClustering::MaxNumModules; i<iend; i+=blockDim.x) {
+        moduleStart[i+1] = std::min(gpuClustering::maxHitsInModule(),cluStart[i]);
+     } 
+
+     __shared__ uint32_t ws[32];
+     blockPrefixScan(moduleStart+1,moduleStart+1,1024,ws);
+     blockPrefixScan(moduleStart+1025,moduleStart+1025,gpuClustering::MaxNumModules-1024,ws); 
+
+     for (int i=first+1025, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
+       moduleStart[i]+=moduleStart[1024];
+     }
+     __syncthreads();
+
+#ifdef GPU_DEBUG
+     assert(0==moduleStart[0]);
+     auto c0 = std::min(gpuClustering::maxHitsInModule(),cluStart[0]);
+     assert(c0==moduleStart[1]);
+     assert(moduleStart[1024]>=moduleStart[1023]);
+     assert(moduleStart[1025]>=moduleStart[1024]);
+     assert(moduleStart[gpuClustering::MaxNumModules]>=moduleStart[1025]);
+
+    for (int i=first, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
+       if(0!=i) assert(moduleStart[i]>=moduleStart[i-i]);
+    // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+    // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+       if (i==96 || i==1184 || i==1744 || i==gpuClustering::MaxNumModules) printf("moduleStart %d %d\n",i,moduleStart[i]);
+    }    
+#endif
+
+     // avoid overflow
+     constexpr auto MAX_HITS = gpuClustering::MaxNumClusters;
+     for (int i=first, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
+        if (moduleStart[i] > MAX_HITS) moduleStart[i] = MAX_HITS;
+     }
+  }
+
+
   // Interface to outside
   void SiPixelRawToClusterGPUKernel::makeClustersAsync(
       const SiPixelFedCablingMapGPU *cablingMap,
@@ -478,6 +527,7 @@ namespace pixelgpudetails {
     edm::Service<CUDAService> cs;
     nModules_Clusters_h = cs->make_host_unique<uint32_t[]>(2, stream);
 
+    if (wordCounter) // protect in case of empty event....
     {
       const int threadsPerBlock = 512;
       const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
@@ -511,19 +561,24 @@ namespace pixelgpudetails {
         digiErrors_d.copyErrorToHostAsync(stream);
       }
     }
-    // End  of Raw2Digi and passing data for cluserisation
+    // End of Raw2Digi and passing data for clustering
 
     {
       // clusterizer ...
       using namespace gpuClustering;
       int threadsPerBlock = 256;
-      int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
+      int blocks = (std::max(int(wordCounter),int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
+      
       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           digis_d.moduleInd(),
           digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(),
           gains,
-          wordCounter);
+          wordCounter,
+          clusters_d.moduleStart(),
+          clusters_d.clusInModule(),
+          clusters_d.clusModuleStart()
+      );
       cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -532,8 +587,6 @@ namespace pixelgpudetails {
          << " blocks of " << threadsPerBlock << " threads\n";
 #endif
 
-      cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
-
       countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
@@ -546,7 +599,6 @@ namespace pixelgpudetails {
          std::cout << "CUDA findClus kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
 #endif
-      cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
       findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           digis_d.c_moduleInd(),
           digis_d.c_xx(), digis_d.c_yy(),
@@ -567,26 +619,19 @@ namespace pixelgpudetails {
       cudaCheck(cudaGetLastError());
 
 
+
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of
       // synchronization/ExternalWork
-      //
-      // Temporary storage
-      size_t tempScanStorageSize = 0;
-      {
-        uint32_t *tmp = nullptr;
-        cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
-      }
-      auto tempScanStorage_d = cs->make_device_unique<uint32_t[]>(tempScanStorageSize, stream);
-      // Set first the first element to 0
-      cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id()));
-      // Then use inclusive_scan to get the partial sum to the rest
-      cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize,
-                                              clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
-                                              stream.id()));
+
+     // MUST be ONE block
+     fillHitsModuleStart<<<1, 1024, 0, stream.id()>>>(clusters_d.c_clusInModule(),clusters_d.clusModuleStart());
+
       // last element holds the number of all clusters
       cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+
     } // end clusterizer scope
   }
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
index 5a681e791f94f..5087516fa009d 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
@@ -5,6 +5,9 @@
 #include <cstdio>
 
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
+
+#include "gpuClusteringConstants.h"
+
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 namespace gpuCalibPixel {
@@ -22,104 +25,46 @@ namespace gpuCalibPixel {
                            uint16_t const * __restrict__ y,
                            uint16_t * adc,
                            SiPixelGainForHLTonGPU const * __restrict__ ped,
-                           int numElements
+                           int numElements,
+                           uint32_t * __restrict__ moduleStart,    // just to zero first
+                           uint32_t * __restrict__ nClustersInModule, // just to zero them
+                           uint32_t * __restrict__ clusModuleStart  // just to zero first
                          )
 {
 
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= numElements) return;
-    if (InvId==id[i]) return;
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
 
-    float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
-    float offset =  id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
-
-    bool isDeadColumn=false, isNoisyColumn=false;
- 
-    int row = x[i];
-    int col = y[i];
-    auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
-    float pedestal = ret.first; float gain = ret.second;
-    // float pedestal = 0; float gain = 1.;
-    if ( isDeadColumn | isNoisyColumn )
-      { 
-        id[i]=InvId; adc[i] =0; 
-        printf("bad pixel at %d in %d\n",i,id[i]);
-    }
-    else {
-      float vcal = adc[i] * gain  - pedestal*gain;
-      adc[i] = std::max(100, int( vcal * conversionFactor + offset));
+    // zero for next kernels...
+    if (0==first) clusModuleStart[0] = moduleStart[0]=0;
+    for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x*blockDim.x) {
+      nClustersInModule[i]=0;
     }
 
-    // if (threadIdx.x==0)
-    //  printf ("calibrated %d\n",id[i]);
-}
-
- __global__ void calibADCByModule(uint16_t * id,
-			   uint16_t const * __restrict__ x,
-			   uint16_t const * __restrict__ y,
-			   uint16_t * adc,
-			   uint32_t * moduleStart,
-                           SiPixelGainForHLTonGPU const * __restrict__ ped,
-                           int numElements
-                         )
-{
-
-
-    auto first = moduleStart[1 + blockIdx.x];  
-    
-    auto me = id[first];
-    
-    assert(me<2000);
+    for (int i = first; i < numElements; i += gridDim.x*blockDim.x) {
+      if (InvId==id[i]) continue;
 
-    /// depends on "me"
+      float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
+      float offset =  id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
 
-    float conversionFactor = me<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; 
-    float offset =  me<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; 
+      bool isDeadColumn=false, isNoisyColumn=false;
  
-
-#ifdef GPU_DEBUG
-    if (me%100==1)
-      if (threadIdx.x==0) printf("start pixel calibration for module %d in block %d\n",me,blockIdx.x);
-#endif
-
-    first+=threadIdx.x;
- 
-    // __syncthreads();
-
-    float pedestal=0,gain=0;
-    bool isDeadColumn=false, isNoisyColumn=false;
-    int oldCol=-1, oldAveragedBlock=-1;
-
-    for (int i=first; i<numElements; i+=blockDim.x) {
-       if (id[i]==InvId) continue;  // not valid
-       if (id[i]!=me) break;  // end of module
-       int row = x[i];
-       int col = y[i];
-       int averagedBlock = row / ped->numberOfRowsAveragedOver_; // 80....  ( row<80 will be faster...)
-       if ( (col!=oldCol) | ( averagedBlock != oldAveragedBlock) ) {
-        oldCol=col; oldAveragedBlock= averagedBlock;
-        auto ret = ped->getPedAndGain(me,col, row, isDeadColumn, isNoisyColumn);
-        pedestal = ret.first; gain = ret.second;
-       }
-       if ( isDeadColumn | isNoisyColumn ) 
-         { id[i]=InvId; adc[i] =0; }
-       else {
-         float vcal = adc[i] * gain  - pedestal*gain;
-         adc[i] = std::max(100, int( vcal * conversionFactor + offset)); 
-       }
-    } 
-
-    __syncthreads(); 
-    //reset start
-    if(0==threadIdx.x) {
-     auto & k = moduleStart[1 + blockIdx.x];
-     while (id[k]==InvId) ++k;
+      int row = x[i];
+      int col = y[i];
+      auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
+      float pedestal = ret.first; float gain = ret.second;
+      // float pedestal = 0; float gain = 1.;
+      if ( isDeadColumn | isNoisyColumn )
+      { 
+         id[i]=InvId; adc[i] =0; 
+         printf("bad pixel at %d in %d\n",i,id[i]);
+      }
+      else {
+        float vcal = adc[i] * gain  - pedestal*gain;
+        adc[i] = std::max(100, int( vcal * conversionFactor + offset));
+      }
     }
-     
-
- }
-
-
+  
+  }
 }
 
-#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
+#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
index 7b4bb5a1c8c95..dc03de8c690c9 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
@@ -1,14 +1,6 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
 
-#include <cstdint>
-
-namespace gpuClustering {
-  constexpr uint32_t MaxNumModules  = 2000;
-  constexpr uint32_t MaxNumPixels   = 256 * 2000;   // this does not mean maxPixelPerModule == 256!
-  constexpr uint32_t MaxNumClustersPerModules = 1024;
-  constexpr uint16_t InvId          = 9999;         // must be > MaxNumModules
-
-}
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py
deleted file mode 100644
index b86520bb28287..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersHeterogeneous_cfi.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneousDefault_cfi import siPixelClustersHeterogeneousDefault as _siPixelClustersHeterogeneousDefault
-siPixelClustersHeterogeneous = _siPixelClustersHeterogeneousDefault.clone()
-
-# following copied from SiPixelRawToDigi_cfi
-siPixelClustersHeterogeneous.IncludeErrors = cms.bool(True)
-siPixelClustersHeterogeneous.InputLabel = cms.InputTag("rawDataCollector")
-siPixelClustersHeterogeneous.UseQualityInfo = cms.bool(False)
-## ErrorList: list of error codes used by tracking to invalidate modules
-siPixelClustersHeterogeneous.ErrorList = cms.vint32(29)
-## UserErrorList: list of error codes used by Pixel experts for investigation
-siPixelClustersHeterogeneous.UserErrorList = cms.vint32(40)
-##  Use pilot blades
-siPixelClustersHeterogeneous.UsePilotBlade = cms.bool(False)
-##  Use phase1
-siPixelClustersHeterogeneous.UsePhase1 = cms.bool(False)
-## Empty Regions PSet means complete unpacking
-siPixelClustersHeterogeneous.Regions = cms.PSet( )
-siPixelClustersHeterogeneous.CablingMapLabel = cms.string("")
-
-# The following is copied from siPixelClusters_cfi, clearly not
-# maintainable in the long run
-from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
-phase1Pixel.toModify(siPixelClustersHeterogeneous,
-  VCaltoElectronGain      = cms.int32(47),   # L2-4: 47 +- 4.7
-  VCaltoElectronGain_L1   = cms.int32(50),   # L1:   49.6 +- 2.6
-  VCaltoElectronOffset    = cms.int32(-60),  # L2-4: -60 +- 130
-  VCaltoElectronOffset_L1 = cms.int32(-670), # L1:   -670 +- 220
-  ChannelThreshold        = cms.int32(10),
-  SeedThreshold           = cms.int32(1000),
-  ClusterThreshold        = cms.int32(4000),
-  ClusterThreshold_L1     = cms.int32(2000)
-)
-
-# The following is copied from SiPixelRawToDigi_cfi
-phase1Pixel.toModify(siPixelClustersHeterogeneous, UsePhase1=True)
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
index 9b8924988e848..82d71ce19f01e 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
@@ -83,6 +83,7 @@ class PixelCPEFast final : public PixelCPEBase
 
    std::vector<pixelCPEforGPU::DetParams, CUDAHostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
    pixelCPEforGPU::CommonParams m_commonParamsGPU;     
+   pixelCPEforGPU::LayerGeometry m_layerGeometry;
 
    struct GPUData {
      ~GPUData();
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
index 5776e054fd330..c33ccc85b16cb 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -6,9 +6,11 @@
 #include <cstdint>
 #include <iterator>
 
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "DataFormats/GeometrySurface/interface/SOARotation.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 namespace pixelCPEforGPU {
 
@@ -43,9 +45,16 @@ namespace pixelCPEforGPU {
   };
 
 
-  struct ParamsOnGPU {
+  struct LayerGeometry {
+    uint32_t layerStart[phase1PixelTopology::numberOfLayers + 1];
+    uint8_t  layer[phase1PixelTopology::layerIndexSize];
+  };
+
+  struct ParamsOnGPU {    
+
     CommonParams * m_commonParams;
     DetParams * m_detParams;
+    LayerGeometry * m_layerGeometry;
 
     constexpr
     CommonParams const & __restrict__ commonParams() const {
@@ -57,6 +66,13 @@ namespace pixelCPEforGPU {
       DetParams const * __restrict__ l = m_detParams;
        return l[i];
     }
+    constexpr
+    LayerGeometry const & __restrict__ layerGeometry() const {
+      return *m_layerGeometry;
+    }
+
+    __device__ uint8_t layer(uint16_t id) const { return __ldg(m_layerGeometry->layer+id/phase1PixelTopology::maxModuleStride);};
+
   };
 
   // SOA (on device)
@@ -86,8 +102,8 @@ namespace pixelCPEforGPU {
   };
 
 
-  constexpr uint32_t MaxClusInModule=256;
-  using ClusParams = ClusParamsT<256>;
+  constexpr uint32_t MaxHitsInModule = gpuClustering::MaxHitsInModule;
+  using ClusParams = ClusParamsT<MaxHitsInModule>;
 
   constexpr inline
   void computeAnglesFromDet(DetParams const & __restrict__ detParams, float const x, float const y, float & cotalpha, float & cotbeta) {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
index 27ee3af86e102..9385896a5e287 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -1,4 +1,5 @@
 <use name="CUDADataFormats/BeamSpot"/>
+<use name="CUDADataFormats/TrackingRecHit"/>
 <use name="DataFormats/TrackerCommon"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 7d19e23eb1119..d5cfda78600a4 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -6,6 +6,9 @@
 #include <cuda_runtime.h>
 
 // CMSSW headers
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
@@ -14,130 +17,32 @@
 
 namespace {
   __global__
-  void setHitsLayerStart(const uint32_t* hitsModuleStart, const uint32_t* layerStart, uint32_t* hitsLayerStart) {
+  void setHitsLayerStart(uint32_t const * __restrict__ hitsModuleStart, pixelCPEforGPU::ParamsOnGPU const * cpeParams, uint32_t* hitsLayerStart) {
     auto i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if(i < 10) {
-      hitsLayerStart[i] = hitsModuleStart[layerStart[i]];
-    }
-    else if(i == 10) {
-      hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules];
-    }
-  }
-
-  template <typename T>
-  T *slicePitch(void *ptr, size_t pitch, size_t row) {
-    return reinterpret_cast<T *>( reinterpret_cast<char *>(ptr) + pitch*row);
-  }
-}
-
-namespace pixelgpudetails {
-  PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
-
-    constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
-
-    cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
-
-    // Coalesce all 32bit and 16bit arrays to two big blobs
-    //
-    // This is just a toy. Please don't copy-paste the logic but
-    // create a proper abstraction (e.g. along FWCore/SOA, or
-    // FWCore/Utilities/interface/SoATuple.h
-    //
-    // Order such that the first ones are the ones transferred to CPU
-    static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
-    cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9));
-    cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id()));
-    //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_;
-    gpu_.charge_d = slicePitch<int32_t>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0);
-    gpu_.xl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1);
-    gpu_.yl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2);
-    gpu_.xerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3);
-    gpu_.yerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4);
-    gpu_.xg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5);
-    gpu_.yg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6);
-    gpu_.zg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7);
-    gpu_.rg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8);
-
-    // Order such that the first ones are the ones transferred to CPU
-    cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 7));
-    cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*7, cudaStream.id()));
-    //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_;
-    gpu_.detInd_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0);
-    gpu_.mr_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);
-    gpu_.mc_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2);
-    gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
-    gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
-    gpu_.xsize_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 5);
-    gpu_.ysize_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 6);
-
-    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
-    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, HitsOnGPU::Hist::wsSize()));
-    cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
-
-    // Feels a bit dumb but constexpr arrays are not supported for device code
-    // TODO: should be moved to EventSetup (or better ideas?)
-    // Would it be better to use "constant memory"?
-    cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayerStart_, 11 * sizeof(uint32_t)));
-    cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
-    cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayer_, phase1PixelTopology::layer.size() * sizeof(uint8_t)));
-    cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayer_, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size() * sizeof(uint8_t), cudaMemcpyDefault, cudaStream.id()));
-
-    gpu_.phase1TopologyLayerStart_d = d_phase1TopologyLayerStart_;
-    gpu_.phase1TopologyLayer_d = d_phase1TopologyLayer_;
-
-    gpu_.me_d = gpu_d;
-    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
-
-    cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));
-
-    // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks
-    // to '*256' it is even aligned by cache line
-    h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); 
-    cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5));
-    h_charge_ = slicePitch<int32_t>(h_owner_32bit_, h_owner_32bit_pitch_, 0);
-    h_xl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 1);
-    h_yl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 2);
-    h_xe_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 3);
-    h_ye_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 4);
-
-    h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t);
-    cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3));
-    h_detInd_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 0);
-    h_mr_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 1);
-    h_mc_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 2);
+    assert(0==hitsModuleStart[0]);
 
+    if(i < 11) {
+      hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
 #ifdef GPU_DEBUG
-    cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t)));
+      printf ("LayerStart %d %d: %d\n",i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]);
 #endif
+    }
   }
-  PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
-    cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
-    cudaCheck(cudaFree(gpu_.owner_32bit_));
-    cudaCheck(cudaFree(gpu_.owner_16bit_));
-    cudaCheck(cudaFree(gpu_.hist_d));
-    cudaCheck(cudaFree(gpu_.hws_d));
-    cudaCheck(cudaFree(gpu_d));
-    cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
-    cudaCheck(cudaFree(d_phase1TopologyLayer_));
+}
 
-    cudaCheck(cudaFreeHost(h_hitsModuleStart_));
-    cudaCheck(cudaFreeHost(h_owner_32bit_));
-    cudaCheck(cudaFreeHost(h_owner_16bit_));
-#ifdef GPU_DEBUG
-    cudaCheck(cudaFreeHost(h_hitsLayerStart_));
-#endif
-  }
+namespace pixelgpudetails {
 
-  void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+  TrackingRecHit2DCUDA PixelRecHitGPUKernel::makeHitsAsync(
+                                           SiPixelDigisCUDA const& digis_d,
                                            SiPixelClustersCUDA const& clusters_d,
                                            BeamSpotCUDA const& bs_d,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
-                                           bool transferToCPU,
-                                           cuda::stream_t<>& stream) {
-    gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
-    gpu_.cpeParams = cpeParams; // copy it for use in clients
-    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
+                                           cuda::stream_t<>& stream
+                                          ) const {
+
+    auto nHits = clusters_d.nClusters();
+    TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
 
     int threadsPerBlock = 256;
     int blocks = digis_d.nModules(); // active modules (with digis)
@@ -145,6 +50,7 @@ namespace pixelgpudetails {
 #ifdef GPU_DEBUG
     std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
 #endif
+    if(blocks)  // protect from empty events
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       bs_d.data(),
@@ -154,54 +60,27 @@ namespace pixelgpudetails {
       clusters_d.clusInModule(), clusters_d.moduleId(),
       digis_d.clus(),
       digis_d.nDigis(),
-      gpu_.hitsModuleStart_d,
-      gpu_.charge_d,
-      gpu_.detInd_d,
-      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
-      gpu_.iphi_d,
-      gpu_.xl_d, gpu_.yl_d,
-      gpu_.xerr_d, gpu_.yerr_d,
-      gpu_.mr_d, gpu_.mc_d,
-      gpu_.xsize_d, gpu_.ysize_d
+      clusters_d.clusModuleStart(),
+      hits_d.view()
     );
     cudaCheck(cudaGetLastError());
 
+     
     // assuming full warp of threads is better than a smaller number...
-    setHitsLayerStart<<<1, 32, 0, stream.id()>>>(gpu_.hitsModuleStart_d, d_phase1TopologyLayerStart_, gpu_.hitsLayerStart_d);
+    setHitsLayerStart<<<1, 32, 0, stream.id()>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
     cudaCheck(cudaGetLastError());
 
-    // needed only if hits on CPU are required...
-    nhits_ = clusters_d.nClusters();
-    if(transferToCPU) {
-      cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-#ifdef GPU_DEBUG
-      cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-#endif
-
-      cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_,
-                                  gpu_.owner_16bit_, gpu_.owner_16bit_pitch_,
-                                  nhits_*sizeof(uint16_t), 3,
-                                  cudaMemcpyDefault, stream.id()));
-
-      cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_,
-                                  gpu_.owner_32bit_, gpu_.owner_32bit_pitch_,
-                                  nhits_*sizeof(uint32_t), 5,
-                                  cudaMemcpyDefault, stream.id()));
+    if (nHits >= TrackingRecHit2DSOAView::maxHits()) {
+      edm::LogWarning("PixelRecHitGPUKernel" ) << "Hits Overflow " << nHits  << " > " << TrackingRecHit2DSOAView::maxHits();
+    } 
 
-#ifdef GPU_DEBUG
-      cudaStreamSynchronize(stream.id());
-
-      std::cout << "hit layerStart ";
-      for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << h_hitsLayerStart_[i] << ' ';
-      std::cout << "end:" << h_hitsLayerStart_[10] << std::endl;
-#endif
-
-      // for timing test
-      // cudaStreamSynchronize(stream.id());
-      // auto nhits_ = h_hitsLayerStart_[10];
-      // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
+    if (nHits) {
+      edm::Service<CUDAService> cs;
+      auto hws = cs->make_device_unique<uint8_t[]>(TrackingRecHit2DSOAView::Hist::wsSize(),stream);
+      cudautils::fillManyFromVector(hits_d.phiBinner(), hws.get(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream.id());
+      cudaCheck(cudaGetLastError());
     }
-
-    cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
+    return hits_d;
   }
-}
+
+}  // end namespace
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 8e5599c239789..096d6c7492f2a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -4,72 +4,32 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
 #include <cuda/api_wrappers.h>
 
 #include <cstdint>
-#include <vector>
 
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
-
-
-namespace pixelCPEforGPU {
-  struct ParamsOnGPU;
-}
 
 namespace pixelgpudetails {
-  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-
-  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
   class PixelRecHitGPUKernel {
   public:
-    PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
-    ~PixelRecHitGPUKernel();
+    PixelRecHitGPUKernel() = default;
+    ~PixelRecHitGPUKernel() = default;
 
     PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel(PixelRecHitGPUKernel&&) = delete;
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+    TrackingRecHit2DCUDA makeHitsAsync(
+                       SiPixelDigisCUDA const& digis_d,
                        SiPixelClustersCUDA const& clusters_d,
                        BeamSpotCUDA const& bs_d,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
-                       bool transferToCPU,
-                       cuda::stream_t<>& stream);
-
-    HitsOnCPU getOutput() const {
-      return HitsOnCPU{
-        h_hitsModuleStart_, h_detInd_, h_charge_,
-        h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_,
-        gpu_d, nhits_
-      };
-    }
+                       cuda::stream_t<>& stream) const;
 
-  private:
-    HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
-    HitsOnGPU gpu_;
-    uint32_t nhits_ = 0;
-    uint32_t *d_phase1TopologyLayerStart_ = nullptr;
-    uint8_t *d_phase1TopologyLayer_ = nullptr;
-    uint32_t *h_hitsModuleStart_ = nullptr;
-    uint16_t *h_detInd_ = nullptr;
-    int32_t *h_charge_ = nullptr;
-    float *h_xl_ = nullptr;
-    float *h_yl_ = nullptr;
-    float *h_xe_ = nullptr;
-    float *h_ye_ = nullptr;
-    uint16_t *h_mr_ = nullptr;
-    uint16_t *h_mc_ = nullptr;
-    void *h_owner_32bit_ = nullptr;
-    size_t h_owner_32bit_pitch_ = 0;
-    void *h_owner_16bit_ = nullptr;
-    size_t h_owner_16bit_pitch_ = 0;
-#ifdef GPU_DEBUG
-    uint32_t *h_hitsLayerStart_ = nullptr;
-#endif
   };
 }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
new file mode 100644
index 0000000000000..7e17f6c029ac2
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -0,0 +1,118 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+
+#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
+#include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
+
+#include "PixelRecHits.h"  // TODO : spit product from kernel
+
+#include <cuda_runtime.h>
+
+class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
+
+public:
+
+
+  explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  // The mess with inputs will be cleaned up when migrating to the new framework
+  edm::EDGetTokenT<CUDAProduct<BeamSpotCUDA>> tBeamSpot;
+  edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
+  
+  edm::EDPutTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;
+
+  std::string cpeName_;
+
+  pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
+
+};
+
+SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig):
+  tBeamSpot(consumes<CUDAProduct<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+  token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  tokenHit_(produces<CUDAProduct<TrackingRecHit2DCUDA>>()),
+  cpeName_(iConfig.getParameter<std::string>("CPE"))
+{}
+
+void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpotCUDA"));
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDAPreSplitting"));
+  desc.add<std::string>("CPE", "PixelCPEFast");
+  descriptions.add("siPixelRecHitCUDA",desc);
+}
+
+
+void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+
+  // const TrackerGeometry *geom_ = nullptr;
+  const PixelClusterParameterEstimator *cpe_ = nullptr;
+
+  /*
+  edm::ESHandle<TrackerGeometry> geom;
+  es.get<TrackerDigiGeometryRecord>().get( geom );
+  geom_ = geom.product();
+  */
+
+  edm::ESHandle<PixelClusterParameterEstimator> hCPE;
+  es.get<TkPixelCPERecord>().get(cpeName_, hCPE);
+  cpe_ = dynamic_cast< const PixelCPEBase* >(hCPE.product());
+
+  PixelCPEFast const * fcpe = dynamic_cast<const PixelCPEFast *>(cpe_);
+  if (!fcpe) {
+    throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
+  }
+
+  edm::Handle<CUDAProduct<SiPixelClustersCUDA>> hclusters;
+  iEvent.getByToken(token_, hclusters);
+
+  CUDAScopedContext ctx{*hclusters};
+  auto const& clusters = ctx.get(*hclusters);
+
+  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> hdigis;
+  iEvent.getByToken(tokenDigi_, hdigis);
+  auto const& digis = ctx.get(*hdigis);
+
+  edm::Handle<CUDAProduct<BeamSpotCUDA>> hbs;
+  iEvent.getByToken(tBeamSpot, hbs);
+  auto const& bs = ctx.get(*hbs);
+
+  ctx.emplace(iEvent,tokenHit_,
+              std::move(
+              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream())
+              ));
+}
+
+DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
new file mode 100644
index 0000000000000..9d3c70aebf2cc
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
@@ -0,0 +1,187 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
+
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+
+
+#include <cuda_runtime.h>
+
+class SiPixelRecHitFromSOA: public edm::stream::EDProducer<edm::ExternalWork> {
+
+
+public:
+
+  explicit SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitFromSOA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;  // CUDA hits
+  edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // Legacy Clusters
+
+  uint32_t m_nHits;
+  cudautils::host::unique_ptr<uint16_t[]>  m_store16;
+  cudautils::host::unique_ptr<float[]>  m_store32;
+  cudautils::host::unique_ptr<uint32_t[]> m_hitsModuleStart;
+};
+
+SiPixelRecHitFromSOA::SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig):
+  tokenHit_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+  clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src")))
+{
+    produces<SiPixelRecHitCollectionNew>();
+}
+
+
+void SiPixelRecHitFromSOA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
+  descriptions.add("siPixelRecHitFromSOA",desc);
+}
+
+
+void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+
+  CUDAProduct<TrackingRecHit2DCUDA> const& inputDataWrapped = iEvent.get(tokenHit_);
+  CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_nHits = inputData.nHits();
+
+  // std::cout<< "converting " << m_nHits << " Hits"<< std::endl;
+
+  if (0==m_nHits) return;
+  m_store32 = inputData.localCoordToHostAsync(ctx.stream());
+  //  m_store16 = inputData.detIndexToHostAsync(ctx.stream();
+  m_hitsModuleStart = inputData.hitsModuleStartToHostAsync(ctx.stream());
+}
+
+void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
+
+  auto output = std::make_unique<SiPixelRecHitCollectionNew>();
+  if (0==m_nHits) {
+    iEvent.put(std::move(output));
+    return;
+  }
+
+  auto xl = m_store32.get();
+  auto yl = xl+m_nHits;
+  auto xe = yl+m_nHits;
+  auto ye = xe+m_nHits;
+
+  edm::ESHandle<TrackerGeometry> geom;
+  es.get<TrackerDigiGeometryRecord>().get( geom );
+  geom = geom.product();
+
+  edm::Handle<SiPixelClusterCollectionNew> hclusters;
+  iEvent.getByToken(clusterToken_, hclusters);
+
+  auto const & input = *hclusters;
+
+  int numberOfDetUnits = 0;
+  int numberOfClusters = 0;
+  for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) {
+    numberOfDetUnits++;
+    unsigned int detid = DSViter->detId();
+    DetId detIdObject( detid );
+    const GeomDetUnit * genericDet = geom->idToDetUnit( detIdObject );
+    auto gind = genericDet->index();
+    const PixelGeomDetUnit * pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
+    assert(pixDet);
+    SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*output, detid);
+    auto fc = m_hitsModuleStart[gind];
+    auto lc = m_hitsModuleStart[gind+1];
+    auto nhits = lc-fc;
+
+    //std::cout << "in det " << gind << "conv " << nhits << " hits from " << DSViter->size() << " legacy clusters" 
+    //          <<' '<< lc <<','<<fc<<std::endl;
+
+    if (0==nhits) continue;
+    uint32_t ic=0;
+    auto jnd = [&](int k) { return fc+k; };
+    assert(nhits<=DSViter->size());
+    if (nhits!=DSViter->size()) {
+       edm::LogWarning("GPUHits2CPU") <<"nhits!= ndigi " << nhits << ' ' << DSViter->size() << std::endl;
+    }
+    for (auto const & clust : *DSViter) {
+      if (ic>=nhits) {
+        // FIXME add a way to handle this case, or at least notify via edm::LogError
+        break;
+      }
+      auto ij = jnd(clust.originalId());
+      if (ij>=TrackingRecHit2DSOAView::maxHits()) break; // overflow...
+      assert(clust.originalId()>=0); assert(clust.originalId()<nhits);
+      LocalPoint lp(xl[ij], yl[ij]);
+      LocalError le(xe[ij], 0, ye[ij]);
+      SiPixelRecHitQuality::QualWordType rqw=0;
+
+      ++ic;
+      numberOfClusters++;
+
+      /*   cpu version....  (for reference)
+           std::tuple<LocalPoint, LocalError, SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( clust, *genericDet );
+           LocalPoint lp( std::get<0>(tuple) );
+           LocalError le( std::get<1>(tuple) );
+           SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) );
+      */
+
+      // Create a persistent edm::Ref to the cluster
+      edm::Ref< edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster > cluster = edmNew::makeRefTo(hclusters, &clust);
+      // Make a RecHit and add it to the DetSet
+      SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster);
+      //
+      // Now save it =================
+      recHitsOnDetUnit.push_back(hit);
+      // =============================
+
+      // std::cout << "SiPixelRecHitGPUVI " << numberOfClusters << ' '<< lp << " " << le << std::endl;
+
+    } //  <-- End loop on Clusters
+
+
+      //  LogDebug("SiPixelRecHitGPU")
+      //std::cout << "SiPixelRecHitGPUVI "
+      //	<< " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //;
+      // << std::endl;
+
+
+  } //    <-- End loop on DetUnits
+
+  /*
+  std::cout << "SiPixelRecHitGPUVI $ det, clus, lost "
+    <<  numberOfDetUnits << ' '
+    << numberOfClusters  << ' '
+    << std::endl;
+  */
+
+  iEvent.put(std::move(output));
+}
+
+DEFINE_FWK_MODULE(SiPixelRecHitFromSOA);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
deleted file mode 100644
index df0bfb5baa0d6..0000000000000
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "DataFormats/Common/interface/DetSetVectorNew.h"
-#include "DataFormats/Common/interface/Handle.h"
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
-#include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
-
-#include "PixelRecHits.h"  // TODO : spit product from kernel
-
-#include <cuda_runtime.h>
-
-class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices <
-                                                                   heterogeneous::GPUCuda,
-                                                                   heterogeneous::CPU
-                                                                   > > {
-
-public:
-  using CPUProduct = siPixelRecHitsHeterogeneousProduct::CPUProduct;
-  using GPUProduct = siPixelRecHitsHeterogeneousProduct::GPUProduct;
-  using Output = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
-
-
-  explicit SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig);
-  ~SiPixelRecHitHeterogeneous() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
-  // CPU implementation
-  void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
-
-  // GPU implementation
-  void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override;
-  void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void convertGPUtoCPU(edm::Event& iEvent, const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, const siPixelRecHitsHeterogeneousProduct::GPUProduct & gpu) const;
-
-    // Commonalities
-  void initialize(const edm::EventSetup& es);
-
-  // CPU
-  void run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output) const;
-  // GPU
-  void run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const;
-
-
-  // The mess with inputs will be cleaned up when migrating to the new framework
-  edm::EDGetTokenT<CUDAProduct<BeamSpotCUDA>> tBeamSpot;
-  edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
-  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
-  edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;
-  std::string cpeName_;
-
-  const TrackerGeometry *geom_ = nullptr;
-  const PixelClusterParameterEstimator *cpe_ = nullptr;
-
-  std::unique_ptr<pixelgpudetails::PixelRecHitGPUKernel> gpuAlgo_;
-
-  bool enableTransfer_;
-  bool enableConversion_;
-};
-
-SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig):
-  HeterogeneousEDProducer(iConfig),
-  tBeamSpot(consumes<CUDAProduct<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-  token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
-  tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
-  cpeName_(iConfig.getParameter<std::string>("CPE"))
-{
-  enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
-  enableTransfer_ = enableConversion_ || iConfig.getParameter<bool>("gpuEnableTransfer");
-
-  produces<HeterogeneousProduct>();
-  if(enableConversion_) {
-    clusterToken_ = consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"));
-    produces<SiPixelRecHitCollectionNew>();
-  }
-}
-
-void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-
-  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpotCUDA"));
-  desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
-  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
-  desc.add<std::string>("CPE", "PixelCPEFast");
-
-  desc.add<bool>("gpuEnableTransfer", true);
-  desc.add<bool>("gpuEnableConversion", true);
-
-  HeterogeneousEDProducer::fillPSetDescription(desc);
-
-  descriptions.add("siPixelRecHitHeterogeneous",desc);
-}
-
-void SiPixelRecHitHeterogeneous::initialize(const edm::EventSetup& es) {
-  edm::ESHandle<TrackerGeometry> geom;
-  es.get<TrackerDigiGeometryRecord>().get( geom );
-  geom_ = geom.product();
-
-  edm::ESHandle<PixelClusterParameterEstimator> hCPE;
-  es.get<TkPixelCPERecord>().get(cpeName_, hCPE);
-  cpe_ = dynamic_cast< const PixelCPEBase* >(hCPE.product());
-}
-
-void SiPixelRecHitHeterogeneous::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
-  throw cms::Exception("NotImplemented") << "CPU version is no longer implemented";
-}
-
-void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output) const {
-  const auto& input = *inputhandle;
-
-  edmNew::DetSetVector<SiPixelCluster>::const_iterator DSViter=input.begin();
-  for ( ; DSViter != input.end() ; DSViter++) {
-    unsigned int detid = DSViter->detId();
-    DetId detIdObject( detid );
-    const GeomDetUnit * genericDet = geom_->idToDetUnit( detIdObject );
-    const PixelGeomDetUnit * pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
-    assert(pixDet);
-    SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output,detid);
-
-    edmNew::DetSet<SiPixelCluster>::const_iterator clustIt = DSViter->begin(), clustEnd = DSViter->end();
-
-    for ( ; clustIt != clustEnd; clustIt++) {
-      std::tuple<LocalPoint, LocalError,SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( *clustIt, *genericDet );
-      LocalPoint lp( std::get<0>(tuple) );
-      LocalError le( std::get<1>(tuple) );
-      SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) );
-      // Create a persistent edm::Ref to the cluster
-      edm::Ref< edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster > cluster = edmNew::makeRefTo( inputhandle, clustIt);
-      // Make a RecHit and add it to the DetSet
-      // old : recHitsOnDetUnit.push_back( new SiPixelRecHit( lp, le, detIdObject, &*clustIt) );
-      SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster);
-      //
-      // Now save it =================
-      recHitsOnDetUnit.push_back(hit);
-    } //  <-- End loop on Clusters
-  } //    <-- End loop on DetUnits
-}
-
-
-void SiPixelRecHitHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
-  gpuAlgo_ = std::make_unique<pixelgpudetails::PixelRecHitGPUKernel>(cudaStream);
-}
-
-void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
-  initialize(iSetup);
-
-  PixelCPEFast const * fcpe = dynamic_cast<const PixelCPEFast *>(cpe_);
-  if (!fcpe) {
-    throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
-  }
-
-  edm::Handle<CUDAProduct<SiPixelClustersCUDA>> hclusters;
-  iEvent.getByToken(token_, hclusters);
-  // temporary check (until the migration)
-  edm::Service<CUDAService> cs;
-  assert(hclusters->device() == cs->getCurrentDevice());
-  CUDAScopedContext ctx{*hclusters};
-  auto const& clusters = ctx.get(*hclusters);
-
-  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> hdigis;
-  iEvent.getByToken(tokenDigi_, hdigis);
-  auto const& digis = ctx.get(*hdigis);
-
-  edm::Handle<CUDAProduct<BeamSpotCUDA>> hbs;
-  iEvent.getByToken(tBeamSpot, hbs);
-  auto const& bs = ctx.get(*hbs);
-
-  // We're processing in a stream given by base class, so need to
-  // synchronize explicitly (implementation is from
-  // CUDAScopedContext). In practice these should not be needed
-  // (because of synchronizations upstream), but let's play generic.
-  if (not hclusters->isAvailable()) {
-    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0));
-  }
-  if (not hdigis->isAvailable()) {
-    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hdigis->event()->id(), 0));
-  }
-  if (not hbs->isAvailable()) {
-    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hbs->event()->id(), 0));
-  }
-
-  gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
-
-}
-
-void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
-  auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput());
-
-  if(enableConversion_) {
-    // Need the CPU clusters to
-    // - properly fill the output DetSetVector of hits
-    // - to set up edm::Refs to the clusters
-    edm::Handle<SiPixelClusterCollectionNew> hclusters;
-    iEvent.getByToken(clusterToken_, hclusters);
-
-    convertGPUtoCPU(iEvent.event(), hclusters, *output);
-  }
-
-  iEvent.put<Output>(std::move(output), heterogeneous::DisableTransfer{});
-}
-
-void SiPixelRecHitHeterogeneous::convertGPUtoCPU(edm::Event& iEvent, const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, const pixelgpudetails::HitsOnCPU & hoc) const{
-  assert(hoc.gpu_d);
-  auto output = std::make_unique<SiPixelRecHitCollectionNew>();
-  run(inputhandle, *output, hoc);
-  iEvent.put(std::move(output));
-}
-
-void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const {
-  auto const & input = *inputhandle;
-
-  int numberOfDetUnits = 0;
-  int numberOfClusters = 0;
-  for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) {
-    numberOfDetUnits++;
-    unsigned int detid = DSViter->detId();
-    DetId detIdObject( detid );
-    const GeomDetUnit * genericDet = geom_->idToDetUnit( detIdObject );
-    auto gind = genericDet->index();
-    const PixelGeomDetUnit * pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
-    assert(pixDet);
-    SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output, detid);
-    auto fc = hoc.hitsModuleStart[gind];
-    auto lc = hoc.hitsModuleStart[gind+1];
-    auto nhits = lc-fc;
-    uint32_t ic=0;
-    auto jnd = [&](int k) { return fc+k; };
-    assert(nhits<=DSViter->size());
-    if (nhits!=DSViter->size()) {
-       edm::LogWarning("GPUHits2CPU") <<"nhits!= ndigi " << nhits << ' ' << DSViter->size() << std::endl;
-    }
-    for (auto const & clust : *DSViter) {
-      if (ic>=nhits) {
-        // FIXME add a way to handle this case, or at least notify via edm::LogError
-        break;
-      }
-      auto ij = jnd(clust.originalId());
-      assert(clust.originalId()>=0); assert(clust.originalId()<nhits);
-      if(clust.charge()!=hoc.charge[ij])
-        edm::LogWarning("GPUHits2CPU") << "not a perfect Match "
-                                       << gind <<' '<<fc<<' '
-                                       << ic<<'/'<<clust.originalId()<<'/'<< (ij-fc) << ' ' << clust.size()
-                                       << ' ' << clust.charge()<<"!="<<hoc.charge[ij]
-                                       << ' ' << clust.minPixelCol()<<"?"<< hoc.mc[ij]
-                                       << ' ' << clust.minPixelRow()<<'/'<< hoc.mr[ij] << std::endl;
-
-      LocalPoint lp(hoc.xl[ij], hoc.yl[ij]);
-      LocalError le(hoc.xe[ij], 0, hoc.ye[ij]);
-      SiPixelRecHitQuality::QualWordType rqw=0;
-
-      ++ic;
-      numberOfClusters++;
-
-      /*   cpu version....  (for reference)
-           std::tuple<LocalPoint, LocalError, SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( clust, *genericDet );
-           LocalPoint lp( std::get<0>(tuple) );
-           LocalError le( std::get<1>(tuple) );
-           SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) );
-      */
-
-      // Create a persistent edm::Ref to the cluster
-      edm::Ref< edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster > cluster = edmNew::makeRefTo( inputhandle, &clust);
-      // Make a RecHit and add it to the DetSet
-      SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster);
-      //
-      // Now save it =================
-      recHitsOnDetUnit.push_back(hit);
-      // =============================
-
-      // std::cout << "SiPixelRecHitGPUVI " << numberOfClusters << ' '<< lp << " " << le << std::endl;
-
-    } //  <-- End loop on Clusters
-
-
-      //  LogDebug("SiPixelRecHitGPU")
-      //std::cout << "SiPixelRecHitGPUVI "
-      //	<< " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //;
-      // << std::endl;
-
-
-  } //    <-- End loop on DetUnits
-
-  /*
-  std::cout << "SiPixelRecHitGPUVI $ det, clus, lost "
-    <<  numberOfDetUnits << ' '
-    << numberOfClusters  << ' '
-    << std::endl;
-  */
-}
-
-DEFINE_FWK_MODULE(SiPixelRecHitHeterogeneous);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index caf58c0615dbb..2874df10c16c3 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -6,15 +6,13 @@
 #include <limits>
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-
 namespace gpuPixelRecHits {
 
 
-
-
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
                           BeamSpotCUDA::Data const * __restrict__  bs,
                           uint16_t const * __restrict__  id,
@@ -27,19 +25,15 @@ namespace gpuPixelRecHits {
 			  int32_t  const * __restrict__  clus,
 			  int numElements,
 			  uint32_t const * __restrict__  hitsModuleStart,
-                          int32_t * chargeh,
-                          uint16_t * detInd,
-			  float * xg, float * yg, float * zg, float * rg, int16_t * iph,
-                          float * xl, float * yl,
-                          float * xe, float * ye, 
-                          uint16_t * mr, uint16_t * mc,
-                          int16_t * xs, int16_t * ys
-                          )
-  {
+                          TrackingRecHit2DSOAView * phits
+                         )
+{
+
+    auto & hits = *phits;
 
     // to be moved in common namespace...
     constexpr uint16_t InvId=9999; // must be > MaxNumModules
-    constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
+    constexpr uint32_t MaxHitsInModule = pixelCPEforGPU::MaxHitsInModule;
 
     using ClusParams = pixelCPEforGPU::ClusParams;
 
@@ -66,14 +60,14 @@ namespace gpuPixelRecHits {
       if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);
 #endif
 
-    assert(blockDim.x >= MaxClusInModule);
+    assert(blockDim.x >= MaxHitsInModule);
 
-    if (threadIdx.x==0 && nclus > MaxClusInModule) { 
-      printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+    if (threadIdx.x==0 && nclus > MaxHitsInModule) { 
+      printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxHitsInModule);
       // zero charge: do not bother to do it in parallel
-      for (auto d=MaxClusInModule; d<nclus; ++d) { chargeh[d]=0; detInd[d]=InvId;}
+      for (auto d=MaxHitsInModule; d<nclus; ++d) { hits.charge(d)=0; hits.detectorIndex(d)=InvId;}
     }
-    nclus = std::min(nclus, MaxClusInModule);
+    nclus = std::min(nclus, MaxHitsInModule);
 
     auto ic = threadIdx.x;
 
@@ -121,42 +115,50 @@ namespace gpuPixelRecHits {
     __syncthreads();
 
     // next one cluster per thread...
+
     if (ic >= nclus) return;
 
     first = hitsModuleStart[me];
     auto h = first+ic;  // output index in global memory
 
-    assert(h < 2000*256);
+    if (h >= TrackingRecHit2DSOAView::maxHits()) return; // overflow...
 
     pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
     pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
-    chargeh[h] = clusParams.charge[ic];
 
-    detInd[h] = me;
+    // store it
+
+    hits.charge(h) = clusParams.charge[ic];
 
-    xl[h]= clusParams.xpos[ic];   
-    yl[h]= clusParams.ypos[ic]; 
+    hits.detectorIndex(h) = me;
 
-    xs[h]= clusParams.xsize[ic];
-    ys[h]= clusParams.ysize[ic];
+    float xl,yl;
+    hits.xLocal(h) = xl = clusParams.xpos[ic];   
+    hits.yLocal(h) = yl = clusParams.ypos[ic]; 
 
+    hits.clusterSizeX(h) = clusParams.xsize[ic];
+    hits.clusterSizeY(h) = clusParams.ysize[ic];
 
-    xe[h]= clusParams.xerr[ic]*clusParams.xerr[ic];
-    ye[h]= clusParams.yerr[ic]*clusParams.yerr[ic];
-    mr[h]= clusParams.minRow[ic];
-    mc[h]= clusParams.minCol[ic];
 
-  
+    hits.xerrLocal(h) = clusParams.xerr[ic]*clusParams.xerr[ic];
+    hits.yerrLocal(h) = clusParams.yerr[ic]*clusParams.yerr[ic];
+
+    // keep it local for computations
+    float xg,yg,zg;  
     // to global and compute phi... 
-    cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]);
+    cpeParams->detParams(me).frame.toGlobal(xl,yl, xg,yg,zg);
     // here correct for the beamspot...
-    xg[h]-=bs->x;
-    yg[h]-=bs->y;
-    zg[h]-=bs->z;
+    xg-=bs->x;
+    yg-=bs->y;
+    zg-=bs->z;
+
+    hits.xGlobal(h) = xg;
+    hits.yGlobal(h) = yg;
+    hits.zGlobal(h) = zg;
 
-    rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]);
-    iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);
+    hits.rGlobal(h) = std::sqrt(xg*xg+yg*yg);
+    hits.iphi(h) = unsafe_atan2s<7>(yg,xg);
     
   }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
deleted file mode 100644
index a90a37da1065f..0000000000000
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h
-
-#include <cstdint>
-#include <vector>
-
-#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-
-namespace pixelCPEforGPU {
-  struct ParamsOnGPU;
-}
-
-namespace siPixelRecHitsHeterogeneousProduct {
-
-  using CPUProduct = int; // dummy
-
-  static constexpr uint32_t maxHits() { return 65536;}
-  using hindex_type = uint16_t; // if above is <=2^16
-
-  struct HitsOnGPU{
-     pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
-     const uint32_t * hitsModuleStart_d; // forwarded from clusters
-     uint32_t * hitsLayerStart_d;
-     int32_t  * charge_d;
-     uint16_t * detInd_d;
-     float *xg_d, *yg_d, *zg_d, *rg_d;
-     float *xl_d, *yl_d;
-     float *xerr_d, *yerr_d;
-     int16_t * iphi_d;
-     uint16_t * sortIndex_d;
-     uint16_t * mr_d;
-     uint16_t * mc_d;
-     int16_t * xsize_d;
-     int16_t * ysize_d;
-
-
-     using Hist = HistoContainer<int16_t,128,maxHits(),8*sizeof(int16_t),uint16_t,10>;
-     Hist * hist_d;
-     uint8_t * hws_d;  
-
-     HitsOnGPU const * me_d = nullptr;
-
-    // Owning pointers to the 32/16 bit arrays with size MAX_HITS
-    void *owner_32bit_;
-    size_t owner_32bit_pitch_;
-    void *owner_16bit_;
-    size_t owner_16bit_pitch_;
-
-    // forwarded from PixelRecHit (FIXME)
-    uint32_t const * phase1TopologyLayerStart_d;
-    uint8_t const * phase1TopologyLayer_d;
-
-  };
-
-  struct HitsOnCPU {
-    uint32_t const * hitsModuleStart = nullptr;
-    uint16_t const * detInd = nullptr;
-    int32_t const * charge = nullptr;
-    float const * xl = nullptr;
-    float const * yl = nullptr;
-    float const * xe = nullptr;
-    float const * ye = nullptr;
-    uint16_t const * mr = nullptr;
-    uint16_t const * mc = nullptr;
-
-    HitsOnGPU const * gpu_d = nullptr;
-    uint32_t nHits;
-  };
-
-  using GPUProduct = HitsOnCPU;  // FIXME fill cpu vectors on demand
-
-  using HeterogeneousPixelRecHit = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
-                                                            heterogeneous::GPUCudaProduct<GPUProduct> >;
-}
-
-#endif // RecoLocalTracker_SiPixelRecHits_plugins_siPixelRecHitsHeterogeneousProduct_h
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index 465aa0bb346ce..8995471470f37 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -6,6 +6,37 @@
     VerboseLevel = cms.untracked.int32(0)
 )
 
-siPixelRecHitsPreSplitting = siPixelRecHits.clone(
+_siPixelRecHitsPreSplitting = siPixelRecHits.clone(
     src = 'siPixelClustersPreSplitting'
 )
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelRecHitsPreSplitting = SwitchProducerCUDA(
+    cpu = _siPixelRecHitsPreSplitting.clone()
+)
+
+
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSOA_cfi import siPixelRecHitFromSOA as _siPixelRecHitFromSOA
+
+gpu.toModify(siPixelRecHitsPreSplitting, 
+    cuda = _siPixelRecHitFromSOA.clone()
+)
+
+
+siPixelRecHitsPreSplittingTask = cms.Task(siPixelRecHitsPreSplitting)
+
+siPixelRecHitsCUDAPreSplitting = _siPixelRecHitCUDA.clone()
+siPixelRecHitsLegacyPreSplitting = _siPixelRecHitFromSOA.clone()
+siPixelRecHitsPreSplittingTaskCUDA = cms.Task(
+    siPixelRecHitsCUDAPreSplitting,
+    siPixelRecHitsLegacyPreSplitting,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelRecHitsPreSplittingTask_gpu = siPixelRecHitsPreSplittingTask.copy()
+_siPixelRecHitsPreSplittingTask_gpu.add(siPixelRecHitsPreSplittingTaskCUDA)
+gpu.toReplaceWith(siPixelRecHitsPreSplittingTask, _siPixelRecHitsPreSplittingTask_gpu)
+
diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
index eb51dd5a2eaeb..1111e4866a8d2 100644
--- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
+++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
@@ -22,6 +22,8 @@ namespace {
    constexpr float micronsToCm = 1.0e-4;
 }
 
+
+
 //-----------------------------------------------------------------------------
 //!  The constructor.
 //-----------------------------------------------------------------------------
@@ -68,13 +70,16 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const & conf,
 
 const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
   const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
+
       // and now copy to device...
       cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
       cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams)));
+      cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
       cudaCheck(cudaMalloc((void**) & data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU)));
 
       cudaCheck(cudaMemcpyAsync(data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream.id()));
       cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_commonParams, &this->m_commonParamsGPU, sizeof(pixelCPEforGPU::CommonParams), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_layerGeometry, &this->m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), cudaMemcpyDefault, stream.id())); 
       cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.data(), this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams), cudaMemcpyDefault, stream.id()));
     });
   return data.d_paramsOnGPU;
@@ -86,7 +91,13 @@ void PixelCPEFast::fillParamsForGpu() {
   m_commonParamsGPU.thePitchX = m_DetParams[0].thePitchX;
   m_commonParamsGPU.thePitchY = m_DetParams[0].thePitchY;
 
-  //uint32_t oldLayer = 0;
+  uint32_t oldLayer = 0;
+  uint32_t oldLadder=0;
+  float rl=0;
+  float zl = 0;
+  float miz = 90, mxz=0;
+  float pl = 0;
+  int nl=0;
   m_detParamsGPU.resize(m_DetParams.size());
   for (auto i=0U; i<m_DetParams.size(); ++i) {
     auto & p=m_DetParams[i];
@@ -107,11 +118,26 @@ void PixelCPEFast::fillParamsForGpu() {
 
     //if (m_commonParamsGPU.theThickness!=p.theThickness)   
     //  std::cout << i << (g.isBarrel ? "B " : "E ") << m_commonParamsGPU.theThickness<<"!="<<p.theThickness << std::endl;
+    auto ladder = ttopo_.pxbLadder(p.theDet->geographicalId());
+
+    if (oldLayer != g.layer) {
+      oldLayer = g.layer;
+      // std::cout << "new layer at " << i << (g.isBarrel ? " B  " :  (g.isPosZ ? " E+ " : " E- ")) << g.layer << " starting at " << g.rawId << std::endl;
+      // std::cout << "old layer had " << nl << " ladders" << std::endl;
+      nl=0;
+    }
+    if (oldLadder != ladder) {
+      oldLadder = ladder;
+      // std::cout << "new ladder at " << i << (g.isBarrel ? " B  " :  (g.isPosZ ? " E+ " : " E- ")) << ladder << " starting at " << g.rawId << std::endl;
+      // std::cout << "old ladder ave z,r,p mz " << zl/8.f << " " << rl/8.f << " " << pl/8.f  << ' ' << miz << ' ' << mxz << std::endl;
+      rl=0;
+      zl = 0;
+      pl = 0;
+      miz=90; mxz=0;
+      nl++;
+    }
+   
 
-    //if (oldLayer != g.layer) {
-    //  oldLayer = g.layer;
-    //  std::cout << "new layer at " << i << (g.isBarrel ? " B  " :  (g.isPosZ ? " E+ " : " E- ")) << g.layer << " starting at " << g.rawId << std::endl;
-    //}
 
     g.shiftX = 0.5f*p.lorentzShiftInCmX;
     g.shiftY = 0.5f*p.lorentzShiftInCmY;
@@ -126,6 +152,11 @@ void PixelCPEFast::fillParamsForGpu() {
     auto rr = pixelCPEforGPU::Rotation(p.theDet->surface().rotation());
     g.frame = pixelCPEforGPU::Frame(vv.x(),vv.y(),vv.z(),rr);
 
+    zl+=vv.z();
+    miz = std::min(miz,std::abs(vv.z()));
+    mxz = std::max(mxz,std::abs(vv.z()));
+    rl+=vv.perp();
+    pl+=vv.phi(); // (not obvious)
 
     // errors .....
    ClusterParamGeneric cp;
@@ -196,13 +227,19 @@ void PixelCPEFast::fillParamsForGpu() {
    }
    */
    
-
    for (int i=0; i<3; ++i) {
      g.sx[i] = std::sqrt(g.sx[i]*g.sx[i]+lape.xx());
      g.sy[i] = std::sqrt(g.sy[i]*g.sy[i]+lape.yy());
    }
 
  }
+
+ // fill Layer and ladders geometry
+ memcpy(m_layerGeometry.layerStart, phase1PixelTopology::layerStart, sizeof(phase1PixelTopology::layerStart));
+ memcpy(m_layerGeometry.layer, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size());
+
+
+
 }
 
 PixelCPEFast::~PixelCPEFast() {}
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 58935e9a6991c..1021918c0ce6c 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
     process.pixelVertices.gpuEnableConversion = False
@@ -32,7 +31,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
index 0d9be5a346d0a..e6ab9f93ca306 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -38,7 +38,7 @@ namespace Rfit
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
     */
     int32_t q;  //!< particle charge
-    float chi2 = 0.0;
+    float chi2;
   };
   
   struct line_fit
@@ -49,7 +49,7 @@ namespace Rfit
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
     */
-    double chi2 = 0.0;
+    double chi2;
   };
   
   struct helix_fit
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc
index c05fa9172ffe3..c6dd66cab18e5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromCUDA.cc
@@ -1,7 +1,6 @@
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/Frameworkfwd.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index a0b2b9e56f59c..894a80616af02 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -11,12 +11,12 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
-
-using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
@@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
-    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
 #ifdef BL_DUMP_HITS
     if (dump){
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit));
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
     }
 #endif
-    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
     hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
   }
   BrokenLine::BL_Fast_fit(hits,fast_fit);
@@ -167,65 +167,71 @@ void kernelBLFit(
 }
 
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
 {
-    assert(tuples_d); assert(fast_fit_resultsGPU_);
+    assert(tuples_d);
 
     auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
-    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
+   //  Fit internals
+   edm::Service<CUDAService> cs;
+   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
+   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
+   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
+
+   for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
 
       // fit triplets
-      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           3, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              3, offset);
       cudaCheck(cudaGetLastError());
 
       // fit quads
-      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           4, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              4, offset);
       cudaCheck(cudaGetLastError());
 
       if (fit5as4_) {
         // fit penta (only first 4)
-        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       } else {
         // fit penta (all 5)
-        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 3c8397cf572f6..d4140692181bf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -11,8 +11,6 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<flags CXXFLAGS="-g -fno-math-errno"/>
-<flags CUDA_FLAGS="-g"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 48a9ec7adf2f7..a33c613402dcf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -6,32 +6,52 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 // #define ONLY_PHICUT
 
 namespace CAConstants {
 
    // constants
-
-   constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; }
+#ifdef GPU_SMALL_EVENTS
+   constexpr uint32_t maxNumberOfTuples() { return 3*1024;}
+#else
+   constexpr uint32_t maxNumberOfTuples() { return 6*1024;}
+#endif
+   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
+#ifndef GPU_SMALL_EVENTS
    constexpr uint32_t maxNumberOfDoublets() { return 262144; }
    constexpr uint32_t maxCellsPerHit() { return 128; }
+#else
+   constexpr uint32_t maxNumberOfDoublets() { return 262144/2; }
+   constexpr uint32_t maxCellsPerHit() { return 128/2; }
+#endif
 #else
    constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
    constexpr uint32_t maxCellsPerHit() { return 4*128; }
 #endif
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;}
+
+
    constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
    constexpr uint32_t maxNumberOfLayers() { return 10; }
-   constexpr uint32_t maxTuples() { return 6*1024;}
+   constexpr uint32_t maxTuples() { return maxNumberOfTuples();}
 
    // types
    using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
    using tindex_type = uint16_t; //  for tuples
+
+   using CellNeighbors = GPU::VecArray< uint32_t, 36>;
+   using CellTracks = GPU::VecArray< tindex_type, 42>;
+
+   using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
+   using CellTracksVector = GPU::SimpleVector<CellTracks>;
+
    using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
    using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
-   using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
+   using HitToTuple = OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
    using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
 
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
index 8007e9a67df3f..c47f5c2fe4c5a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
@@ -1,5 +1,6 @@
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/Common/interface/OwnVector.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
@@ -12,7 +13,12 @@
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
 #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h"
 #include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h"
 #include "RecoTracker/TkHitPairs/interface/RegionsSeedingHitSets.h"
@@ -33,7 +39,7 @@ class CAHitNtupletHeterogeneousEDProducer
           heterogeneous::GPUCuda, heterogeneous::CPU>> {
 public:
 
-    using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+    using PixelRecHitsH = TrackingRecHit2DCUDA;
     using GPUProduct = pixelTuplesHeterogeneousProduct::GPUProduct;
     using CPUProduct = pixelTuplesHeterogeneousProduct::CPUProduct;
     using Output = pixelTuplesHeterogeneousProduct::HeterogeneousPixelTuples;
@@ -57,7 +63,7 @@ class CAHitNtupletHeterogeneousEDProducer
 private:
   edm::EDGetTokenT<edm::OwnVector<TrackingRegion>> regionToken_;
 
-  edm::EDGetTokenT<HeterogeneousProduct> gpuHits_;
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>>  gpuHits_;
   edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
 
   edm::RunningAverage localRA_;
@@ -74,8 +80,7 @@ class CAHitNtupletHeterogeneousEDProducer
 CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
     const edm::ParameterSet &iConfig)
     : HeterogeneousEDProducer(iConfig),
-      gpuHits_(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
-      cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+      gpuHits_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
       GPUGenerator_(iConfig, consumesCollector()),
       useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit")),
       enableConversion_(iConfig.getParameter<bool>("gpuEnableConversion")),
@@ -83,6 +88,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
 {
    produces<HeterogeneousProduct>();
    if(enableConversion_) {
+      cpuHits_ = consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"));
      regionToken_ = consumes<edm::OwnVector<TrackingRegion>>(iConfig.getParameter<edm::InputTag>("trackingRegions"));
      produces<RegionsSeedingHitSets>();
    }
@@ -93,7 +99,8 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions(
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("trackingRegions", edm::InputTag("globalTrackingRegionFromBeamSpot"));
-  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplitting"));
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting"));
   desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
   desc.add<bool>("gpuEnableTransfer", true);
   desc.add<bool>("gpuEnableConversion", true);
@@ -113,20 +120,28 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
     const edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup,
     cuda::stream_t<> &cudaStream) {
 
-  edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
-  iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(gpuHits_, gh);
-  auto const & gHits = *gh;
+  edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> hHits;
+  iEvent.getByToken(gpuHits_, hHits);
 
-  GPUGenerator_.buildDoublets(gHits,cudaStream.id());
+  // temporary check (until the migration)
+  edm::Service<CUDAService> cs;
+  assert(hHits->device() == cs->getCurrentDevice());
+
+  CUDAScopedContext ctx{*hHits};
+  auto const& gHits = ctx.get(*hHits);
+
+  if(not hHits->isAvailable()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hHits->event()->id(), 0));
+  }
+
+  GPUGenerator_.buildDoublets(gHits,cudaStream);
 
   GPUGenerator_.initEvent(iEvent.event(), iSetup);
 
   LogDebug("CAHitNtupletHeterogeneousEDProducer")
         << "Creating ntuplets on GPU";
 
-  GPUGenerator_.hitNtuplets(gHits, iSetup, useRiemannFit_, enableTransfer_, cudaStream.id());
-
-  
+  GPUGenerator_.hitNtuplets(gHits, iSetup, useRiemannFit_, enableTransfer_, cudaStream);
 
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
index 749447ebf88b8..98d8b4ca2bc96 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
@@ -65,7 +65,7 @@ void CAHitQuadrupletGeneratorGPU::hitNtuplets(
     edm::EventSetup const& es,
     bool useRiemannFit,
     bool transferToCPU,
-    cudaStream_t cudaStream)
+    cuda::stream_t<> &cudaStream)
 {
   hitsOnCPU = &hh;
   launchKernels(hh, useRiemannFit, transferToCPU, cudaStream);
@@ -75,12 +75,29 @@ void CAHitQuadrupletGeneratorGPU::fillResults(
     const TrackingRegion &region, SiPixelRecHitCollectionNew const & rechits,
     std::vector<OrderedHitSeeds> &result, const edm::EventSetup &es)
 {
+
+  assert(hitsOnCPU);
+  auto nhits = hitsOnCPU->nHits();
+
+   uint32_t hitsModuleStart[gpuClustering::MaxNumModules+1];
+  // to be understood where to locate
+  cudaCheck(cudaMemcpy(hitsModuleStart, hitsOnCPU->hitsModuleStart(), (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault));
+
+  auto fc = hitsModuleStart;
+
   hitmap_.clear();
   auto const & rcs = rechits.data();
-  for (auto const & h : rcs) hitmap_.add(h, &h);
+  hitmap_.resize(nhits);
+  for (auto const & h : rcs) {
+       auto const & thit = static_cast<BaseTrackerRecHit const&>(h);
+       auto detI = thit.det()->index();
+       auto const & clus = thit.firstClusterRef();
+       assert(clus.isPixel());  
+       auto i = fc[detI] + clus.pixelCluster().originalId();
+       assert(i<nhits);
+       hitmap_[i] = &h;
+  } 
 
-  assert(hitsOnCPU);
-  auto nhits = hitsOnCPU->nHits;
   int index = 0;
 
   auto const & foundQuads = fetchKernelResult(index);
@@ -99,8 +116,9 @@ void CAHitQuadrupletGeneratorGPU::fillResults(
       auto k = foundQuads[quadId][i];
       if (k<0) { phits[i] = nullptr;continue; } // (actually break...)
       assert(k<int(nhits));
-      auto hp = hitmap_.get((*hitsOnCPU).detInd[k],(*hitsOnCPU).mr[k], (*hitsOnCPU).mc[k]);
+      auto hp = hitmap_[k];
       if (hp==nullptr) {
+        edm::LogWarning("CAHitQuadrupletGeneratorGPU") << "hit not found????  " << k << std::endl;
         bad=true;
         break;
       }
@@ -164,29 +182,29 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU()
 void CAHitQuadrupletGeneratorGPU::launchKernels(HitsOnCPU const & hh,
                                                 bool useRiemannFit,
                                                 bool transferToCPU,
-                                                cudaStream_t cudaStream)
+                                                cuda::stream_t<> &cudaStream)
 {
 
-  kernels.launchKernels(hh, gpu_, cudaStream); 
+  kernels.launchKernels(hh, gpu_, cudaStream.id()); 
   if (useRiemannFit) {
-    fitter.launchRiemannKernels(hh, hh.nHits, CAConstants::maxNumberOfQuadruplets(), cudaStream);
+    fitter.launchRiemannKernels(hh, hh.nHits(), CAConstants::maxNumberOfQuadruplets(), cudaStream);
   } else {
-    fitter.launchBrokenLineKernels(hh, hh.nHits, CAConstants::maxNumberOfQuadruplets(), cudaStream);
+    fitter.launchBrokenLineKernels(hh, hh.nHits(), CAConstants::maxNumberOfQuadruplets(), cudaStream);
   }
-  kernels.classifyTuples(hh, gpu_, cudaStream);
+  kernels.classifyTuples(hh, gpu_, cudaStream.id());
 
   if (transferToCPU) {
     cudaCheck(cudaMemcpyAsync(tuples_,gpu_.tuples_d,
                               sizeof(TuplesOnGPU::Container),
-                              cudaMemcpyDeviceToHost, cudaStream));
+                              cudaMemcpyDeviceToHost, cudaStream.id()));
 
     cudaCheck(cudaMemcpyAsync(helix_fit_results_,gpu_.helix_fit_results_d, 
                               sizeof(Rfit::helix_fit)*CAConstants::maxNumberOfQuadruplets(),
-                              cudaMemcpyDeviceToHost, cudaStream));
+                              cudaMemcpyDeviceToHost, cudaStream.id()));
 
     cudaCheck(cudaMemcpyAsync(quality_,gpu_.quality_d,
                               sizeof(Quality)*CAConstants::maxNumberOfQuadruplets(),
-                              cudaMemcpyDeviceToHost, cudaStream));
+                              cudaMemcpyDeviceToHost, cudaStream.id()));
 
   }
 
@@ -233,6 +251,6 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int)
   return quadsInterface;
 }
 
-void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
+void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream) {
    kernels.buildDoublets(hh,stream);
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index 5bf4964ab3488..0c5217daff0bc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -4,11 +4,11 @@
 #include <cuda_runtime.h>
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RZLine.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h"
 #include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
@@ -20,7 +20,6 @@
 #include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h"
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h"
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h"
-#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
 
 #include "CAHitQuadrupletGeneratorKernels.h"
 #include "HelixFitOnGPU.h"
@@ -41,9 +40,9 @@ namespace edm {
 class CAHitQuadrupletGeneratorGPU {
 public:
 
-    using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-    using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
-    using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+    using HitsOnGPU = TrackingRecHit2DSOAView;
+    using HitsOnCPU = TrackingRecHit2DCUDA;
+    using hindex_type = TrackingRecHit2DSOAView::hindex_type;
 
     using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
     using TuplesOnCPU = pixelTuplesHeterogeneousProduct::TuplesOnCPU;
@@ -65,16 +64,16 @@ class CAHitQuadrupletGeneratorGPU {
 
     void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
-    void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream);
+    void buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream);
 
     void hitNtuplets(HitsOnCPU const & hh,
                      const edm::EventSetup& es,
                      bool useRiemannFit,
                      bool transferToCPU,
-                     cudaStream_t stream);
+                     cuda::stream_t<> &cudaStream);
 
     TuplesOnCPU getOutput() const {
-       return TuplesOnCPU { std::move(indToEdm), hitsOnCPU->gpu_d, tuples_,  helix_fit_results_, quality_, gpu_d, nTuples_};
+       return TuplesOnCPU { std::move(indToEdm), hitsOnCPU->view(), tuples_,  helix_fit_results_, quality_, gpu_d, nTuples_};
     }
 
     void cleanup(cudaStream_t stream);
@@ -87,7 +86,7 @@ class CAHitQuadrupletGeneratorGPU {
 
 private:
 
-    void launchKernels(HitsOnCPU const & hh, bool useRiemannFit, bool transferToCPU, cudaStream_t);
+    void launchKernels(HitsOnCPU const & hh, bool useRiemannFit, bool transferToCPU, cuda::stream_t<> &cudaStream);
 
 
     std::vector<std::array<int,4>> fetchKernelResult(int);
@@ -114,7 +113,7 @@ class CAHitQuadrupletGeneratorGPU {
     // input
     HitsOnCPU const * hitsOnCPU=nullptr;
 
-    RecHitsMap<TrackingRecHit const *> hitmap_ = RecHitsMap<TrackingRecHit const *>(nullptr);
+    std::vector<TrackingRecHit const *> hitmap_;
 
 };
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
index e685c4674e0d9..cff61138d7d29 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
@@ -14,9 +14,14 @@
 #include "gpuFishbone.h"
 #include "CAConstants.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+
+
 using namespace gpuPixelDoublets;
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+using HitsOnGPU = CAHitQuadrupletGeneratorKernels::HitsOnGPU;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 using Quality = pixelTuplesHeterogeneousProduct::Quality;
 
@@ -25,13 +30,10 @@ using Quality = pixelTuplesHeterogeneousProduct::Quality;
 __global__
 void kernel_checkOverflows(TuplesOnGPU::Container * foundNtuplets, AtomicPairCounter * apc,
                GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells,
+               CellNeighborsVector const * cellNeighbors, CellTracksVector const * cellTracks,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
                uint32_t nHits, CAHitQuadrupletGeneratorKernels::Counters * counters) {
 
- __shared__ uint32_t killedCell;
- killedCell=0;
- __syncthreads();
-
  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
 
  auto    & c = *counters;
@@ -64,20 +66,19 @@ void kernel_checkOverflows(TuplesOnGPU::Container * foundNtuplets, AtomicPairCou
 
  if (idx < (*nCells) ) {
    auto &thisCell = cells[idx];
-   if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId];
+   if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId];
      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
-   if (thisCell.theTracks.full()) //++tooManyTracks[thisCell.theLayerPairId];
+   if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId];
      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
-   if (thisCell.theDoubletId<0) atomicAdd(&killedCell,1);
+   if (thisCell.theDoubletId<0) atomicAdd(&c.nKilledCells,1);
+   if (thisCell.outerNeighbors().empty()) atomicAdd(&c.nEmptyCells,1);
+   if (thisCell.tracks().empty())  atomicAdd(&c.nZeroTrackCells,1);
  }
  if (idx < nHits) {
    if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell;
      printf("OuterHitOfCell overflow %d\n", idx);
  }
 
- __syncthreads();
- if (threadIdx.x==0) atomicAdd(&c.nKilledCells,killedCell);
- // if (threadIdx.x==0) printf("number of killed cells %d\n",killedCell);
 }
 
 
@@ -95,7 +96,7 @@ kernel_fishboneCleaner(GPUCACell const * cells, uint32_t const * __restrict__ nC
   auto const & thisCell = cells[cellIndex];
   if (thisCell.theDoubletId>=0) return;
 
-  for (auto it : thisCell.theTracks) quality[it] = bad;
+  for (auto it : thisCell.tracks()) quality[it] = bad;
 
 }
 
@@ -125,13 +126,13 @@ kernel_fastDuplicateRemover(GPUCACell const * cells, uint32_t const * __restrict
   };
 
   // find maxNh
-  for (auto it : thisCell.theTracks) {
+  for (auto it : thisCell.tracks()) {
     if (quality[it] == bad) continue;
     auto nh = foundNtuplets->size(it);
     maxNh = std::max(nh,maxNh);
   }
   // find min chi2
-  for (auto it : thisCell.theTracks) {
+  for (auto it : thisCell.tracks()) {
     auto nh = foundNtuplets->size(it);
     if (nh!=maxNh) continue; 
     if (quality[it]!= bad && 
@@ -141,7 +142,7 @@ kernel_fastDuplicateRemover(GPUCACell const * cells, uint32_t const * __restrict
     }
   }
   // mark duplicates
-  for (auto it : thisCell.theTracks) {
+  for (auto it : thisCell.tracks()) {
      if (quality[it]!= bad && it!=im) quality[it] = dup; //no race:  simple assignment of the same constant
   }
 }
@@ -151,6 +152,7 @@ void
 kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to zero them,
                GPUCACell::Hits const *  __restrict__ hhp,
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
+               CellNeighborsVector * cellNeighbors,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell) {
 
   auto const & hh = *hhp;
@@ -179,15 +181,16 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to z
      if (thisCell.check_alignment(hh,
                  cells[otherCell], ptmin, hardCurvCut)
         ) {
-          cells[otherCell].theOuterNeighbors.push_back(cellIndex);
+          cells[otherCell].addOuterNeighbor(cellIndex, *cellNeighbors);
      }
   }
 }
 
 __global__ 
 void kernel_find_ntuplets(
-    GPUCACell::Hits const *  __restrict__ hhp,
+    GPUCACell::Hits const * __restrict__ hhp,
     GPUCACell * __restrict__ cells, uint32_t const * nCells,
+    CellTracksVector * cellTracks,
     TuplesOnGPU::Container * foundNtuplets, AtomicPairCounter * apc,
     GPUCACell::TupleMultiplicity * tupleMultiplicity,
     unsigned int minHitsPerNtuplet)
@@ -209,7 +212,7 @@ void kernel_find_ntuplets(
   if (thisCell.theLayerPairId==0 || thisCell.theLayerPairId==3 || thisCell.theLayerPairId==8) { // inner layer is 0 FIXME
     GPUCACell::TmpTuple stack;
     stack.reset();
-    thisCell.find_ntuplets(hh, cells, *foundNtuplets, *apc, 
+    thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, 
                            #ifdef CA_USE_LOCAL_COUNTERS
                            local,
                            #else
@@ -341,7 +344,7 @@ void kernel_doStatsForHitInTracks(CAHitQuadrupletGeneratorKernels::HitToTuple co
 }
 
 __global__
-void kernel_tripletCleaner(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
+void kernel_tripletCleaner(TrackingRecHit2DSOAView const *  __restrict__ hhp,
                            TuplesOnGPU::Container const * __restrict__ ptuples,
                            Rfit::helix_fit const * __restrict__ hfit,
                            Quality *  __restrict__ quality,
@@ -422,10 +425,10 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets();
 
 
-  auto nhits = hh.nHits;
-  assert(nhits <= PixelGPUConstants::maxNumberOfHits);
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
   
-  if (earlyFishbone_) {
+  if (nhits>1 && earlyFishbone_) {
     auto nthTot = 64;
     auto stride = 4;
     auto blockSize = nthTot/stride;
@@ -433,9 +436,9 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
     dim3 blks(1,numberOfBlocks,1);
     dim3 thrs(stride,blockSize,1);
     fishbone<<<blks,thrs, 0, cudaStream>>>(
-      hh.gpu_d,
-      device_theCells_, device_nCells_,
-      device_isOuterHitOfCell_,
+      hh.view(),
+      device_theCells_.get(), device_nCells_,
+      device_isOuterHitOfCell_.get(),
       nhits, false
     );
     cudaCheck(cudaGetLastError());
@@ -455,15 +458,17 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
 
   kernel_connect<<<blks, thrs, 0, cudaStream>>>(
       gpu_.apc_d, device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-      hh.gpu_d,
-      device_theCells_, device_nCells_,
-      device_isOuterHitOfCell_
+      hh.view(),
+      device_theCells_.get(), device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get()
   );
   cudaCheck(cudaGetLastError());
 
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      hh.gpu_d,
-      device_theCells_, device_nCells_,
+      hh.view(),
+      device_theCells_.get(), device_nCells_,
+      device_theCellTracks_,
       gpu_.tuples_d,
       gpu_.apc_d,
       device_tupleMultiplicity_,
@@ -482,7 +487,7 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.tuples_d,device_tupleMultiplicity_);
   cudaCheck(cudaGetLastError());
 
-  if (lateFishbone_) {
+  if (nhits>1 && lateFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot/stride;
@@ -490,9 +495,9 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
     dim3 blks(1,numberOfBlocks,1);
     dim3 thrs(stride,blockSize,1);
     fishbone<<<blks,thrs, 0, cudaStream>>>(
-      hh.gpu_d,
-      device_theCells_, device_nCells_,
-      device_isOuterHitOfCell_,
+      hh.view(),
+      device_theCells_.get(), device_nCells_,
+      device_isOuterHitOfCell_.get(),
       nhits, true
     );
     cudaCheck(cudaGetLastError());
@@ -502,11 +507,15 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
     numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1)/blockSize;
     kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
                         gpu_.tuples_d, gpu_.apc_d,
-                        device_theCells_, device_nCells_,
-                        device_isOuterHitOfCell_, nhits,
+                        device_theCells_.get(), device_nCells_,
+                        device_theCellNeighbors_,device_theCellTracks_,
+                        device_isOuterHitOfCell_.get(), nhits,
                         counters_
                        );
     cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+#endif
   }
 
 
@@ -514,16 +523,39 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   }
 
 
-void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
-  auto nhits = hh.nHits;
+void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream) {
+  auto nhits = hh.nHits();
+
+#ifdef GPU_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  edm::Service<CUDAService> cs;
+  device_isOuterHitOfCell_ = cs->make_device_unique<GPUCACell::OuterHitOfCell[]>(nhits, stream);
+  {
+    int threadsPerBlock = 128;
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),nhits,
+                                     device_theCellNeighbors_, device_theCellNeighborsContainer_.get(),
+                                     device_theCellTracks_, device_theCellTracksContainer_.get()
+                                     );
+    cudaCheck(cudaGetLastError());
+  }
+   
+  device_theCells_  = cs->make_device_unique<GPUCACell[]>(CAConstants::maxNumberOfDoublets(), stream);
+
+  if (0==nhits) return; // protect against empty events
 
   int stride=1;
   int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
   int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1,blocks,1);
   dim3 thrs(stride,threadsPerBlock,1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
-            device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_, idealConditions_);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(
+            device_theCells_.get(), device_nCells_,
+            device_theCellNeighbors_, device_theCellTracks_,
+            hh.view(), device_isOuterHitOfCell_.get(), idealConditions_);
   cudaCheck(cudaGetLastError());
 }
 
@@ -533,31 +565,40 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
     // classify tracks based on kinematics
     auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize;
     kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples.tuples_d, tuples.helix_fit_results_d, tuples.quality_d);
+    cudaCheck(cudaGetLastError());
 
     // apply fishbone cleaning to good tracks
     numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1)/blockSize;
-    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.quality_d);
+    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_.get(), device_nCells_,tuples.quality_d);
+    cudaCheck(cudaGetLastError());
 
     // remove duplicates (tracks that share a doublet) 
     numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1)/blockSize;
-    kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);
+    kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_.get(), device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);
+    cudaCheck(cudaGetLastError());
 
     // fill hit->track "map"
     numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize;
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples.tuples_d,tuples.quality_d,device_hitToTuple_);
+    cudaCheck(cudaGetLastError());
     cudautils::launchFinalize(device_hitToTuple_,device_tmws_,cudaStream);
+    cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples.tuples_d,tuples.quality_d,device_hitToTuple_);
+    cudaCheck(cudaGetLastError());
 
     // remove duplicates (tracks that share a hit)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1)/blockSize;
-    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.gpu_d,tuples.tuples_d,tuples.helix_fit_results_d,tuples.quality_d,device_hitToTuple_);
+    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),tuples.tuples_d,tuples.helix_fit_results_d,tuples.quality_d,device_hitToTuple_);
+    cudaCheck(cudaGetLastError());
 
     if (doStats_) {
       // counters (add flag???)
       numberOfBlocks = (HitToTuple::capacity() + blockSize - 1)/blockSize;
       kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_, counters_);
+      cudaCheck(cudaGetLastError());
       numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1)/blockSize;
       kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples.tuples_d,tuples.quality_d,counters_);
+      cudaCheck(cudaGetLastError());
     }
 }
 
@@ -566,10 +607,16 @@ __global__
 void kernel_printCounters(CAHitQuadrupletGeneratorKernels::Counters const * counters) {
    
    auto const & c = *counters;
-   printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld\n",c.nEvents,c.nHits,c.nCells,c.nTuples,c.nGoodTracks,c.nUsedHits, c.nDupHits, c.nKilledCells);
-   printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f||\n",c.nEvents,c.nHits/double(c.nEvents),c.nCells/double(c.nEvents),
+   printf("||Counters | nEvents | nHits | nCells | nTuples | nGoodTracks | nUsedHits | nDupHits | nKilledCells | nEmptyCells | nZeroTrackCells ||\n");
+   printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",c.nEvents,c.nHits,c.nCells,
+           c.nTuples,c.nGoodTracks,c.nUsedHits, c.nDupHits, c.nKilledCells, c.nEmptyCells,c.nZeroTrackCells
+         );
+   printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+                                                c.nEvents,c.nHits/double(c.nEvents),c.nCells/double(c.nEvents),
                                                 c.nTuples/double(c.nEvents),c.nGoodTracks/double(c.nEvents),
-                                                c.nUsedHits/double(c.nEvents),c.nDupHits/double(c.nEvents),c.nKilledCells/double(c.nEvents));
+                                                c.nUsedHits/double(c.nEvents),c.nDupHits/double(c.nEvents),c.nKilledCells/double(c.nEvents),
+                                                c.nEmptyCells/double(c.nCells),c.nZeroTrackCells/double(c.nCells)
+         );
 
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h
index 357fdcf1ad3f7..1917d13f2934f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.h
@@ -2,12 +2,12 @@
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorKernels_h
 
 
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
-
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
 #include "GPUCACell.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 
 
@@ -24,10 +24,12 @@ class CAHitQuadrupletGeneratorKernels {
       unsigned long long nUsedHits;
       unsigned long long nDupHits;
       unsigned long long nKilledCells;
+      unsigned long long nEmptyCells;
+      unsigned long long nZeroTrackCells;
     };
 
-   using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-   using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+   using HitsOnGPU = TrackingRecHit2DSOAView;
+   using HitsOnCPU = TrackingRecHit2DCUDA;
 
    using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
@@ -51,7 +53,7 @@ class CAHitQuadrupletGeneratorKernels {
 
    void classifyTuples(HitsOnCPU const & hh, TuplesOnGPU & tuples_d, cudaStream_t cudaStream);
 
-   void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream);
+   void buildDoublets(HitsOnCPU const & hh, cuda::stream_t<>& stream);
    void allocateOnGPU();
    void deallocateOnGPU();
    void cleanup(cudaStream_t cudaStream);
@@ -59,24 +61,31 @@ class CAHitQuadrupletGeneratorKernels {
 
 private:
 
-    Counters * counters_ = nullptr;
+   Counters * counters_ = nullptr;
+
+   // workspace
+   CAConstants::CellNeighborsVector * device_theCellNeighbors_ = nullptr;
+   cudautils::device::unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_; 
+   CAConstants::CellTracksVector * device_theCellTracks_ = nullptr;
+   cudautils::device::unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+
 
-    // workspace
-    GPUCACell* device_theCells_ = nullptr;
-    GPUCACell::OuterHitOfCell* device_isOuterHitOfCell_ = nullptr;
-    uint32_t* device_nCells_ = nullptr;
+   cudautils::device::unique_ptr<GPUCACell[]> device_theCells_;
+   cudautils::device::unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+   uint32_t* device_nCells_ = nullptr;
 
-    HitToTuple * device_hitToTuple_ = nullptr;
-    AtomicPairCounter * device_hitToTuple_apc_ = nullptr;
+   HitToTuple * device_hitToTuple_ = nullptr;
+   AtomicPairCounter * device_hitToTuple_apc_ = nullptr;
 
-    TupleMultiplicity * device_tupleMultiplicity_ = nullptr;
-    uint8_t * device_tmws_ = nullptr;    
+   TupleMultiplicity * device_tupleMultiplicity_ = nullptr;
+   uint8_t * device_tmws_ = nullptr;    
 
-    const uint32_t minHitsPerNtuplet_;
-    const bool earlyFishbone_;
-    const bool lateFishbone_;
-    const bool idealConditions_;
-    const bool doStats_;
+   // params
+   const uint32_t minHitsPerNtuplet_;
+   const bool earlyFishbone_;
+   const bool lateFishbone_;
+   const bool idealConditions_;
+   const bool doStats_;
 };
 
 #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu
index 2c65058a1651a..8c5e646bb8325 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernelsAlloc.cu
@@ -11,9 +11,10 @@ CAHitQuadrupletGeneratorKernels::deallocateOnGPU()
   }
   cudaFree(counters_);
 
-  cudaFree(device_theCells_);
-  cudaFree(device_isOuterHitOfCell_);
   cudaFree(device_nCells_);
+  cudaFree(device_theCellNeighbors_);
+  cudaFree(device_theCellTracks_);
+
   cudaFree(device_hitToTuple_);
   cudaFree(device_hitToTuple_apc_);
   cudaFree(device_tupleMultiplicity_);
@@ -29,31 +30,37 @@ void CAHitQuadrupletGeneratorKernels::allocateOnGPU()
   cudaCheck(cudaMalloc(&counters_, sizeof(Counters)));
   cudaCheck(cudaMemset(counters_,0,sizeof(Counters)));
 
-  cudaCheck(cudaMalloc(&device_theCells_,
-             CAConstants::maxNumberOfLayerPairs() * CAConstants::maxNumberOfDoublets() * sizeof(GPUCACell)));
   cudaCheck(cudaMalloc(&device_nCells_, sizeof(uint32_t)));
   cudaCheck(cudaMemset(device_nCells_, 0, sizeof(uint32_t)));
 
-  cudaCheck(cudaMalloc(&device_isOuterHitOfCell_,
-             PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell)));
-  cudaCheck(cudaMemset(device_isOuterHitOfCell_, 0,
-             PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell)));
+  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
+  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
+
 
-   cudaCheck(cudaMalloc(&device_hitToTuple_, sizeof(HitToTuple)));
-   cudaCheck(cudaMemset(device_hitToTuple_,0,sizeof(HitToTuple))); // overkill
-   cudaCheck(cudaMalloc(&device_hitToTuple_apc_, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMalloc(&device_hitToTuple_, sizeof(HitToTuple)));
+  cudaCheck(cudaMemset(device_hitToTuple_,0,sizeof(HitToTuple))); // overkill
+  cudaCheck(cudaMalloc(&device_hitToTuple_apc_, sizeof(AtomicPairCounter)));
 
-   cudaCheck(cudaMalloc(&device_tupleMultiplicity_,sizeof(TupleMultiplicity)));
-   cudaCheck(cudaMemset(device_tupleMultiplicity_,0,sizeof(TupleMultiplicity))); // overkill
+  cudaCheck(cudaMalloc(&device_tupleMultiplicity_,sizeof(TupleMultiplicity)));
+  cudaCheck(cudaMemset(device_tupleMultiplicity_,0,sizeof(TupleMultiplicity))); // overkill
 
-   cudaCheck(cudaMalloc(&device_tmws_, std::max(TupleMultiplicity::wsSize(),HitToTuple::wsSize())));
+  cudaCheck(cudaMalloc(&device_tmws_, std::max(TupleMultiplicity::wsSize(),HitToTuple::wsSize())));
 }
 
 void CAHitQuadrupletGeneratorKernels::cleanup(cudaStream_t cudaStream) {
+
+#ifdef GPU_DEBUG
+  std::cout << "CAHitQuadrupletGeneratorKernels::cleanup" << std::endl;
+#endif
+  
   // this lazily resets temporary memory for the next event, and is not needed for reading the output
-  cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0,
-                            PixelGPUConstants::maxNumberOfHits * sizeof(CAConstants::OuterHitOfCell),
-                            cudaStream));
+  device_theCells_ = nullptr;
+  device_isOuterHitOfCell_ = nullptr;
+  device_theCellNeighborsContainer_ = nullptr;
+  device_theCellTracksContainer_ = nullptr;
+
   cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream));
 
   cudautils::launchZero(device_tupleMultiplicity_,cudaStream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index b3e23792a4083..f20a96dc79b73 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,24 +6,28 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 
-
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
 class GPUCACell {
 public:
 
+  using ptrAsInt = unsigned long long;
+
   static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
   using OuterHitOfCell = CAConstants::OuterHitOfCell;
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
 
-
-  using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-  using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+  using Hits = TrackingRecHit2DSOAView;
+  using hindex_type = Hits::hindex_type;
 
   using TmpTuple = GPU::VecArray<uint32_t,6>;
 
@@ -35,7 +39,8 @@ class GPUCACell {
 #ifdef __CUDACC__
 
   __device__ __forceinline__
-  void init(Hits const & hh,
+  void init(CellNeighborsVector & cellNeighbors, CellTracksVector & cellTracks,
+      Hits const & hh,
       int layerPairId, int doubletId,  
       hindex_type innerHitId, hindex_type outerHitId)
   {
@@ -44,23 +49,45 @@ class GPUCACell {
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    theInnerZ = __ldg(hh.zg_d+innerHitId);
-    theInnerR = __ldg(hh.rg_d+innerHitId);
-    theOuterNeighbors.reset();
-    theTracks.reset();
+    theInnerZ = hh.zGlobal(innerHitId);
+    theInnerR = hh.rGlobal(innerHitId);
+
+    outerNeighbors().reset();
+    tracks().reset();
+    assert(outerNeighbors().empty());
+    assert(tracks().empty());
+
   }
 
-  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
 
-  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theOuterHitId); }
+ __device__ __forceinline__
+  int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector & cellNeighbors) {
+     return outerNeighbors().push_back(t);
+  }
+
+  __device__ __forceinline__
+  int addTrack(CellTracks::value_t t, CellTracksVector & cellTracks) {
+     return tracks().push_back(t);
+  } 
+
+  __device__ __forceinline__ CellTracks & tracks() { return theTracks;}
+  __device__ __forceinline__ CellTracks const & tracks() const { return theTracks;}
+  __device__ __forceinline__ CellNeighbors & outerNeighbors() { return theOuterNeighbors;}
+  __device__ __forceinline__ CellNeighbors const & outerNeighbors() const { return theOuterNeighbors;}
+  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return hh.rGlobal(theOuterHitId); }
+
+   __device__ __forceinline__ auto get_inner_iphi(Hits const & hh) const { return hh.iphi(theInnerHitId); }
+   __device__ __forceinline__ auto get_outer_iphi(Hits const & hh) const { return hh.iphi(theOuterHitId); }
+
+  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return hh.detectorIndex(theOuterHitId); }
 
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
@@ -143,13 +170,19 @@ class GPUCACell {
   __device__
   inline bool 
   hole(Hits const & hh, GPUCACell const & innerCell) const {
-    constexpr float r4 = 16.f;
+    int p = get_outer_iphi(hh);
+    if (p<0) p+=std::numeric_limits<unsigned short>::max();
+    p = (64*p)/std::numeric_limits<unsigned short>::max();
+    p %=2;
+    float r4 = p==0 ? 15.815 : 16.146;  // later on from geom
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
     auto z4 = std::abs(zi + (r4-ri)*(zo-zi)/(ro-ri));
-    return z4>25.f && z4<33.f;
+    auto zm = z4-6.7*int(z4/6.7);
+    auto h = zm<0.2 || zm>6.5;
+    return h || ( z4>26 && z4<32.f);
   }
 
 
@@ -161,6 +194,7 @@ class GPUCACell {
   inline void find_ntuplets(
       Hits const & hh,
       GPUCACell * __restrict__ cells,
+      CellTracksVector & cellTracks,
       TuplesOnGPU::Container & foundNtuplets, 
       AtomicPairCounter & apc,
       CM & tupleMultiplicity,
@@ -176,10 +210,10 @@ class GPUCACell {
     tmpNtuplet.push_back_unsafe(theDoubletId);
     assert(tmpNtuplet.size()<=4);
 
-    if(theOuterNeighbors.size()>0) {
-      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
-        auto otherCell = theOuterNeighbors[j];
-        cells[otherCell].find_ntuplets(hh, cells, foundNtuplets, apc, tupleMultiplicity, 
+    if(outerNeighbors().size()>0) {
+      for (int j = 0; j < outerNeighbors().size(); ++j) {
+        auto otherCell = outerNeighbors()[j];
+        cells[otherCell].find_ntuplets(hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, 
                                        tmpNtuplet, minHitsPerNtuplet);
       }
     } else {  // if long enough save...
@@ -194,7 +228,7 @@ class GPUCACell {
           hits[nh] = theOuterHitId; 
           auto it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
           if (it>=0)  { // if negative is overflow....
-            for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+            for (auto c : tmpNtuplet) cells[c].addTrack(it,cellTracks);
             tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
           }
         }
@@ -206,9 +240,11 @@ class GPUCACell {
 
 #endif // __CUDACC__
 
-  GPU::VecArray< uint32_t, 36> theOuterNeighbors;
-  GPU::VecArray< uint16_t, 42> theTracks;
+private:
+  CellNeighbors theOuterNeighbors;
+  CellTracks theTracks;
 
+public:
   int32_t theDoubletId;
   int32_t theLayerPairId;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 393eb9b020c3d..b2eab7626279e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -9,27 +9,10 @@ void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMu
 
   assert(tuples_d); assert(tupleMultiplicity_d); assert(helix_fit_results_d);
 
-  cudaCheck(cudaMalloc(&hitsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
-  cudaCheck(cudaMemset(hitsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
-
-  cudaCheck(cudaMalloc(&hits_geGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
-  cudaCheck(cudaMemset(hits_geGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
-
-  cudaCheck(cudaMalloc(&fast_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
-  cudaCheck(cudaMemset(fast_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
-
-  cudaCheck(cudaMalloc(&circle_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
-  cudaCheck(cudaMemset(circle_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
-
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {
 
-  cudaFree(hitsGPU_);
-  cudaFree(hits_geGPU_);
-  cudaFree(fast_fit_resultsGPU_);
-  cudaFree(circle_fit_resultsGPU_);
-
 }
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 40a62c4c1c723..a50116c87321d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -4,9 +4,11 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
-namespace siPixelRecHitsHeterogeneousProduct {
-   struct HitsOnCPU;
-}
+#include <cuda/api_wrappers.h>
+
+class TrackingRecHit2DSOAView;
+class TrackingRecHit2DCUDA;
+
 
 namespace Rfit {
   constexpr uint32_t maxNumberOfConcurrentFits() { return 6*1024;}
@@ -35,8 +37,8 @@ namespace Rfit {
 class HelixFitOnGPU {
 public:
 
-   using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-   using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+   using HitsOnGPU = TrackingRecHit2DSOAView;
+   using HitsOnCPU = TrackingRecHit2DCUDA;
 
    using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
    using TupleMultiplicity = CAConstants::TupleMultiplicity;
@@ -45,8 +47,8 @@ class HelixFitOnGPU {
    ~HelixFitOnGPU() { deallocateOnGPU();}
 
    void setBField(double bField) { bField_ = bField;}
-   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
+   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
 
    void allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results);
    void deallocateOnGPU();
@@ -62,12 +64,6 @@ class HelixFitOnGPU {
     double bField_;
     Rfit::helix_fit * helix_fit_results_d = nullptr;
 
-   // Riemann Fit internals
-   double *hitsGPU_ = nullptr;
-   float *hits_geGPU_ = nullptr;
-   double *fast_fit_resultsGPU_ = nullptr;
-   Rfit::circle_fit *circle_fit_resultsGPU_ = nullptr;
-
     const bool fit5as4_;
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
index d27a639a5a9bf..f7538fc822011 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -8,6 +8,9 @@
 #include <unordered_map>
 
 #include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
 // store T for each cluster
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index e9111cb0b5db1..ea801b1b46389 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -11,12 +11,14 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
-using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
@@ -66,10 +68,10 @@ void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
     float ge[6];
-    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
     // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
 
-    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
     hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
   }
   Rfit::Fast_fit(hits,fast_fit);
@@ -190,85 +192,93 @@ void kernelLineFit(
 }
 
 
-void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
 {
-    assert(tuples_d); assert(fast_fit_resultsGPU_);
+    assert(tuples_d); 
 
     auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
+   //  Fit internals
+   edm::Service<CUDAService> cs;
+   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
+   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
+   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
+   auto circle_fit_resultsGPU_holder = cs->make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit),stream);
+   Rfit::circle_fit * circle_fit_resultsGPU_ = (Rfit::circle_fit*)(circle_fit_resultsGPU_holder.get());
+
     for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
 
       // triplets
-      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 3, 
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 3, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 3,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
       cudaCheck(cudaGetLastError());
 
       // quads
-      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 4,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 4, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 4,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
       cudaCheck(cudaGetLastError());
 
       if (fit5as4_) {
         // penta
-        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 5,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
         cudaCheck(cudaGetLastError());
 
-        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 5, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
         cudaCheck(cudaGetLastError());
       } else {
         // penta all 5
-        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 5,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
         cudaCheck(cudaGetLastError());
 
-        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 5, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
         cudaCheck(cudaGetLastError());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 4368bc12ab5f8..e5f8406bd31b8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -10,7 +10,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "DataFormats/Math/interface/approx_atan2.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 
 #include "GPUCACell.h"
@@ -32,8 +31,7 @@ namespace gpuPixelDoublets {
 
 
     auto const & hh = *hhp;
-    uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
-    auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
+    auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id);};
 
     // x run faster...
     auto idy = threadIdx.y + blockIdx.y * blockDim.y;
@@ -54,7 +52,7 @@ namespace gpuPixelDoublets {
     auto sg=0;
     for (uint32_t ic=0; ic<s; ++ic) {
       auto & ci = cells[vc[ic]];
-      if (checkTrack && ci.theTracks.empty()) continue;
+      if (checkTrack && ci.tracks().empty()) continue;
       cc[sg] = vc[ic];
       d[sg] = ci.get_inner_detId(hh);
 //      l[sg] = layer(d[sg]);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 7007554fce9e6..b4c563e601008 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -1,223 +1,16 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
 
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <limits>
+#include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h"
 
-#include "DataFormats/Math/interface/approx_atan2.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
-
-#include "GPUCACell.h"
-#include "CAConstants.h"
-
-
-// useful for benchmark
-// #define ONLY_PHICUT
-// #define USE_ZCUT
-// #define NO_CLSCUT
+#define CONSTANT_VAR __constant__
 
 namespace gpuPixelDoublets {
 
-  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
-
-  template<typename Hist>
-  __device__
-  __forceinline__
-  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
-                         uint32_t nPairs,
-                         GPUCACell * cells,
-                         uint32_t * nCells,
-                         int16_t const * __restrict__ iphi,
-                         Hist const & __restrict__ hist,
-                         uint32_t const * __restrict__ offsets,
-                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
-                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                         int16_t const * __restrict__ phicuts,
-#ifdef USE_ZCUT
-                         float const * __restrict__ minz,
-                         float const * __restrict__ maxz,
-#endif
-                         float const * __restrict__ maxr, bool ideal_cond)
-  {
-
-#ifndef NO_CLSCUT 
-    // ysize cuts (z in the barrel)  times 8
-    constexpr int minYsizeB1=36;
-    constexpr int minYsizeB2=28;
-    constexpr int maxDYsize12=28;
-    constexpr int maxDYsize=20;
-#endif
-
-    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
-
-    // nPairsMax to be optimized later (originally was 64).
-    // If it should be much bigger, consider using a block-wide parallel prefix scan,
-    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = 16;
-    assert(nPairs <= nPairsMax);
-    uint32_t innerLayerCumulativeSize[nPairsMax];
-    innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
-    for (uint32_t i = 1; i < nPairs; ++i) {
-      innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
-    }
-    auto ntot = innerLayerCumulativeSize[nPairs-1];
-
-    // x runs faster
-    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
-    auto first = threadIdx.x;
-    auto stride = blockDim.x;
-    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
-
-      uint32_t pairLayerId=0;
-      while (j >= innerLayerCumulativeSize[pairLayerId++]);
-      --pairLayerId; // move to lower_bound ??
-
-      assert(pairLayerId < nPairs);
-      assert(j < innerLayerCumulativeSize[pairLayerId]);
-      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
-
-      uint8_t inner = layerPairs[2*pairLayerId];
-      uint8_t outer = layerPairs[2*pairLayerId+1];
-      assert(outer > inner);
-
-      auto hoff = Hist::histOff(outer);
-
-      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
-      i += offsets[inner];
-
-      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
-
-      assert(i >= offsets[inner]);
-      assert(i < offsets[inner+1]);
-
-      // found hit corresponding to our cuda thread, now do the job
-      auto mez = __ldg(hh.zg_d+i);
-
-#ifdef USE_ZCUT
-     // this statement is responsible for a 10% slow down of the kernel once all following cuts are optimized...
-     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
-#endif
-
-#ifndef NO_CLSCUT
-      auto mes = __ldg(hh.ysize_d+i);
-
-      // if ideal treat inner ladder as outer
-      auto mi = __ldg(hh.detInd_d+i);
-      if (inner==0) assert(mi<96);    
-      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
-
-      // auto mesx = __ldg(hh.xsize_d+i);
-      // if (mesx<0) continue; // remove edges in x as overlap will take care
-
-      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
-         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
-      if (inner==1 && outer>3)  // B2 and F1
-         if (mes>0 && mes<minYsizeB2) continue;
-#endif // NO_CLSCUT
-
-      auto mep = iphi[i];
-      auto mer = __ldg(hh.rg_d+i);
- 
-      constexpr float z0cut = 12.f;                     // cm
-      constexpr float hardPtCut = 0.5f;                 // GeV
-      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
-      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
-      auto ptcut = [&](int j) {
-        auto r2t4 = minRadius2T4;
-        auto ri = mer;
-        auto ro = __ldg(hh.rg_d+j);
-        auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
-        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
-      };
-      auto z0cutoff = [&](int j) {
-        auto zo = __ldg(hh.zg_d+j);
-        auto ro = __ldg(hh.rg_d+j);
-        auto dr = ro-mer;
-        return dr > maxr[pairLayerId] ||
-          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
-      };
-
-#ifndef NO_CLSCUT
-      auto zsizeCut = [&](int j) {
-        auto onlyBarrel = outer<4;
-        auto so = __ldg(hh.ysize_d+j);
-        //auto sox = __ldg(hh.xsize_d+j);
-        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
-        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
-      };
-#endif
-
-      auto iphicut = phicuts[pairLayerId];
-
-      auto kl = Hist::bin(int16_t(mep-iphicut));
-      auto kh = Hist::bin(int16_t(mep+iphicut));
-      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
-
-#ifdef GPU_DEBUG
-      int  tot  = 0;
-      int  nmin = 0;
-      int tooMany=0;
-#endif
-
-      auto khh = kh;
-      incr(khh);
-      for (auto kk = kl; kk != khh; incr(kk)) {
-#ifdef GPU_DEBUG
-        if (kk != kl && kk != kh)
-          nmin += hist.size(kk+hoff);
-#endif
-        auto const * __restrict__ p = hist.begin(kk+hoff);
-        auto const * __restrict__ e = hist.end(kk+hoff);
-        p+=first;
-        for (;p < e; p+=stride) {
-          auto oi=__ldg(p);
-          assert(oi>=offsets[outer]);
-          assert(oi<offsets[outer+1]);
-
-          if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
-            continue;
-#ifndef ONLY_PHICUT
-#ifndef NO_CLSCUT
-          if (zsizeCut(oi)) continue;
-#endif
-          if (z0cutoff(oi) || ptcut(oi)) continue;
-#endif
-          auto ind = atomicAdd(nCells, 1); 
-          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
-          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
-          cells[ind].init(hh, pairLayerId, ind, i, oi);
-          isOuterHitOfCell[oi].push_back(ind);
-#ifdef GPU_DEBUG
-          if (isOuterHitOfCell[oi].full()) ++tooMany;
-          ++tot;
-#endif
-        }
-      }
-#ifdef GPU_DEBUG
-      if (tooMany > 0)
-        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
-#endif
-    }  // loop in block...
-  }
+    using namespace gpuPixelDoubletsAlgos;
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
-  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
-
-  __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
-  void getDoubletsFromHisto(GPUCACell * cells,
-                            uint32_t * nCells,
-                            siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                            bool ideal_cond)
-  {
     constexpr int nPairs = 13;
-    constexpr const uint8_t layerPairs[2*nPairs] = {
+    CONSTANT_VAR const uint8_t layerPairs[2*nPairs] = {
       0, 1,  1, 2,  2, 3,
       // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
       0, 7,  1, 7,  2, 7,  7, 8,  8, 9, // neg
@@ -228,41 +21,71 @@ namespace gpuPixelDoublets {
     constexpr int16_t phi0p06 = 626;    // round(625.82270...) = phi2short(0.06);
     constexpr int16_t phi0p07 = 730;    // round(730.12648...) = phi2short(0.07);
 
-    constexpr const int16_t phicuts[nPairs] {
+    CONSTANT_VAR const int16_t phicuts[nPairs] {
       phi0p05, phi0p05, phi0p06,
       phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
       phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
     };
 
-#ifdef USE_ZCUT
-    float const minz[nPairs] = {
+    CONSTANT_VAR float const minz[nPairs] = {
       -20., -22., -22.,
       -30., -30.,-30., -70., -70.,
         0., 10., 15., -70., -70.
     };
 
-    float const maxz[nPairs] = {
+    CONSTANT_VAR float const maxz[nPairs] = {
       20., 22., 22.,
        0., -10., -15., 70., 70.,
       30., 30., 30., 70., 70.
     };
-#endif
 
-    float const maxr[nPairs] = {
+    CONSTANT_VAR float const maxr[nPairs] = {
       20., 20., 20.,
        9.,  7.,  6.,  5.,  5.,
        9.,  7.,  6.,  5.,  5.
     };
 
+
+  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
+
+  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
+
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+ __global__
+  void initDoublets(GPUCACell::OuterHitOfCell * isOuterHitOfCell, int nHits,
+                    CellNeighborsVector * cellNeighbors, CellNeighbors * cellNeighborsContainer,
+                    CellTracksVector * cellTracks, CellTracks * cellTracksContainer
+                   )
+  {
+     int first = blockIdx.x * blockDim.x + threadIdx.x;
+     for (int i=first; i<nHits; i+=gridDim.x*blockDim.x) isOuterHitOfCell[i].reset();
+  }
+
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
+
+  __global__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
+  void getDoubletsFromHisto(GPUCACell * cells,
+                            uint32_t * nCells,
+                            CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
+                            TrackingRecHit2DSOAView const *  __restrict__ hhp,
+                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
+                            bool ideal_cond)
+  {
+
     auto const &  __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs, nPairs, cells, nCells,
-                      hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
+    doubletsFromHisto(layerPairs, nPairs,
+                      cells, nCells,
+                      cellNeighbors, cellTracks,
                       hh, isOuterHitOfCell,
-                      phicuts, 
-#ifdef USE_ZCUT
-                      minz, maxz, 
-#endif
-                      maxr , ideal_cond);
+                      phicuts, minz, maxz, maxr,
+                      ideal_cond);
   }
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
new file mode 100644
index 0000000000000..cffb432cd8e37
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -0,0 +1,220 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+#include "GPUCACell.h"
+#include "CAConstants.h"
+
+
+// useful for benchmark
+// #define ONLY_PHICUT
+// #define NO_ZCUT
+// #define NO_CLSCUT
+
+namespace gpuPixelDoubletsAlgos {
+
+  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
+
+  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
+
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+  __device__
+  __forceinline__
+  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
+                         uint32_t nPairs,
+                         GPUCACell * cells,
+                         uint32_t * nCells,
+                         CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
+                         TrackingRecHit2DSOAView const &  __restrict__ hh,
+                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
+                         int16_t const * __restrict__ phicuts,
+                         float const * __restrict__ minz,
+                         float const * __restrict__ maxz,
+                         float const * __restrict__ maxr,
+                         bool ideal_cond)
+  {
+
+#ifndef NO_CLSCUT 
+    // ysize cuts (z in the barrel)  times 8
+    constexpr int minYsizeB1=36;
+    constexpr int minYsizeB2=28;
+    constexpr int maxDYsize12=28;
+    constexpr int maxDYsize=20;
+#endif
+
+    using Hist = TrackingRecHit2DSOAView::Hist;
+
+    auto const & __restrict__ hist = hh.phiBinner();
+    uint32_t const * __restrict__ offsets = hh.hitsLayerStart();
+    assert(offsets);
+
+    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
+
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = 16;
+    assert(nPairs <= nPairsMax);
+    __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
+    __shared__ uint32_t ntot;
+    if (threadIdx.y==0 && threadIdx.x==0) {
+      innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+      for (uint32_t i = 1; i < nPairs; ++i) {
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+      }
+      ntot = innerLayerCumulativeSize[nPairs-1];
+    }
+    __syncthreads();
+
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
+
+      uint32_t pairLayerId=0;
+      while (j >= innerLayerCumulativeSize[pairLayerId++]);
+      --pairLayerId; // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
+
+      uint8_t inner = layerPairs[2*pairLayerId];
+      uint8_t outer = layerPairs[2*pairLayerId+1];
+      assert(outer > inner);
+
+      auto hoff = Hist::histOff(outer);
+
+      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner+1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mez = hh.zGlobal(i);
+
+#ifndef NO_ZCUT
+     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
+#endif
+
+#ifndef NO_CLSCUT
+      auto mes = hh.clusterSizeY(i);
+
+      // if ideal treat inner ladder as outer
+      auto mi = hh.detectorIndex(i);
+      if (inner==0) assert(mi<96);    
+      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+
+      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
+         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
+      if (inner==1 && outer>3)  // B2 and F1
+         if (mes>0 && mes<minYsizeB2) continue;
+#endif // NO_CLSCUT
+
+      auto mep = hh.iphi(i);
+      auto mer = hh.rGlobal(i);
+ 
+      constexpr float z0cut = 12.f;                     // cm
+      constexpr float hardPtCut = 0.5f;                 // GeV
+      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+      auto ptcut = [&](int j) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rGlobal(j);
+        auto dphi = short2phi( min( abs(int16_t(mep-hh.iphi(j))), abs(int16_t(hh.iphi(j)-mep)) ) );
+        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zGlobal(j);;
+        auto ro = hh.rGlobal(j);
+        auto dr = ro-mer;
+        return dr > maxr[pairLayerId] ||
+          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+      };
+
+#ifndef NO_CLSCUT
+      auto zsizeCut = [&](int j) {
+        auto onlyBarrel = outer<4;
+        auto so = hh.clusterSizeY(j);
+        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
+        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
+      };
+#endif
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = Hist::bin(int16_t(mep-iphicut));
+      auto kh = Hist::bin(int16_t(mep+iphicut));
+      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+
+#ifdef GPU_DEBUG
+      int  tot  = 0;
+      int  nmin = 0;
+      int tooMany=0;
+#endif
+
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
+        if (kk != kl && kk != kh)
+          nmin += hist.size(kk+hoff);
+#endif
+        auto const * __restrict__ p = hist.begin(kk+hoff);
+        auto const * __restrict__ e = hist.end(kk+hoff);
+        p+=first;
+        for (;p < e; p+=stride) {
+          auto oi=__ldg(p);
+          assert(oi>=offsets[outer]);
+          assert(oi<offsets[outer+1]);
+
+          if (std::min(std::abs(int16_t(hh.iphi(oi)-mep)), std::abs(int16_t(mep-hh.iphi(oi)))) > iphicut)
+            continue;
+#ifndef ONLY_PHICUT
+#ifndef NO_CLSCUT
+          if (zsizeCut(oi)) continue;
+#endif
+          if (z0cutoff(oi) || ptcut(oi)) continue;
+#endif
+          auto ind = atomicAdd(nCells, 1); 
+          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+#ifdef GPU_DEBUG
+          if (isOuterHitOfCell[oi].full()) ++tooMany;
+          ++tot;
+#endif
+        }
+      }
+#ifdef GPU_DEBUG
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+#endif
+    }  // loop in block...
+  }
+
+} // namespace end
+
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h b/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h
index a1c4a26f2fff6..54851ecda1179 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h
@@ -12,9 +12,7 @@
 
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h"
 
-namespace siPixelRecHitsHeterogeneousProduct {
-  struct HitsOnGPU;
-}
+class TrackingRecHit2DSOAView;
 
 namespace pixelTuplesHeterogeneousProduct {
 
@@ -42,7 +40,7 @@ namespace pixelTuplesHeterogeneousProduct {
 
     using Container = TuplesOnGPU::Container;
 
-    siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hitsOnGPU_d = nullptr; // forwarding
+    TrackingRecHit2DSOAView const * hitsOnGPU_d = nullptr; // forwarding
 
     Container const * tuples = nullptr;