diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index d462be2c5dd7b..e79a32c21daa0 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -16,7 +16,7 @@ class TrackSoAT {
   static constexpr int32_t stride() { return S; }
 
   using Quality = trackQuality::Quality;
-  using hindex_type = uint16_t;
+  using hindex_type = uint32_t;
   using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
 
   // Always check quality is at least loose!
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
index 808feb2a4218f..6a83a66b60fbd 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -15,10 +15,12 @@ namespace pixelCPEforGPU {
 class TrackingRecHit2DSOAView {
 public:
   static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
-  using hindex_type = uint16_t;  // if above is <=2^16
+  using hindex_type = uint32_t;  // if above is <=2^32
 
-  using Hist =
-      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), uint16_t, 10>;
+  using PhiBinner =
+      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;
+
+  using Hist = PhiBinner;  // FIXME
 
   using AverageGeometry = phase1PixelTopology::AverageGeometry;
 
@@ -93,9 +95,9 @@ class TrackingRecHit2DSOAView {
 
   uint32_t* m_hitsLayerStart;
 
-  Hist* m_hist;
+  PhiBinner* m_hist;  // FIXME use a more descriptive name consistently
 
   uint32_t m_nHits;
 };
 
-#endif
+#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 0ebbdf3ed3705..9eea4f528fcdb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -44,7 +44,7 @@ namespace CAConstants {
   constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
 
   // types
-  using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
   using tindex_type = uint16_t;  //  for tuples
 
 #ifndef ONLY_PHICUT
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 4d4791b87ad3b..1646cb503ff81 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -134,6 +134,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   if (m_params.doStats_) {
     kernel_checkOverflows(tuples_d,
                           device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 8a213eee2f579..a8dac7992f4fa 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -125,21 +125,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_.get(),
-                                                                        device_theCellTracks_.get(),
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
-    cudaCheck(cudaGetLastError());
-  }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -278,6 +263,24 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     cudaCheck(cudaGetLastError());
   }
 
+  if (m_params.doStats_) {
+    auto nhits = hh.nHits();
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitToTuple_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+
   if (m_params.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 691395887dddb..3a935efbe2b4b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -30,7 +30,8 @@ using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
-                                      CAConstants::TupleMultiplicity *tupleMultiplicity,
+                                      CAConstants::TupleMultiplicity const *tupleMultiplicity,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
@@ -95,7 +96,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       atomicAdd(&c.nKilledCells, 1);
     if (0 == thisCell.theUsed)
       atomicAdd(&c.nEmptyCells, 1);
-    if (thisCell.tracks().empty())
+    if (0 == hitToTuple->size(thisCell.get_inner_hit_id()) && 0 == hitToTuple->size(thisCell.get_outer_hit_id()))
       atomicAdd(&c.nZeroTrackCells, 1);
   }