Skip to content

Commit

Permalink
Migrate the pixel rechits producer and CA to the new heterogeneous fr…
Browse files Browse the repository at this point in the history
…amework (#338)

Use cleaned hits.
Use pixel layer and ladders geometry, and use pixel triplets in the gaps.

Optimise GPU memory usage:
  - reduce the number of memory allocations
  - fix the size of the cub workspace
  - allocate memory per event via the caching allocator
  - use constant memory for geometry and parameters
  - use shared memory where the content is the same for every thread

Optimise kernel launches, and add a protection for empty events and overflows.
  • Loading branch information
VinInn authored and fwyzard committed May 14, 2019
1 parent 70254ee commit 41cd273
Show file tree
Hide file tree
Showing 40 changed files with 1,253 additions and 1,300 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,14 @@
from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import *
from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import *

pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting)
pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplittingTask)
striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits)
trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer)

pixeltrackerlocalreco = cms.Sequence(pixeltrackerlocalrecoTask)
striptrackerlocalreco = cms.Sequence(striptrackerlocalrecoTask)
trackerlocalreco = cms.Sequence(trackerlocalrecoTask)

from Configuration.ProcessModifiers.gpu_cff import gpu
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous
gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous)

from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
}

// fill final clusters
fillClusters((*detDigis).detId());
if (detDigis) fillClusters((*detDigis).detId());
//std::cout << "filled " << totCluseFilled << " clusters" << std::endl;

iEvent.put(digiPutToken_, std::move(collection));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"

// local includes
#include "SiPixelRawToClusterGPUKernel.h"
Expand Down Expand Up @@ -456,6 +457,54 @@ namespace pixelgpudetails {

} // end of Raw to Digi kernel


__global__
void fillHitsModuleStart(uint32_t const * __restrict__ cluStart, uint32_t * __restrict__ moduleStart) {

assert(gpuClustering::MaxNumModules<2048); // easy to extend at least till 32*1024
assert(1==gridDim.x);
assert(0==blockIdx.x);

int first = threadIdx.x;

// limit to MaxHitsInModule;
for (int i=first, iend=gpuClustering::MaxNumModules; i<iend; i+=blockDim.x) {
moduleStart[i+1] = std::min(gpuClustering::maxHitsInModule(),cluStart[i]);
}

__shared__ uint32_t ws[32];
blockPrefixScan(moduleStart+1,moduleStart+1,1024,ws);
blockPrefixScan(moduleStart+1025,moduleStart+1025,gpuClustering::MaxNumModules-1024,ws);

for (int i=first+1025, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
moduleStart[i]+=moduleStart[1024];
}
__syncthreads();

#ifdef GPU_DEBUG
assert(0==moduleStart[0]);
auto c0 = std::min(gpuClustering::maxHitsInModule(),cluStart[0]);
assert(c0==moduleStart[1]);
assert(moduleStart[1024]>=moduleStart[1023]);
assert(moduleStart[1025]>=moduleStart[1024]);
assert(moduleStart[gpuClustering::MaxNumModules]>=moduleStart[1025]);

for (int i=first, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
if(0!=i) assert(moduleStart[i]>=moduleStart[i-i]);
// [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID]
// [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856]
if (i==96 || i==1184 || i==1744 || i==gpuClustering::MaxNumModules) printf("moduleStart %d %d\n",i,moduleStart[i]);
}
#endif

// avoid overflow
constexpr auto MAX_HITS = gpuClustering::MaxNumClusters;
for (int i=first, iend=gpuClustering::MaxNumModules+1; i<iend; i+=blockDim.x) {
if (moduleStart[i] > MAX_HITS) moduleStart[i] = MAX_HITS;
}
}


// Interface to outside
void SiPixelRawToClusterGPUKernel::makeClustersAsync(
const SiPixelFedCablingMapGPU *cablingMap,
Expand All @@ -478,6 +527,7 @@ namespace pixelgpudetails {
edm::Service<CUDAService> cs;
nModules_Clusters_h = cs->make_host_unique<uint32_t[]>(2, stream);

if (wordCounter) // protect in case of empty event....
{
const int threadsPerBlock = 512;
const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
Expand Down Expand Up @@ -511,19 +561,24 @@ namespace pixelgpudetails {
digiErrors_d.copyErrorToHostAsync(stream);
}
}
// End of Raw2Digi and passing data for cluserisation
// End of Raw2Digi and passing data for clustering

{
// clusterizer ...
using namespace gpuClustering;
int threadsPerBlock = 256;
int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
int blocks = (std::max(int(wordCounter),int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;


gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
digis_d.moduleInd(),
digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(),
gains,
wordCounter);
wordCounter,
clusters_d.moduleStart(),
clusters_d.clusInModule(),
clusters_d.clusModuleStart()
);
cudaCheck(cudaGetLastError());

#ifdef GPU_DEBUG
Expand All @@ -532,8 +587,6 @@ namespace pixelgpudetails {
<< " blocks of " << threadsPerBlock << " threads\n";
#endif

cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));

countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
cudaCheck(cudaGetLastError());

Expand All @@ -546,7 +599,6 @@ namespace pixelgpudetails {
std::cout << "CUDA findClus kernel launch with " << blocks
<< " blocks of " << threadsPerBlock << " threads\n";
#endif
cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
digis_d.c_moduleInd(),
digis_d.c_xx(), digis_d.c_yy(),
Expand All @@ -567,26 +619,19 @@ namespace pixelgpudetails {
cudaCheck(cudaGetLastError());



// count the module start indices already here (instead of
// rechits) so that the number of clusters/hits can be made
// available in the rechit producer without additional points of
// synchronization/ExternalWork
//
// Temporary storage
size_t tempScanStorageSize = 0;
{
uint32_t *tmp = nullptr;
cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
}
auto tempScanStorage_d = cs->make_device_unique<uint32_t[]>(tempScanStorageSize, stream);
// Set first the first element to 0
cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize,
clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
stream.id()));

// MUST be ONE block
fillHitsModuleStart<<<1, 1024, 0, stream.id()>>>(clusters_d.c_clusInModule(),clusters_d.clusModuleStart());

// last element holds the number of all clusters
cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));


} // end clusterizer scope
}
}
123 changes: 34 additions & 89 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
#include <cstdio>

#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"

#include "gpuClusteringConstants.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

namespace gpuCalibPixel {
Expand All @@ -22,104 +25,46 @@ namespace gpuCalibPixel {
uint16_t const * __restrict__ y,
uint16_t * adc,
SiPixelGainForHLTonGPU const * __restrict__ ped,
int numElements
int numElements,
uint32_t * __restrict__ moduleStart, // just to zero first
uint32_t * __restrict__ nClustersInModule, // just to zero them
uint32_t * __restrict__ clusModuleStart // just to zero first
)
{

int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= numElements) return;
if (InvId==id[i]) return;
int first = blockDim.x * blockIdx.x + threadIdx.x;

float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
float offset = id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;

bool isDeadColumn=false, isNoisyColumn=false;

int row = x[i];
int col = y[i];
auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
float pedestal = ret.first; float gain = ret.second;
// float pedestal = 0; float gain = 1.;
if ( isDeadColumn | isNoisyColumn )
{
id[i]=InvId; adc[i] =0;
printf("bad pixel at %d in %d\n",i,id[i]);
}
else {
float vcal = adc[i] * gain - pedestal*gain;
adc[i] = std::max(100, int( vcal * conversionFactor + offset));
// zero for next kernels...
if (0==first) clusModuleStart[0] = moduleStart[0]=0;
for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x*blockDim.x) {
nClustersInModule[i]=0;
}

// if (threadIdx.x==0)
// printf ("calibrated %d\n",id[i]);
}

__global__ void calibADCByModule(uint16_t * id,
uint16_t const * __restrict__ x,
uint16_t const * __restrict__ y,
uint16_t * adc,
uint32_t * moduleStart,
SiPixelGainForHLTonGPU const * __restrict__ ped,
int numElements
)
{


auto first = moduleStart[1 + blockIdx.x];

auto me = id[first];

assert(me<2000);
for (int i = first; i < numElements; i += gridDim.x*blockDim.x) {
if (InvId==id[i]) continue;

/// depends on "me"
float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
float offset = id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;

float conversionFactor = me<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
float offset = me<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
bool isDeadColumn=false, isNoisyColumn=false;


#ifdef GPU_DEBUG
if (me%100==1)
if (threadIdx.x==0) printf("start pixel calibration for module %d in block %d\n",me,blockIdx.x);
#endif

first+=threadIdx.x;

// __syncthreads();

float pedestal=0,gain=0;
bool isDeadColumn=false, isNoisyColumn=false;
int oldCol=-1, oldAveragedBlock=-1;

for (int i=first; i<numElements; i+=blockDim.x) {
if (id[i]==InvId) continue; // not valid
if (id[i]!=me) break; // end of module
int row = x[i];
int col = y[i];
int averagedBlock = row / ped->numberOfRowsAveragedOver_; // 80.... ( row<80 will be faster...)
if ( (col!=oldCol) | ( averagedBlock != oldAveragedBlock) ) {
oldCol=col; oldAveragedBlock= averagedBlock;
auto ret = ped->getPedAndGain(me,col, row, isDeadColumn, isNoisyColumn);
pedestal = ret.first; gain = ret.second;
}
if ( isDeadColumn | isNoisyColumn )
{ id[i]=InvId; adc[i] =0; }
else {
float vcal = adc[i] * gain - pedestal*gain;
adc[i] = std::max(100, int( vcal * conversionFactor + offset));
}
}

__syncthreads();
//reset start
if(0==threadIdx.x) {
auto & k = moduleStart[1 + blockIdx.x];
while (id[k]==InvId) ++k;
int row = x[i];
int col = y[i];
auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
float pedestal = ret.first; float gain = ret.second;
// float pedestal = 0; float gain = 1.;
if ( isDeadColumn | isNoisyColumn )
{
id[i]=InvId; adc[i] =0;
printf("bad pixel at %d in %d\n",i,id[i]);
}
else {
float vcal = adc[i] * gain - pedestal*gain;
adc[i] = std::max(100, int( vcal * conversionFactor + offset));
}
}


}



}
}

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
#endif
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h

#include <cstdint>

namespace gpuClustering {
constexpr uint32_t MaxNumModules = 2000;
constexpr uint32_t MaxNumPixels = 256 * 2000; // this does not mean maxPixelPerModule == 256!
constexpr uint32_t MaxNumClustersPerModules = 1024;
constexpr uint16_t InvId = 9999; // must be > MaxNumModules

}
#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h

This file was deleted.

1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class PixelCPEFast final : public PixelCPEBase

std::vector<pixelCPEforGPU::DetParams, CUDAHostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
pixelCPEforGPU::CommonParams m_commonParamsGPU;
pixelCPEforGPU::LayerGeometry m_layerGeometry;

struct GPUData {
~GPUData();
Expand Down
Loading

0 comments on commit 41cd273

Please sign in to comment.