Skip to content

Commit

Permalink
reduce the maximum number of Pixel hits in CA (#163)
Browse files Browse the repository at this point in the history
Change approach: use one global array of hits and isOuterHitOfCell instead of one per layer.
This allows to allocate memory for the maximum number of hits over all layers, instead of the maximum number of hits per layer times the number of layers.

The distribution of the number of pixel hits for data at pileup 50 is reasonably Gaussian around 18300 ± 3500; set the the maxNumberOfHits to 40000, which is > 6 σ away.

Start moving constants for pixel-related GPU algorithms to a dedicated header file.
  • Loading branch information
felicepantaleo authored and fwyzard committed Sep 13, 2018
1 parent 8293c5e commit 68a8252
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h
#define RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h

#include <cstdint>

namespace PixelGPUConstants {
constexpr uint16_t maxNumberOfHits = 40000; // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away

}

#endif // RecoLocalTracker_SiPixelClusterizer_interface_PixelTrackingGPUConstants_h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ __global__ void
kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
GPUCACell *cells, uint32_t const * nCells,
GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
uint32_t nHits) {
uint32_t nHits, uint32_t maxNumberOfDoublets) {

auto idx = threadIdx.x + blockIdx.x * blockDim.x;
#ifdef GPU_DEBUG
Expand All @@ -28,17 +28,16 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
}
if (idx < nHits) {
if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell;
printf("OuterHitOfCell overflow %d\n", idx);
printf("OuterHitOfCell overflow %d\n", idx);
}

}


__global__ void
kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
GPUCACell *cells, uint32_t const * nCells,
GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
float ptmin,
float ptmin,
float region_origin_radius, const float thetaCut,
const float phiCut, const float hardPtCut,
unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
Expand Down Expand Up @@ -93,7 +92,7 @@ kernel_print_found_ntuplets(GPU::SimpleVector<Quadruplet> *foundNtuplets, int ma
(*foundNtuplets)[i].hitId[2],
(*foundNtuplets)[i].hitId[3]
);

}
}

Expand Down Expand Up @@ -124,9 +123,9 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU()
cudaCheck(cudaMemset(device_nCells_, 0, sizeof(uint32_t)));

cudaCheck(cudaMalloc(&device_isOuterHitOfCell_,
maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>)));
PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>)));
cudaCheck(cudaMemset(device_isOuterHitOfCell_, 0,
maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>)));
PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>)));

h_foundNtupletsVec_.resize(maxNumberOfRegions_);
h_foundNtupletsData_.resize(maxNumberOfRegions_);
Expand Down Expand Up @@ -158,15 +157,15 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
h_foundNtupletsVec_[regionIndex]->reset();

auto nhits = hh.nHits;

assert(nhits <= PixelGPUConstants::maxNumberOfHits);
auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512;
kernel_connect<<<numberOfBlocks, 512, 0, cudaStream>>>(
d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
region.ptMin(),
region.ptMin(),
region.originRBound(), caThetaCut, caPhiCut, caHardPtCut,
maxNumberOfDoublets_, maxNumberOfHits_
maxNumberOfDoublets_, PixelGPUConstants::maxNumberOfHits
);
cudaCheck(cudaGetLastError());

Expand All @@ -181,7 +180,8 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
d_foundNtupletsVec_[regionIndex],
device_theCells_, device_nCells_,
device_isOuterHitOfCell_, nhits
device_isOuterHitOfCell_, nhits,
maxNumberOfDoublets_
);


Expand All @@ -201,7 +201,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
void CAHitQuadrupletGeneratorGPU::cleanup(cudaStream_t cudaStream) {
// this lazily resets temporary memory for the next event, and is not needed for reading the output
cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0,
maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>),
PixelGPUConstants::maxNumberOfHits * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>),
cudaStream));
cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), cudaStream));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
#include "RecoPixelVertexing/PixelTrackFitting/interface/RZLine.h"
#include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h"
#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
#include "RecoTracker/TkHitPairs/interface/HitPairGeneratorFromLayerPair.h"
#include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h"
#include "RecoTracker/TkHitPairs/interface/LayerHitMapCache.h"
Expand All @@ -18,8 +20,6 @@
#include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h"
#include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h"
#include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h"
#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"


#include "GPUCACell.h"

Expand Down Expand Up @@ -67,7 +67,6 @@ class CAHitQuadrupletGeneratorGPU {
void deallocateOnGPU();

private:
// LayerCacheType theLayerCache;

std::unique_ptr<SeedComparitor> theComparitor;

Expand Down Expand Up @@ -151,11 +150,10 @@ class CAHitQuadrupletGeneratorGPU {
const float caHardPtCut = 0.f;

static constexpr int maxNumberOfQuadruplets_ = 10000;
static constexpr int maxCellsPerHit_ = 256; // 2048; // 512;
static constexpr int maxCellsPerHit_ = 256;
static constexpr int maxNumberOfLayerPairs_ = 13;
static constexpr int maxNumberOfLayers_ = 10;
static constexpr int maxNumberOfDoublets_ = 262144;
static constexpr int maxNumberOfHits_ = 20000;
static constexpr int maxNumberOfRegions_ = 2;

std::vector<GPU::SimpleVector<Quadruplet>*> h_foundNtupletsVec_;
Expand Down

0 comments on commit 68a8252

Please sign in to comment.