Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up CPU side of GPU rechits #125

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 76 additions & 49 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,52 @@ namespace {
hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules];
}
}

template <typename T>
T *slicePitch(void *ptr, size_t pitch, size_t row) {
return reinterpret_cast<T *>( reinterpret_cast<char *>(ptr) + pitch*row);
}
}

namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {

constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;

cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
cudaCheck(cudaMalloc((void **) & gpu_.charge_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.detInd_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
cudaCheck(cudaMalloc((void **) & gpu_.xg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.yg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.zg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.rg_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.xl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.yl_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.xerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.yerr_d, (gpuClustering::MaxNumModules * 256) * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.iphi_d, (gpuClustering::MaxNumModules * 256) * sizeof(int16_t)));
cudaCheck(cudaMalloc((void **) & gpu_.sortIndex_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
cudaCheck(cudaMalloc((void **) & gpu_.mr_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));
cudaCheck(cudaMalloc((void **) & gpu_.mc_d, (gpuClustering::MaxNumModules * 256) * sizeof(uint16_t)));

// Coalesce all 32bit and 16bit arrays to two big blobs
//
// This is just a toy. Please don't copy-paste the logic but
// create a proper abstraction (e.g. along FWCore/SOA, or
// FWCore/Utilities/interface/SoATuple.h
//
// Order such that the first ones are the ones transferred to CPU
static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9));
cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id()));
//edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_;
gpu_.charge_d = slicePitch<int32_t>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0);
gpu_.xl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1);
gpu_.yl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2);
gpu_.xerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3);
gpu_.yerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4);
gpu_.xg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5);
gpu_.yg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6);
gpu_.zg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7);
gpu_.rg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8);

// Order such that the first ones are the ones transferred to CPU
cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5));
cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*5, cudaStream.id()));
//edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_;
gpu_.detInd_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0);
gpu_.mr_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);
gpu_.mc_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2);
gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);

cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
gpu_.me_d = gpu_d;
Expand All @@ -57,32 +82,39 @@ namespace pixelgpudetails {
cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));

cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));

// On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks
// to '*256' it is even aligned by cache line
h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t);
cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5));
h_charge_ = slicePitch<int32_t>(h_owner_32bit_, h_owner_32bit_pitch_, 0);
h_xl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 1);
h_yl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 2);
h_xe_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 3);
h_ye_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 4);

h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t);
cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3));
h_detInd_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 0);
h_mr_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 1);
h_mc_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 2);

#ifdef GPU_DEBUG
cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t)));
#endif
}
PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.bs_d));
cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
cudaCheck(cudaFree(gpu_.charge_d));
cudaCheck(cudaFree(gpu_.detInd_d));
cudaCheck(cudaFree(gpu_.xg_d));
cudaCheck(cudaFree(gpu_.yg_d));
cudaCheck(cudaFree(gpu_.zg_d));
cudaCheck(cudaFree(gpu_.rg_d));
cudaCheck(cudaFree(gpu_.xl_d));
cudaCheck(cudaFree(gpu_.yl_d));
cudaCheck(cudaFree(gpu_.xerr_d));
cudaCheck(cudaFree(gpu_.yerr_d));
cudaCheck(cudaFree(gpu_.iphi_d));
cudaCheck(cudaFree(gpu_.sortIndex_d));
cudaCheck(cudaFree(gpu_.mr_d));
cudaCheck(cudaFree(gpu_.mc_d));
cudaCheck(cudaFree(gpu_.owner_32bit_));
cudaCheck(cudaFree(gpu_.owner_16bit_));
cudaCheck(cudaFree(gpu_.hist_d));
cudaCheck(cudaFree(gpu_d));
cudaCheck(cudaFree(d_phase1TopologyLayerStart_));

cudaCheck(cudaFreeHost(h_hitsModuleStart_));
cudaCheck(cudaFreeHost(h_owner_32bit_));
cudaCheck(cudaFreeHost(h_owner_16bit_));
#ifdef GPU_DEBUG
cudaCheck(cudaFreeHost(h_hitsLayerStart_));
#endif
Expand All @@ -92,9 +124,9 @@ namespace pixelgpudetails {
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream) {
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
gpu_.hitsModuleStart_d = input.clusModuleStart_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
gpu_.hitsModuleStart_d = input.clusModuleStart_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
Expand Down Expand Up @@ -123,20 +155,21 @@ namespace pixelgpudetails {
cudaCheck(cudaGetLastError());

// needed only if hits on CPU are required...
nhits_ = input.nClusters;
cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
#ifdef GPU_DEBUG
cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
#endif
auto nhits = input.nClusters;
cpu_ = std::make_unique<HitsOnCPU>(nhits);
cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits * sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits * sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits * sizeof(uint16_t), cudaMemcpyDefault, stream.id()));

cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_,
gpu_.owner_16bit_, gpu_.owner_16bit_pitch_,
nhits_*sizeof(uint16_t), 3,
cudaMemcpyDefault, stream.id()));

cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_,
gpu_.owner_32bit_, gpu_.owner_32bit_pitch_,
nhits_*sizeof(uint32_t), 5,
cudaMemcpyDefault, stream.id()));

#ifdef GPU_DEBUG
cudaStreamSynchronize(stream.id());
Expand All @@ -148,15 +181,9 @@ namespace pixelgpudetails {

// for timing test
// cudaStreamSynchronize(stream.id());
// auto nhits = h_hitsLayerStart_[10];
// auto nhits_ = h_hitsLayerStart_[10];
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);

cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits, 256, stream.id());
}

std::unique_ptr<HitsOnCPU>&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) {
cpu_->gpu_d = gpu_d;
memcpy(cpu_->hitsModuleStart, h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
return std::move(cpu_);
cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
}
}
22 changes: 20 additions & 2 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,32 @@ namespace pixelgpudetails {
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream);

std::unique_ptr<HitsOnCPU>&& getOutput(cuda::stream_t<>& stream);
HitsOnCPU getOutput() const {
return HitsOnCPU{
h_hitsModuleStart_, h_detInd_, h_charge_,
h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_,
gpu_d, nhits_
};
}

private:
HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product"
HitsOnGPU gpu_;
std::unique_ptr<HitsOnCPU> cpu_;
uint32_t nhits_ = 0;
uint32_t *d_phase1TopologyLayerStart_ = nullptr;
uint32_t *h_hitsModuleStart_ = nullptr;
uint16_t *h_detInd_ = nullptr;
int32_t *h_charge_ = nullptr;
float *h_xl_ = nullptr;
float *h_yl_ = nullptr;
float *h_xe_ = nullptr;
float *h_ye_ = nullptr;
uint16_t *h_mr_ = nullptr;
uint16_t *h_mc_ = nullptr;
void *h_owner_32bit_ = nullptr;
size_t h_owner_32bit_pitch_ = 0;
void *h_owner_16bit_ = nullptr;
size_t h_owner_16bit_pitch_ = 0;
#ifdef GPU_DEBUG
uint32_t *h_hitsLayerStart_ = nullptr;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
}

void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
auto output = gpuAlgo_->getOutput(cudaStream);
auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput());

// Need the CPU clusters to
// - properly fill the output DetSetVector of hits
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,33 +33,27 @@ namespace siPixelRecHitsHeterogeneousProduct {
Hist * hist_d;

HitsOnGPU const * me_d = nullptr;

// Owning pointers to the 32/16 bit arrays with size MAX_HITS
void *owner_32bit_;
size_t owner_32bit_pitch_;
void *owner_16bit_;
size_t owner_16bit_pitch_;
};

struct HitsOnCPU {
HitsOnCPU() = default;

explicit HitsOnCPU(uint32_t nhits) :
detInd(nhits),
charge(nhits),
xl(nhits),
yl(nhits),
xe(nhits),
ye(nhits),
mr(nhits),
mc(nhits),
nHits(nhits)
{ }

uint32_t hitsModuleStart[2001];
std::vector<uint16_t, CUDAHostAllocator<uint16_t>> detInd;
std::vector<int32_t, CUDAHostAllocator<int32_t>> charge;
std::vector<float, CUDAHostAllocator<float>> xl, yl;
std::vector<float, CUDAHostAllocator<float>> xe, ye;
std::vector<uint16_t, CUDAHostAllocator<uint16_t>> mr;
std::vector<uint16_t, CUDAHostAllocator<uint16_t>> mc;
uint32_t const * hitsModuleStart = nullptr;
uint16_t const * detInd = nullptr;
int32_t const * charge = nullptr;
float const * xl = nullptr;
float const * yl = nullptr;
float const * xe = nullptr;
float const * ye = nullptr;
uint16_t const * mr = nullptr;
uint16_t const * mc = nullptr;

uint32_t nHits;
HitsOnGPU const * gpu_d = nullptr;
uint32_t nHits;
};

using GPUProduct = HitsOnCPU; // FIXME fill cpu vectors on demand
Expand Down