Skip to content

Commit

Permalink
Various fixes and cleanup (#87)
Browse files Browse the repository at this point in the history
  - replace `exclusive_scan` with `memset` + `inclusive_scan` to avoid an invalid read
  - fix memory sizes in allocations and copies
  - add a missing stream synchronize
  - set `recordWatcherUpdatedSinceLastTransfer_` to avoid spurious copies
  • Loading branch information
makortel authored and fwyzard committed Oct 20, 2020
1 parent 4d75de4 commit e86e73c
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
namespace pixelgpudetails {

SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD * sizeof(unsigned int);
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
cudaMallocHost(&word, sizeof(unsigned int)*WSIZE);
cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE);

Expand Down Expand Up @@ -680,8 +680,8 @@ namespace pixelgpudetails {

// std::cout << "found " << nModulesActive << " Modules active" << std::endl;

// TODO: I suspect we need a cudaStreamSynchronize before using nModules below
// In order to avoid the cudaStreamSynchronize, create a new kernel which launches countModules and findClus.
cudaStreamSynchronize(stream.id());

threadsPerBlock = 256;
blocks = nModulesActive;
Expand Down
19 changes: 11 additions & 8 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,13 @@ namespace pixelgpudetails {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
// Set first the first element to 0
cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
thrust::inclusive_scan(thrust::cuda::par.on(stream.id()),
input.clusInModule_d,
input.clusInModule_d + gpuClustering::MaxNumModules + 1,
gpu_.hitsModuleStart_d);
input.clusInModule_d + gpuClustering::MaxNumModules,
&gpu_.hitsModuleStart_d[1]);

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
Expand Down Expand Up @@ -126,11 +129,11 @@ namespace pixelgpudetails {
HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
Expand Down

0 comments on commit e86e73c

Please sign in to comment.