Various fixes and cleanup (#87)

- replace `exclusive_scan` with `memset` + `inclusive_scan` to avoid an invalid read - fix memory sizes in allocations and copies - add a missing stream synchronize - set `recordWatcherUpdatedSinceLastTransfer_` to avoid spurious copies
cms-patatrack · Oct 20, 2020 · e86e73c · e86e73c
1 parent 4d75de4
commit e86e73c
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 10 deletions.
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -39,7 +39,7 @@
 namespace pixelgpudetails {
 
   SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
-    int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD * sizeof(unsigned int);
+    int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
 
@@ -680,8 +680,8 @@ namespace pixelgpudetails {
 
     // std::cout << "found " << nModulesActive << " Modules active" << std::endl;
 
-    // TODO: I suspect we need a cudaStreamSynchronize before using nModules below
     // In order to avoid the cudaStreamSynchronize, create a new kernel which launches countModules and findClus.
+    cudaStreamSynchronize(stream.id());
 
     threadsPerBlock = 256;
     blocks = nModulesActive;

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -71,10 +71,13 @@ namespace pixelgpudetails {
 
    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));
 
-    thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
+    // Set first the first element to 0
+    cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
+    // Then use inclusive_scan to get the partial sum to the rest
+    thrust::inclusive_scan(thrust::cuda::par.on(stream.id()),
                            input.clusInModule_d,
-                           input.clusInModule_d + gpuClustering::MaxNumModules + 1,
-                           gpu_.hitsModuleStart_d);
+                           input.clusInModule_d + gpuClustering::MaxNumModules,
+                           &gpu_.hitsModuleStart_d[1]);
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
@@ -126,11 +129,11 @@ namespace pixelgpudetails {
     HitsOnCPU hoc(nhits);
     hoc.gpu_d = gpu_d;
     memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
-    cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaStreamSynchronize(stream.id()));