Various fixes and cleanup (#101)

Fix errors found by cuda-memcheck: - properly initialise device memory - fix various cudaMemcpy calls Remove unused debug variables and function declarations, and #ifdef some debug printouts. Call cudaDeviceReset() before exiting, via the destructor of CUDAService. This explicitly destroys and cleans up all resources associated with the current device, and is useful to check for memory leaks with cuda-memcheck --tool memcheck --leak-check full.
cms-patatrack · Oct 19, 2020 · c8e25f4 · c8e25f4
1 parent ccc6f95
commit c8e25f4
Show file tree

Hide file tree

Showing 5 changed files with 5 additions and 24 deletions.
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -90,10 +90,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
-
-    cudaCheck(cudaMalloc((void**) & debug_d,        MAX_WORD32_SIZE));
   }
-
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
     // free device memory used for RawToDigi on GPU
@@ -114,7 +111,6 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
-    cudaCheck(cudaFree(debug_d));
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
@@ -662,7 +658,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     // calibrated adc
-    cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
 
     /*
     std::cout
@@ -700,14 +696,12 @@ namespace pixelgpudetails {
                  moduleStart_d,
                  clusInModule_d, moduleId_d,
                  clus_d,
-                 debug_d,
                  wordCounter
     );
 
     // clusters
     cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 
-
     cudaStreamSynchronize(stream.id());
     cudaCheck(cudaGetLastError());
 

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -208,9 +208,7 @@ namespace pixelgpudetails {
     int32_t *  clus_d;
     uint32_t * clusInModule_d;
     uint32_t * moduleId_d;
-    uint32_t * debug_d;
   };
-
 
   // configuration and memory buffers alocated on the GPU
   struct context {
@@ -234,18 +232,6 @@ namespace pixelgpudetails {
     uint32_t * debug_d;
   };
 
-  // wrapper function to call RawToDigi on the GPU from host side
-  void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice,
-                         SiPixelGainForHLTonGPU * const ped,
-                         const uint32_t wordCounter, uint32_t *word,
-                         const uint32_t fedCounter,  uint8_t *fedId_h,
-                         bool convertADCtoElectrons, uint32_t * pdigi_h,
-                         uint32_t *rawIdArr_h, GPU::SimpleVector<error_obj> *error_h,
-                         GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
-                         uint16_t * adc_h, int32_t * clus_h,
-                         bool useQualityInfo, bool includeErrors, bool debug,
-                         uint32_t & nModulesActive);
-
   // void initCablingMap();
   context initDeviceMemory();
   void freeMemory(context &);

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -36,7 +36,7 @@ namespace gpuClustering {
                            uint16_t const * adc,
                            uint32_t const * moduleStart,
                            uint32_t * clusInModule, uint32_t * moduleId,
-                           int32_t * clus, uint32_t * debug,
+                           int32_t * clus,
                            int numElements)
   {
     __shared__ bool go;
@@ -98,7 +98,6 @@ namespace gpuClustering {
         if (id[i] == InvId)             // not valid
           continue;
         assert(id[i] == me); // break;  // end of module
-        ++debug[i];
         auto js = i + 1;
         auto jm = jmax[k];
         jmax[k] = i + 1;

diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -154,7 +154,7 @@ void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablin
     }
   }
 
-  cuda::memory::async::copy(modToUnpHost.data(), modToUnpDevice.get(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id());
+  cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.data(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id());
 }
 
 

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -108,9 +108,11 @@ namespace pixelgpudetails {
     for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
     hitsLayerStart_[10]=nhits;
 
+#ifdef GPU_DEBUG
     std::cout << "hit layerStart "; 
     for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
     std::cout << "end:" << hitsLayerStart_[10] << std::endl;
+#endif
 
     cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));