Improve CUDAService log messages (#153)

Make the message from the CUDAService more descriptive,and add a test file to run the CUDAService.
cms-patatrack · Sep 2, 2018 · 305d260 · 305d260
1 parent 46ea6cc
commit 305d260
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 5 deletions.
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -31,6 +31,51 @@ void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
   }
 }
 
+constexpr
+unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
+  switch (major * 10 + minor) {
+  // Fermi architecture
+  case 20:  // SM 2.0: GF100 class
+    return  32;
+  case 21:  // SM 2.1: GF10x class
+    return  48;
+
+  // Kepler architecture
+  case 30:  // SM 3.0: GK10x class
+  case 32:  // SM 3.2: GK10x class
+  case 35:  // SM 3.5: GK11x class
+  case 37:  // SM 3.7: GK21x class
+    return 192;
+
+  // Maxwell architecture
+  case 50:  // SM 5.0: GM10x class
+  case 52:  // SM 5.2: GM20x class
+  case 53:  // SM 5.3: GM20x class
+    return 128;
+
+  // Pascal architecture
+  case 60:  // SM 6.0: GP100 class
+    return  64;
+  case 61:  // SM 6.1: GP10x class
+  case 62:  // SM 6.2: GP10x class
+    return 128;
+
+  // Volta architecture
+  case 70:  // SM 7.0: GV100 class
+  case 72:  // SM 7.2: GV11b class
+    return  64;
+
+  // Turing architecture
+  case 80:  // SM 8.0: TU10x class
+    return  64;
+
+  // unknown architecture
+  default:
+    return   0;
+  }
+}
+
+
 CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
@@ -40,7 +85,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
 
   auto status = cudaGetDeviceCount(&numberOfDevices_);
   if (cudaSuccess != status) {
-    edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << ".\n" << "Disabling the CUDAService.";
+    edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
     return;
   }
   edm::LogInfo log("CUDAService");
@@ -66,9 +111,25 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     cudaDeviceProp properties;
     cudaCheck(cudaGetDeviceProperties(&properties, i));
     log << "CUDA device " << i << ": " << properties.name << '\n';
-    log << "  compute capability: " << properties.major << "." << properties.minor << '\n';
+
+    // compute capabilities
+    log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
     computeCapabilities_.emplace_back(properties.major, properties.minor);
+    log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
+    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
+    log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
 
+    // compute mode
+    static constexpr const char* computeModeDescription[] = {
+      "default (shared)",               // cudaComputeModeDefault
+      "exclusive (single thread)",      // cudaComputeModeExclusive
+      "prohibited",                     // cudaComputeModeProhibited
+      "exclusive (single process)",     // cudaComputeModeExclusiveProcess
+      "unknown"
+    };
+    log << "  compute mode:" << std::right << std::setw(27) << computeModeDescription[std::min(properties.computeMode, (int) std::size(computeModeDescription) - 1)] << '\n';
+
+    // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
     cudaCheck(cudaSetDevice(i));
     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
 
@@ -77,6 +138,31 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     size_t freeMemory, totalMemory;
     cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
     log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) << totalMemory / (1 << 20) << " MB total\n";
+    log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
+    log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
+
+    // L1 cache behaviour
+    static constexpr const char* l1CacheModeDescription[] = {
+      "unknown",
+      "local memory",
+      "global memory",
+      "local and global memory"
+    };
+    int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
+    log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
+    log << '\n';
+
+    log << "Other capabilities\n";
+    log << "  " << (properties.canMapHostMemory ? "can" : "cannot") << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
+    log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support") << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
+    log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot") << " access pageable memory via the host's page tables\n";
+    log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot") << " access host registered memory at the same virtual address as the host\n";
+    log << "  " << (properties.unifiedAddressing ? "shares" : "does not share") << " a unified address space with the host\n";
+    log << "  " << (properties.managedMemory ? "supports" : "does not support") << " allocating managed memory on this system\n";
+    log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot") << " coherently access managed memory concurrently with the host\n";
+    log << "  " << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot") << " directly access managed memory on the device without migration\n";
+    log << "  " << (properties.cooperativeLaunch ? "support" : "does not support") << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
+    log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support") << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
     log << '\n';
 
     // set and read the CUDA device flags.
@@ -145,11 +231,11 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     size_t value;
     log << "CUDA limits\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
-    log << "  printf buffer size:           " << std::setw(10) << value << '\n';
+    log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
-    log << "  stack size:                   " << std::setw(10) << value << '\n';
+    log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
-    log << "  malloc heap size:             " << std::setw(10) << value << '\n';
+    log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
       log << "  runtime sync depth:           " << std::setw(10) << value << '\n';

diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.py b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
@@ -0,0 +1,13 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process( "TEST" )
+
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+process.MessageLogger.categories.append("CUDAService")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32( 0 )
+)