diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 5051a012af9e0..7efbaa0d00498 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -31,6 +31,51 @@ void setCudaLimit(cudaLimit limit, const char* name, size_t request) { } } +constexpr +unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) { + switch (major * 10 + minor) { + // Fermi architecture + case 20: // SM 2.0: GF100 class + return 32; + case 21: // SM 2.1: GF10x class + return 48; + + // Kepler architecture + case 30: // SM 3.0: GK10x class + case 32: // SM 3.2: GK10x class + case 35: // SM 3.5: GK11x class + case 37: // SM 3.7: GK21x class + return 192; + + // Maxwell architecture + case 50: // SM 5.0: GM10x class + case 52: // SM 5.2: GM20x class + case 53: // SM 5.3: GM20x class + return 128; + + // Pascal architecture + case 60: // SM 6.0: GP100 class + return 64; + case 61: // SM 6.1: GP10x class + case 62: // SM 6.2: GP10x class + return 128; + + // Volta architecture + case 70: // SM 7.0: GV100 class + case 72: // SM 7.2: GV11b class + return 64; + + // Turing architecture + case 80: // SM 8.0: TU10x class + return 64; + + // unknown architecture + default: + return 0; + } +} + + CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) { bool configEnabled = config.getUntrackedParameter("enabled"); if (not configEnabled) { @@ -40,7 +85,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& auto status = cudaGetDeviceCount(&numberOfDevices_); if (cudaSuccess != status) { - edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << ".\n" << "Disabling the CUDAService."; + edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService."; return; } edm::LogInfo log("CUDAService"); @@ -66,9 +111,25 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& cudaDeviceProp properties; cudaCheck(cudaGetDeviceProperties(&properties, i)); log << "CUDA device " << i << ": " << properties.name << '\n'; - log << " compute capability: " << properties.major << "." << properties.minor << '\n'; + + // compute capabilities + log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n"; computeCapabilities_.emplace_back(properties.major, properties.minor); + log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n'; + log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n'; + log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n"; + // compute mode + static constexpr const char* computeModeDescription[] = { + "default (shared)", // cudaComputeModeDefault + "exclusive (single thread)", // cudaComputeModeExclusive + "prohibited", // cudaComputeModeProhibited + "exclusive (single process)", // cudaComputeModeExclusiveProcess + "unknown" + }; + log << " compute mode:" << std::right << std::setw(27) << computeModeDescription[std::min(properties.computeMode, (int) std::size(computeModeDescription) - 1)] << '\n'; + + // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort() cudaCheck(cudaSetDevice(i)); cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost)); @@ -77,6 +138,31 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& size_t freeMemory, totalMemory; cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) << totalMemory / (1 << 20) << " MB total\n"; + log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n"; + log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n"; + + // L1 cache behaviour + static constexpr const char* l1CacheModeDescription[] = { + "unknown", + "local memory", + "global memory", + "local and global memory" + }; + int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported; + log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n'; + log << '\n'; + + log << "Other capabilities\n"; + log << " " << (properties.canMapHostMemory ? "can" : "cannot") << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n"; + log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support") << " coherently accessing pageable memory without calling cudaHostRegister() on it\n"; + log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot") << " access pageable memory via the host's page tables\n"; + log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot") << " access host registered memory at the same virtual address as the host\n"; + log << " " << (properties.unifiedAddressing ? "shares" : "does not share") << " a unified address space with the host\n"; + log << " " << (properties.managedMemory ? "supports" : "does not support") << " allocating managed memory on this system\n"; + log << " " << (properties.concurrentManagedAccess ? "can" : "cannot") << " coherently access managed memory concurrently with the host\n"; + log << " " << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot") << " directly access managed memory on the device without migration\n"; + log << " " << (properties.cooperativeLaunch ? "support" : "does not support") << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n"; + log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support") << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n"; log << '\n'; // set and read the CUDA device flags. @@ -145,11 +231,11 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& size_t value; log << "CUDA limits\n"; cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize)); - log << " printf buffer size: " << std::setw(10) << value << '\n'; + log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n"; cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize)); - log << " stack size: " << std::setw(10) << value << '\n'; + log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n"; cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize)); - log << " malloc heap size: " << std::setw(10) << value << '\n'; + log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n"; if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth)); log << " runtime sync depth: " << std::setw(10) << value << '\n'; diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.py b/HeterogeneousCore/CUDAServices/test/testCUDAService.py new file mode 100644 index 0000000000000..06edefcac2b66 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.py @@ -0,0 +1,13 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process( "TEST" ) + +process.load('FWCore.MessageService.MessageLogger_cfi') +process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi') +process.MessageLogger.categories.append("CUDAService") + +process.source = cms.Source("EmptySource") + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32( 0 ) +)