diff --git a/kitsune/experiments/inc/kitsune-tapir.mk b/kitsune/experiments/inc/kitsune-tapir.mk index e922d599e0ed42..9ce554fd322717 100644 --- a/kitsune/experiments/inc/kitsune-tapir.mk +++ b/kitsune/experiments/inc/kitsune-tapir.mk @@ -4,7 +4,7 @@ # KITSUNE_PREFIX?=/projects/kitsune/${host_arch}/16.x KITSUNE_OPTLEVEL?=3 -KITSUNE_ABI_OPTLEVEL?=2 +KITSUNE_ABI_OPTLEVEL?=3 KITSUNE_OPTFLAGS?=-O$(KITSUNE_OPTLEVEL) # For now we disable stripmining on GPUs. diff --git a/kitsune/experiments/raytracer/makefile b/kitsune/experiments/raytracer/makefile index efa2c76347e47b..46d3e2125144d9 100644 --- a/kitsune/experiments/raytracer/makefile +++ b/kitsune/experiments/raytracer/makefile @@ -26,6 +26,7 @@ raytracer-forall.cuda.${host_arch}: raytracer-forall.cpp @echo $@ @$(TIME_CMD) $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib @$(FILE_SIZE) + raytracer-forall.hip.${host_arch}: raytracer-forall.cpp @echo $@ @$(TIME_CMD) $(KIT_CXX) -v $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib diff --git a/kitsune/runtime/cuda/cuda.cpp b/kitsune/runtime/cuda/cuda.cpp index bb55a4bc21f0cf..4491845295c851 100644 --- a/kitsune/runtime/cuda/cuda.cpp +++ b/kitsune/runtime/cuda/cuda.cpp @@ -52,11 +52,10 @@ //===----------------------------------------------------------------------===// // TODO: -// * Need to do a better job tracking and freeing resources as necessary. -// * Need to ponder a path for better stream usage (probably related to -// more complex code generation on the compiler side). -// -// +// * Need a few options for stream usage that will likely require some +// compiler-side static analysis and additional entry points for +// runtime tuning. (consider: blocked prefetches, prefetch streams, etc.). + #include #include #include @@ -122,6 +121,13 @@ extern unsigned _kitrt_MaxPrefetchStreams; static unsigned _kitrt_CurPrefetchStream = 0; std::vector _kitrt_PrefetchStreams; +struct KitRTPrefetchRequest { + void *addr; + size_t size; +}; + +std::list _kitrt_PrefetchRequests; + // NOTE: Over a series of CUDA releases it is worthwhile to // check in on the header files for replacement versioned // entry points into the driver API. These are typically @@ -326,13 +332,9 @@ bool __kitrt_cuInit() { _kitrtUseHeuristicLaunchParameters = false; } - if (__kitrt_prefetchEnabled()) { - fprintf(stderr, "kitrt: prefetching enabled.\n"); - } - if (__kitrt_prefetchStreamsEnabled()) { for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) { - CUstream stream; + CUstream stream; CU_SAFE_CALL(cuStreamCreate_p(&stream, CU_STREAM_DEFAULT)); _kitrt_PrefetchStreams.push_back(stream); } @@ -347,14 +349,14 @@ void __kitrt_cuDestroy() { void __kitrt_cuFreeManagedMem(void *vp); __kitrt_destroyMemoryMap(__kitrt_cuFreeManagedMem); - /*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) { - CUstream stream = _kitrt_PrefetchStreams[si]; - CU_SAFE_CALL(hipStreamDestroy_p(stream)); + if (__kitrt_prefetchStreamsEnabled()) { + for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) { + CUstream stream = _kitrt_PrefetchStreams[si]; + CU_SAFE_CALL(cuStreamDestroy_v2_p(stream)); + } } - */ // Note that all resources associated with the context will be destroyed. - CU_SAFE_CALL(cuDevicePrimaryCtxRelease_v2_p(_kitrtCUdevice)); CU_SAFE_CALL(cuDevicePrimaryCtxReset_v2_p(_kitrtCUdevice)); _kitrt_cuIsInitialized = false; } @@ -489,20 +491,65 @@ bool __kitrt_cuIsMemManaged(void *vp) { // ---- Memory/data prefetch and data movement support. +void __kitrt_cuPrefetchRequest(void *vp) { + size_t size = 0; + if (not __kitrt_isMemPrefetched(vp, &size)) { + if (size > 0) { + struct KitRTPrefetchRequest R; + R.addr = vp; + R.size = size; + _kitrt_PrefetchRequests.push_back(R); + } + } +} + + void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) { assert(vp && "unexpected null pointer!"); - bool is_read_only, is_write_only; - size_t size = __kitrt_getMemAllocSize(vp, &is_read_only, &is_write_only); - if (size > 0) { - // If we have a size, we know this pointer is from managed memory... - if (is_read_only) { - CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, - CU_MEM_ADVISE_SET_READ_MOSTLY, - _kitrtCUdevice)); - } else { + size_t size = 0; + if (not __kitrt_isMemPrefetched(vp, &size)) { + if (size > 0) { + + //if (is_read_only) { + // CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, + // CU_MEM_ADVISE_SET_READ_MOSTLY, + // _kitrtCUdevice)); + //} else { + // CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, + // CU_MEM_ADVISE_UNSET_READ_MOSTLY, + // _kitrtCUdevice)); + //} + + + // Our semantics assume that a prefetch request suggests an inbound + // kernel launch. Setting the preferred location does not cause + // data to migrate to that location immediately. Instead, it guides + // the migration policy when a fault occurs on that memory region. If + // the data is already in its preferred location and the faulting + // processor can establish a mapping without requiring the data to be + // migrated, then data migration will be avoided. On the other hand, if + // the data is not in its preferred location or if a direct mapping cannot + // be established, then it will be migrated to the processor accessing it. + // It is important to note that setting the preferred location does not + // prevent data prefetching done using cuMemPrefetchAsync(). Having a + // preferred location can override the page thrash detection and + // resolution logic in the Unified Memory driver. Normally, if a page is + // detected to be constantly thrashing between host and device + // memory, the page may eventually be pinned to host memory. But if the + // preferred location is set as device memory, then the page will continue + // to thrash indefinitely. If CU_MEM_ADVISE_SET_READ_MOSTLY is also set on + // this memory region or any subset of it, then the policies associated + // with that advice will override the policies of this advice, unless read + // accesses from device will not result in a read-only copy being created + // on that device as outlined in description for the advice + // CU_MEM_ADVISE_SET_READ_MOSTLY. CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, - CU_MEM_ADVISE_UNSET_READ_MOSTLY, - _kitrtCUdevice)); + CU_MEM_ADVISE_SET_PREFERRED_LOCATION, + _kitrtCUdevice)); + + CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitrtCUdevice, + (CUstream)stream)); + __kitrt_markMemPrefetched(vp); } // Our semantics assume that a prefetch request suggests an inbound // kernel launch. Setting the preferred location does not cause @@ -535,12 +582,23 @@ void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) { } } + void __kitrt_cuMemPrefetch(void *vp) { assert(vp && "unexpected null pointer!"); __kitrt_cuMemPrefetchOnStream(vp, NULL); } + void __kitrt_cuStreamSetMemPrefetch(void *vp) { + // Prefetching with streams has some rules that make a guarenteed + // behavior difficult... For a busy stream, the prefetch is + // deferred to a background thread by the driver to maintain stream + // ordering. This background thread executes the prefetch when all + // prior operations in the stream are completed. For idle streams, + // the driver can either defer the operation or not, but the driver + // often (how often?) does not defer because of the associated + // overhead. The exact details for when the driver may defer vary + // across driver versions. assert(vp && "unexpected null pointer!"); CUstream stream = _kitrt_PrefetchStreams[_kitrt_CurPrefetchStream]; __kitrt_cuMemPrefetchOnStream(vp, (void*)stream); @@ -1002,7 +1060,7 @@ void __kitrt_cuSynchronizeStreams() { CU_SAFE_CALL(cuCtxSynchronize()); while (not _kitrtActiveStreams.empty()) { CUstream stream = _kitrtActiveStreams.front(); - CU_SAFE_CALL(cuStreamDestroy(stream)); + CU_SAFE_CALL(cuStreamDestroy_v2_p(stream)); _kitrtActiveStreams.pop_front(); } } diff --git a/kitsune/runtime/hip/hip.cpp b/kitsune/runtime/hip/hip.cpp index 35fdb9dd642bd9..ccdbb069d2e3f6 100644 --- a/kitsune/runtime/hip/hip.cpp +++ b/kitsune/runtime/hip/hip.cpp @@ -365,11 +365,13 @@ void __kitrt_hipDestroy() { extern void __kitrt_hipFreeManagedMem(void *); __kitrt_destroyMemoryMap(__kitrt_hipFreeManagedMem); - /*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) { - hipStream_t stream = *(_kitrt_PrefetchStreams[si]); - HIP_SAFE_CALL(hipStreamDestroy_p(stream)); + + if (__kitrt_prefetchStreamsEnabled()) { + for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) { + hipStream_t stream = *(_kitrt_PrefetchStreams[si]); + HIP_SAFE_CALL(hipStreamDestroy_p(stream)); + } } - */ HIP_SAFE_CALL(hipDeviceReset_p()); _kitrt_hipIsInitialized = false; diff --git a/kitsune/runtime/kitrt.cpp b/kitsune/runtime/kitrt.cpp index 261cd5cb83ba5b..cceda641a0176d 100644 --- a/kitsune/runtime/kitrt.cpp +++ b/kitsune/runtime/kitrt.cpp @@ -103,10 +103,12 @@ void __kitrt_CommonInit() { __kitrt_getEnvValue("KITRT_MAX_NUM_PREFETCH_STREAMS", _kitrt_MaxPrefetchStreams); __kitrt_getEnvValue("KITRT_DEVICE_ID", _kitrt_DefaultDeviceID); - if (__kitrt_prefetchEnabled()) - fprintf(stderr, "kitrt: prefetch enabled.\n"); - if (__kitrt_prefetchStreamsEnabled()) - fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams); + if (_kitrtVerboseMode) { + if (__kitrt_prefetchEnabled()) + fprintf(stderr, "kitrt: prefetch enabled.\n"); + if (__kitrt_prefetchStreamsEnabled()) + fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams); + } } void __kitrt_setVerboseMode(bool Enable) { diff --git a/kitsune/runtime/memory_map.cpp b/kitsune/runtime/memory_map.cpp index e5c85b2afa9308..818717f7842775 100644 --- a/kitsune/runtime/memory_map.cpp +++ b/kitsune/runtime/memory_map.cpp @@ -143,10 +143,12 @@ void __kitrt_clearMemAdvice(void *addr) { } } -bool __kitrt_isMemPrefetched(void *addr) { +bool __kitrt_isMemPrefetched(void *addr, size_t *size) { assert(addr != nullptr && "unexpected null pointer!"); KitRTAllocMap::const_iterator cit = _kitrtAllocMap.find(addr); if (cit != _kitrtAllocMap.end()) { + if (size != nullptr) + *size = cit->second.size; return cit->second.prefetched; } else { #ifdef _KITRT_VERBOSE_ diff --git a/kitsune/runtime/memory_map.h b/kitsune/runtime/memory_map.h index 10ee760bc7921f..0bcad61e6b5519 100644 --- a/kitsune/runtime/memory_map.h +++ b/kitsune/runtime/memory_map.h @@ -110,7 +110,7 @@ extern void __kitrt_memNeedsPrefetch(void *addr); /// @brief Return the prefetch status of the given allocation. /// @param addr: The pointer to the managed allocation. -bool __kitrt_isMemPrefetched(void *addr); +bool __kitrt_isMemPrefetched(void *addr, size_t *size = nullptr); /// @brief Is the given managed allocation marked as ready-only? /// @param addr: The pointer to the managed allocation. diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp index f15b062f97f4a4..9edb1f0367b476 100644 --- a/llvm/lib/Transforms/Tapir/CudaABI.cpp +++ b/llvm/lib/Transforms/Tapir/CudaABI.cpp @@ -56,7 +56,7 @@ using namespace llvm; #define DEBUG_TYPE "cuabi" // support for -debug-only=cuabi static const std::string CUABI_PREFIX = "__cuabi"; -static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + ".kern."; +static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + "_kern_"; // NOTE: At this point in time we do not provide support for the older range // of GPU architectures. We favor 64-bit and SM_60 or newer, which @@ -115,12 +115,6 @@ static cl::opt static cl::opt OptLevel("cuabi-opt-level", cl::init(3), cl::NotHidden, cl::desc("Specify the GPU kernel optimization level.")); -static const OptimizationLevel *optLevels[4] = { - &OptimizationLevel::O0, - &OptimizationLevel::O1, - &OptimizationLevel::O2, - &OptimizationLevel::O3 -}; /// Enable an extra set of passes over the host-side code after the /// code has been transformed (e.g., loops replaced with kernel launch @@ -1380,11 +1374,7 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) { // For now let's always warn if we spill registers... PTXASArgList.push_back("--warn-on-spills"); - - if (Verbose) - PTXASArgList.push_back("--verbose"); - else - LLVM_DEBUG(PTXASArgList.push_back("--verbose")); + PTXASArgList.push_back("--verbose"); if (Debug) { PTXASArgList.push_back("--device-debug"); @@ -1417,7 +1407,9 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) { break; case 3: PTXASArgList.push_back("3"); - PTXASArgList.push_back("--extensible-whole-program"); + // TODO: Some compiled codes (e.g., the raytracer test) crash with a + // corrupted kernel error (module load time) if this flag is enabled. + //PTXASArgList.push_back("--extensible-whole-program"); break; default: llvm_unreachable_internal("unhandled/unexpected optimization level", @@ -1994,9 +1986,17 @@ CudaABIOutputFile CudaABI::generatePTX() { PipelineTuningOptions pto; pto.LoopVectorization = OptLevel > 2; pto.SLPVectorization = OptLevel > 2; - pto.LoopUnrolling = OptLevel >= 2;; + pto.LoopUnrolling = OptLevel >= 2; pto.LoopInterleaving = OptLevel > 2; - pto.LoopStripmine = false; + pto.LoopStripmine = OptLevel > 2; + OptimizationLevel optLevels[] = { + OptimizationLevel::O0, + OptimizationLevel::O1, + OptimizationLevel::O2, + OptimizationLevel::O3, + }; + OptimizationLevel optLevel = optLevels[OptLevel]; + LoopAnalysisManager lam; FunctionAnalysisManager fam; CGSCCAnalysisManager cgam; @@ -2008,8 +2008,9 @@ CudaABIOutputFile CudaABI::generatePTX() { pb.registerLoopAnalyses(lam); PTXTargetMachine->registerPassBuilderCallbacks(pb); pb.crossRegisterProxies(lam, fam, cgam, mam); - ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]); + ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel); mpm.addPass(VerifierPass()); + LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n"); mpm.run(KernelModule, mam); LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n"); } @@ -2072,7 +2073,13 @@ void CudaABI::postProcessModule() { FunctionAnalysisManager fam; CGSCCAnalysisManager cgam; ModuleAnalysisManager mam; - + OptimizationLevel optLevels[] = { + OptimizationLevel::O0, + OptimizationLevel::O1, + OptimizationLevel::O2, + OptimizationLevel::O3, + }; + OptimizationLevel optLevel = optLevels[OptLevel]; PassBuilder pb(PTXTargetMachine, pto); pb.registerModuleAnalyses(mam); pb.registerCGSCCAnalyses(cgam); @@ -2081,7 +2088,7 @@ void CudaABI::postProcessModule() { PTXTargetMachine->registerPassBuilderCallbacks(pb); pb.crossRegisterProxies(lam, fam, cgam, mam); - ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]); + ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel); mpm.addPass(VerifierPass()); mpm.run(M, mam); LLVM_DEBUG(dbgs() << "\tpasses complete.\n"); @@ -2110,14 +2117,14 @@ CudaABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) { // If we have debug info in the module use a line number // based naming scheme for kernels. unsigned LineNumber = TL->getLoop()->getStartLoc()->getLine(); - KernelName = CUABI_PREFIX + ModuleName + "_" + Twine(LineNumber).str(); + KernelName = CUABI_KERNEL_NAME_PREFIX + ModuleName + "_" + Twine(LineNumber).str(); } else { //SmallString<255> ModName(Twine(ModuleName).str()); //sys::path::replace_extension(ModName, ""); //KernelName = CUABI_PREFIX + ModName.c_str(); // In the non-debug mode we use a consecutive numbering scheme for our // kernel names (this is currently handled via the 'make unique' parameter). - KernelName = CUABI_PREFIX + KernelName; + KernelName = CUABI_KERNEL_NAME_PREFIX + KernelName; } CudaLoop *Outliner = new CudaLoop(M, KernelModule, KernelName, this);