Working on some runtime tweaks and clean up. Traced a new crash to

the use of a ptxas whole-program optimization flag.
tarunprabhu · Nov 27, 2023 · e7d0c09 · e7d0c09
1 parent 9512eb5
commit e7d0c09
Show file tree

Hide file tree

Showing 8 changed files with 130 additions and 58 deletions.
diff --git a/kitsune/experiments/inc/kitsune-tapir.mk b/kitsune/experiments/inc/kitsune-tapir.mk
@@ -4,7 +4,7 @@
 # 
 KITSUNE_PREFIX?=/projects/kitsune/${host_arch}/16.x
 KITSUNE_OPTLEVEL?=3
-KITSUNE_ABI_OPTLEVEL?=2
+KITSUNE_ABI_OPTLEVEL?=3
 KITSUNE_OPTFLAGS?=-O$(KITSUNE_OPTLEVEL)
 
 # For now we disable stripmining on GPUs.

diff --git a/kitsune/experiments/raytracer/makefile b/kitsune/experiments/raytracer/makefile
@@ -26,6 +26,7 @@ raytracer-forall.cuda.${host_arch}: raytracer-forall.cpp
 	@echo $@
 	@$(TIME_CMD) $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
 	@$(FILE_SIZE)
+
 raytracer-forall.hip.${host_arch}: raytracer-forall.cpp 
 	@echo $@
 	@$(TIME_CMD) $(KIT_CXX) -v $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib

diff --git a/kitsune/runtime/cuda/cuda.cpp b/kitsune/runtime/cuda/cuda.cpp
@@ -52,11 +52,10 @@
 //===----------------------------------------------------------------------===//
 
 // TODO:
-//   * Need to do a better job tracking and freeing resources as necessary.
-//   * Need to ponder a path for better stream usage (probably related to
-//     more complex code generation on the compiler side).
-//
-//
+//   * Need a few options for stream usage that will likely require some 
+//     compiler-side static analysis and additional entry points for 
+//     runtime tuning.  (consider: blocked prefetches, prefetch streams, etc.).
+
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
@@ -122,6 +121,13 @@ extern unsigned _kitrt_MaxPrefetchStreams;
 static unsigned _kitrt_CurPrefetchStream = 0;
 std::vector<CUstream> _kitrt_PrefetchStreams;
 
+struct KitRTPrefetchRequest {
+  void *addr;
+  size_t size;
+};
+
+std::list<KitRTPrefetchRequest> _kitrt_PrefetchRequests;
+
 // NOTE: Over a series of CUDA releases it is worthwhile to
 // check in on the header files for replacement versioned
 // entry points into the driver API.  These are typically
@@ -326,13 +332,9 @@ bool __kitrt_cuInit() {
     _kitrtUseHeuristicLaunchParameters = false;
   }
 
-  if (__kitrt_prefetchEnabled()) {
-    fprintf(stderr, "kitrt: prefetching enabled.\n");
-  }
-
   if (__kitrt_prefetchStreamsEnabled()) {
     for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
-      CUstream stream; 
+      CUstream stream;
       CU_SAFE_CALL(cuStreamCreate_p(&stream, CU_STREAM_DEFAULT));
       _kitrt_PrefetchStreams.push_back(stream);
     }
@@ -347,14 +349,14 @@ void __kitrt_cuDestroy() {
     void __kitrt_cuFreeManagedMem(void *vp);
     __kitrt_destroyMemoryMap(__kitrt_cuFreeManagedMem);
 
-    /*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
-      CUstream stream = _kitrt_PrefetchStreams[si];
-      CU_SAFE_CALL(hipStreamDestroy_p(stream));
+    if (__kitrt_prefetchStreamsEnabled()) {
+      for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
+        CUstream stream = _kitrt_PrefetchStreams[si];
+        CU_SAFE_CALL(cuStreamDestroy_v2_p(stream));
+      }
     }
-    */
 
     // Note that all resources associated with the context will be destroyed.
-    CU_SAFE_CALL(cuDevicePrimaryCtxRelease_v2_p(_kitrtCUdevice));
     CU_SAFE_CALL(cuDevicePrimaryCtxReset_v2_p(_kitrtCUdevice));
     _kitrt_cuIsInitialized = false;
   }
@@ -489,20 +491,65 @@ bool __kitrt_cuIsMemManaged(void *vp) {
 
 // ---- Memory/data prefetch and data movement support.
 
+void __kitrt_cuPrefetchRequest(void *vp) {
+  size_t size = 0;
+  if (not __kitrt_isMemPrefetched(vp, &size)) {
+    if (size > 0) {
+      struct KitRTPrefetchRequest R;
+      R.addr = vp;
+      R.size = size;
+      _kitrt_PrefetchRequests.push_back(R);
+    }
+  }
+}
+
+
 void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) {
   assert(vp && "unexpected null pointer!");
-  bool is_read_only, is_write_only;
-  size_t size = __kitrt_getMemAllocSize(vp, &is_read_only, &is_write_only);
-  if (size > 0) {
-    // If we have a size, we know this pointer is from managed memory...
-    if (is_read_only) {
-      CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
-                                 CU_MEM_ADVISE_SET_READ_MOSTLY,
-                                 _kitrtCUdevice));
-    } else {
+  size_t size = 0;
+  if (not __kitrt_isMemPrefetched(vp, &size)) {
+    if (size > 0) {
+
+      //if (is_read_only) {
+      //  CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
+      //                             CU_MEM_ADVISE_SET_READ_MOSTLY,
+      //                             _kitrtCUdevice));
+      //} else {
+      //  CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
+      //                             CU_MEM_ADVISE_UNSET_READ_MOSTLY,
+      //                             _kitrtCUdevice));
+      //}
+
+
+      // Our semantics assume that a prefetch request suggests an inbound
+      // kernel launch.   Setting the preferred location does not cause
+      // data to migrate to that location immediately. Instead, it guides
+      // the migration policy when a fault occurs on that memory region. If
+      // the data is already in its preferred location and the faulting
+      // processor can establish a mapping without requiring the data to be
+      // migrated, then data migration will be avoided. On the other hand, if
+      // the data is not in its preferred location or if a direct mapping cannot
+      // be established, then it will be migrated to the processor accessing it.
+      // It is important to note that setting the preferred location does not
+      // prevent data prefetching done using cuMemPrefetchAsync(). Having a
+      // preferred location can override the page thrash detection and
+      // resolution logic in the Unified Memory driver. Normally, if a page is
+      // detected to be constantly thrashing between host and device
+      // memory, the page may eventually be pinned to host memory. But if the
+      // preferred location is set as device memory, then the page will continue
+      // to thrash indefinitely. If CU_MEM_ADVISE_SET_READ_MOSTLY is also set on
+      // this memory region or any subset of it, then the policies associated
+      // with that advice will override the policies of this advice, unless read
+      // accesses from device will not result in a read-only copy being created
+      // on that device as outlined in description for the advice
+      // CU_MEM_ADVISE_SET_READ_MOSTLY.
       CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
-                                 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 _kitrtCUdevice));
+                                 CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
+                                _kitrtCUdevice));
+
+      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitrtCUdevice,
+                                        (CUstream)stream));
+      __kitrt_markMemPrefetched(vp);
     }
     // Our semantics assume that a prefetch request suggests an inbound
     // kernel launch.   Setting the preferred location does not cause
@@ -535,12 +582,23 @@ void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) {
   }
 }
 
+
 void __kitrt_cuMemPrefetch(void *vp) {
   assert(vp && "unexpected null pointer!");
   __kitrt_cuMemPrefetchOnStream(vp, NULL);
 }
 
+
 void __kitrt_cuStreamSetMemPrefetch(void *vp) {
+  // Prefetching with streams has some rules that make a guarenteed
+  // behavior difficult...  For a busy stream, the prefetch is
+  // deferred to a background thread by the driver to maintain stream
+  // ordering. This background thread executes the prefetch when all
+  // prior operations in the stream are completed. For idle streams,
+  // the driver can either defer the operation or not, but the driver
+  // often (how often?) does not defer because of the associated
+  // overhead.  The exact details for when the driver may defer vary
+  // across driver versions.
   assert(vp && "unexpected null pointer!");
   CUstream stream = _kitrt_PrefetchStreams[_kitrt_CurPrefetchStream];
   __kitrt_cuMemPrefetchOnStream(vp, (void*)stream);
@@ -1002,7 +1060,7 @@ void __kitrt_cuSynchronizeStreams() {
   CU_SAFE_CALL(cuCtxSynchronize());
   while (not _kitrtActiveStreams.empty()) {
     CUstream stream = _kitrtActiveStreams.front();
-    CU_SAFE_CALL(cuStreamDestroy(stream));
+    CU_SAFE_CALL(cuStreamDestroy_v2_p(stream));
     _kitrtActiveStreams.pop_front();
   }
 }

diff --git a/kitsune/runtime/hip/hip.cpp b/kitsune/runtime/hip/hip.cpp
@@ -365,11 +365,13 @@ void __kitrt_hipDestroy() {
     extern void __kitrt_hipFreeManagedMem(void *);
     __kitrt_destroyMemoryMap(__kitrt_hipFreeManagedMem);
 
-    /*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
-      hipStream_t stream = *(_kitrt_PrefetchStreams[si]);
-      HIP_SAFE_CALL(hipStreamDestroy_p(stream));
+
+    if (__kitrt_prefetchStreamsEnabled()) {
+      for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
+        hipStream_t stream = *(_kitrt_PrefetchStreams[si]);
+        HIP_SAFE_CALL(hipStreamDestroy_p(stream));
+      }
     }
-    */
     HIP_SAFE_CALL(hipDeviceReset_p());
 
     _kitrt_hipIsInitialized = false;

diff --git a/kitsune/runtime/kitrt.cpp b/kitsune/runtime/kitrt.cpp
@@ -103,10 +103,12 @@ void __kitrt_CommonInit() {
   __kitrt_getEnvValue("KITRT_MAX_NUM_PREFETCH_STREAMS", _kitrt_MaxPrefetchStreams);
   __kitrt_getEnvValue("KITRT_DEVICE_ID", _kitrt_DefaultDeviceID);
 
-  if (__kitrt_prefetchEnabled())
-    fprintf(stderr, "kitrt: prefetch enabled.\n");
-  if (__kitrt_prefetchStreamsEnabled())
-    fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams);
+  if (_kitrtVerboseMode) {
+    if (__kitrt_prefetchEnabled())
+      fprintf(stderr, "kitrt: prefetch enabled.\n");
+    if (__kitrt_prefetchStreamsEnabled())
+      fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams);
+  }
 }
 
 void __kitrt_setVerboseMode(bool Enable) {

diff --git a/kitsune/runtime/memory_map.cpp b/kitsune/runtime/memory_map.cpp
@@ -143,10 +143,12 @@ void __kitrt_clearMemAdvice(void *addr) {
   }
 }
 
-bool __kitrt_isMemPrefetched(void *addr) {
+bool __kitrt_isMemPrefetched(void *addr, size_t *size) {
   assert(addr != nullptr && "unexpected null pointer!");
   KitRTAllocMap::const_iterator cit = _kitrtAllocMap.find(addr);
   if (cit != _kitrtAllocMap.end()) {
+    if (size != nullptr)
+      *size = cit->second.size;
     return cit->second.prefetched;
   } else {
     #ifdef _KITRT_VERBOSE_

diff --git a/kitsune/runtime/memory_map.h b/kitsune/runtime/memory_map.h
@@ -110,7 +110,7 @@ extern void __kitrt_memNeedsPrefetch(void *addr);
 
 /// @brief Return the prefetch status of the given allocation.
 /// @param addr: The pointer to the managed allocation.
-bool __kitrt_isMemPrefetched(void *addr);
+bool __kitrt_isMemPrefetched(void *addr, size_t *size = nullptr);
 
 /// @brief Is the given managed allocation marked as ready-only?
 /// @param addr: The pointer to the managed allocation. 

diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp
@@ -56,7 +56,7 @@ using namespace llvm;
 #define DEBUG_TYPE "cuabi" // support for -debug-only=cuabi
 
 static const std::string CUABI_PREFIX = "__cuabi";
-static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + ".kern.";
+static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + "_kern_";
 
 // NOTE: At this point in time we do not provide support for the older range
 // of GPU architectures. We favor 64-bit and SM_60 or newer, which
@@ -115,12 +115,6 @@ static cl::opt<bool>
 static cl::opt<unsigned>
     OptLevel("cuabi-opt-level", cl::init(3), cl::NotHidden,
              cl::desc("Specify the GPU kernel optimization level."));
-static const OptimizationLevel *optLevels[4] = {
-    &OptimizationLevel::O0,
-    &OptimizationLevel::O1,
-    &OptimizationLevel::O2,
-    &OptimizationLevel::O3
-};
 
 /// Enable an extra set of passes over the host-side code after the
 /// code has been transformed (e.g., loops replaced with kernel launch
@@ -1380,11 +1374,7 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) {
 
   // For now let's always warn if we spill registers...
   PTXASArgList.push_back("--warn-on-spills");
-
-  if (Verbose)
-    PTXASArgList.push_back("--verbose");
-  else
-    LLVM_DEBUG(PTXASArgList.push_back("--verbose"));
+  PTXASArgList.push_back("--verbose");
 
   if (Debug) {
     PTXASArgList.push_back("--device-debug");
@@ -1417,7 +1407,9 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) {
       break;
     case 3:
       PTXASArgList.push_back("3");
-      PTXASArgList.push_back("--extensible-whole-program");
+      // TODO: Some compiled codes (e.g., the raytracer test) crash with a
+      // corrupted kernel error (module load time) if this flag is enabled.
+      //PTXASArgList.push_back("--extensible-whole-program");
       break;
     default:
       llvm_unreachable_internal("unhandled/unexpected optimization level",
@@ -1994,9 +1986,17 @@ CudaABIOutputFile CudaABI::generatePTX() {
     PipelineTuningOptions pto;
     pto.LoopVectorization = OptLevel > 2;
     pto.SLPVectorization = OptLevel > 2;
-    pto.LoopUnrolling = OptLevel >= 2;;
+    pto.LoopUnrolling = OptLevel >= 2;
     pto.LoopInterleaving = OptLevel > 2;
-    pto.LoopStripmine = false;
+    pto.LoopStripmine = OptLevel > 2;
+    OptimizationLevel optLevels[] = {
+        OptimizationLevel::O0,
+        OptimizationLevel::O1,
+        OptimizationLevel::O2,
+        OptimizationLevel::O3,
+    };
+    OptimizationLevel optLevel = optLevels[OptLevel];
+
     LoopAnalysisManager lam;
     FunctionAnalysisManager fam;
     CGSCCAnalysisManager cgam;
@@ -2008,8 +2008,9 @@ CudaABIOutputFile CudaABI::generatePTX() {
     pb.registerLoopAnalyses(lam);
     PTXTargetMachine->registerPassBuilderCallbacks(pb);
     pb.crossRegisterProxies(lam, fam, cgam, mam);
-    ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]);
+    ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
     mpm.addPass(VerifierPass());
+    LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
     mpm.run(KernelModule, mam);
     LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n");
   }
@@ -2072,7 +2073,13 @@ void CudaABI::postProcessModule() {
     FunctionAnalysisManager fam;
     CGSCCAnalysisManager cgam;
     ModuleAnalysisManager mam;
-
+    OptimizationLevel optLevels[] = {
+        OptimizationLevel::O0,
+        OptimizationLevel::O1,
+        OptimizationLevel::O2,
+        OptimizationLevel::O3,
+    };
+    OptimizationLevel optLevel = optLevels[OptLevel];
     PassBuilder pb(PTXTargetMachine, pto);
     pb.registerModuleAnalyses(mam);
     pb.registerCGSCCAnalyses(cgam);
@@ -2081,7 +2088,7 @@ void CudaABI::postProcessModule() {
     PTXTargetMachine->registerPassBuilderCallbacks(pb);
     pb.crossRegisterProxies(lam, fam, cgam, mam);
 
-    ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]);
+    ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
     mpm.addPass(VerifierPass());
     mpm.run(M, mam);
     LLVM_DEBUG(dbgs() << "\tpasses complete.\n");
@@ -2110,14 +2117,14 @@ CudaABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) {
     // If we have debug info in the module use a line number
     // based naming scheme for kernels.
     unsigned LineNumber = TL->getLoop()->getStartLoc()->getLine();
-    KernelName = CUABI_PREFIX + ModuleName + "_" + Twine(LineNumber).str();
+    KernelName = CUABI_KERNEL_NAME_PREFIX + ModuleName + "_" + Twine(LineNumber).str();
   } else {
     //SmallString<255> ModName(Twine(ModuleName).str());
     //sys::path::replace_extension(ModName, "");
     //KernelName = CUABI_PREFIX + ModName.c_str();
     // In the non-debug mode we use a consecutive numbering scheme for our
     // kernel names (this is currently handled via the 'make unique' parameter).
-    KernelName = CUABI_PREFIX + KernelName;
+    KernelName = CUABI_KERNEL_NAME_PREFIX + KernelName;
   }
 
   CudaLoop *Outliner = new CudaLoop(M, KernelModule, KernelName, this);