Skip to content

Commit

Permalink
Working on some runtime tweaks and clean up. Traced a new crash to
Browse files Browse the repository at this point in the history
the use of a ptxas whole-program optimization flag.
  • Loading branch information
pmccormick committed Nov 27, 2023
1 parent 9512eb5 commit e7d0c09
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 58 deletions.
2 changes: 1 addition & 1 deletion kitsune/experiments/inc/kitsune-tapir.mk
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
KITSUNE_PREFIX?=/projects/kitsune/${host_arch}/16.x
KITSUNE_OPTLEVEL?=3
KITSUNE_ABI_OPTLEVEL?=2
KITSUNE_ABI_OPTLEVEL?=3
KITSUNE_OPTFLAGS?=-O$(KITSUNE_OPTLEVEL)

# For now we disable stripmining on GPUs.
Expand Down
1 change: 1 addition & 0 deletions kitsune/experiments/raytracer/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ raytracer-forall.cuda.${host_arch}: raytracer-forall.cpp
@echo $@
@$(TIME_CMD) $(KIT_CXX) $(TAPIR_CUDA_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
@$(FILE_SIZE)

raytracer-forall.hip.${host_arch}: raytracer-forall.cpp
@echo $@
@$(TIME_CMD) $(KIT_CXX) -v $(TAPIR_HIP_FLAGS) -o $@ $< -Xlinker -rpath=$(KITSUNE_PREFIX)/lib
Expand Down
112 changes: 85 additions & 27 deletions kitsune/runtime/cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,10 @@
//===----------------------------------------------------------------------===//

// TODO:
// * Need to do a better job tracking and freeing resources as necessary.
// * Need to ponder a path for better stream usage (probably related to
// more complex code generation on the compiler side).
//
//
// * Need a few options for stream usage that will likely require some
// compiler-side static analysis and additional entry points for
// runtime tuning. (consider: blocked prefetches, prefetch streams, etc.).

#include <cassert>
#include <cstdio>
#include <cstdlib>
Expand Down Expand Up @@ -122,6 +121,13 @@ extern unsigned _kitrt_MaxPrefetchStreams;
static unsigned _kitrt_CurPrefetchStream = 0;
std::vector<CUstream> _kitrt_PrefetchStreams;

struct KitRTPrefetchRequest {
void *addr;
size_t size;
};

std::list<KitRTPrefetchRequest> _kitrt_PrefetchRequests;

// NOTE: Over a series of CUDA releases it is worthwhile to
// check in on the header files for replacement versioned
// entry points into the driver API. These are typically
Expand Down Expand Up @@ -326,13 +332,9 @@ bool __kitrt_cuInit() {
_kitrtUseHeuristicLaunchParameters = false;
}

if (__kitrt_prefetchEnabled()) {
fprintf(stderr, "kitrt: prefetching enabled.\n");
}

if (__kitrt_prefetchStreamsEnabled()) {
for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
CUstream stream;
CUstream stream;
CU_SAFE_CALL(cuStreamCreate_p(&stream, CU_STREAM_DEFAULT));
_kitrt_PrefetchStreams.push_back(stream);
}
Expand All @@ -347,14 +349,14 @@ void __kitrt_cuDestroy() {
void __kitrt_cuFreeManagedMem(void *vp);
__kitrt_destroyMemoryMap(__kitrt_cuFreeManagedMem);

/*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
CUstream stream = _kitrt_PrefetchStreams[si];
CU_SAFE_CALL(hipStreamDestroy_p(stream));
if (__kitrt_prefetchStreamsEnabled()) {
for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
CUstream stream = _kitrt_PrefetchStreams[si];
CU_SAFE_CALL(cuStreamDestroy_v2_p(stream));
}
}
*/

// Note that all resources associated with the context will be destroyed.
CU_SAFE_CALL(cuDevicePrimaryCtxRelease_v2_p(_kitrtCUdevice));
CU_SAFE_CALL(cuDevicePrimaryCtxReset_v2_p(_kitrtCUdevice));
_kitrt_cuIsInitialized = false;
}
Expand Down Expand Up @@ -489,20 +491,65 @@ bool __kitrt_cuIsMemManaged(void *vp) {

// ---- Memory/data prefetch and data movement support.

void __kitrt_cuPrefetchRequest(void *vp) {
size_t size = 0;
if (not __kitrt_isMemPrefetched(vp, &size)) {
if (size > 0) {
struct KitRTPrefetchRequest R;
R.addr = vp;
R.size = size;
_kitrt_PrefetchRequests.push_back(R);
}
}
}


void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) {
assert(vp && "unexpected null pointer!");
bool is_read_only, is_write_only;
size_t size = __kitrt_getMemAllocSize(vp, &is_read_only, &is_write_only);
if (size > 0) {
// If we have a size, we know this pointer is from managed memory...
if (is_read_only) {
CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
CU_MEM_ADVISE_SET_READ_MOSTLY,
_kitrtCUdevice));
} else {
size_t size = 0;
if (not __kitrt_isMemPrefetched(vp, &size)) {
if (size > 0) {

//if (is_read_only) {
// CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
// CU_MEM_ADVISE_SET_READ_MOSTLY,
// _kitrtCUdevice));
//} else {
// CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
// CU_MEM_ADVISE_UNSET_READ_MOSTLY,
// _kitrtCUdevice));
//}


// Our semantics assume that a prefetch request suggests an inbound
// kernel launch. Setting the preferred location does not cause
// data to migrate to that location immediately. Instead, it guides
// the migration policy when a fault occurs on that memory region. If
// the data is already in its preferred location and the faulting
// processor can establish a mapping without requiring the data to be
// migrated, then data migration will be avoided. On the other hand, if
// the data is not in its preferred location or if a direct mapping cannot
// be established, then it will be migrated to the processor accessing it.
// It is important to note that setting the preferred location does not
// prevent data prefetching done using cuMemPrefetchAsync(). Having a
// preferred location can override the page thrash detection and
// resolution logic in the Unified Memory driver. Normally, if a page is
// detected to be constantly thrashing between host and device
// memory, the page may eventually be pinned to host memory. But if the
// preferred location is set as device memory, then the page will continue
// to thrash indefinitely. If CU_MEM_ADVISE_SET_READ_MOSTLY is also set on
// this memory region or any subset of it, then the policies associated
// with that advice will override the policies of this advice, unless read
// accesses from device will not result in a read-only copy being created
// on that device as outlined in description for the advice
// CU_MEM_ADVISE_SET_READ_MOSTLY.
CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
CU_MEM_ADVISE_UNSET_READ_MOSTLY,
_kitrtCUdevice));
CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
_kitrtCUdevice));

CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitrtCUdevice,
(CUstream)stream));
__kitrt_markMemPrefetched(vp);
}
// Our semantics assume that a prefetch request suggests an inbound
// kernel launch. Setting the preferred location does not cause
Expand Down Expand Up @@ -535,12 +582,23 @@ void __kitrt_cuMemPrefetchOnStream(void *vp, void *stream) {
}
}


void __kitrt_cuMemPrefetch(void *vp) {
assert(vp && "unexpected null pointer!");
__kitrt_cuMemPrefetchOnStream(vp, NULL);
}


void __kitrt_cuStreamSetMemPrefetch(void *vp) {
// Prefetching with streams has some rules that make a guarenteed
// behavior difficult... For a busy stream, the prefetch is
// deferred to a background thread by the driver to maintain stream
// ordering. This background thread executes the prefetch when all
// prior operations in the stream are completed. For idle streams,
// the driver can either defer the operation or not, but the driver
// often (how often?) does not defer because of the associated
// overhead. The exact details for when the driver may defer vary
// across driver versions.
assert(vp && "unexpected null pointer!");
CUstream stream = _kitrt_PrefetchStreams[_kitrt_CurPrefetchStream];
__kitrt_cuMemPrefetchOnStream(vp, (void*)stream);
Expand Down Expand Up @@ -1002,7 +1060,7 @@ void __kitrt_cuSynchronizeStreams() {
CU_SAFE_CALL(cuCtxSynchronize());
while (not _kitrtActiveStreams.empty()) {
CUstream stream = _kitrtActiveStreams.front();
CU_SAFE_CALL(cuStreamDestroy(stream));
CU_SAFE_CALL(cuStreamDestroy_v2_p(stream));
_kitrtActiveStreams.pop_front();
}
}
Expand Down
10 changes: 6 additions & 4 deletions kitsune/runtime/hip/hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,13 @@ void __kitrt_hipDestroy() {
extern void __kitrt_hipFreeManagedMem(void *);
__kitrt_destroyMemoryMap(__kitrt_hipFreeManagedMem);

/*for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
hipStream_t stream = *(_kitrt_PrefetchStreams[si]);
HIP_SAFE_CALL(hipStreamDestroy_p(stream));

if (__kitrt_prefetchStreamsEnabled()) {
for(unsigned si = 0; si < _kitrt_MaxPrefetchStreams; si++) {
hipStream_t stream = *(_kitrt_PrefetchStreams[si]);
HIP_SAFE_CALL(hipStreamDestroy_p(stream));
}
}
*/
HIP_SAFE_CALL(hipDeviceReset_p());

_kitrt_hipIsInitialized = false;
Expand Down
10 changes: 6 additions & 4 deletions kitsune/runtime/kitrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,12 @@ void __kitrt_CommonInit() {
__kitrt_getEnvValue("KITRT_MAX_NUM_PREFETCH_STREAMS", _kitrt_MaxPrefetchStreams);
__kitrt_getEnvValue("KITRT_DEVICE_ID", _kitrt_DefaultDeviceID);

if (__kitrt_prefetchEnabled())
fprintf(stderr, "kitrt: prefetch enabled.\n");
if (__kitrt_prefetchStreamsEnabled())
fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams);
if (_kitrtVerboseMode) {
if (__kitrt_prefetchEnabled())
fprintf(stderr, "kitrt: prefetch enabled.\n");
if (__kitrt_prefetchStreamsEnabled())
fprintf(stderr, "kitrt: maximum prefetch streams: %d\n", _kitrt_MaxPrefetchStreams);
}
}

void __kitrt_setVerboseMode(bool Enable) {
Expand Down
4 changes: 3 additions & 1 deletion kitsune/runtime/memory_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,12 @@ void __kitrt_clearMemAdvice(void *addr) {
}
}

bool __kitrt_isMemPrefetched(void *addr) {
bool __kitrt_isMemPrefetched(void *addr, size_t *size) {
assert(addr != nullptr && "unexpected null pointer!");
KitRTAllocMap::const_iterator cit = _kitrtAllocMap.find(addr);
if (cit != _kitrtAllocMap.end()) {
if (size != nullptr)
*size = cit->second.size;
return cit->second.prefetched;
} else {
#ifdef _KITRT_VERBOSE_
Expand Down
2 changes: 1 addition & 1 deletion kitsune/runtime/memory_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ extern void __kitrt_memNeedsPrefetch(void *addr);

/// @brief Return the prefetch status of the given allocation.
/// @param addr: The pointer to the managed allocation.
bool __kitrt_isMemPrefetched(void *addr);
bool __kitrt_isMemPrefetched(void *addr, size_t *size = nullptr);

/// @brief Is the given managed allocation marked as ready-only?
/// @param addr: The pointer to the managed allocation.
Expand Down
47 changes: 27 additions & 20 deletions llvm/lib/Transforms/Tapir/CudaABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ using namespace llvm;
#define DEBUG_TYPE "cuabi" // support for -debug-only=cuabi

static const std::string CUABI_PREFIX = "__cuabi";
static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + ".kern.";
static const std::string CUABI_KERNEL_NAME_PREFIX = CUABI_PREFIX + "_kern_";

// NOTE: At this point in time we do not provide support for the older range
// of GPU architectures. We favor 64-bit and SM_60 or newer, which
Expand Down Expand Up @@ -115,12 +115,6 @@ static cl::opt<bool>
static cl::opt<unsigned>
OptLevel("cuabi-opt-level", cl::init(3), cl::NotHidden,
cl::desc("Specify the GPU kernel optimization level."));
static const OptimizationLevel *optLevels[4] = {
&OptimizationLevel::O0,
&OptimizationLevel::O1,
&OptimizationLevel::O2,
&OptimizationLevel::O3
};

/// Enable an extra set of passes over the host-side code after the
/// code has been transformed (e.g., loops replaced with kernel launch
Expand Down Expand Up @@ -1380,11 +1374,7 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) {

// For now let's always warn if we spill registers...
PTXASArgList.push_back("--warn-on-spills");

if (Verbose)
PTXASArgList.push_back("--verbose");
else
LLVM_DEBUG(PTXASArgList.push_back("--verbose"));
PTXASArgList.push_back("--verbose");

if (Debug) {
PTXASArgList.push_back("--device-debug");
Expand Down Expand Up @@ -1417,7 +1407,9 @@ CudaABIOutputFile CudaABI::assemblePTXFile(CudaABIOutputFile &PTXFile) {
break;
case 3:
PTXASArgList.push_back("3");
PTXASArgList.push_back("--extensible-whole-program");
// TODO: Some compiled codes (e.g., the raytracer test) crash with a
// corrupted kernel error (module load time) if this flag is enabled.
//PTXASArgList.push_back("--extensible-whole-program");
break;
default:
llvm_unreachable_internal("unhandled/unexpected optimization level",
Expand Down Expand Up @@ -1994,9 +1986,17 @@ CudaABIOutputFile CudaABI::generatePTX() {
PipelineTuningOptions pto;
pto.LoopVectorization = OptLevel > 2;
pto.SLPVectorization = OptLevel > 2;
pto.LoopUnrolling = OptLevel >= 2;;
pto.LoopUnrolling = OptLevel >= 2;
pto.LoopInterleaving = OptLevel > 2;
pto.LoopStripmine = false;
pto.LoopStripmine = OptLevel > 2;
OptimizationLevel optLevels[] = {
OptimizationLevel::O0,
OptimizationLevel::O1,
OptimizationLevel::O2,
OptimizationLevel::O3,
};
OptimizationLevel optLevel = optLevels[OptLevel];

LoopAnalysisManager lam;
FunctionAnalysisManager fam;
CGSCCAnalysisManager cgam;
Expand All @@ -2008,8 +2008,9 @@ CudaABIOutputFile CudaABI::generatePTX() {
pb.registerLoopAnalyses(lam);
PTXTargetMachine->registerPassBuilderCallbacks(pb);
pb.crossRegisterProxies(lam, fam, cgam, mam);
ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]);
ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
mpm.addPass(VerifierPass());
LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
mpm.run(KernelModule, mam);
LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n");
}
Expand Down Expand Up @@ -2072,7 +2073,13 @@ void CudaABI::postProcessModule() {
FunctionAnalysisManager fam;
CGSCCAnalysisManager cgam;
ModuleAnalysisManager mam;

OptimizationLevel optLevels[] = {
OptimizationLevel::O0,
OptimizationLevel::O1,
OptimizationLevel::O2,
OptimizationLevel::O3,
};
OptimizationLevel optLevel = optLevels[OptLevel];
PassBuilder pb(PTXTargetMachine, pto);
pb.registerModuleAnalyses(mam);
pb.registerCGSCCAnalyses(cgam);
Expand All @@ -2081,7 +2088,7 @@ void CudaABI::postProcessModule() {
PTXTargetMachine->registerPassBuilderCallbacks(pb);
pb.crossRegisterProxies(lam, fam, cgam, mam);

ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(*optLevels[OptLevel]);
ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
mpm.addPass(VerifierPass());
mpm.run(M, mam);
LLVM_DEBUG(dbgs() << "\tpasses complete.\n");
Expand Down Expand Up @@ -2110,14 +2117,14 @@ CudaABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) {
// If we have debug info in the module use a line number
// based naming scheme for kernels.
unsigned LineNumber = TL->getLoop()->getStartLoc()->getLine();
KernelName = CUABI_PREFIX + ModuleName + "_" + Twine(LineNumber).str();
KernelName = CUABI_KERNEL_NAME_PREFIX + ModuleName + "_" + Twine(LineNumber).str();
} else {
//SmallString<255> ModName(Twine(ModuleName).str());
//sys::path::replace_extension(ModName, "");
//KernelName = CUABI_PREFIX + ModName.c_str();
// In the non-debug mode we use a consecutive numbering scheme for our
// kernel names (this is currently handled via the 'make unique' parameter).
KernelName = CUABI_PREFIX + KernelName;
KernelName = CUABI_KERNEL_NAME_PREFIX + KernelName;
}

CudaLoop *Outliner = new CudaLoop(M, KernelModule, KernelName, this);
Expand Down

0 comments on commit e7d0c09

Please sign in to comment.