Skip to content

Commit

Permalink
Fixes for LTO...
Browse files Browse the repository at this point in the history
  • Loading branch information
pmccormick committed Feb 13, 2024
1 parent f9094d3 commit dbfc195
Show file tree
Hide file tree
Showing 11 changed files with 150 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,14 +242,14 @@ int main(int argc, char** argv)
<< "*** " << elapsed_time << ", " << elapsed_time << "\n"
<< "----\n\n";

dealloc(ff_variable);
/*dealloc(ff_variable);
dealloc(areas);
dealloc(elements_surrounding_elements);
dealloc(normals);
dealloc(variables);
dealloc(old_variables);
dealloc(fluxes);
dealloc(step_factors);

*/
return 0;
}
}
7 changes: 4 additions & 3 deletions kitsune/experiments/euler3d-multi-file/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ VPATH := ./forall/

forall_lto_src = forall/compute_flux.cpp \
forall/compute_step_factor.cpp \
forall/time_step.cpp
forall/time_step.cpp \
forall/euler3d-forall.cpp

forall_src = forall/euler3d-forall.cpp

Expand Down Expand Up @@ -40,9 +41,9 @@ forall_cuda_lto_objs := $(patsubst %.cpp, forall/%.cuda.lto.o, $(notdir $(forall
forall_cuda_objs := $(patsubst %.cpp, forall/%.cuda.o, $(notdir $(forall_src)))

forall/%.cuda.lto.o: forall/%.cpp
${KIT_CXX} -c -flto ${TAPIR_CUDA_FLAGS} -o $@ $<
${KIT_CXX} -c -flto -ftapir=cuda -O${KITSUNE_OPTLEVEL} -o $@ $<
euler3d-forall.cuda.${host_arch}: ${forall_cuda_lto_objs}
${KIT_CXX} -flto -fuse-ld=lld ${TAPIR_CUDA_LTO_FLAGS} -o $@ ${forall_cuda_lto_objs} \
${KIT_CXX} -flto -fuse-ld=lld -ftapir=cuda ${TAPIR_CUDA_LTO_FLAGS} -o $@ ${forall_cuda_lto_objs} \
-Xlinker -rpath=${KITSUNE_PREFIX}/lib

clean:
Expand Down
6 changes: 5 additions & 1 deletion kitsune/experiments/inc/kitsune-tapir.mk
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ TAPIR_CUDA_FLAGS=$(TAPIR_CUDA_TARGET) $(TAPIR_CUDA_TARGET_FLAGS)
#-mllvm -cuabi-run-post-opts \
# -mllvm -cuabi-streams=true \
TAPIR_CUDA_LTO_FLAGS?=-Wl,--tapir-target=cuda,--lto-O${KITSUNE_OPTLEVEL},--mllvm=-cuabi-opt-level=$(KITSUNE_ABI_OPTLEVEL),--mllvm=-cuabi-arch=$(CUDA_ARCH)
TAPIR_CUDA_LTO_FLAGS?=-Wl,--tapir-target=cuda\
-Wl,--threads=1\
-Wl,--lto-O${KITSUNE_OPTLEVEL}\
-Wl,-mllvm=-cuabi-opt-level=$(KITSUNE_ABI_OPTLEVEL)\
-Wl,-mllvm=-cuabi-arch=$(CUDA_ARCH)

ifneq ($(KITSUNE_VERBOSE),)
TAPIR_CUDA_FLAGS+=-mllvm -debug-only=cuabi $(TAPIR_CUDA_DEBUG_FLAGS)
Expand Down
33 changes: 33 additions & 0 deletions kitsune/experiments/lto-tests/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
include ../experiments.mk

targets += vecadd.cuda.${host_arch}

VPATH := ./

forall_lto_src = vecadd-init.cpp vecadd-add.cpp vecadd-forall.cpp

all: ${targets}

#############################
# cuda target
#
forall_cuda_lto_objs := $(patsubst %.cpp, %.cuda.lto.o, $(notdir $(forall_lto_src)))

%.cuda.lto.o: %.cpp
${KIT_CXX} -ftapir=cuda -c -flto -O1 -o $@ $<
# ${KITSUNE_PREFIX}/bin/llvm-dis -o $@.ll $@

vecadd.cuda.${host_arch}: ${forall_cuda_lto_objs}
${KIT_CXX} -flto -fuse-ld=lld -ftapir=cuda ${TAPIR_CUDA_LTO_FLAGS} -o $@ ${forall_cuda_lto_objs} \
-Xlinker -rpath=${KITSUNE_PREFIX}/lib
cuobjdump -symbols $@

clean:
-rm -f *.${host_arch} *.o *.lto.o
-rm -f *.fatbin
-rm -rf *-cfg-tmp
-rm -f *.bc
-rm -f *.fatbin
-rm -f *.ppm *.jpg
-rm -f *.ll *.ptx *.csv *.log *.s *.fbin *.tapir
-rm -f *.dat
12 changes: 12 additions & 0 deletions kitsune/experiments/lto-tests/vecadd-add.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include <cstdlib>
#include <kitsune.h>

extern "C" {
void vec_add(const float *A, const float *B, float *C, uint64_t N) {
forall(size_t i = 0; i < N; ++i)
C[i] = A[i] + B[i];
}
}



62 changes: 62 additions & 0 deletions kitsune/experiments/lto-tests/vecadd-forall.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <iostream>
#include <iomanip>
#include <chrono>
#include <kitsune.h>

extern "C" void fill(float *data, uint64_t N);
extern "C" void vec_add(const float *A, const float *B, float *C, uint64_t N);

int main (int argc, char* argv[]) {
using namespace std;
uint64_t size = 1024 * 1024 * 256;
unsigned int iterations = 10;
cout << setprecision(5);
cout << "\n";
cout << "---- vector addition benchmark (forall) ----\n"
<< " Vector size: " << size << " elements.\n\n";
cout << " Allocating arrays and filling with random values..."
<< std::flush;
float *A = alloc<float>(size);
float *B = alloc<float>(size);
float *C = alloc<float>(size);
fill(A, size);
fill(B, size);
cout << " done.\n\n";

double elapsed_time;
double min_time = 100000.0;
double max_time = 0.0;
for(unsigned t = 0; t < iterations; t++) {
auto start_time = chrono::steady_clock::now();
vec_add(A, B, C, size);
auto end_time = chrono::steady_clock::now();
elapsed_time = chrono::duration<double>(end_time-start_time).count();
if (elapsed_time < min_time)
min_time = elapsed_time;
if (elapsed_time > max_time)
max_time = elapsed_time;
cout << "\t" << t << ". iteration time: " << elapsed_time << ".\n";
}
cout << " Checking final result..." << std::flush;

size_t error_count = 0;
for(size_t i = 0; i < size; i++) {
float sum = A[i] + B[i];
if (C[i] != sum)
error_count++;
}
if (error_count) {
cout << " incorrect result found! ("
<< error_count << " errors found)\n\n";
return 1;
} else {
cout << " pass (answers match).\n\n"
<< " Total time: " << elapsed_time
<< " seconds. (" << size / elapsed_time << " elements/sec.)\n"
<< "*** " << min_time << ", " << max_time << "\n"
<< "----\n\n";
}

return error_count;
}

13 changes: 13 additions & 0 deletions kitsune/experiments/lto-tests/vecadd-init.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include <cstdlib>
#include <kitsune.h>

extern "C" {
void fill(float *data, uint64_t N) {
float base_value = rand() / (float)RAND_MAX;
forall(size_t i = 0; i < N; ++i)
data[i] = base_value + i;
}
}



5 changes: 3 additions & 2 deletions kitsune/experiments/vecadd/vecadd-forall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

template<typename T>
void random_fill(T *data, size_t N) {
for(size_t i = 0; i < N; ++i)
data[i] = rand() / (T)RAND_MAX;
T base_value = rand() / (T)RAND_MAX;
forall(size_t i = 0; i < N; ++i)
data[i] = base_value + i;
}

int main (int argc, char* argv[]) {
Expand Down
2 changes: 2 additions & 0 deletions kitsune/runtime/cuda/streams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ extern "C" {

CUstream __kitcuda_get_thread_stream() {
KIT_NVTX_PUSH("kitcuda:get_thread_stream", KIT_NVTX_STREAM);

pid_t tid = gettid();

_kitcuda_stream_mutex.lock();
KitCudaStreamMap::iterator sit = _kitcuda_stream_map.find(tid);
CUstream stream;
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Transforms/Tapir/CudaABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ void CudaLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
LLVM_DEBUG(dbgs() << "\t\t- gathering and analyzing global values...\n");
std::set<GlobalValue *> UsedGlobalValues;
Loop &L = *TL.getLoop();

for (Loop *SL : L)
for (BasicBlock *BB : SL->blocks())
collect(*BB, UsedGlobalValues);
Expand Down Expand Up @@ -737,7 +738,7 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
// occurred before outlining.
Function *KernelF = Out.Outline;
KernelF->setName(KernelName);

KernelF->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
KernelF->removeFnAttr("target-cpu");
KernelF->removeFnAttr("target-features");
KernelF->removeFnAttr("personality");
Expand Down Expand Up @@ -1956,6 +1957,8 @@ CudaABIOutputFile CudaABI::generatePTX() {
LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
mpm.run(KernelModule, mam);
LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n");
LLVM_DEBUG(saveModuleToFile(&KernelModule, KernelModule.getName().str() +
".postopt.LTO.ll"));
}

// Setup the passes and request that the output goes to the
Expand Down
28 changes: 9 additions & 19 deletions llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1742,39 +1742,29 @@ PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) {
for (Loop *L : LoopWorkList)
Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
}

errs() << "before target id stuff...\n";
assert(SavedF && "unexpected null saved function?\n");
// FIXME: Are there any chances of WorkList being empty? Why not
// just use the head of the WorkList here vs. SavedF?
TapirTargetID TargetID = GetTLI(*SavedF).getTapirTarget();
errs() << " got a target id : " << int(TargetID) << "...\n";
std::shared_ptr<TapirTarget> Target(getTapirTargetFromID(M, TargetID));
errs() << " created a target...\n";

bool HasParallelism = false;
std::map<TapirTargetID, std::shared_ptr<TapirTarget>> Targets;
errs() << "after target id stuff...\n";

// Now process each loop.
bool HasParallelism = false;
for (Function *F : WorkList) {
TapirTargetID TargetID = GetTLI(*F).getTapirTarget();
std::shared_ptr<TapirTarget> Target(getTapirTargetFromID(M, TargetID));
HasParallelism |=
LoopSpawningImpl(*F, GetDT(*F), GetLI(*F), GetTI(*F), GetSE(*F),
GetAC(*F), GetTTI(*F), TargetID, GetORE(*F), Targets)
.run();
}

// if parallelism was discovered during loop spawning postprocess each target
if (HasParallelism)
// FIXME: The order of target processing here possibly breaks a "inside-out"
// contract (loosely speaking) for ordering. In nested constructs this
// leaves us with a partially completed code transformation when we pop
// up a level of code nesting. This is important for nested loops with
// different targets...
// FIXME: The order of target processing here possibly breaks a
// "inside-out" contract (loosely speaking) for ordering. In nested
// constructs this leaves us with a partially completed code
// transformation when we pop up a level of code nesting. This is
// important for nested loops with different targets...
for (const auto &[TID, ThisTarget] : Targets)
ThisTarget->postProcessModule();

Changed |= HasParallelism;

if (Changed)
return PreservedAnalyses::none();
return PreservedAnalyses::all();
Expand Down

0 comments on commit dbfc195

Please sign in to comment.