From 2668be237f4c59e7b33a3e27117324db876e486f Mon Sep 17 00:00:00 2001 From: Patrick McCormick <> Date: Mon, 27 Nov 2023 15:09:29 -0700 Subject: [PATCH] work on hip performance details. --- .../experiments/euler3d/euler3d-forall.cpp | 31 ++++++++++--------- .../experiments/euler3d/euler3d-kokkos.cpp | 7 ++--- kitsune/experiments/inc/kitsune-tapir.mk | 3 +- llvm/lib/Transforms/Tapir/HipABI.cpp | 24 +++++++++----- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/kitsune/experiments/euler3d/euler3d-forall.cpp b/kitsune/experiments/euler3d/euler3d-forall.cpp index db7ecf1109baa5..8e8a15f4900927 100644 --- a/kitsune/experiments/euler3d/euler3d-forall.cpp +++ b/kitsune/experiments/euler3d/euler3d-forall.cpp @@ -41,9 +41,8 @@ struct Float3 { inline __attribute__((always_inline)) void cpy(float* dst, const float* src, int N) { - forall(unsigned int i = 0; i < N; i++) { + forall(unsigned int i = 0; i < N; i++) dst[i] = src[i]; - } } void dump(float* variables, int nel, int nelr) @@ -137,7 +136,8 @@ float compute_pressure(float density, float density_energy, float speed_sqd) { - return (float(GAMMA)-float(1.0f))*(density_energy - float(0.5f)*density*speed_sqd); + return (float(GAMMA)-float(1.0f))*(density_energy - + float(0.5f)*density*speed_sqd); } inline __attribute__((always_inline)) @@ -146,7 +146,6 @@ float compute_speed_of_sound(float density, float pressure) return sqrtf(float(GAMMA)*pressure/density); } -//inline __attribute__((always_inline)) void compute_step_factor(int nelr, const float* __restrict variables, const float* areas, @@ -194,11 +193,11 @@ void compute_flux(int nelr, using namespace std; const float smoothing_coefficient = 0.2f; - forall(int blk = 0; blk < nelr/block_length; ++blk) { - int b_start = blk*block_length; - int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length; + forall(unsigned int blk = 0; blk < nelr/block_length; ++blk) { + unsigned int b_start = blk*block_length; + unsigned int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length; - for(int i = b_start; i < b_end; ++i) { + for(unsigned int i = b_start; i < b_end; ++i) { float density_i = variables[i + VAR_DENSITY*nelr]; Float3 momentum_i; momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr]; @@ -417,11 +416,11 @@ int main(int argc, char** argv) cout << " Reading input data, allocating arrays, initializing data, etc..." << std::flush; + auto total_start_time = chrono::steady_clock::now(); // these need to be computed the first time in order to compute time step - - float *ff_variable = alloc(NVAR); + float *ff_variable = alloc(NVAR); Float3 ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z; @@ -526,8 +525,8 @@ int main(int argc, char** argv) double copy_total = 0.0; double sf_total = 0.0; double rk_total = 0.0; + for(int i = 0; i < iterations; i++) { - auto copy_start = chrono::steady_clock::now(); cpy(old_variables, variables, nelr*NVAR); auto copy_end = chrono::steady_clock::now(); @@ -553,8 +552,10 @@ int main(int argc, char** argv) } auto rk_end = chrono::steady_clock::now(); time = chrono::duration(rk_end-rk_start).count(); - rk_times[i] = time; - rk_total += time; + if (i > 0) { + rk_times[i] = time; + rk_total += time; + } } dump(variables, nel, nelr); @@ -562,9 +563,9 @@ int main(int argc, char** argv) auto end_time = chrono::steady_clock::now(); double elapsed_time = chrono::duration(end_time-start_time).count(); double total_time = chrono::duration(end_time-total_start_time).count(); - double rk_mean = rk_total / iterations; + double rk_mean = rk_total / (iterations-1); double sum = 0.0; - for(int i = 0; i < iterations; i++) { + for(int i = 1; i < iterations; i++) { double dist = rk_times[i] - rk_mean; sum += dist * dist; } diff --git a/kitsune/experiments/euler3d/euler3d-kokkos.cpp b/kitsune/experiments/euler3d/euler3d-kokkos.cpp index 6bcdfaab0ca87c..95e04f432f13d6 100644 --- a/kitsune/experiments/euler3d/euler3d-kokkos.cpp +++ b/kitsune/experiments/euler3d/euler3d-kokkos.cpp @@ -44,8 +44,7 @@ struct Float3 { #define __restrict #endif -template -void cpy(View &dst, View &src, int N) { +void cpy(View &dst, View &src, int N) { src.sync_device(); dst.sync_device(); Kokkos::parallel_for("copy", N, KOKKOS_LAMBDA(const int &i) { @@ -170,14 +169,13 @@ void compute_step_factor(int nelr, variables.sync_device(); areas.sync_device(); step_factors.sync_device(); - step_factors.modify_device(); Kokkos::parallel_for("compute_step_factor", nelr/block_length, KOKKOS_LAMBDA(const int &blk) { int b_start = blk*block_length; int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length; - for(int i = b_start; i < b_end; i++) { + for(unsigned int i = b_start; i < b_end; i++) { float density = variables.d_view(i + VAR_DENSITY*nelr); Float3 momentum; @@ -199,6 +197,7 @@ void compute_step_factor(int nelr, (sqrtf(speed_sqd) + speed_of_sound)); } }); + step_factors.modify_device(); Kokkos::fence(); } diff --git a/kitsune/experiments/inc/kitsune-tapir.mk b/kitsune/experiments/inc/kitsune-tapir.mk index 9ce554fd322717..359d6e491c4996 100644 --- a/kitsune/experiments/inc/kitsune-tapir.mk +++ b/kitsune/experiments/inc/kitsune-tapir.mk @@ -13,7 +13,7 @@ GPU_STRIPMINE_FLAGS?= ################################## TAPIR_CUDA_FLAGS?=-ftapir=cuda \ -O$(KITSUNE_OPTLEVEL) \ - -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \ + -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \ -mllvm -cuabi-arch=$(CUDA_ARCH) \ -ffp-contract=fast \ -mllvm -cuabi-prefetch=true \ @@ -38,6 +38,7 @@ TAPIR_HIP_FLAGS?=-ftapir=hip \ -O$(KITSUNE_OPTLEVEL) \ -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \ -ffp-contract=fast \ + -fno-vectorize \ -mllvm -hipabi-arch=$(AMDGPU_ARCH) \ -mllvm -hipabi-prefetch=true \ -mllvm -hipabi-streams=true \ diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp index 536e5cad64e4b6..479bea731ce434 100644 --- a/llvm/lib/Transforms/Tapir/HipABI.cpp +++ b/llvm/lib/Transforms/Tapir/HipABI.cpp @@ -1742,9 +1742,10 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { ObjFile->keep(); if (OptLevel > 0) { - if (OptLevel > 3) + + if (OptLevel > 3) // This (I think) is consistent w/ Clang behavior... OptLevel = 3; - LLVM_DEBUG(dbgs() << "\trunning kernel module optimization passes.\n"); + PipelineTuningOptions pto; pto.LoopVectorization = OptLevel > 2; pto.SLPVectorization = OptLevel > 2; @@ -1752,11 +1753,17 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { pto.LoopInterleaving = OptLevel > 2; pto.LoopStripmine = OptLevel > 2; pto.ForgetAllSCEVInLoopUnroll = OptLevel > 2; + + // From the LLVM docs: Create the analysis managers. + // These must be declared in this order so that they are destroyed in the + // correct order due to inter-analysis-manager + // references. LoopAnalysisManager lam; FunctionAnalysisManager fam; CGSCCAnalysisManager cgam; ModuleAnalysisManager mam; - PassBuilder pb(AMDTargetMachine, pto); + + PassBuilder pb(AMDTargetMachine);//, pto); pb.registerModuleAnalyses(mam); pb.registerCGSCCAnalyses(cgam); pb.registerFunctionAnalyses(fam); @@ -1770,10 +1777,13 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { OptimizationLevel::O3, }; OptimizationLevel optLevel = optLevels[OptLevel]; - ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel); - mpm.addPass(VerifierPass()); - LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n"); - mpm.run(KernelModule, mam); + ModulePassManager mpm0 = pb.buildModuleSimplificationPipeline(optLevels[3], ThinOrFullLTOPhase::None); + ModulePassManager mpm1 = pb.buildPerModuleDefaultPipeline(optLevels[2]); + mpm0.addPass(VerifierPass()); + mpm1.addPass(VerifierPass()); + LLVM_DEBUG(dbgs() << "\t\t* optimize module: " << KernelModule.getName() << "\n"); + mpm0.run(KernelModule, mam); + mpm1.run(KernelModule, mam); LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n"); }