From 2668be237f4c59e7b33a3e27117324db876e486f Mon Sep 17 00:00:00 2001
From: Patrick McCormick <>
Date: Mon, 27 Nov 2023 15:09:29 -0700
Subject: [PATCH] work on hip performance details.

---
 .../experiments/euler3d/euler3d-forall.cpp    | 31 ++++++++++---------
 .../experiments/euler3d/euler3d-kokkos.cpp    |  7 ++---
 kitsune/experiments/inc/kitsune-tapir.mk      |  3 +-
 llvm/lib/Transforms/Tapir/HipABI.cpp          | 24 +++++++++-----
 4 files changed, 38 insertions(+), 27 deletions(-)
diff --git a/kitsune/experiments/euler3d/euler3d-forall.cpp b/kitsune/experiments/euler3d/euler3d-forall.cpp
index db7ecf1109baa5..8e8a15f4900927 100644
--- a/kitsune/experiments/euler3d/euler3d-forall.cpp
+++ b/kitsune/experiments/euler3d/euler3d-forall.cpp
@@ -41,9 +41,8 @@ struct Float3 {
 
 inline __attribute__((always_inline))
 void cpy(float* dst, const float* src, int N) {
-  forall(unsigned int i = 0; i < N; i++) {
+  forall(unsigned int i = 0; i < N; i++)
     dst[i] = src[i];
-  }
 }
 
 void dump(float* variables, int nel, int nelr)
@@ -137,7 +136,8 @@ float compute_pressure(float density,
                        float density_energy,
                        float speed_sqd)
 {
-  return (float(GAMMA)-float(1.0f))*(density_energy - float(0.5f)*density*speed_sqd);
+  return (float(GAMMA)-float(1.0f))*(density_energy -
+	  float(0.5f)*density*speed_sqd);
 }
 
 inline __attribute__((always_inline))
@@ -146,7 +146,6 @@ float compute_speed_of_sound(float density, float pressure)
   return sqrtf(float(GAMMA)*pressure/density);
 }
 
-//inline __attribute__((always_inline))
 void compute_step_factor(int nelr,
                          const float* __restrict variables,
                          const float* areas,
@@ -194,11 +193,11 @@ void compute_flux(int nelr,
   using namespace std;
   const float smoothing_coefficient = 0.2f;
 
-  forall(int blk = 0; blk < nelr/block_length; ++blk) {
-    int b_start = blk*block_length;
-    int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;
+  forall(unsigned int blk = 0; blk < nelr/block_length; ++blk) {
+    unsigned int b_start = blk*block_length;
+    unsigned int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;
 
-    for(int i = b_start; i < b_end; ++i) {
+    for(unsigned int i = b_start; i < b_end; ++i) {
       float density_i = variables[i + VAR_DENSITY*nelr];
       Float3 momentum_i;
       momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr];
@@ -417,11 +416,11 @@ int main(int argc, char** argv)
       
   cout << "  Reading input data, allocating arrays, initializing data, etc..." 
        << std::flush;
+
   auto total_start_time = chrono::steady_clock::now();
 
   // these need to be computed the first time in order to compute time step
-       
-  float *ff_variable = alloc<float>(NVAR);
+    float *ff_variable = alloc<float>(NVAR);
   Float3 ff_flux_contribution_momentum_x,
     ff_flux_contribution_momentum_y,
     ff_flux_contribution_momentum_z;
@@ -526,8 +525,8 @@ int main(int argc, char** argv)
   double copy_total = 0.0;
   double sf_total = 0.0;
   double rk_total = 0.0;
+
   for(int i = 0; i < iterations; i++) {
-    
     auto copy_start = chrono::steady_clock::now();
     cpy(old_variables, variables, nelr*NVAR);
     auto copy_end = chrono::steady_clock::now();
@@ -553,8 +552,10 @@ int main(int argc, char** argv)
     }
     auto rk_end = chrono::steady_clock::now();
     time = chrono::duration<double>(rk_end-rk_start).count();
-    rk_times[i] = time;
-    rk_total += time;
+    if (i > 0) {
+      rk_times[i] = time;
+      rk_total += time;
+    }
   }
   
   dump(variables, nel, nelr);
@@ -562,9 +563,9 @@ int main(int argc, char** argv)
   auto end_time = chrono::steady_clock::now();
   double elapsed_time = chrono::duration<double>(end_time-start_time).count();
   double total_time = chrono::duration<double>(end_time-total_start_time).count();
-  double rk_mean = rk_total / iterations;
+  double rk_mean = rk_total / (iterations-1);
   double sum = 0.0;
-  for(int i = 0; i < iterations; i++) {
+  for(int i = 1; i < iterations; i++) {
     double dist = rk_times[i] - rk_mean;
     sum += dist * dist; 
   }
diff --git a/kitsune/experiments/euler3d/euler3d-kokkos.cpp b/kitsune/experiments/euler3d/euler3d-kokkos.cpp
index 6bcdfaab0ca87c..95e04f432f13d6 100644
--- a/kitsune/experiments/euler3d/euler3d-kokkos.cpp
+++ b/kitsune/experiments/euler3d/euler3d-kokkos.cpp
@@ -44,8 +44,7 @@ struct Float3 {
 #define __restrict
 #endif
 
-template <typename T>
-void cpy(View<T> &dst, View<T> &src, int N) {
+void cpy(View<float> &dst, View<float> &src, int N) {
   src.sync_device();
   dst.sync_device();
   Kokkos::parallel_for("copy", N, KOKKOS_LAMBDA(const int &i) {
@@ -170,14 +169,13 @@ void compute_step_factor(int nelr,
   variables.sync_device();
   areas.sync_device();
   step_factors.sync_device();
-  step_factors.modify_device();
 
   Kokkos::parallel_for("compute_step_factor", nelr/block_length,
         KOKKOS_LAMBDA(const int &blk) {
     int b_start = blk*block_length;
     int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;
 
-    for(int i = b_start; i < b_end; i++) {
+    for(unsigned int i = b_start; i < b_end; i++) {
       float density = variables.d_view(i + VAR_DENSITY*nelr);
 
       Float3 momentum;
@@ -199,6 +197,7 @@ void compute_step_factor(int nelr,
         (sqrtf(speed_sqd) + speed_of_sound));
     }
   });
+  step_factors.modify_device();
   Kokkos::fence();
 }
 
diff --git a/kitsune/experiments/inc/kitsune-tapir.mk b/kitsune/experiments/inc/kitsune-tapir.mk
index 9ce554fd322717..359d6e491c4996 100644
--- a/kitsune/experiments/inc/kitsune-tapir.mk
+++ b/kitsune/experiments/inc/kitsune-tapir.mk
@@ -13,7 +13,7 @@ GPU_STRIPMINE_FLAGS?=
 ##################################
 TAPIR_CUDA_FLAGS?=-ftapir=cuda \
  -O$(KITSUNE_OPTLEVEL) \
-  -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
+ -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
  -mllvm -cuabi-arch=$(CUDA_ARCH) \
  -ffp-contract=fast \
  -mllvm -cuabi-prefetch=true \
@@ -38,6 +38,7 @@ TAPIR_HIP_FLAGS?=-ftapir=hip \
   -O$(KITSUNE_OPTLEVEL) \
   -mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
   -ffp-contract=fast \
+  -fno-vectorize \
   -mllvm -hipabi-arch=$(AMDGPU_ARCH) \
   -mllvm -hipabi-prefetch=true \
   -mllvm -hipabi-streams=true \
diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp
index 536e5cad64e4b6..479bea731ce434 100644
--- a/llvm/lib/Transforms/Tapir/HipABI.cpp
+++ b/llvm/lib/Transforms/Tapir/HipABI.cpp
@@ -1742,9 +1742,10 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
   ObjFile->keep();
 
   if (OptLevel > 0) {
-    if (OptLevel > 3)
+
+    if (OptLevel > 3)  // This (I think) is consistent w/ Clang behavior...
       OptLevel = 3;
-    LLVM_DEBUG(dbgs() << "\trunning kernel module optimization passes.\n");
+
     PipelineTuningOptions pto;
     pto.LoopVectorization = OptLevel > 2;
     pto.SLPVectorization = OptLevel > 2;
@@ -1752,11 +1753,17 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
     pto.LoopInterleaving = OptLevel > 2;
     pto.LoopStripmine = OptLevel > 2;
     pto.ForgetAllSCEVInLoopUnroll = OptLevel > 2;
+    
+    // From the LLVM docs: Create the analysis managers.
+    // These must be declared in this order so that they are destroyed in the
+    // correct order due to inter-analysis-manager 
+    // references.
     LoopAnalysisManager lam;
     FunctionAnalysisManager fam;
     CGSCCAnalysisManager cgam;
     ModuleAnalysisManager mam;
-    PassBuilder pb(AMDTargetMachine, pto);
+
+    PassBuilder pb(AMDTargetMachine);//, pto);
     pb.registerModuleAnalyses(mam);
     pb.registerCGSCCAnalyses(cgam);
     pb.registerFunctionAnalyses(fam);
@@ -1770,10 +1777,13 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
         OptimizationLevel::O3,
     };
     OptimizationLevel optLevel = optLevels[OptLevel];
-    ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
-    mpm.addPass(VerifierPass());
-    LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
-    mpm.run(KernelModule, mam);
+    ModulePassManager mpm0 = pb.buildModuleSimplificationPipeline(optLevels[3], ThinOrFullLTOPhase::None);
+    ModulePassManager mpm1 = pb.buildPerModuleDefaultPipeline(optLevels[2]);
+    mpm0.addPass(VerifierPass());
+    mpm1.addPass(VerifierPass());
+    LLVM_DEBUG(dbgs() << "\t\t* optimize module: " << KernelModule.getName() << "\n");
+    mpm0.run(KernelModule, mam);
+    mpm1.run(KernelModule, mam);
     LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n");
   }