Skip to content

Commit

Permalink
work on hip performance details.
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrick McCormick committed Nov 28, 2023
1 parent 243ff11 commit 2668be2
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 27 deletions.
31 changes: 16 additions & 15 deletions kitsune/experiments/euler3d/euler3d-forall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ struct Float3 {

inline __attribute__((always_inline))
void cpy(float* dst, const float* src, int N) {
forall(unsigned int i = 0; i < N; i++) {
forall(unsigned int i = 0; i < N; i++)
dst[i] = src[i];
}
}

void dump(float* variables, int nel, int nelr)
Expand Down Expand Up @@ -137,7 +136,8 @@ float compute_pressure(float density,
float density_energy,
float speed_sqd)
{
return (float(GAMMA)-float(1.0f))*(density_energy - float(0.5f)*density*speed_sqd);
return (float(GAMMA)-float(1.0f))*(density_energy -
float(0.5f)*density*speed_sqd);
}

inline __attribute__((always_inline))
Expand All @@ -146,7 +146,6 @@ float compute_speed_of_sound(float density, float pressure)
return sqrtf(float(GAMMA)*pressure/density);
}

//inline __attribute__((always_inline))
void compute_step_factor(int nelr,
const float* __restrict variables,
const float* areas,
Expand Down Expand Up @@ -194,11 +193,11 @@ void compute_flux(int nelr,
using namespace std;
const float smoothing_coefficient = 0.2f;

forall(int blk = 0; blk < nelr/block_length; ++blk) {
int b_start = blk*block_length;
int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;
forall(unsigned int blk = 0; blk < nelr/block_length; ++blk) {
unsigned int b_start = blk*block_length;
unsigned int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;

for(int i = b_start; i < b_end; ++i) {
for(unsigned int i = b_start; i < b_end; ++i) {
float density_i = variables[i + VAR_DENSITY*nelr];
Float3 momentum_i;
momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr];
Expand Down Expand Up @@ -417,11 +416,11 @@ int main(int argc, char** argv)

cout << " Reading input data, allocating arrays, initializing data, etc..."
<< std::flush;

auto total_start_time = chrono::steady_clock::now();

// these need to be computed the first time in order to compute time step

float *ff_variable = alloc<float>(NVAR);
float *ff_variable = alloc<float>(NVAR);
Float3 ff_flux_contribution_momentum_x,
ff_flux_contribution_momentum_y,
ff_flux_contribution_momentum_z;
Expand Down Expand Up @@ -526,8 +525,8 @@ int main(int argc, char** argv)
double copy_total = 0.0;
double sf_total = 0.0;
double rk_total = 0.0;

for(int i = 0; i < iterations; i++) {

auto copy_start = chrono::steady_clock::now();
cpy(old_variables, variables, nelr*NVAR);
auto copy_end = chrono::steady_clock::now();
Expand All @@ -553,18 +552,20 @@ int main(int argc, char** argv)
}
auto rk_end = chrono::steady_clock::now();
time = chrono::duration<double>(rk_end-rk_start).count();
rk_times[i] = time;
rk_total += time;
if (i > 0) {
rk_times[i] = time;
rk_total += time;
}
}

dump(variables, nel, nelr);

auto end_time = chrono::steady_clock::now();
double elapsed_time = chrono::duration<double>(end_time-start_time).count();
double total_time = chrono::duration<double>(end_time-total_start_time).count();
double rk_mean = rk_total / iterations;
double rk_mean = rk_total / (iterations-1);
double sum = 0.0;
for(int i = 0; i < iterations; i++) {
for(int i = 1; i < iterations; i++) {
double dist = rk_times[i] - rk_mean;
sum += dist * dist;
}
Expand Down
7 changes: 3 additions & 4 deletions kitsune/experiments/euler3d/euler3d-kokkos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ struct Float3 {
#define __restrict
#endif

template <typename T>
void cpy(View<T> &dst, View<T> &src, int N) {
void cpy(View<float> &dst, View<float> &src, int N) {
src.sync_device();
dst.sync_device();
Kokkos::parallel_for("copy", N, KOKKOS_LAMBDA(const int &i) {
Expand Down Expand Up @@ -170,14 +169,13 @@ void compute_step_factor(int nelr,
variables.sync_device();
areas.sync_device();
step_factors.sync_device();
step_factors.modify_device();

Kokkos::parallel_for("compute_step_factor", nelr/block_length,
KOKKOS_LAMBDA(const int &blk) {
int b_start = blk*block_length;
int b_end = (blk+1)*block_length > nelr ? nelr : (blk+1)*block_length;

for(int i = b_start; i < b_end; i++) {
for(unsigned int i = b_start; i < b_end; i++) {
float density = variables.d_view(i + VAR_DENSITY*nelr);

Float3 momentum;
Expand All @@ -199,6 +197,7 @@ void compute_step_factor(int nelr,
(sqrtf(speed_sqd) + speed_of_sound));
}
});
step_factors.modify_device();
Kokkos::fence();
}

Expand Down
3 changes: 2 additions & 1 deletion kitsune/experiments/inc/kitsune-tapir.mk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ GPU_STRIPMINE_FLAGS?=
##################################
TAPIR_CUDA_FLAGS?=-ftapir=cuda \
-O$(KITSUNE_OPTLEVEL) \
-mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
-mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
-mllvm -cuabi-arch=$(CUDA_ARCH) \
-ffp-contract=fast \
-mllvm -cuabi-prefetch=true \
Expand All @@ -38,6 +38,7 @@ TAPIR_HIP_FLAGS?=-ftapir=hip \
-O$(KITSUNE_OPTLEVEL) \
-mllvm -hipabi-opt-level=$(KITSUNE_ABI_OPTLEVEL) \
-ffp-contract=fast \
-fno-vectorize \
-mllvm -hipabi-arch=$(AMDGPU_ARCH) \
-mllvm -hipabi-prefetch=true \
-mllvm -hipabi-streams=true \
Expand Down
24 changes: 17 additions & 7 deletions llvm/lib/Transforms/Tapir/HipABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1742,21 +1742,28 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
ObjFile->keep();

if (OptLevel > 0) {
if (OptLevel > 3)

if (OptLevel > 3) // This (I think) is consistent w/ Clang behavior...
OptLevel = 3;
LLVM_DEBUG(dbgs() << "\trunning kernel module optimization passes.\n");

PipelineTuningOptions pto;
pto.LoopVectorization = OptLevel > 2;
pto.SLPVectorization = OptLevel > 2;
pto.LoopUnrolling = OptLevel >= 2;
pto.LoopInterleaving = OptLevel > 2;
pto.LoopStripmine = OptLevel > 2;
pto.ForgetAllSCEVInLoopUnroll = OptLevel > 2;

// From the LLVM docs: Create the analysis managers.
// These must be declared in this order so that they are destroyed in the
// correct order due to inter-analysis-manager
// references.
LoopAnalysisManager lam;
FunctionAnalysisManager fam;
CGSCCAnalysisManager cgam;
ModuleAnalysisManager mam;
PassBuilder pb(AMDTargetMachine, pto);

PassBuilder pb(AMDTargetMachine);//, pto);
pb.registerModuleAnalyses(mam);
pb.registerCGSCCAnalyses(cgam);
pb.registerFunctionAnalyses(fam);
Expand All @@ -1770,10 +1777,13 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
OptimizationLevel::O3,
};
OptimizationLevel optLevel = optLevels[OptLevel];
ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
mpm.addPass(VerifierPass());
LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
mpm.run(KernelModule, mam);
ModulePassManager mpm0 = pb.buildModuleSimplificationPipeline(optLevels[3], ThinOrFullLTOPhase::None);
ModulePassManager mpm1 = pb.buildPerModuleDefaultPipeline(optLevels[2]);
mpm0.addPass(VerifierPass());
mpm1.addPass(VerifierPass());
LLVM_DEBUG(dbgs() << "\t\t* optimize module: " << KernelModule.getName() << "\n");
mpm0.run(KernelModule, mam);
mpm1.run(KernelModule, mam);
LLVM_DEBUG(dbgs() << "\t\tpasses complete.\n");
}

Expand Down

0 comments on commit 2668be2

Please sign in to comment.