Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][HIP] Add AMDGPU reflect pass to choose between safe and unsafe AMDGPU atomics #11467

Merged
merged 14 commits into from
Apr 24, 2024
6 changes: 6 additions & 0 deletions libclc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
# Disables NVVM reflection to defer to after linking
list( APPEND flags -Xclang -target-feature -Xclang +ptx72
-march=sm_86 -mllvm --nvvm-reflect-enable=false)
elseif( ARCH STREQUAL amdgcn )
# AMDGCN needs libclc to be compiled to high bc version since all atomic
# clang builtins need to be accessible
list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false )
elseif( ARCH STREQUAL x86_64)
# TODO: This is used by SYCL Native Cpu, we should define an option to set this flags
list( APPEND flags -Xclang -target-feature -Xclang +avx
Expand Down Expand Up @@ -342,6 +346,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
# simultaneously, we choose declare the builtins using the private space,
# which will also work for the generic address space.
set( supports_generic_addrspace FALSE )
elseif( ARCH STREQUAL amdgcn )
set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false )
elseif( ARCH STREQUAL x86_64)
set( opt_flags )
set( supports_generic_addrspace FALSE )
Expand Down
13 changes: 4 additions & 9 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_and.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, int, i, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned int, j, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, long, l, __hip_atomic_fetch_and)
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned long, m, __hip_atomic_fetch_and)
#define __CLC_OP &
#define __SPIRV_BUILTIN _Z17__spirv_AtomicAnd
#define __HIP_BUILTIN __hip_atomic_fetch_and

#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
28 changes: 22 additions & 6 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

extern int __oclc_amdgpu_reflect(__constant char *);

#define AMDGPU_ARCH_GEQ(LOWER) __oclc_ISA_version >= LOWER
#define AMDGPU_ARCH_BETWEEN(LOWER, UPPER) \
__oclc_ISA_version >= LOWER &&__oclc_ISA_version < UPPER
Expand Down Expand Up @@ -72,14 +74,22 @@
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, BUILTIN) \
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, BUILTIN)

#define AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
SUB1, OP) \
// Safe atomics will either choose a slow CAS atomic impl (default) or a fast
// native atomic if --amdgpu-unsafe-int-atomics is passed to LLVM.
//
// Safe atomics using CAS may be necessary if PCIe does not support atomic
// operations such as and, or, xor
#define AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
SUB1, OP, USE_BUILTIN_COND, BUILTIN) \
_CLC_DEF TYPE \
FUNC_NAME##P##AS_MANGLED##TYPE_MANGLED##N5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagE##TYPE_MANGLED( \
volatile AS TYPE *p, enum Scope scope, \
enum MemorySemanticsMask semantics, TYPE val) { \
int atomic_scope = 0, memory_order = 0; \
GET_ATOMIC_SCOPE_AND_ORDER(scope, atomic_scope, semantics, memory_order) \
if (USE_BUILTIN_COND) \
return BUILTIN(p, val, memory_order, atomic_scope); \
/* CAS atomics*/ \
TYPE oldval = __hip_atomic_load(p, memory_order, atomic_scope); \
TYPE newval = 0; \
do { \
Expand All @@ -89,7 +99,13 @@
return oldval; \
}

#define AMDGPU_CAS_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP) \
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP)
#define AMDGPU_SAFE_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP, BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL( \
FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP, \
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP, \
true /* local AS should always use builtin*/, \
BUILTIN) \
AMDGPU_SAFE_ATOMIC_IMPL( \
FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP, \
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN)
13 changes: 4 additions & 9 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_or.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, int, i, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned int, j, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, long, l, __hip_atomic_fetch_or)
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned long, m, __hip_atomic_fetch_or)
#define __CLC_OP |
#define __SPIRV_BUILTIN _Z16__spirv_AtomicOr
#define __HIP_BUILTIN __hip_atomic_fetch_or

#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
9 changes: 9 additions & 0 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_safe.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Before including, define: __SPIRV_BUILTIN, __CLC_OP, __HIP_BUILTIN
// and include atomic_helpers.h to get AMDGPU_SAFE_ATOMIC

AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, int, i, __CLC_OP, __HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned int, j, __CLC_OP,
__HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, long, l, __CLC_OP, __HIP_BUILTIN)
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned long, m, __CLC_OP,
__HIP_BUILTIN)
16 changes: 4 additions & 12 deletions libclc/amdgcn-amdhsa/libspirv/atomic/atomic_xor.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,8 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

#define __CLC_XOR ^
#define __CLC_OP ^
#define __SPIRV_BUILTIN _Z17__spirv_AtomicXor
#define __HIP_BUILTIN __hip_atomic_fetch_xor

AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, int, i, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned int, j, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, long, l, __CLC_XOR)
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned long, m, __CLC_XOR)

#undef __CLC_XOR
#undef AMDGPU_ATOMIC
#undef AMDGPU_ATOMIC_IMPL
#undef AMDGPU_ARCH_GEQ
#undef AMDGPU_ARCH_BETWEEN
#undef GET_ATOMIC_SCOPE_AND_ORDER
#include "atomic_safe.def"
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ struct AMDGPULowerKernelAttributesPass
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

struct AMDGPUOclcReflectPass : public PassInfoMixin<AMDGPUOclcReflectPass> {
public:
PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
hdelan marked this conversation as resolved.
Show resolved Hide resolved
static bool isRequired() { return true; }
};

void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
extern char &AMDGPULowerModuleLDSLegacyPassID;

Expand Down
92 changes: 92 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUOclcReflect.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
//===- AMDGPUOclcReflect.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass searches for occurences of the AMDGPU_OCLC_REFLECT function, and
// replaces the calls with some val dependent on the operand of the func. This
// can be used to reflect across different implementations of functions at
// compile time based on a compiler flag or some other means. This pass
// currently supports use cases:
//
// 1. Choose a safe or unsafe version of atomic_xor at compile time, which can
// be chosen at compile time by setting the flag
// --amdgpu-oclc-unsafe-int-atomics=true.
//
// This pass is similar to the NVPTX pass NVVMReflect.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"

using namespace llvm;

#define AMDGPU_OCLC_REFLECT "__oclc_amdgpu_reflect"

static cl::opt<bool>
AMDGPUReflectEnabled("amdgpu-oclc-reflect-enable", cl::init(true),
frasercrmck marked this conversation as resolved.
Show resolved Hide resolved
cl::Hidden,
cl::desc("AMDGPU reflection, enabled by default"));
static cl::opt<bool> AMDGPUUnsafeIntAtomicsEnable(
"amdgpu-oclc-unsafe-int-atomics", cl::init(false), cl::Hidden,
cl::desc("Should unsafe int atomics be chosen. Disabled by default."));

PreservedAnalyses AMDGPUOclcReflectPass::run(Function &F,
FunctionAnalysisManager &AM) {
if (!AMDGPUReflectEnabled)
return PreservedAnalyses::all();

if (F.getName() == AMDGPU_OCLC_REFLECT) {
assert(F.isDeclaration() &&
"__oclc_amdgpu_reflect function should not have a body");
return PreservedAnalyses::all();
}

SmallVector<CallInst *, 4> ToRemove;

for (Instruction &I : instructions(F)) {
CallInst *Call = dyn_cast<CallInst>(&I);
hdelan marked this conversation as resolved.
Show resolved Hide resolved
if (!Call)
continue;
if (Function *Callee = Call->getCalledFunction();
!Callee || Callee->getName() != AMDGPU_OCLC_REFLECT)
continue;

assert(Call->arg_size() == 1 &&
"Wrong number of operands to __oclc_amdgpu_reflect function");

ToRemove.push_back(Call);
}

if (!ToRemove.size())
return PreservedAnalyses::all();

for (CallInst *Call : ToRemove) {
const Value *Str = Call->getArgOperand(0);
const Value *Operand = cast<Constant>(Str)->getOperand(0);
hdelan marked this conversation as resolved.
Show resolved Hide resolved
StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
hdelan marked this conversation as resolved.
Show resolved Hide resolved
hdelan marked this conversation as resolved.
Show resolved Hide resolved

if (ReflectArg == "AMDGPU_OCLC_UNSAFE_INT_ATOMICS") {
int ReflectVal = AMDGPUUnsafeIntAtomicsEnable ? 1 : 0;
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
} else {
report_fatal_error("Invalid arg passed to __oclc_amdgpu_reflect");
}
Call->eraseFromParent();
}

PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<DominatorTreeAnalysis>();
return PA;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-arguments",
AMDGPULowerKernelArgumentsPass(*this))
FUNCTION_PASS("amdgpu-lower-kernel-attributes",
AMDGPULowerKernelAttributesPass())
FUNCTION_PASS("amdgpu-oclc-reflect", AMDGPUOclcReflectPass())
FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
FPM.addPass(AMDGPUOclcReflectPass());
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUInsertSingleUseVDST.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOclcReflect.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgpu-oclc-reflect.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
; RUN: opt -S -p amdgpu-oclc-reflect %s | FileCheck %s -check-prefixes=CHECK,CHECK-SAFE-ATOMICS
hdelan marked this conversation as resolved.
Show resolved Hide resolved
; RUN: opt -S -p amdgpu-oclc-reflect -amdgpu-oclc-unsafe-int-atomics=true %s | FileCheck %s -check-prefixes=CHECK,CHECK-UNSAFE-ATOMICS

target triple = "amdgcn-amd-amdhsa"

@.str = private unnamed_addr addrspace(4) constant [31 x i8] c"AMDGPU_OCLC_UNSAFE_INT_ATOMICS\00", align 1

declare hidden i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef) local_unnamed_addr

define i32 @foo() {
; CHECK-NOT: call i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef @.str)
; CHECK-SAFE-ATOMICS: ret i32 0
; CHECK-UNSAFE-ATOMICS: ret i32 1
%call = tail call i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef @.str)
frasercrmck marked this conversation as resolved.
Show resolved Hide resolved
ret i32 %call
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// REQUIRES: hip_be
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
hdelan marked this conversation as resolved.
Show resolved Hide resolved
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE

#include <sycl/sycl.hpp>

int main() {
sycl::queue{}.single_task([=] {
int a;
sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
atomicInt(a);
atomicInt.fetch_xor(1);
atomicInt.fetch_and(1);
atomicInt.fetch_or(1);
// CHECK: __CLANG_OFFLOAD_BUNDLE____START__ sycl-amdgcn-amd-amdhsa-
// CHECK-SAFE: cmpxchg volatile
// CHECK-SAFE-NOT: atomicrmw
hdelan marked this conversation as resolved.
Show resolved Hide resolved
// CHECK-UNSAFE: atomicrmw volatile xor
// CHECK-UNSAFE: atomicrmw volatile and
// CHECK-UNSAFE: atomicrmw volatile or
// CHECK-UNSAFE-NOT: cmpxchg
// CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
});
}
Loading