Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aarch64: Support FEAT_LSE128 and FEAT_LRCPC3 #68

Merged
merged 2 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/.cspell/project-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ isync
kuser
ldar
ldaxp
ldclrp
ldiapp
ldsetp
ldxp
lghi
libcalls
Expand Down Expand Up @@ -118,6 +121,7 @@ sreg
sstatus
stdarch
stdsimd
stilp
stlxp
stpq
stqcx
Expand All @@ -127,6 +131,7 @@ subc
subfe
subfic
subfze
swpp
syscall
sysctlbyname
systemsim
Expand Down
18 changes: 13 additions & 5 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,18 @@ fn main() {
// aarch64 macOS always support FEAT_LSE and FEAT_LSE2 because it is armv8.5-a:
// https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0/llvm/include/llvm/TargetParser/AArch64TargetParser.h#L458
let is_macos = target_os == "macos";
// aarch64_target_feature stabilized in Rust 1.61.
target_feature_if("lse", is_macos, &version, Some(61), true);
// As of rustc 1.70, target_feature "lse2" is not available on rustc side:
let mut has_lse = is_macos;
// FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
// As of rustc 1.70, target_feature "lse2"/"lse128"/"rcpc3" is not available on rustc side:
// https://github.com/rust-lang/rust/blob/1.70.0/compiler/rustc_codegen_ssa/src/target_features.rs#L58
target_feature_if("lse2", is_macos, &version, None, false);
// LLVM supports FEAT_LRCPC3 and FEAT_LSE128 on LLVM 16+:
// https://github.com/llvm/llvm-project/commit/a6aaa969f7caec58a994142f8d855861cf3a1463
// https://github.com/llvm/llvm-project/commit/7fea6f2e0e606e5339c3359568f680eaf64aa306
has_lse |= target_feature_if("lse128", false, &version, None, false);
target_feature_if("rcpc3", false, &version, None, false);
// aarch64_target_feature stabilized in Rust 1.61.
target_feature_if("lse", has_lse, &version, Some(61), true);

// As of Apple M1/M1 Pro, on Apple hardware, CAS loop-based RMW is much slower than LL/SC
// loop-based RMW: https://github.com/taiki-e/portable-atomic/pull/89
Expand Down Expand Up @@ -338,7 +345,7 @@ fn target_feature_if(
version: &Version,
stabilized: Option<u32>,
is_rustc_target_feature: bool,
) {
) -> bool {
// HACK: Currently, it seems that the only way to handle unstable target
// features on the stable is to parse the `-C target-feature` in RUSTFLAGS.
//
Expand All @@ -353,7 +360,7 @@ fn target_feature_if(
&& (version.nightly || stabilized.map_or(false, |stabilized| version.minor >= stabilized))
{
// In this case, cfg(target_feature = "...") would work, so skip emitting our own target_feature cfg.
return;
return false;
} else if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") {
for mut flag in rustflags.to_string_lossy().split('\x1f') {
flag = strip_prefix(flag, "-C").unwrap_or(flag);
Expand All @@ -373,6 +380,7 @@ fn target_feature_if(
if has_target_feature {
println!("cargo:rustc-cfg=portable_atomic_target_feature=\"{}\"", name);
}
has_target_feature
}

fn target_cpu() -> Option<String> {
Expand Down
2 changes: 1 addition & 1 deletion src/imp/atomic128/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Here is the table of targets that support 128-bit atomics and the instructions u
| target_arch | load | store | CAS | RMW | note |
| ----------- | ---- | ----- | --- | --- | ---- |
| x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel or AMD CPU with AVX. <br> Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only. <br> Requires rustc 1.59+ when cmpxchg16b target feature is enabled at compile-time, otherwise requires rustc 1.69+ |
| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp | ldxp/stxp or casp | ldxp/stxp or casp | casp requires lse target feature, ldp/stp requires lse2 target feature. <br> Both compile-time and run-time detection are supported for lse. lse2 is currently compile-time detection only. <br> Requires rustc 1.59+ |
| aarch64 | ldxp/stxp or casp or ldp/ldiapp | ldxp/stxp or casp or stp/stilp/swpp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, ldiapp/stilp requires lse2 and rcpc3 target features, swpp/ldclrp/ldsetp requires lse128 target feature. <br> Both compile-time and run-time detection are supported for lse. Others are currently compile-time detection only. <br> Requires rustc 1.59+ |
| powerpc64 | lq | stq | lqarx/stqcx. | lqarx/stqcx. | Requires target-cpu pwr8+ (powerpc64le is pwr8 by default). Both compile-time and run-time detection are supported (run-time detection is currently disabled by default). <br> Requires nightly |
| s390x | lpq | stpq | cdsg | cdsg | Requires nightly |

Expand Down
181 changes: 169 additions & 12 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// - LDXP/STXP loop (DW LL/SC)
// - CASP (DWCAS) added as FEAT_LSE (mandatory from armv8.1-a)
// - LDP/STP (DW load/store) if FEAT_LSE2 (optional from armv8.2-a, mandatory from armv8.4-a) is available
// - LDIAPP/STILP (DW acquire-load/release-store) added as FEAT_LRCPC3 (optional from armv8.9-a/armv9.4-a) (if FEAT_LSE2 is also available)
// - LDCLRP/LDSETP/SWPP (DW RMW) added as FEAT_LSE128 (optional from armv9.4-a)
//
// If outline-atomics is not enabled and FEAT_LSE is not available at
// compile-time, we use LDXP/STXP loop.
Expand All @@ -15,8 +17,10 @@
// However, when portable_atomic_ll_sc_rmw cfg is set, use LDXP/STXP loop instead of CASP
// loop for RMW (by default, it is set on Apple hardware; see build script for details).
// If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store.
// If FEAT_LSE2 and FEAT_LRCPC3 are available at compile-time, we use LDIAPP/STILP for acquire-load/release-store.
// If FEAT_LSE128 is available at compile-time, we use LDCLRP/LDSETP/SWPP for fetch_and/fetch_or/swap/{release,seqcst}-store.
//
// Note: FEAT_LSE2 doesn't imply FEAT_LSE.
// Note: FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
//
// Note that we do not separate LL and SC into separate functions, but handle
// them within a single asm block. This is because it is theoretically possible
Expand Down Expand Up @@ -48,11 +52,14 @@
// - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
//
// Generated asm:
// - aarch64 https://godbolt.org/z/nds1nWbnq
// - aarch64 msvc https://godbolt.org/z/PTKdhbKqW
// - aarch64 (+lse) https://godbolt.org/z/5GzssfTKc
// - aarch64 msvc (+lse) https://godbolt.org/z/oYE87caM7
// - aarch64 (+lse,+lse2) https://godbolt.org/z/36dPjMbaG
// - aarch64 https://godbolt.org/z/zT5av9nMP
// - aarch64 msvc https://godbolt.org/z/b5r9ordYW
// - aarch64 (+lse) https://godbolt.org/z/6EeE94ebd
// - aarch64 msvc (+lse) https://godbolt.org/z/d3Tev7nbv
// - aarch64 (+lse,+lse2) https://godbolt.org/z/K1xhW5jP8
// - aarch64 (+lse,+lse2,+rcpc3) https://godbolt.org/z/3jzsxedq8
// - aarch64 (+lse2,+lse128) https://godbolt.org/z/jqdYaP6a3
// - aarch64 (+lse2,+lse128,+rcpc3) https://godbolt.org/z/h156b4TMv

include!("macros.rs");

Expand Down Expand Up @@ -218,7 +225,7 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
}
}
}
// If CPU supports FEAT_LSE2, LDP is single-copy atomic reads,
// If CPU supports FEAT_LSE2, LDP/LDIAPP is single-copy atomic reads,
// otherwise it is two single-copy atomic reads.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))]
Expand Down Expand Up @@ -247,6 +254,19 @@ unsafe fn atomic_load_ldp(src: *mut u128, order: Ordering) -> u128 {
}
match order {
Ordering::Relaxed => atomic_load_relaxed!("", readonly),
#[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))]
Ordering::Acquire => {
// SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3.
// Refs: https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDIAPP--Load-Acquire-RCpc-ordered-Pair-of-registers-
asm!(
"ldiapp {prev_lo}, {prev_hi}, [{src}]",
src = in(reg) ptr_reg!(src),
prev_hi = lateout(reg) prev_hi,
prev_lo = lateout(reg) prev_lo,
options(nostack, preserves_flags),
);
}
#[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))]
Ordering::Acquire => atomic_load_relaxed!("dmb ishld"),
Ordering::SeqCst => {
asm!(
Expand Down Expand Up @@ -343,18 +363,44 @@ unsafe fn _atomic_load_ldxp_stxp(src: *mut u128, order: Ordering) -> u128 {
#[inline]
unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2.
unsafe {
atomic_store_stp(dst, val, order);
{
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2 and FEAT_FSE128.
unsafe {
// Use swpp if stp requires fences.
// https://reviews.llvm.org/D143506
match order {
Ordering::Relaxed => atomic_store_stp(dst, val, order),
#[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))]
Ordering::Release => atomic_store_stp(dst, val, order),
#[cfg(not(any(
target_feature = "rcpc3",
portable_atomic_target_feature = "rcpc3",
)))]
Ordering::Release => {
_atomic_swap_swpp(dst, val, order);
}
Ordering::SeqCst => {
_atomic_swap_swpp(dst, val, order);
}
_ => unreachable!("{:?}", order),
}
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2.
unsafe {
atomic_store_stp(dst, val, order);
}
}
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract.
unsafe {
atomic_swap(dst, val, order);
}
}
// If CPU supports FEAT_LSE2, STP is single-copy atomic writes,
// If CPU supports FEAT_LSE2, STP/STILP is single-copy atomic writes,
// otherwise it is two single-copy atomic writes.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))]
Expand Down Expand Up @@ -384,6 +430,19 @@ unsafe fn atomic_store_stp(dst: *mut u128, val: u128, order: Ordering) {
}
match order {
Ordering::Relaxed => atomic_store!("", ""),
#[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))]
Ordering::Release => {
// SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3.
// Refs: https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/STILP--Store-Release-ordered-Pair-of-registers-
asm!(
"stilp {val_lo}, {val_hi}, [{dst}]",
dst = in(reg) ptr_reg!(dst),
val_lo = in(reg) val.pair.lo,
val_hi = in(reg) val.pair.hi,
options(nostack, preserves_flags),
);
}
#[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))]
Ordering::Release => atomic_store!("", "dmb ish"),
Ordering::SeqCst => atomic_store!("dmb ish", "dmb ish"),
_ => unreachable!("{:?}", order),
Expand Down Expand Up @@ -682,16 +741,50 @@ use self::atomic_compare_exchange as atomic_compare_exchange_weak;

// If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set,
// we use CAS-based atomic RMW.
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
#[cfg(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
))]
use _atomic_swap_casp as atomic_swap;
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
#[cfg(not(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
)))]
use _atomic_swap_ldxp_stxp as atomic_swap;
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
use _atomic_swap_swpp as atomic_swap;
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn _atomic_swap_swpp(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SWPP--SWPPA--SWPPAL--SWPPL--Swap-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: val };
let (prev_lo, prev_hi);
macro_rules! swap {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("swpp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(swap, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}
// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
#[cfg(any(
test,
Expand Down Expand Up @@ -1066,16 +1159,48 @@ atomic_rmw_cas_3! {
select_le_or_be!("sbc x5, x7, {val_hi}", "sbc x4, x6, {val_lo}"),
}

#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_ll_sc_3! {
_atomic_and_ldxp_stxp as atomic_and (preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_cas_3! {
_atomic_and_casp as atomic_and,
"and x4, x6, {val_lo}",
"and x5, x7, {val_hi}",
}
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDCLRP--LDCLRPA--LDCLRPAL--LDCLRPL--Atomic-bit-clear-on-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: !val };
let (prev_lo, prev_hi);
macro_rules! and {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("ldclrp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(and, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}

atomic_rmw_ll_sc_3! {
_atomic_nand_ldxp_stxp as atomic_nand (preserves_flags),
Expand All @@ -1092,16 +1217,48 @@ atomic_rmw_cas_3! {
"mvn x5, x5",
}

#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_ll_sc_3! {
_atomic_or_ldxp_stxp as atomic_or (preserves_flags),
"orr {new_lo}, {prev_lo}, {val_lo}",
"orr {new_hi}, {prev_hi}, {val_hi}",
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_cas_3! {
_atomic_or_casp as atomic_or,
"orr x4, x6, {val_lo}",
"orr x5, x7, {val_hi}",
}
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDSETP--LDSETPA--LDSETPAL--LDSETPL--Atomic-bit-set-on-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: val };
let (prev_lo, prev_hi);
macro_rules! or {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("ldsetp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(or, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}

atomic_rmw_ll_sc_3! {
_atomic_xor_ldxp_stxp as atomic_xor (preserves_flags),
Expand Down
Loading