Skip to content

Commit

Permalink
x86_64: Optimize 128-bit atomic load/store on Zhaoxin CPU with AVX
Browse files Browse the repository at this point in the history
  • Loading branch information
taiki-e committed Jul 19, 2024
1 parent 58a2ebb commit 86cee8f
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 31 deletions.
2 changes: 2 additions & 0 deletions .github/.cspell/project-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ getpid
HWCAP
ifunc
includepath
Iotel
isel
iseleq
iselgt
Expand Down Expand Up @@ -173,3 +174,4 @@ xilf
xmmword
xsave
xsub
Zhaoxin
2 changes: 1 addition & 1 deletion src/imp/atomic128/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Here is the table of targets that support 128-bit atomics and the instructions u

| target_arch | load | store | CAS | RMW | note |
| ----------- | ---- | ----- | --- | --- | ---- |
| x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel or AMD CPU with AVX. <br> Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only. <br> Requires rustc 1.59+ |
| x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel, AMD, or Zhaoxin CPU with AVX. <br> Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only. <br> Requires rustc 1.59+ |
| aarch64 | ldxp/stxp or casp or ldp/ldiapp | ldxp/stxp or casp or stp/stilp/swpp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, ldiapp/stilp requires lse2 and rcpc3 target features, swpp/ldclrp/ldsetp requires lse128 target feature. <br> Both compile-time and run-time detection are supported for lse and lse2. Others are currently compile-time detection only. <br> Requires rustc 1.59+ |
| powerpc64 | lq | stq | lqarx/stqcx. | lqarx/stqcx. | Requires target-cpu pwr8+ (powerpc64le is pwr8 by default). Both compile-time and run-time detection are supported (run-time detection is currently disabled by default). <br> Requires nightly |
| s390x | lpq | stpq | cdsg | cdsg | Requires nightly |
Expand Down
62 changes: 34 additions & 28 deletions src/imp/atomic128/detect/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,15 @@ use core::arch::x86_64::CpuidResult;
// - https://www.felixcloutier.com/x86/cpuid
// - https://en.wikipedia.org/wiki/CPUID
// - https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/core_arch/src/x86/cpuid.rs
unsafe fn __cpuid(leaf: u32) -> CpuidResult {
#[cfg(not(target_env = "sgx"))]
fn __cpuid(leaf: u32) -> CpuidResult {
let eax;
let mut ebx;
let ecx;
let edx;
// SAFETY: the caller must guarantee that CPU supports `cpuid`.
// SAFETY: Calling `__cpuid`` is safe on all x86_64 CPUs except for SGX,
// which doesn't support `cpuid`.
// https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/core_arch/src/x86/cpuid.rs#L102-L109
unsafe {
asm!(
"mov {ebx_tmp:r}, rbx", // save rbx which is reserved by LLVM
Expand All @@ -44,22 +47,29 @@ unsafe fn __cpuid(leaf: u32) -> CpuidResult {
}

// https://en.wikipedia.org/wiki/CPUID
const _VENDOR_ID_INTEL: [u8; 12] = *b"GenuineIntel";
const _VENDOR_ID_AMD: [u8; 12] = *b"AuthenticAMD";

unsafe fn _vendor_id() -> [u8; 12] {
const _VENDOR_ID_INTEL: [u8; 12] = *b"GenuineIntel"; // Intel
const _VENDOR_ID_INTEL2: [u8; 12] = *b"GenuineIotel"; // Intel https://github.com/InstLatx64/InstLatx64/commit/8fdd319884c67d2c6ec1ca0c595b42c1c4b8d803
const _VENDOR_ID_AMD: [u8; 12] = *b"AuthenticAMD"; // AMD
const _VENDOR_ID_ZHAOXIN: [u8; 12] = *b" Shanghai "; // Zhaoxin
fn _vendor_id() -> [u8; 12] {
// https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/std_detect/src/detect/os/x86.rs#L40-L59
// SAFETY: the caller must guarantee that CPU supports `cpuid`.
let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0) };
let CpuidResult { ebx, ecx, edx, .. } = __cpuid(0);
let vendor_id: [[u8; 4]; 3] = [ebx.to_ne_bytes(), edx.to_ne_bytes(), ecx.to_ne_bytes()];
// SAFETY: transmute is safe because `[u8; 12]` and `[[u8; 4]; 3]` has the same layout.
unsafe { core::mem::transmute(vendor_id) }
}
fn _vendor_has_vmovdqa_atomic(vendor_id: [u8; 12]) -> bool {
// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
vendor_id == _VENDOR_ID_INTEL
|| vendor_id == _VENDOR_ID_INTEL2
|| vendor_id == _VENDOR_ID_AMD
|| vendor_id == _VENDOR_ID_ZHAOXIN
}

#[cold]
fn _detect(info: &mut CpuInfo) {
// SAFETY: Calling `__cpuid`` is safe because the CPU has `cpuid` support.
let proc_info_ecx = unsafe { __cpuid(0x0000_0001_u32).ecx };
let proc_info_ecx = __cpuid(0x0000_0001_u32).ecx;

// https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/std_detect/src/detect/os/x86.rs#L111
if test(proc_info_ecx, 13) {
Expand All @@ -71,22 +81,18 @@ fn _detect(info: &mut CpuInfo) {
{
use core::arch::x86_64::_xgetbv;

// SAFETY: Calling `vendor_id`` is safe because the CPU has `cpuid` support.
let vendor_id = unsafe { _vendor_id() };

// VMOVDQA is atomic on Intel and AMD CPUs with AVX.
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
if vendor_id == _VENDOR_ID_INTEL || vendor_id == _VENDOR_ID_AMD {
// https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/std_detect/src/detect/os/x86.rs#L131-L224
let cpu_xsave = test(proc_info_ecx, 26);
if cpu_xsave {
let cpu_osxsave = test(proc_info_ecx, 27);
if cpu_osxsave {
// SAFETY: Calling `_xgetbv`` is safe because the CPU has `xsave` support
// and OS has set `osxsave`.
let xcr0 = unsafe { _xgetbv(0) };
let os_avx_support = xcr0 & 6 == 6;
if os_avx_support && test(proc_info_ecx, 28) {
// https://github.com/rust-lang/stdarch/blob/a0c30f3e3c75adcd6ee7efc94014ebcead61c507/crates/std_detect/src/detect/os/x86.rs#L131-L224
let cpu_xsave = test(proc_info_ecx, 26);
if cpu_xsave {
let cpu_osxsave = test(proc_info_ecx, 27);
if cpu_osxsave {
// SAFETY: Calling `_xgetbv`` is safe because the CPU has `xsave` support
// and OS has set `osxsave`.
let xcr0 = unsafe { _xgetbv(0) };
let os_avx_support = xcr0 & 6 == 6;
if os_avx_support && test(proc_info_ecx, 28) {
let vendor_id = _vendor_id();
if _vendor_has_vmovdqa_atomic(vendor_id) {
info.set(CpuInfo::HAS_VMOVDQA_ATOMIC);
}
}
Expand All @@ -112,13 +118,13 @@ mod tests {
#[cfg_attr(portable_atomic_test_outline_atomics_detect_false, ignore)]
fn test_cpuid() {
assert_eq!(std::is_x86_feature_detected!("cmpxchg16b"), detect().has_cmpxchg16b());
let vendor_id = unsafe { _vendor_id() };
let vendor_id = _vendor_id();
{
let stdout = io::stderr();
let mut stdout = stdout.lock();
let _ = writeln!(stdout, "\n vendor_id: {}", std::str::from_utf8(&vendor_id).unwrap());
}
if vendor_id == _VENDOR_ID_INTEL || vendor_id == _VENDOR_ID_AMD {
if _vendor_has_vmovdqa_atomic(vendor_id) {
assert_eq!(std::is_x86_feature_detected!("avx"), detect().has_vmovdqa_atomic());
} else {
assert!(!detect().has_vmovdqa_atomic());
Expand Down
4 changes: 2 additions & 2 deletions src/imp/atomic128/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
}
}

// VMOVDQA is atomic on Intel and AMD CPUs with AVX.
// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
//
// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
Expand All @@ -155,7 +155,7 @@ unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
// baseline and is always available, but the SSE target feature is disabled for
// use cases such as kernels and firmware that should not use vector registers.
// So, do not use vector registers unless SSE target feature is enabled.
// See also https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html.
// See also https://github.com/rust-lang/rust/blob/1.77.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
#[cfg(target_feature = "sse")]
#[target_feature(enable = "avx")]
Expand Down

0 comments on commit 86cee8f

Please sign in to comment.