From 40c4cd4f682f1cb153f18d4d6a88795bafaf5667 Mon Sep 17 00:00:00 2001
From: Taiki Endo <te316e89@gmail.com>
Date: Wed, 19 Apr 2023 03:12:55 +0900
Subject: [PATCH] Various cleanup

- Optimize x86_64 128-bit outline-atomics. FYI, this improves
  performance by up to 15% in concurrent RMW/store microbenchmarks.
- Optimize x86_64 128-bit load that uses cmpxchg16b.
- Optimize aarch64 128-bit load that uses FEAT_LSE.
- Move 128-bit atomic implementation for Miri and ThreadSanitizer to
  intrinsics.rs on all architectures.
- Remove duplicate tests and add tests for cases where feature detection
  returns false.
- Several minor cleanups.
---
 .cirrus.yml                         |  91 ++--
 .github/workflows/ci.yml            |  43 +-
 Cargo.toml                          |   1 -
 bench/Cargo.toml                    |   1 -
 src/imp/arm_linux.rs                |  18 +-
 src/imp/atomic128/README.md         |  13 +
 src/imp/atomic128/aarch64.rs        | 294 ++++++-----
 src/imp/atomic128/detect/common.rs  |  12 +-
 src/imp/atomic128/detect/x86_64.rs  |  11 +-
 src/imp/atomic128/intrinsics.rs     | 575 ++++++++-------------
 src/imp/atomic128/macros.rs         |  82 +--
 src/imp/atomic128/powerpc64.rs      |   9 +-
 src/imp/atomic128/s390x.rs          |  99 +---
 src/imp/atomic128/x86_64.rs         | 761 ++++++++++++++--------------
 src/imp/core_atomic.rs              |   6 +-
 src/imp/fallback/mod.rs             |  26 +-
 src/imp/fallback/outline_atomics.rs | 116 +++--
 src/imp/float.rs                    |   6 +-
 src/imp/interrupt/mod.rs            |   2 +-
 src/imp/mod.rs                      |  56 +-
 src/imp/x86.rs                      |   2 +-
 src/lib.rs                          |   4 +-
 src/tests/helper.rs                 |  11 +-
 src/tests/mod.rs                    |  54 +-
 tests/helper/Cargo.toml             |   4 +-
 tests/helper/src/lib.rs             |  23 +-
 tools/build.sh                      |  12 +-
 tools/test.sh                       |  15 +-
 28 files changed, 1049 insertions(+), 1298 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index b934e82b..dfbd6505 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -36,11 +36,11 @@ aarch64_linux_gnu_test_task:
     - |
       [ ! -f $HOME/.cargo/env ] || . $HOME/.cargo/env
     - set -ex
-    - ./tools/test.sh -vv
+    - ./tools/test.sh -vv -- -Z unstable-options --report-time
     # +lse
     # Graviton2 (Neoverse N1) is ARMv8.2-a and doesn't support FEAT_LSE2.
     # FEAT_LSE2 is tested on aarch64 macOS VM.
-    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv
+    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv -- -Z unstable-options --report-time
 
 aarch64_linux_musl_test_task:
   name: test ($TARGET)
@@ -55,47 +55,36 @@ aarch64_linux_musl_test_task:
     - rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
   test_script:
     - set -ex
-    - ./tools/test.sh -vv
+    - ./tools/test.sh -vv -- -Z unstable-options --report-time
     # -crt-static
-    - RUSTFLAGS="$RUSTFLAGS -C target-feature=-crt-static" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=-crt-static" ./tools/test.sh -vv
+    - RUSTFLAGS="$RUSTFLAGS -C target-feature=-crt-static" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=-crt-static" ./tools/test.sh -vv -- -Z unstable-options --report-time
     # +lse
     # Graviton2 (Neoverse N1) is ARMv8.2-a and doesn't support FEAT_LSE2.
     # FEAT_LSE2 is tested on aarch64 macOS VM.
-    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv
+    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv -- -Z unstable-options --report-time
 
-armel_linux_test_task:
+arm_linux_test_task:
   name: test ($TARGET)
-  env:
-    TARGET: armv5te-unknown-linux-gnueabi
+  matrix:
+    - env:
+        TARGET: armv5te-unknown-linux-gnueabi
+        DPKG_ARCH: armel
+    - env:
+        TARGET: armv7-unknown-linux-gnueabihf
+        DPKG_ARCH: armhf
   arm_container:
     image: rust:latest
   setup_script:
     - set -ex
     - lscpu
     - rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
-    - dpkg --add-architecture armel
-    - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-arm-linux-gnueabi libc6-dev-armel-cross libc6:armel
+    - dpkg --add-architecture "$DPKG_ARCH"
+    - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-"${TARGET/*-unknown/arm}" libc6-dev-"$DPKG_ARCH"-cross libc6:"$DPKG_ARCH"
   test_script:
     - set -ex
     - export CARGO_TARGET_ARMV5TE_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc
-    - ./tools/test.sh --target "$TARGET" -Z doctest-xcompile -vv
-
-armhf_linux_test_task:
-  name: test ($TARGET)
-  env:
-    TARGET: armv7-unknown-linux-gnueabihf
-  arm_container:
-    image: rust:latest
-  setup_script:
-    - set -ex
-    - lscpu
-    - rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
-    - dpkg --add-architecture armhf
-    - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf
-  test_script:
-    - set -ex
     - export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc
-    - ./tools/test.sh --target "$TARGET" -Z doctest-xcompile -vv
+    - ./tools/test.sh --target "$TARGET" -Z doctest-xcompile -vv -- -Z unstable-options --report-time
 
 aarch64_macos_test_task:
   name: test ($TARGET)
@@ -111,14 +100,12 @@ aarch64_macos_test_task:
     - . $HOME/.cargo/env
     - set -ex
     # macOS is +lse,+lse2 by default
-    - ./tools/test.sh -vv
+    - ./tools/test.sh -vv -- -Z unstable-options --report-time
 
 aarch64_linux_valgrind_task:
   name: valgrind ($TARGET)
   env:
-    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER: valgrind -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes
-    RUSTDOCFLAGS: -D warnings --cfg valgrind
-    RUSTFLAGS: -D warnings --cfg valgrind
+    VALGRIND: valgrind
     TARGET: aarch64-unknown-linux-gnu
   arm_container:
     # Valgrind support ldxp/stxp on 3.19+: https://valgrind.org/docs/manual/dist.news.html
@@ -126,17 +113,15 @@ aarch64_linux_valgrind_task:
     image: debian:bookworm
   setup_script:
     - set -ex
-    - lscpu
-    - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends ca-certificates curl gcc git libc6-dev valgrind
+    - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends ca-certificates curl gcc git libc6-dev valgrind moreutils
     - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain nightly --component rust-src
   test_script:
     - . $HOME/.cargo/env
     - set -ex
-    # doctests on Valgrind are very slow
-    - ./tools/test.sh -vv --tests
+    - ./tools/test.sh -vv 2>&1 | ts -i '%.s  '
     # +lse
     # As of Valgrind 3.19, Valgrind supports atomic instructions of ARMv8.0 and ARMv8.1 (FEAT_LSE).
-    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv --tests
+    - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" ./tools/test.sh -vv 2>&1 | ts -i '%.s  '
 #
 # aarch64_linux_bench_task:
 #   name: bench ($TARGET)
@@ -152,33 +137,36 @@ aarch64_linux_valgrind_task:
 #     - rustup toolchain add nightly --no-self-update && rustup default nightly
 #   bench_script:
 #     - set -ex
+#     - cargo bench -vv --manifest-path bench/Cargo.toml
 #     - RUSTFLAGS="${RUSTFLAGS} -C target-feature=-outline-atomics --cfg portable_atomic_no_outline_atomics" cargo bench -vv --manifest-path bench/Cargo.toml
 #     - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+lse" cargo bench -vv --manifest-path bench/Cargo.toml
 
-# armel_linux_bench_task:
+# x86_64_linux_bench_task:
 #   name: bench ($TARGET)
 #   env:
-#     TARGET: armv5te-unknown-linux-gnueabi
-#   arm_container:
+#     TARGET: x86_64-unknown-linux-gnu
+#   container:
 #     image: rust:latest
 #     cpu: 4
 #     memory: 12G
 #   setup_script:
 #     - set -ex
 #     - lscpu
-#     - rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
-#     - rustup target add "$TARGET"
-#     - dpkg --add-architecture armel
-#     - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-arm-linux-gnueabi libc6-dev-armel-cross libc6:armel
-#   test_script:
+#     - rustup toolchain add nightly --no-self-update && rustup default nightly
+#   bench_script:
 #     - set -ex
-#     - export CARGO_TARGET_ARMV5TE_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc
-#     - RUSTFLAGS="${RUSTFLAGS}" cargo bench --target "$TARGET" -vv --manifest-path bench/Cargo.toml
+#     - cargo bench -vv --manifest-path bench/Cargo.toml
+#     - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+cmpxchg16b" cargo bench -vv --manifest-path bench/Cargo.toml
 
-# armhf_linux_bench_task:
+# arm_linux_bench_task:
 #   name: bench ($TARGET)
-#   env:
-#     TARGET: armv7-unknown-linux-gnueabihf
+#   matrix:
+#     - env:
+#         TARGET: armv5te-unknown-linux-gnueabi
+#         DPKG_ARCH: armel
+#     - env:
+#         TARGET: armv7-unknown-linux-gnueabihf
+#         DPKG_ARCH: armhf
 #   arm_container:
 #     image: rust:latest
 #     cpu: 4
@@ -188,10 +176,11 @@ aarch64_linux_valgrind_task:
 #     - lscpu
 #     - rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
 #     - rustup target add "$TARGET"
-#     - dpkg --add-architecture armhf
-#     - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf
+#     - dpkg --add-architecture "$DPKG_ARCH"
+#     - apt-get -o Acquire::Retries=10 -qq update && apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends gcc-"${TARGET/*-unknown/arm}" libc6-dev-"$DPKG_ARCH"-cross libc6:"$DPKG_ARCH"
 #   test_script:
 #     - set -ex
+#     - export CARGO_TARGET_ARMV5TE_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc
 #     - export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc
 #     - RUSTFLAGS="${RUSTFLAGS}" cargo bench --target "$TARGET" -vv --manifest-path bench/Cargo.toml
 
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 10c54f9b..95ef5cb2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -95,8 +95,6 @@ jobs:
             target: armv7-unknown-linux-gnueabi
           - rust: nightly
             target: armv7-unknown-linux-gnueabihf
-          - rust: nightly
-            target: armv5te-unknown-linux-musleabi
           - rust: nightly
             target: arm-linux-androideabi
           - rust: nightly
@@ -163,20 +161,31 @@ jobs:
         if: matrix.target != '' && !startsWith(matrix.target, 'i686') && !startsWith(matrix.target, 'x86_64')
       - run: echo "TARGET=--target=${{ matrix.target }}" >>"${GITHUB_ENV}"
         if: matrix.target != ''
+      - run: echo "REPORT_TIME=-- -Z unstable-options --report-time" >>"${GITHUB_ENV}"
+        if: startsWith(matrix.rust, 'nightly')
         # Since nightly-2022-12-23, -Z build-std + -Z randomize-layout + release mode on windows causes segfault.
       - run: echo "RANDOMIZE_LAYOUT=-Z randomize-layout" >>"${GITHUB_ENV}"
         if: startsWith(matrix.rust, 'nightly') && !startsWith(matrix.os, 'windows')
 
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
+      # macOS is skipped because it is +cmpxchg16b by default
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
+        env:
+          # Note: This cfg is intended to make it easy for portable-atomic developers
+          # to test has_cmpxchg16b == false, has_lse == false, or __kuser_helper_version < 5 cases,
+          # and is not a public API.
+          RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} --cfg portable_atomic_test_outline_atomics_detect_false
+          RUSTFLAGS: ${{ env.RUSTFLAGS }} --cfg portable_atomic_test_outline_atomics_detect_false
+        if: (matrix.target == '' || startsWith(matrix.target, 'x86_64')) && !startsWith(matrix.os, 'macos') || startsWith(matrix.target, 'aarch64') || startsWith(matrix.target, 'armv5te') || matrix.target == 'arm-linux-androideabi'
       # -crt-static
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
         env:
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-feature=-crt-static
           RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-feature=-crt-static
         if: contains(matrix.target, '-musl')
       # +cmpxchg16b
       # macOS is skipped because it is +cmpxchg16b by default
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
         env:
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-feature=+cmpxchg16b
           RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-feature=+cmpxchg16b
@@ -184,21 +193,21 @@ jobs:
       # +lse
       # As of QEMU 7.2, QEMU has not yet implemented FEAT_LSE2: https://linaro.atlassian.net/browse/QEMU-300
       # FEAT_LSE2 is tested on Cirrus CI's aarch64 macOS VM.
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
         env:
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-feature=+lse
           RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-feature=+lse
         if: startsWith(matrix.target, 'aarch64')
       # pwr7
       # powerpc64- (big-endian) is skipped because it is pre-pwr8 by default
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
         env:
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-cpu=pwr7
           RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-cpu=pwr7
         if: startsWith(matrix.target, 'powerpc64le-')
       # pwr8
       # powerpc64le- (little-endian) is skipped because it is pwr8 by default
-      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD
+      - run: tools/test.sh -vv $TARGET $DOCTEST_XCOMPILE $BUILD_STD $REPORT_TIME
         env:
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-cpu=pwr8
           RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-cpu=pwr8
@@ -303,8 +312,7 @@ jobs:
       - run: sudo apt-get -o Acquire::Retries=10 -qq update && sudo apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends moreutils
       - run: echo "TARGET=--target=${{ matrix.target }}" >>"${GITHUB_ENV}"
         if: matrix.target != ''
-      - run: |
-          cargo miri test --workspace --all-features $EXCLUDE $TARGET 2>&1 | ts -i '%.s  '
+      - run: cargo miri test --workspace --all-features $EXCLUDE $TARGET 2>&1 | ts -i '%.s  '
         env:
           MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-retag-fields -Zmiri-disable-isolation
           RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -Z randomize-layout
@@ -351,7 +359,7 @@ jobs:
 
   valgrind:
     env:
-      CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: valgrind -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes
+      VALGRIND: valgrind
     runs-on: ubuntu-latest
     timeout-minutes: 60
     steps:
@@ -360,17 +368,14 @@ jobs:
           persist-credentials: false
       - name: Install Rust
         run: rustup toolchain add nightly --no-self-update --component rust-src && rustup default nightly
+      - run: sudo apt-get -o Acquire::Retries=10 -qq update && sudo apt-get -o Acquire::Retries=10 -o Dpkg::Use-Pty=0 install -y --no-install-recommends moreutils
       - uses: taiki-e/install-action@valgrind
-      # doctests on Valgrind are very slow
-      - run: tools/test.sh -vv --tests
-        env:
-          RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} --cfg valgrind
-          RUSTFLAGS: ${{ env.RUSTFLAGS }} --cfg valgrind
+      - run: tools/test.sh -vv 2>&1 | ts -i '%.s  '
       # +cmpxchg16b
-      - run: tools/test.sh -vv --tests
+      - run: tools/test.sh -vv 2>&1 | ts -i '%.s  '
         env:
-          RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-feature=+cmpxchg16b --cfg valgrind
-          RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-feature=+cmpxchg16b --cfg valgrind
+          RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-feature=+cmpxchg16b
+          RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-feature=+cmpxchg16b
 
   codegen:
     runs-on: ubuntu-latest
diff --git a/Cargo.toml b/Cargo.toml
index 7d99d156..72e542d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,7 +67,6 @@ crossbeam-utils = "0.8"
 fastrand = "1"
 paste = "1"
 quickcheck = { default-features = false, git = "https://github.com/taiki-e/quickcheck.git", branch = "dev" } # https://github.com/BurntSushi/quickcheck/pull/304 + https://github.com/BurntSushi/quickcheck/pull/282 + lower MSRV
-rustversion = "1"
 serde_test = "1"
 sptr = "0.3"
 static_assertions = "1"
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 83f51163..9b6ea215 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -19,7 +19,6 @@ crossbeam-utils = "0.8"
 fastrand = "1"
 paste = "1"
 quickcheck = { default-features = false, git = "https://github.com/taiki-e/quickcheck.git", branch = "dev" }  # https://github.com/BurntSushi/quickcheck/pull/304 + https://github.com/BurntSushi/quickcheck/pull/282 + lower MSRV
-rustversion = "1"
 static_assertions = "1"
 
 [[bench]]
diff --git a/src/imp/arm_linux.rs b/src/imp/arm_linux.rs
index 892efa24..8633ce35 100644
--- a/src/imp/arm_linux.rs
+++ b/src/imp/arm_linux.rs
@@ -16,7 +16,7 @@ mod fallback;
 
 #[cfg(not(portable_atomic_no_asm))]
 use core::arch::asm;
-use core::{mem, sync::atomic::Ordering};
+use core::{cell::UnsafeCell, mem, sync::atomic::Ordering};
 
 /// A 64-bit value represented as a pair of 32-bit values.
 ///
@@ -56,10 +56,17 @@ fn __kuser_helper_version() -> i32 {
 }
 #[inline]
 fn has_kuser_cmpxchg64() -> bool {
+    // Note: This cfg is intended to make it easy for portable-atomic developers
+    // to test __kuser_helper_version < 5 cases, and is not a public API.
+    if cfg!(portable_atomic_test_outline_atomics_detect_false) {
+        return false;
+    }
     __kuser_helper_version() >= 5
 }
 #[inline]
 unsafe fn __kuser_cmpxchg64(old_val: *const u64, new_val: *const u64, ptr: *mut u64) -> bool {
+    debug_assert!(ptr as usize % 8 == 0);
+    debug_assert!(has_kuser_cmpxchg64());
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         let f: extern "C" fn(*const u64, *const u64, *mut u64) -> u32 =
@@ -91,9 +98,6 @@ unsafe fn atomic_update_kuser_cmpxchg64<F>(dst: *mut u64, mut f: F) -> u64
 where
     F: FnMut(u64) -> u64,
 {
-    debug_assert!(dst as usize % 8 == 0);
-    debug_assert!(has_kuser_cmpxchg64());
-
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         loop {
@@ -130,7 +134,7 @@ macro_rules! atomic_with_ifunc {
                     if has_kuser_cmpxchg64() {
                         kuser_cmpxchg64_fn
                     } else {
-                        // Use SeqCst because __kuser_cmpxchg64 is SeqCst.
+                        // Use SeqCst because __kuser_cmpxchg64 is always SeqCst.
                         // https://github.com/torvalds/linux/blob/v6.1/arch/arm/kernel/entry-armv.S#L918-L925
                         fallback::$seqcst_fallback_fn
                     }
@@ -264,7 +268,7 @@ macro_rules! atomic64 {
     ($atomic_type:ident, $int_type:ident, $atomic_max:ident, $atomic_min:ident) => {
         #[repr(C, align(8))]
         pub(crate) struct $atomic_type {
-            v: core::cell::UnsafeCell<$int_type>,
+            v: UnsafeCell<$int_type>,
         }
 
         // Send is implicitly implemented.
@@ -276,7 +280,7 @@ macro_rules! atomic64 {
         impl $atomic_type {
             #[inline]
             pub(crate) const fn new(v: $int_type) -> Self {
-                Self { v: core::cell::UnsafeCell::new(v) }
+                Self { v: UnsafeCell::new(v) }
             }
 
             #[inline]
diff --git a/src/imp/atomic128/README.md b/src/imp/atomic128/README.md
index 486b2f30..2ffcd300 100644
--- a/src/imp/atomic128/README.md
+++ b/src/imp/atomic128/README.md
@@ -15,6 +15,19 @@ On compiler versions or platforms where these are not supported, the fallback im
 
 See [aarch64.rs](aarch64.rs) module-level comments for more details on the instructions used on aarch64.
 
+## Comparison with core::intrinsics::atomic_\* (core::sync::atomic::Atomic{I,U}128)
+
+This directory has target-specific implementations with inline assembly ([aarch64.rs](aarch64.rs), [x86_64.rs](x86_64.rs), [powerpc64.rs](powerpc64.rs), [s390x.rs](s390x.rs)) and an implementation without inline assembly ([intrinsics.rs](intrinsics.rs)). The latter currently always needs nightly compilers and is only used for Miri and ThreadSanitizer, which do not support inline assembly.
+
+Implementations with inline assembly generate assemblies almost equivalent to the `core::intrinsics::atomic_*` (used in `core::sync::atomic::Atomic{I,U}128`) for many operations, but some operations may or may not generate more efficient code. For example:
+
+- On x86_64, implementation with inline assembly contains additional optimizations (e.g., [#16](https://github.com/taiki-e/portable-atomic/pull/16)) and is much faster for some operations.
+- On aarch64, implementation with inline assembly supports outline-atomics on more operating systems, and may be faster in environments where outline-atomics can improve performance.
+- On powerpc64 and s390x, LLVM does not support generating some 128-bit atomic operations (see [intrinsics.rs](intrinsics.rs) module-level comments), and we use CAS loop to implement them, so implementation with inline assembly may be faster for those operations.
+- In implementations without inline assembly, the compiler may reuse condition flags that have changed as a result of the operation, or use immediate values instead of registers, depending on the situation.
+
+As 128-bit atomics-related APIs stabilize in the standard library, implementations with inline assembly are planned to be updated to get the benefits of both.
+
 ## Run-time feature detection
 
 [detect](detect) module has run-time feature detection implementations.
diff --git a/src/imp/atomic128/aarch64.rs b/src/imp/atomic128/aarch64.rs
index d4b4c735..ccced4d5 100644
--- a/src/imp/atomic128/aarch64.rs
+++ b/src/imp/atomic128/aarch64.rs
@@ -48,9 +48,9 @@
 // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
 //
 // Generated asm:
-// - aarch64 https://godbolt.org/z/qKPb1asj4
-// - aarch64 (+lse) https://godbolt.org/z/dqxj9z9Ps
-// - aarch64 (+lse,+lse2) https://godbolt.org/z/x4a135Psb
+// - aarch64 https://godbolt.org/z/jz7rGK8hc
+// - aarch64 (+lse) https://godbolt.org/z/sK3sEa8jP
+// - aarch64 (+lse,+lse2) https://godbolt.org/z/P564r3EG9
 
 include!("macros.rs");
 
@@ -101,6 +101,40 @@ mod detect_macos;
 use core::arch::asm;
 use core::sync::atomic::Ordering;
 
+#[cfg(any(
+    target_feature = "lse",
+    portable_atomic_target_feature = "lse",
+    all(not(portable_atomic_no_aarch64_target_feature), not(portable_atomic_no_outline_atomics)),
+))]
+macro_rules! debug_assert_lse {
+    () => {
+        #[cfg(all(
+            not(portable_atomic_no_outline_atomics),
+            any(
+                all(
+                    target_os = "linux",
+                    any(
+                        target_env = "gnu",
+                        all(
+                            any(target_env = "musl", target_env = "ohos"),
+                            not(target_feature = "crt-static"),
+                        ),
+                    ),
+                ),
+                target_os = "android",
+                target_os = "freebsd",
+                target_os = "openbsd",
+                target_os = "fuchsia",
+                target_os = "windows",
+            ),
+        ))]
+        #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
+        {
+            debug_assert!(detect::detect().has_lse());
+        }
+    };
+}
+
 #[cfg(target_pointer_width = "32")]
 macro_rules! ptr_modifier {
     () => {
@@ -170,7 +204,7 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
         // SAFETY: the caller must uphold the safety contract.
         // cfg guarantee that the CPU supports FEAT_LSE.
         unsafe {
-            _atomic_compare_exchange_casp(src, 0, 0, order)
+            _atomic_load_casp(src, order)
         }
         #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
         // SAFETY: the caller must uphold the safety contract.
@@ -215,6 +249,46 @@ unsafe fn atomic_load_ldp(src: *mut u128, order: Ordering) -> u128 {
         U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
     }
 }
+// Do not use _atomic_compare_exchange_casp because it needs extra MOV to implement load.
+#[cfg(any(test, not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))))]
+#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
+#[inline]
+unsafe fn _atomic_load_casp(src: *mut u128, order: Ordering) -> u128 {
+    debug_assert!(src as usize % 16 == 0);
+    debug_assert_lse!();
+
+    // SAFETY: the caller must uphold the safety contract.
+    // cfg guarantee that the CPU supports FEAT_LSE.
+    unsafe {
+        let (prev_lo, prev_hi);
+        macro_rules! atomic_load {
+            ($acquire:tt, $release:tt) => {
+                asm!(
+                    concat!("casp", $acquire, $release, " x4, x5, x4, x5, [{src", ptr_modifier!(), "}]"),
+                    src = in(reg) src,
+                    // must be allocated to even/odd register pair
+                    inout("x4") 0_u64 => prev_lo,
+                    inout("x5") 0_u64 => prev_hi,
+                    options(nostack, preserves_flags),
+                )
+            };
+        }
+        match order {
+            Ordering::Relaxed => atomic_load!("", ""),
+            Ordering::Acquire => atomic_load!("a", ""),
+            Ordering::SeqCst => atomic_load!("a", "l"),
+            _ => unreachable!("{:?}", order),
+        }
+        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
+    }
+}
+#[cfg(any(
+    test,
+    all(
+        not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")),
+        not(any(target_feature = "lse", portable_atomic_target_feature = "lse")),
+    ),
+))]
 #[inline]
 unsafe fn _atomic_load_ldxp_stxp(src: *mut u128, order: Ordering) -> u128 {
     debug_assert!(src as usize % 16 == 0);
@@ -366,24 +440,26 @@ unsafe fn atomic_compare_exchange(
                 target_feature(enable = "lse")
             )]
             unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128;
-            _atomic_compare_exchange_casp_relaxed
+            atomic_compare_exchange_casp_relaxed
                 = _atomic_compare_exchange_casp(Ordering::Relaxed);
-            _atomic_compare_exchange_casp_acquire
+            atomic_compare_exchange_casp_acquire
                 = _atomic_compare_exchange_casp(Ordering::Acquire);
-            _atomic_compare_exchange_casp_release
+            atomic_compare_exchange_casp_release
                 = _atomic_compare_exchange_casp(Ordering::Release);
-            _atomic_compare_exchange_casp_acqrel
+            // AcqRel and SeqCst RMWs are equivalent.
+            atomic_compare_exchange_casp_acqrel
                 = _atomic_compare_exchange_casp(Ordering::AcqRel);
         }
         fn_alias! {
             unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128;
-            _atomic_compare_exchange_ldxp_stxp_relaxed
+            atomic_compare_exchange_ldxp_stxp_relaxed
                 = _atomic_compare_exchange_ldxp_stxp(Ordering::Relaxed);
-            _atomic_compare_exchange_ldxp_stxp_acquire
+            atomic_compare_exchange_ldxp_stxp_acquire
                 = _atomic_compare_exchange_ldxp_stxp(Ordering::Acquire);
-            _atomic_compare_exchange_ldxp_stxp_release
+            atomic_compare_exchange_ldxp_stxp_release
                 = _atomic_compare_exchange_ldxp_stxp(Ordering::Release);
-            _atomic_compare_exchange_ldxp_stxp_acqrel
+            // AcqRel and SeqCst RMWs are equivalent.
+            atomic_compare_exchange_ldxp_stxp_acqrel
                 = _atomic_compare_exchange_ldxp_stxp(Ordering::AcqRel);
         }
         // SAFETY: the caller must guarantee that `dst` is valid for both writes and
@@ -394,27 +470,27 @@ unsafe fn atomic_compare_exchange(
                 Ordering::Relaxed => {
                     ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 {
                         if detect::detect().has_lse() {
-                            _atomic_compare_exchange_casp_relaxed
+                            atomic_compare_exchange_casp_relaxed
                         } else {
-                            _atomic_compare_exchange_ldxp_stxp_relaxed
+                            atomic_compare_exchange_ldxp_stxp_relaxed
                         }
                     })
                 }
                 Ordering::Acquire => {
                     ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 {
                         if detect::detect().has_lse() {
-                            _atomic_compare_exchange_casp_acquire
+                            atomic_compare_exchange_casp_acquire
                         } else {
-                            _atomic_compare_exchange_ldxp_stxp_acquire
+                            atomic_compare_exchange_ldxp_stxp_acquire
                         }
                     })
                 }
                 Ordering::Release => {
                     ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 {
                         if detect::detect().has_lse() {
-                            _atomic_compare_exchange_casp_release
+                            atomic_compare_exchange_casp_release
                         } else {
-                            _atomic_compare_exchange_ldxp_stxp_release
+                            atomic_compare_exchange_ldxp_stxp_release
                         }
                     })
                 }
@@ -422,9 +498,9 @@ unsafe fn atomic_compare_exchange(
                 Ordering::AcqRel | Ordering::SeqCst => {
                     ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> u128 {
                         if detect::detect().has_lse() {
-                            _atomic_compare_exchange_casp_acqrel
+                            atomic_compare_exchange_casp_acqrel
                         } else {
-                            _atomic_compare_exchange_ldxp_stxp_acqrel
+                            atomic_compare_exchange_ldxp_stxp_acqrel
                         }
                     })
                 }
@@ -455,6 +531,7 @@ unsafe fn _atomic_compare_exchange_casp(
     order: Ordering,
 ) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
+    debug_assert_lse!();
 
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
@@ -486,6 +563,7 @@ unsafe fn _atomic_compare_exchange_casp(
         U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
     }
 }
+#[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))]
 #[inline]
 unsafe fn _atomic_compare_exchange_ldxp_stxp(
     dst: *mut u128,
@@ -554,31 +632,32 @@ unsafe fn _atomic_compare_exchange_ldxp_stxp(
 // so we always use strong CAS for now.
 use self::atomic_compare_exchange as atomic_compare_exchange_weak;
 
-#[inline]
-unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
-    #[cfg(all(
-        any(target_feature = "lse", portable_atomic_target_feature = "lse"),
-        not(portable_atomic_ll_sc_rmw),
-    ))]
-    // SAFETY: the caller must uphold the safety contract.
-    // cfg guarantee that the CPU supports FEAT_LSE.
-    unsafe {
-        _atomic_swap_casp(dst, val, order)
-    }
-    #[cfg(not(all(
+// If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set,
+// we use CAS-based atomic RMW.
+#[cfg(all(
+    any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+    not(portable_atomic_ll_sc_rmw),
+))]
+use _atomic_swap_casp as atomic_swap;
+#[cfg(not(all(
+    any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+    not(portable_atomic_ll_sc_rmw),
+)))]
+use _atomic_swap_ldxp_stxp as atomic_swap;
+// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
+#[cfg(any(
+    test,
+    all(
         any(target_feature = "lse", portable_atomic_target_feature = "lse"),
         not(portable_atomic_ll_sc_rmw),
-    )))]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        _atomic_swap_ldxp_stxp(dst, val, order)
-    }
-}
-// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
+    )
+))]
 #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
 #[inline]
 unsafe fn _atomic_swap_casp(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
+    debug_assert_lse!();
+
     // SAFETY: the caller must uphold the safety contract.
     // cfg guarantee that the CPU supports FEAT_LSE.
     unsafe {
@@ -619,6 +698,13 @@ unsafe fn _atomic_swap_casp(dst: *mut u128, val: u128, order: Ordering) -> u128
     }
 }
 // Do not use atomic_rmw_ll_sc_3 because it needs extra MOV to implement swap.
+#[cfg(any(
+    test,
+    not(all(
+        any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+        not(portable_atomic_ll_sc_rmw),
+    ))
+))]
 #[inline]
 unsafe fn _atomic_swap_ldxp_stxp(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
@@ -658,7 +744,7 @@ unsafe fn _atomic_swap_ldxp_stxp(dst: *mut u128, val: u128, order: Ordering) ->
 /// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`)
 /// - new_lo/new_hi pair: new value that will to stored by sc
 macro_rules! atomic_rmw_ll_sc_3 {
-    ($name:ident as $reexport_name:ident, options($($options:tt)*), $($op:tt)*) => {
+    ($name:ident as $reexport_name:ident $(($preserves_flags:tt))?, $($op:tt)*) => {
         // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set,
         // we use CAS-based atomic RMW generated by atomic_rmw_cas_3! macro instead.
         #[cfg(not(all(
@@ -666,6 +752,13 @@ macro_rules! atomic_rmw_ll_sc_3 {
             not(portable_atomic_ll_sc_rmw),
         )))]
         use $name as $reexport_name;
+        #[cfg(any(
+            test,
+            not(all(
+                any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+                not(portable_atomic_ll_sc_rmw),
+            ))
+        ))]
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
@@ -690,7 +783,7 @@ macro_rules! atomic_rmw_ll_sc_3 {
                             new_lo = out(reg) _,
                             new_hi = out(reg) _,
                             r = out(reg) _,
-                            options($($options)*),
+                            options(nostack $(, $preserves_flags)?),
                         )
                     };
                 }
@@ -698,16 +791,6 @@ macro_rules! atomic_rmw_ll_sc_3 {
                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
             }
         }
-        #[cfg(test)]
-        paste::paste! {
-            // Helper to test $op separately.
-            unsafe fn [<$reexport_name _op>](dst: *mut u128, val: u128) -> u128 {
-                // SAFETY: the caller must uphold the safety contract.
-                unsafe {
-                    $name(dst, val, Ordering::Relaxed)
-                }
-            }
-        }
     };
 }
 /// Atomic RMW by CAS loop (3 arguments)
@@ -726,10 +809,18 @@ macro_rules! atomic_rmw_cas_3 {
             not(portable_atomic_ll_sc_rmw),
         ))]
         use $name as $reexport_name;
+        #[cfg(any(
+            test,
+            all(
+                any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+                not(portable_atomic_ll_sc_rmw),
+            )
+        ))]
         #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            debug_assert_lse!();
             // SAFETY: the caller must uphold the safety contract.
             // cfg guarantee that the CPU supports FEAT_LSE.
             unsafe {
@@ -782,7 +873,7 @@ macro_rules! atomic_rmw_cas_3 {
 /// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`)
 /// - new_lo/new_hi pair: new value that will to stored by sc
 macro_rules! atomic_rmw_ll_sc_2 {
-    ($name:ident as $reexport_name:ident, options($($options:tt)*), $($op:tt)*) => {
+    ($name:ident as $reexport_name:ident $(($preserves_flags:tt))?, $($op:tt)*) => {
         // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set,
         // we use CAS-based atomic RMW generated by atomic_rmw_cas_2! macro instead.
         #[cfg(not(all(
@@ -790,6 +881,13 @@ macro_rules! atomic_rmw_ll_sc_2 {
             not(portable_atomic_ll_sc_rmw),
         )))]
         use $name as $reexport_name;
+        #[cfg(any(
+            test,
+            not(all(
+                any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+                not(portable_atomic_ll_sc_rmw),
+            ))
+        ))]
         #[inline]
         unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
@@ -811,7 +909,7 @@ macro_rules! atomic_rmw_ll_sc_2 {
                             new_lo = out(reg) _,
                             new_hi = out(reg) _,
                             r = out(reg) _,
-                            options($($options)*),
+                            options(nostack $(, $preserves_flags)?),
                         )
                     };
                 }
@@ -819,16 +917,6 @@ macro_rules! atomic_rmw_ll_sc_2 {
                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
             }
         }
-        #[cfg(test)]
-        paste::paste! {
-            // Helper to test $op separately.
-            unsafe fn [<$reexport_name _op>](dst: *mut u128) -> u128 {
-                // SAFETY: the caller must uphold the safety contract.
-                unsafe {
-                    $name(dst, Ordering::Relaxed)
-                }
-            }
-        }
     };
 }
 /// Atomic RMW by CAS loop (2 arguments)
@@ -846,10 +934,18 @@ macro_rules! atomic_rmw_cas_2 {
             not(portable_atomic_ll_sc_rmw),
         ))]
         use $name as $reexport_name;
+        #[cfg(any(
+            test,
+            all(
+                any(target_feature = "lse", portable_atomic_target_feature = "lse"),
+                not(portable_atomic_ll_sc_rmw),
+            )
+        ))]
         #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
         #[inline]
         unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            debug_assert_lse!();
             // SAFETY: the caller must uphold the safety contract.
             // cfg guarantee that the CPU supports FEAT_LSE.
             unsafe {
@@ -892,10 +988,9 @@ macro_rules! atomic_rmw_cas_2 {
     };
 }
 
+// Do not use `preserves_flags` because ADDS and ADCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_add_ldxp_stxp as atomic_add,
-    // Do not use `preserves_flags` because ADDS and ADCS modify the condition flags.
-    options(nostack),
     concat!(
         "adds ",
         select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
@@ -917,10 +1012,9 @@ atomic_rmw_cas_3! {
     ),
 }
 
+// Do not use `preserves_flags` because SUBS and SBCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_sub_ldxp_stxp as atomic_sub,
-    // Do not use `preserves_flags` because SUBS and SBCS modify the condition flags.
-    options(nostack),
     concat!(
         "subs ",
         select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
@@ -943,8 +1037,7 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_ll_sc_3! {
-    _atomic_and_ldxp_stxp as atomic_and,
-    options(nostack, preserves_flags),
+    _atomic_and_ldxp_stxp as atomic_and (preserves_flags),
     "and {new_lo}, {prev_lo}, {val_lo}",
     "and {new_hi}, {prev_hi}, {val_hi}",
 }
@@ -955,8 +1048,7 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_ll_sc_3! {
-    _atomic_nand_ldxp_stxp as atomic_nand,
-    options(nostack, preserves_flags),
+    _atomic_nand_ldxp_stxp as atomic_nand (preserves_flags),
     "and {new_lo}, {prev_lo}, {val_lo}",
     "mvn {new_lo}, {new_lo}",
     "and {new_hi}, {prev_hi}, {val_hi}",
@@ -971,8 +1063,7 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_ll_sc_3! {
-    _atomic_or_ldxp_stxp as atomic_or,
-    options(nostack, preserves_flags),
+    _atomic_or_ldxp_stxp as atomic_or (preserves_flags),
     "orr {new_lo}, {prev_lo}, {val_lo}",
     "orr {new_hi}, {prev_hi}, {val_hi}",
 }
@@ -983,8 +1074,7 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_ll_sc_3! {
-    _atomic_xor_ldxp_stxp as atomic_xor,
-    options(nostack, preserves_flags),
+    _atomic_xor_ldxp_stxp as atomic_xor (preserves_flags),
     "eor {new_lo}, {prev_lo}, {val_lo}",
     "eor {new_hi}, {prev_hi}, {val_hi}",
 }
@@ -995,8 +1085,7 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_ll_sc_2! {
-    _atomic_not_ldxp_stxp as atomic_not,
-    options(nostack, preserves_flags),
+    _atomic_not_ldxp_stxp as atomic_not (preserves_flags),
     "mvn {new_lo}, {prev_lo}",
     "mvn {new_hi}, {prev_hi}",
 }
@@ -1006,10 +1095,9 @@ atomic_rmw_cas_2! {
     "mvn x5, x7",
 }
 
+// Do not use `preserves_flags` because NEGS modifies the condition flags.
 atomic_rmw_ll_sc_2! {
     _atomic_neg_ldxp_stxp as atomic_neg,
-    // Do not use `preserves_flags` because NEGS modifies the condition flags.
-    options(nostack),
     concat!("negs ", select_le_or_be!("{new_lo}, {prev_lo}", "{new_hi}, {prev_hi}")),
     concat!("ngc ", select_le_or_be!("{new_hi}, {prev_hi}", "{new_lo}, {prev_lo}")),
 }
@@ -1019,10 +1107,9 @@ atomic_rmw_cas_2! {
     concat!("ngc ", select_le_or_be!("x5, x7", "x4, x6")),
 }
 
+// Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_max_ldxp_stxp as atomic_max,
-    // Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
-    options(nostack),
     select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"),
     select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"),
     "csel {new_hi}, {prev_hi}, {val_hi}, lt", // select hi 64-bit
@@ -1036,10 +1123,9 @@ atomic_rmw_cas_3! {
     "csel x4, x6, {val_lo}, lt", // select lo 64-bit
 }
 
+// Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_umax_ldxp_stxp as atomic_umax,
-    // Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
-    options(nostack),
     select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"),
     select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"),
     "csel {new_hi}, {prev_hi}, {val_hi}, lo", // select hi 64-bit
@@ -1053,10 +1139,9 @@ atomic_rmw_cas_3! {
     "csel x4, x6, {val_lo}, lo", // select lo 64-bit
 }
 
+// Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_min_ldxp_stxp as atomic_min,
-    // Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
-    options(nostack),
     select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"),
     select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"),
     "csel {new_hi}, {prev_hi}, {val_hi}, ge", // select hi 64-bit
@@ -1070,10 +1155,9 @@ atomic_rmw_cas_3! {
     "csel x4, x6, {val_lo}, ge", // select lo 64-bit
 }
 
+// Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
 atomic_rmw_ll_sc_3! {
     _atomic_umin_ldxp_stxp as atomic_umin,
-    // Do not use `preserves_flags` because CMP and SBCS modify the condition flags.
-    options(nostack),
     select_le_or_be!("cmp {val_lo}, {prev_lo}", "cmp {val_hi}, {prev_hi}"),
     select_le_or_be!("sbcs xzr, {val_hi}, {prev_hi}", "sbcs xzr, {val_lo}, {prev_lo}"),
     "csel {new_hi}, {prev_hi}, {val_hi}, hs", // select hi 64-bit
@@ -1102,40 +1186,4 @@ mod tests {
 
     test_atomic_int!(i128);
     test_atomic_int!(u128);
-
-    test_atomic128_op!();
-}
-
-#[cfg(test)]
-#[allow(dead_code, clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
-mod tests_no_outline_atomics {
-    use super::*;
-
-    #[inline]
-    unsafe fn atomic_compare_exchange(
-        dst: *mut u128,
-        old: u128,
-        new: u128,
-        success: Ordering,
-        _failure: Ordering,
-    ) -> Result<u128, u128> {
-        // SAFETY: the caller must uphold the safety contract.
-        let res = unsafe { _atomic_compare_exchange_ldxp_stxp(dst, old, new, success) };
-        if res == old {
-            Ok(res)
-        } else {
-            Err(res)
-        }
-    }
-
-    // LLVM appears to generate strong CAS for aarch64 128-bit weak CAS,
-    // so we always use strong CAS.
-    use self::atomic_compare_exchange as atomic_compare_exchange_weak;
-
-    atomic128!(AtomicI128, i128, atomic_max, atomic_min);
-    atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
-
-    // Do not put this in the nested tests module due to glob imports refer to super::super::Atomic*.
-    test_atomic_int!(i128);
-    test_atomic_int!(u128);
 }
diff --git a/src/imp/atomic128/detect/common.rs b/src/imp/atomic128/detect/common.rs
index a23eb7c6..086d3054 100644
--- a/src/imp/atomic128/detect/common.rs
+++ b/src/imp/atomic128/detect/common.rs
@@ -33,7 +33,12 @@ pub(crate) fn detect() -> CpuInfo {
         return info;
     }
     info.set(CpuInfo::INIT);
-    _detect(&mut info);
+    // Note: This cfg is intended to make it easy for portable-atomic developers
+    // to test has_cmpxchg16b == false or has_lse == false cases,
+    // and is not a public API.
+    if !cfg!(portable_atomic_test_outline_atomics_detect_false) {
+        _detect(&mut info);
+    }
     CACHE.store(info.0, Ordering::Relaxed);
     info
 }
@@ -60,7 +65,7 @@ impl CpuInfo {
     pub(crate) fn has_lse(self) -> bool {
         #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
         {
-            // FEAT_LSE is statically available.
+            // FEAT_LSE is available at compile-time.
             true
         }
         #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
@@ -82,7 +87,7 @@ impl CpuInfo {
     pub(crate) fn has_cmpxchg16b(self) -> bool {
         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
         {
-            // CMPXCHG16B is statically available.
+            // CMPXCHG16B is available at compile-time.
             true
         }
         #[cfg(not(any(
@@ -246,6 +251,7 @@ mod tests_common {
         let _ = stdout.write_all(features.as_bytes());
     }
 
+    #[cfg(not(portable_atomic_test_outline_atomics_detect_false))]
     #[cfg(target_arch = "aarch64")]
     #[test]
     fn test_detect() {
diff --git a/src/imp/atomic128/detect/x86_64.rs b/src/imp/atomic128/detect/x86_64.rs
index fca3d354..d3dc6746 100644
--- a/src/imp/atomic128/detect/x86_64.rs
+++ b/src/imp/atomic128/detect/x86_64.rs
@@ -1,7 +1,12 @@
 // Adapted from https://github.com/rust-lang/stdarch.
 
 #![cfg_attr(
-    any(not(target_feature = "sse"), miri, portable_atomic_sanitize_thread),
+    any(
+        not(target_feature = "sse"),
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        miri,
+        portable_atomic_sanitize_thread,
+    ),
     allow(dead_code)
 )]
 
@@ -29,7 +34,7 @@ unsafe fn __cpuid(leaf: u32) -> CpuidResult {
             // rbx is reserved by LLVM
             "mov {ebx_tmp:r}, rbx",
             "cpuid",
-            "xchg {ebx_tmp:r}, rbx",
+            "xchg {ebx_tmp:r}, rbx", // restore rbx
             ebx_tmp = out(reg) ebx,
             inout("eax") leaf => eax,
             inout("ecx") 0 => ecx,
@@ -105,8 +110,10 @@ fn _detect(info: &mut CpuInfo) {
 )]
 #[cfg(test)]
 mod tests {
+    #[cfg(not(portable_atomic_test_outline_atomics_detect_false))]
     use super::*;
 
+    #[cfg(not(portable_atomic_test_outline_atomics_detect_false))]
     #[test]
     // SGX doesn't support CPUID.
     // Miri doesn't support inline assembly.
diff --git a/src/imp/atomic128/intrinsics.rs b/src/imp/atomic128/intrinsics.rs
index 7dc5e985..15445dde 100644
--- a/src/imp/atomic128/intrinsics.rs
+++ b/src/imp/atomic128/intrinsics.rs
@@ -1,40 +1,60 @@
-// Atomic{I,U}128 implementation using core::intrinsics.
+// Atomic{I,U}128 implementation without inline assembly.
 //
-// Refs: https://github.com/rust-lang/rust/blob/1.68.0/library/core/src/sync/atomic.rs
+// Note: This module is currently only enabled on Miri and ThreadSanitizer which
+// do not support inline assembly.
+//
+// This uses `core::arch::x86_64::cmpxchg16b` on x86_64 and
+// `core::intrinsics::atomic_*` on aarch64, powerpc64, and s390x.
 //
-// On aarch64 and powerpc64, this module is currently only enabled on Miri and ThreadSanitizer
-// which do not support inline assembly. (Note: on powerpc64, it requires LLVM 15+)
-// On x86_64, this module is currently only enabled on benchmark.
+// See README.md of this directory for performance comparison with the
+// implementation with inline assembly.
 //
-// Note that we cannot use this module on s390x because LLVM currently generates
-// libcalls for operations other than load/store/cmpxchg: https://godbolt.org/z/5c9b3eYf7
+// Note:
+// - This currently always needs nightly compilers. On x86_64, the stabilization
+//   of `core::arch::x86_64::cmpxchg16b` has been recently merged to stdarch:
+//   https://github.com/rust-lang/stdarch/pull/1358
+// - On powerpc64, this requires LLVM 15+ and pwr8+ (quadword-atomics LLVM target feature):
+//   https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445
+// - On aarch64 big-endian, LLVM (as of 15) generates broken code.
+//   (on cfg(miri)/cfg(sanitize) it is fine though)
+// - On s390x, LLVM (as of 16) generates libcalls for operations other than load/store/cmpxchg:
+//   https://godbolt.org/z/5a5T4hxMh
+// - On powerpc64, LLVM (as of 16) doesn't support 128-bit atomic min/max:
+//   https://godbolt.org/z/3rebKcbdf
 //
-// Note that we cannot use this module on aarch64_be (big-endian) because LLVM
-// currently generates broken code. (on cfg(miri)/cfg(sanitize) it is fine though)
+// Refs: https://github.com/rust-lang/rust/blob/1.68.0/library/core/src/sync/atomic.rs
+
+include!("macros.rs");
+
+#[allow(dead_code)] // we only use compare_exchange
+#[cfg(target_arch = "x86_64")]
+#[cfg(not(target_feature = "cmpxchg16b"))]
+#[path = "../fallback/outline_atomics.rs"]
+mod fallback;
+
+#[cfg(target_arch = "x86_64")]
+#[cfg(not(target_feature = "cmpxchg16b"))]
+#[path = "detect/x86_64.rs"]
+mod detect;
 
+use core::sync::atomic::Ordering;
+#[cfg(not(target_arch = "x86_64"))]
 use core::{
-    cell::UnsafeCell,
     intrinsics,
-    sync::atomic::Ordering::{self, AcqRel, Acquire, Relaxed, Release, SeqCst},
+    sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release, SeqCst},
 };
 
-// On x86_64, this module is only enabled on benchmark.
-macro_rules! assert_cmpxchg16b {
-    () => {
-        #[cfg(all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")))]
-        {
-            assert!(std::is_x86_feature_detected!("cmpxchg16b"));
-        }
-    };
-}
-
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
-    crate::utils::assert_load_ordering(order);
+    #[cfg(target_arch = "x86_64")]
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        let fail_order = crate::utils::strongest_failure_ordering(order);
+        match atomic_compare_exchange(src, 0, 0, order, fail_order) {
+            Ok(v) | Err(v) => v,
+        }
+    }
+    #[cfg(not(target_arch = "x86_64"))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
@@ -47,47 +67,25 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
 }
 
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
-    crate::utils::assert_store_ordering(order);
+    #[cfg(target_arch = "x86_64")]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        match order {
-            Release => intrinsics::atomic_store_release(dst, val),
-            Relaxed => intrinsics::atomic_store_relaxed(dst, val),
-            SeqCst => intrinsics::atomic_store_seqcst(dst, val),
-            _ => unreachable!("{:?}", order),
-        }
+        atomic_swap(dst, val, order);
     }
-}
-
-#[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
-unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
+    #[cfg(not(target_arch = "x86_64"))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
-            Acquire => intrinsics::atomic_xchg_acquire(dst, val),
-            Release => intrinsics::atomic_xchg_release(dst, val),
-            AcqRel => intrinsics::atomic_xchg_acqrel(dst, val),
-            Relaxed => intrinsics::atomic_xchg_relaxed(dst, val),
-            SeqCst => intrinsics::atomic_xchg_seqcst(dst, val),
+            Release => intrinsics::atomic_store_release(dst, val),
+            Relaxed => intrinsics::atomic_store_relaxed(dst, val),
+            SeqCst => intrinsics::atomic_store_seqcst(dst, val),
             _ => unreachable!("{:?}", order),
         }
     }
 }
 
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_compare_exchange(
     dst: *mut u128,
     old: u128,
@@ -95,7 +93,53 @@ unsafe fn atomic_compare_exchange(
     success: Ordering,
     failure: Ordering,
 ) -> Result<u128, u128> {
-    crate::utils::assert_compare_exchange_ordering(success, failure);
+    #[cfg(target_arch = "x86_64")]
+    let (val, ok) = {
+        #[cfg_attr(not(target_feature = "cmpxchg16b"), target_feature(enable = "cmpxchg16b"))]
+        #[cfg_attr(target_feature = "cmpxchg16b", inline)]
+        #[cfg_attr(not(target_feature = "cmpxchg16b"), inline(never))]
+        unsafe fn cmpxchg16b(
+            dst: *mut u128,
+            old: u128,
+            new: u128,
+            success: Ordering,
+            failure: Ordering,
+        ) -> (u128, bool) {
+            debug_assert!(dst as usize % 16 == 0);
+            #[cfg(not(target_feature = "cmpxchg16b"))]
+            {
+                debug_assert!(detect::detect().has_cmpxchg16b());
+            }
+            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+            // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
+            // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
+            let res = unsafe { core::arch::x86_64::cmpxchg16b(dst, old, new, success, failure) };
+            (res, res == old)
+        }
+        let success = crate::utils::upgrade_success_ordering(success, failure);
+        #[cfg(target_feature = "cmpxchg16b")]
+        // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+        // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
+        // and cfg guarantees that CMPXCHG16B is available at compile-time.
+        unsafe {
+            cmpxchg16b(dst, old, new, success, failure)
+        }
+        #[cfg(not(target_feature = "cmpxchg16b"))]
+        // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+        // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
+        unsafe {
+            ifunc!(unsafe fn(
+                dst: *mut u128, old: u128, new: u128, success: Ordering, failure: Ordering
+            ) -> (u128, bool) {
+                if detect::detect().has_cmpxchg16b() {
+                    cmpxchg16b
+                } else {
+                    fallback::atomic_compare_exchange
+                }
+            })
+        }
+    };
+    #[cfg(not(target_arch = "x86_64"))]
     // SAFETY: the caller must uphold the safety contract.
     let (val, ok) = unsafe {
         match (success, failure) {
@@ -124,11 +168,10 @@ unsafe fn atomic_compare_exchange(
     }
 }
 
+#[cfg(target_arch = "x86_64")]
+use atomic_compare_exchange as atomic_compare_exchange_weak;
+#[cfg(not(target_arch = "x86_64"))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_compare_exchange_weak(
     dst: *mut u128,
     old: u128,
@@ -136,7 +179,6 @@ unsafe fn atomic_compare_exchange_weak(
     success: Ordering,
     failure: Ordering,
 ) -> Result<u128, u128> {
-    crate::utils::assert_compare_exchange_ordering(success, failure);
     // SAFETY: the caller must uphold the safety contract.
     let (val, ok) = unsafe {
         match (success, failure) {
@@ -165,11 +207,52 @@ unsafe fn atomic_compare_exchange_weak(
     }
 }
 
+#[inline(always)]
+unsafe fn atomic_update<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
+where
+    F: FnMut(u128) -> u128,
+{
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        // This is a private function and all instances of `f` only operate on the value
+        // loaded, so there is no need to synchronize the first load/failed CAS.
+        let mut old = atomic_load(dst, Ordering::Relaxed);
+        loop {
+            let next = f(old);
+            match atomic_compare_exchange_weak(dst, old, next, order, Ordering::Relaxed) {
+                Ok(x) => return x,
+                Err(x) => old = x,
+            }
+        }
+    }
+}
+
+// On x86_64, we use core::arch::x86_64::cmpxchg16b instead of core::intrinsics.
+// On s390x, LLVM (as of 16) generates libcalls for operations other than load/store/cmpxchg: https://godbolt.org/z/5a5T4hxMh
+#[cfg(any(target_arch = "x86_64", target_arch = "s390x"))]
+atomic_rmw_by_atomic_update!();
+// On powerpc64, LLVM (as of 16) doesn't support 128-bit atomic min/max: https://godbolt.org/z/3rebKcbdf
+#[cfg(target_arch = "powerpc64")]
+atomic_rmw_by_atomic_update!(cmp);
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
+#[inline]
+unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        match order {
+            Acquire => intrinsics::atomic_xchg_acquire(dst, val),
+            Release => intrinsics::atomic_xchg_release(dst, val),
+            AcqRel => intrinsics::atomic_xchg_acqrel(dst, val),
+            Relaxed => intrinsics::atomic_xchg_relaxed(dst, val),
+            SeqCst => intrinsics::atomic_xchg_seqcst(dst, val),
+            _ => unreachable!("{:?}", order),
+        }
+    }
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_add(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -184,11 +267,8 @@ unsafe fn atomic_add(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_sub(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -203,11 +283,8 @@ unsafe fn atomic_sub(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -222,11 +299,8 @@ unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_nand(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -241,11 +315,8 @@ unsafe fn atomic_nand(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -260,11 +331,8 @@ unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_xor(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -279,96 +347,41 @@ unsafe fn atomic_xor(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
-#[inline(always)]
-unsafe fn atomic_update<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
-where
-    F: FnMut(u128) -> u128,
-{
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        // This is a private function and all instances of `f` only operate on the value
-        // loaded, so there is no need to synchronize the first load/failed CAS.
-        let mut old = atomic_load(dst, Ordering::Relaxed);
-        loop {
-            let next = f(old);
-            match atomic_compare_exchange_weak(dst, old, next, order, Ordering::Relaxed) {
-                Ok(x) => return x,
-                Err(x) => old = x,
-            }
-        }
-    }
-}
-
-/// returns the max value (signed comparison)
+#[cfg(not(any(target_arch = "x86_64", target_arch = "powerpc64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
-unsafe fn atomic_max(dst: *mut i128, val: i128, order: Ordering) -> i128 {
-    // LLVM 15 doesn't support 128-bit atomic min/max for powerpc64.
-    #[cfg(target_arch = "powerpc64")]
-    #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        atomic_update(dst.cast::<u128>(), order, |x| core::cmp::max(x as i128, val) as u128) as i128
-    }
-    #[cfg(not(target_arch = "powerpc64"))]
+unsafe fn atomic_max(dst: *mut u128, val: u128, order: Ordering) -> i128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
-            Acquire => intrinsics::atomic_max_acquire(dst, val),
-            Release => intrinsics::atomic_max_release(dst, val),
-            AcqRel => intrinsics::atomic_max_acqrel(dst, val),
-            Relaxed => intrinsics::atomic_max_relaxed(dst, val),
-            SeqCst => intrinsics::atomic_max_seqcst(dst, val),
+            Acquire => intrinsics::atomic_max_acquire(dst.cast::<i128>(), val as i128),
+            Release => intrinsics::atomic_max_release(dst.cast::<i128>(), val as i128),
+            AcqRel => intrinsics::atomic_max_acqrel(dst.cast::<i128>(), val as i128),
+            Relaxed => intrinsics::atomic_max_relaxed(dst.cast::<i128>(), val as i128),
+            SeqCst => intrinsics::atomic_max_seqcst(dst.cast::<i128>(), val as i128),
             _ => unreachable!("{:?}", order),
         }
     }
 }
 
-/// returns the min value (signed comparison)
+#[cfg(not(any(target_arch = "x86_64", target_arch = "powerpc64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
-unsafe fn atomic_min(dst: *mut i128, val: i128, order: Ordering) -> i128 {
-    // LLVM 15 doesn't support 128-bit atomic min/max for powerpc64.
-    #[cfg(target_arch = "powerpc64")]
-    #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        atomic_update(dst.cast::<u128>(), order, |x| core::cmp::min(x as i128, val) as u128) as i128
-    }
-    #[cfg(not(target_arch = "powerpc64"))]
+unsafe fn atomic_min(dst: *mut u128, val: u128, order: Ordering) -> i128 {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
-            Acquire => intrinsics::atomic_min_acquire(dst, val),
-            Release => intrinsics::atomic_min_release(dst, val),
-            AcqRel => intrinsics::atomic_min_acqrel(dst, val),
-            Relaxed => intrinsics::atomic_min_relaxed(dst, val),
-            SeqCst => intrinsics::atomic_min_seqcst(dst, val),
+            Acquire => intrinsics::atomic_min_acquire(dst.cast::<i128>(), val as i128),
+            Release => intrinsics::atomic_min_release(dst.cast::<i128>(), val as i128),
+            AcqRel => intrinsics::atomic_min_acqrel(dst.cast::<i128>(), val as i128),
+            Relaxed => intrinsics::atomic_min_relaxed(dst.cast::<i128>(), val as i128),
+            SeqCst => intrinsics::atomic_min_seqcst(dst.cast::<i128>(), val as i128),
             _ => unreachable!("{:?}", order),
         }
     }
 }
 
-/// returns the max value (unsigned comparison)
+#[cfg(not(any(target_arch = "x86_64", target_arch = "powerpc64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_umax(dst: *mut u128, val: u128, order: Ordering) -> u128 {
-    // LLVM 15 doesn't support 128-bit atomic min/max for powerpc64.
-    #[cfg(target_arch = "powerpc64")]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        atomic_update(dst, order, |x| core::cmp::max(x, val))
-    }
-    #[cfg(not(target_arch = "powerpc64"))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
@@ -382,20 +395,9 @@ unsafe fn atomic_umax(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
-/// returns the min value (unsigned comparison)
+#[cfg(not(any(target_arch = "x86_64", target_arch = "powerpc64", target_arch = "s390x")))]
 #[inline]
-#[cfg_attr(
-    all(target_arch = "x86_64", not(target_feature = "cmpxchg16b")),
-    target_feature(enable = "cmpxchg16b")
-)]
 unsafe fn atomic_umin(dst: *mut u128, val: u128, order: Ordering) -> u128 {
-    // LLVM 15 doesn't support 128-bit atomic min/max for powerpc64.
-    #[cfg(target_arch = "powerpc64")]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        atomic_update(dst, order, |x| core::cmp::min(x, val))
-    }
-    #[cfg(not(target_arch = "powerpc64"))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         match order {
@@ -409,218 +411,43 @@ unsafe fn atomic_umin(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     }
 }
 
-macro_rules! atomic128 {
-    ($atomic_type:ident, $int_type:ident, $atomic_max:ident, $atomic_min:ident) => {
-        #[repr(C, align(16))]
-        pub(crate) struct $atomic_type {
-            v: UnsafeCell<$int_type>,
-        }
-
-        // Send is implicitly implemented.
-        // SAFETY: any data races are prevented by atomic intrinsics.
-        unsafe impl Sync for $atomic_type {}
-
-        impl_default_no_fetch_ops!($atomic_type, $int_type);
-        impl_default_bit_opts!($atomic_type, $int_type);
-        impl $atomic_type {
-            #[inline]
-            pub(crate) const fn new(v: $int_type) -> Self {
-                Self { v: UnsafeCell::new(v) }
-            }
-
-            #[inline]
-            pub(crate) fn is_lock_free() -> bool {
-                Self::is_always_lock_free()
-            }
-            #[inline]
-            pub(crate) const fn is_always_lock_free() -> bool {
-                true
-            }
-
-            #[inline]
-            pub(crate) fn get_mut(&mut self) -> &mut $int_type {
-                self.v.get_mut()
-            }
-
-            #[inline]
-            pub(crate) fn into_inner(self) -> $int_type {
-                self.v.into_inner()
-            }
-
-            #[inline]
-            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
-            pub(crate) fn load(&self, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_load(self.v.get().cast::<u128>(), order) as $int_type }
-            }
-
-            #[inline]
-            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
-            pub(crate) fn store(&self, val: $int_type, order: Ordering) {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_store(self.v.get().cast::<u128>(), val as u128, order) }
-            }
-
-            #[inline]
-            pub(crate) fn swap(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_swap(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
-            pub(crate) fn compare_exchange(
-                &self,
-                current: $int_type,
-                new: $int_type,
-                success: Ordering,
-                failure: Ordering,
-            ) -> Result<$int_type, $int_type> {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe {
-                    match atomic_compare_exchange(
-                        self.v.get().cast::<u128>(),
-                        current as u128,
-                        new as u128,
-                        success,
-                        failure,
-                    ) {
-                        Ok(v) => Ok(v as $int_type),
-                        Err(v) => Err(v as $int_type),
-                    }
-                }
-            }
-
-            #[inline]
-            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
-            pub(crate) fn compare_exchange_weak(
-                &self,
-                current: $int_type,
-                new: $int_type,
-                success: Ordering,
-                failure: Ordering,
-            ) -> Result<$int_type, $int_type> {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe {
-                    match atomic_compare_exchange_weak(
-                        self.v.get().cast::<u128>(),
-                        current as u128,
-                        new as u128,
-                        success,
-                        failure,
-                    ) {
-                        Ok(v) => Ok(v as $int_type),
-                        Err(v) => Err(v as $int_type),
-                    }
-                }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_add(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_add(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_sub(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_sub(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_and(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_and(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_nand(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_nand(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_or(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_or(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_xor(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { atomic_xor(self.v.get().cast::<u128>(), val as u128, order) as $int_type }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_max(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { $atomic_max(self.v.get(), val, order) }
-            }
-
-            #[inline]
-            pub(crate) fn fetch_min(&self, val: $int_type, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe { $atomic_min(self.v.get(), val, order) }
-            }
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
+#[inline]
+unsafe fn atomic_not(dst: *mut u128, order: Ordering) -> u128 {
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe { atomic_xor(dst, core::u128::MAX, order) }
+}
 
-            #[inline]
-            pub(crate) fn fetch_not(&self, order: Ordering) -> $int_type {
-                const NOT_MASK: $int_type = (0 as $int_type).wrapping_sub(1);
-                self.fetch_xor(NOT_MASK, order)
-            }
-            #[inline]
-            pub(crate) fn not(&self, order: Ordering) {
-                self.fetch_not(order);
-            }
+#[cfg(not(any(target_arch = "x86_64", target_arch = "s390x")))]
+#[inline]
+unsafe fn atomic_neg(dst: *mut u128, order: Ordering) -> u128 {
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe { atomic_update(dst, order, u128::wrapping_neg) }
+}
 
-            #[inline]
-            pub(crate) fn fetch_neg(&self, order: Ordering) -> $int_type {
-                assert_cmpxchg16b!();
-                // SAFETY: any data races are prevented by atomic intrinsics and the raw
-                // pointer passed in is valid because we got it from a reference.
-                unsafe {
-                    atomic_update(self.v.get().cast::<u128>(), order, u128::wrapping_neg)
-                        as $int_type
-                }
-            }
-            #[inline]
-            pub(crate) fn neg(&self, order: Ordering) {
-                self.fetch_neg(order);
-            }
+#[cfg(not(target_arch = "x86_64"))]
+#[inline]
+const fn is_lock_free() -> bool {
+    IS_ALWAYS_LOCK_FREE
+}
+#[cfg(not(target_arch = "x86_64"))]
+const IS_ALWAYS_LOCK_FREE: bool = true;
 
-            #[inline]
-            pub(crate) const fn as_ptr(&self) -> *mut $int_type {
-                self.v.get()
-            }
-        }
-    };
+#[cfg(target_arch = "x86_64")]
+#[inline]
+fn is_lock_free() -> bool {
+    #[cfg(target_feature = "cmpxchg16b")]
+    {
+        // CMPXCHG16B is available at compile-time.
+        true
+    }
+    #[cfg(not(target_feature = "cmpxchg16b"))]
+    {
+        detect::detect().has_cmpxchg16b()
+    }
 }
+#[cfg(target_arch = "x86_64")]
+const IS_ALWAYS_LOCK_FREE: bool = cfg!(target_feature = "cmpxchg16b");
 
 atomic128!(AtomicI128, i128, atomic_max, atomic_min);
 atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
diff --git a/src/imp/atomic128/macros.rs b/src/imp/atomic128/macros.rs
index 7e1fe30b..77281c36 100644
--- a/src/imp/atomic128/macros.rs
+++ b/src/imp/atomic128/macros.rs
@@ -201,130 +201,58 @@ macro_rules! atomic128 {
     };
 }
 
-#[cfg(any(
-    target_arch = "s390x",
-    all(
-        target_arch = "x86_64",
-        any(
-            test,
-            not(any(
-                target_feature = "cmpxchg16b",
-                portable_atomic_target_feature = "cmpxchg16b",
-            )),
-            any(miri, portable_atomic_sanitize_thread),
-        ),
-    ),
-))]
+#[cfg(any(target_arch = "powerpc64", target_arch = "s390x", target_arch = "x86_64"))]
+#[cfg_attr(any(target_arch = "powerpc64", target_arch = "x86_64"), allow(unused_macros))] // only used by intrinsics.rs
 macro_rules! atomic_rmw_by_atomic_update {
     () => {
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |_| val) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_add(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| x.wrapping_add(val)) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_sub(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| x.wrapping_sub(val)) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| x & val) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_nand(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| !(x & val)) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| x | val) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_xor(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| x ^ val) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_not(dst: *mut u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, |x| !x) }
         }
-        #[cfg_attr(
-            target_arch = "s390x",
-            cfg(all(
-                any(miri, portable_atomic_sanitize_thread),
-                portable_atomic_new_atomic_intrinsics,
-            ))
-        )]
         #[inline]
         unsafe fn atomic_neg(dst: *mut u128, order: Ordering) -> u128 {
             // SAFETY: the caller must uphold the safety contract.
             unsafe { atomic_update(dst, order, u128::wrapping_neg) }
         }
+        atomic_rmw_by_atomic_update!(cmp);
+    };
+    (cmp) => {
         #[inline]
         unsafe fn atomic_max(dst: *mut u128, val: u128, order: Ordering) -> u128 {
             #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
diff --git a/src/imp/atomic128/powerpc64.rs b/src/imp/atomic128/powerpc64.rs
index e23eca31..af1bad8c 100644
--- a/src/imp/atomic128/powerpc64.rs
+++ b/src/imp/atomic128/powerpc64.rs
@@ -22,8 +22,8 @@
 // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
 //
 // Generated asm:
-// - powerpc64 (pwr8) https://godbolt.org/z/xo6EWKojK
-// - powerpc64le https://godbolt.org/z/z8ToMza5e
+// - powerpc64 (pwr8) https://godbolt.org/z/4aGs41dEn
+// - powerpc64le https://godbolt.org/z/oE3rPoqz4
 
 include!("macros.rs");
 
@@ -229,6 +229,7 @@ use atomic_compare_exchange as atomic_compare_exchange_weak;
 #[inline]
 unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
+
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         let val = U128 { whole: val };
@@ -524,5 +525,9 @@ mod tests {
     #[cfg(qemu)]
     test_atomic_int_load_store!(u128);
 
+    // Test operation parts of LL/SC-based atomic RMW implementations separately.
+    //
+    // This allows testing more code on QEMU while avoiding the problem of some
+    // atomic instructions not working on QEMU.
     test_atomic128_op!();
 }
diff --git a/src/imp/atomic128/s390x.rs b/src/imp/atomic128/s390x.rs
index 9ec5d073..fd02cc41 100644
--- a/src/imp/atomic128/s390x.rs
+++ b/src/imp/atomic128/s390x.rs
@@ -3,23 +3,19 @@
 // s390x supports 128-bit atomic load/store/cmpxchg:
 // https://github.com/llvm/llvm-project/commit/a11f63a952664f700f076fd754476a2b9eb158cc
 //
-// Note that LLVM currently generates libcalls for other operations: https://godbolt.org/z/5c9b3eYf7
+// Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
+// this module and use intrinsics.rs instead.
 //
 // Refs:
 // - z/Architecture Reference Summary https://www.ibm.com/support/pages/zarchitecture-reference-summary
 // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
 //
 // Generated asm:
-// - s390x https://godbolt.org/z/4Ms3M8x6c
+// - s390x https://godbolt.org/z/oP5bhbqce
 
 include!("macros.rs");
 
-#[cfg(not(all(
-    any(miri, portable_atomic_sanitize_thread),
-    portable_atomic_new_atomic_intrinsics,
-)))]
-use core::arch::asm;
-use core::sync::atomic::Ordering;
+use core::{arch::asm, sync::atomic::Ordering};
 
 /// A 128-bit value represented as a pair of 64-bit values.
 ///
@@ -42,21 +38,6 @@ struct Pair {
 unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
     debug_assert!(src as usize % 16 == 0);
 
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics))]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        match order {
-            Ordering::Acquire => core::intrinsics::atomic_load_acquire(src),
-            Ordering::Relaxed => core::intrinsics::atomic_load_relaxed(src),
-            Ordering::SeqCst => core::intrinsics::atomic_load_seqcst(src),
-            _ => unreachable!("{:?}", order),
-        }
-    }
-    #[cfg(not(all(
-        any(miri, portable_atomic_sanitize_thread),
-        portable_atomic_new_atomic_intrinsics,
-    )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         // atomic load is always SeqCst.
@@ -78,21 +59,6 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
 unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
     debug_assert!(dst as usize % 16 == 0);
 
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics))]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        match order {
-            Ordering::Release => core::intrinsics::atomic_store_release(dst, val),
-            Ordering::Relaxed => core::intrinsics::atomic_store_relaxed(dst, val),
-            Ordering::SeqCst => core::intrinsics::atomic_store_seqcst(dst, val),
-            _ => unreachable!("{:?}", order),
-        }
-    }
-    #[cfg(not(all(
-        any(miri, portable_atomic_sanitize_thread),
-        portable_atomic_new_atomic_intrinsics,
-    )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         let val = U128 { whole: val };
@@ -134,35 +100,6 @@ unsafe fn atomic_compare_exchange(
 ) -> Result<u128, u128> {
     debug_assert!(dst as usize % 16 == 0);
 
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics))]
-    // SAFETY: the caller must uphold the safety contract.
-    let res = unsafe {
-        use core::sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release, SeqCst};
-        match (success, failure) {
-            (Relaxed, Relaxed) => core::intrinsics::atomic_cxchg_relaxed_relaxed(dst, old, new),
-            (Relaxed, Acquire) => core::intrinsics::atomic_cxchg_relaxed_acquire(dst, old, new),
-            (Relaxed, SeqCst) => core::intrinsics::atomic_cxchg_relaxed_seqcst(dst, old, new),
-            (Acquire, Relaxed) => core::intrinsics::atomic_cxchg_acquire_relaxed(dst, old, new),
-            (Acquire, Acquire) => core::intrinsics::atomic_cxchg_acquire_acquire(dst, old, new),
-            (Acquire, SeqCst) => core::intrinsics::atomic_cxchg_acquire_seqcst(dst, old, new),
-            (Release, Relaxed) => core::intrinsics::atomic_cxchg_release_relaxed(dst, old, new),
-            (Release, Acquire) => core::intrinsics::atomic_cxchg_release_acquire(dst, old, new),
-            (Release, SeqCst) => core::intrinsics::atomic_cxchg_release_seqcst(dst, old, new),
-            (AcqRel, Relaxed) => core::intrinsics::atomic_cxchg_acqrel_relaxed(dst, old, new),
-            (AcqRel, Acquire) => core::intrinsics::atomic_cxchg_acqrel_acquire(dst, old, new),
-            (AcqRel, SeqCst) => core::intrinsics::atomic_cxchg_acqrel_seqcst(dst, old, new),
-            (SeqCst, Relaxed) => core::intrinsics::atomic_cxchg_seqcst_relaxed(dst, old, new),
-            (SeqCst, Acquire) => core::intrinsics::atomic_cxchg_seqcst_acquire(dst, old, new),
-            (SeqCst, SeqCst) => core::intrinsics::atomic_cxchg_seqcst_seqcst(dst, old, new),
-            _ => unreachable!("{:?}, {:?}", success, failure),
-        }
-        .0
-    };
-    #[cfg(not(all(
-        any(miri, portable_atomic_sanitize_thread),
-        portable_atomic_new_atomic_intrinsics,
-    )))]
     // SAFETY: the caller must uphold the safety contract.
     let res = unsafe {
         // atomic CAS is always SeqCst.
@@ -211,11 +148,6 @@ where
     }
 }
 
-// Miri and Sanitizer do not support inline assembly.
-#[cfg(not(all(
-    any(miri, portable_atomic_sanitize_thread),
-    portable_atomic_new_atomic_intrinsics,
-)))]
 #[inline]
 unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
@@ -257,20 +189,14 @@ unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
 /// - r12/r13 pair: new value that will to stored
 // We could use atomic_update here, but using an inline assembly allows omitting
 // the comparison of results and the storing/comparing of condition flags.
-#[rustfmt::skip] // buggy macro formatting
 macro_rules! atomic_rmw_cas_3 {
     ($name:ident, $($op:tt)*) => {
-        // Miri and Sanitizer do not support inline assembly.
-        #[cfg(not(all(
-            any(miri, portable_atomic_sanitize_thread),
-            portable_atomic_new_atomic_intrinsics,
-        )))]
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                // atomic swap is always SeqCst.
+                // atomic RMW is always SeqCst.
                 let val = U128 { whole: val };
                 let (mut prev_hi, mut prev_lo);
                 asm!(
@@ -302,20 +228,14 @@ macro_rules! atomic_rmw_cas_3 {
 /// - r12/r13 pair: new value that will to stored
 // We could use atomic_update here, but using an inline assembly allows omitting
 // the comparison of results and the storing/comparing of condition flags.
-#[rustfmt::skip] // buggy macro formatting
 macro_rules! atomic_rmw_cas_2 {
     ($name:ident, $($op:tt)*) => {
-        // Miri and Sanitizer do not support inline assembly.
-        #[cfg(not(all(
-            any(miri, portable_atomic_sanitize_thread),
-            portable_atomic_new_atomic_intrinsics,
-        )))]
         #[inline]
         unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                // atomic swap is always SeqCst.
+                // atomic RMW is always SeqCst.
                 let (mut prev_hi, mut prev_lo);
                 asm!(
                     "lpq %r0, 0({dst})",
@@ -401,7 +321,12 @@ atomic_rmw_cas_2! {
     "slbgr %r12, %r0",
 }
 
-atomic_rmw_by_atomic_update!();
+// We use atomic_update for atomic min/max in all cases because
+// pre-z13 doesn't seem to have a good way to implement 128-bit min/max.
+// https://godbolt.org/z/53fnrET7o
+// (LLVM 16's minimal supported architecture level is z10:
+// https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0/llvm/lib/Target/SystemZ/SystemZProcessors.td)
+atomic_rmw_by_atomic_update!(cmp);
 
 #[inline]
 const fn is_lock_free() -> bool {
diff --git a/src/imp/atomic128/x86_64.rs b/src/imp/atomic128/x86_64.rs
index 122c649f..0b6d332d 100644
--- a/src/imp/atomic128/x86_64.rs
+++ b/src/imp/atomic128/x86_64.rs
@@ -1,18 +1,18 @@
 // Atomic{I,U}128 implementation for x86_64 using CMPXCHG16B (DWCAS).
 //
+// Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
+// this module and use intrinsics.rs instead.
+//
 // Refs:
 // - x86 and amd64 instruction reference https://www.felixcloutier.com/x86
 // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
 //
 // Generated asm:
-// - x86_64 (+cmpxchg16b) https://godbolt.org/z/44xdG776a
+// - x86_64 (+cmpxchg16b) https://godbolt.org/z/KahrWeW9G
 
 include!("macros.rs");
 
-#[cfg(any(
-    test,
-    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-))]
+#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
 #[path = "../fallback/outline_atomics.rs"]
 mod fallback;
 
@@ -25,6 +25,27 @@ mod detect;
 use core::arch::asm;
 use core::sync::atomic::Ordering;
 
+// Asserts that the function is called in the correct context.
+macro_rules! debug_assert_cmpxchg16b {
+    () => {
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        {
+            debug_assert!(detect::detect().has_cmpxchg16b());
+        }
+    };
+}
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+macro_rules! debug_assert_vmovdqa_atomic {
+    () => {{
+        debug_assert_cmpxchg16b!();
+        debug_assert!(detect::detect().has_vmovdqa_atomic());
+    }};
+}
+
 #[allow(unused_macros)]
 #[cfg(target_pointer_width = "32")]
 macro_rules! ptr_modifier {
@@ -61,33 +82,11 @@ struct Pair {
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
     target_feature(enable = "cmpxchg16b")
 )]
-#[cfg_attr(
-    any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
-    inline
-)]
-#[cfg_attr(
-    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-    inline(never)
-)]
-unsafe fn _cmpxchg16b(
-    dst: *mut u128,
-    old: u128,
-    new: u128,
-    success: Ordering,
-    failure: Ordering,
-) -> (u128, bool) {
+#[inline]
+unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
     debug_assert!(dst as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
 
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(any(miri, portable_atomic_sanitize_thread))]
-    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
-    // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
-    // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
-    unsafe {
-        let res = core::arch::x86_64::cmpxchg16b(dst, old, new, success, failure);
-        (res, res == old)
-    }
-    #[cfg(not(any(miri, portable_atomic_sanitize_thread)))]
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
     // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
@@ -102,7 +101,6 @@ unsafe fn _cmpxchg16b(
     // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
     unsafe {
         // cmpxchg16b is always SeqCst.
-        let _ = (success, failure);
         let r: u8;
         let old = U128 { whole: old };
         let new = U128 { whole: new };
@@ -114,8 +112,7 @@ unsafe fn _cmpxchg16b(
                     "xchg {rbx_tmp}, rbx",
                     concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                     "sete r8b",
-                    // restore rbx
-                    "mov rbx, {rbx_tmp}",
+                    "mov rbx, {rbx_tmp}", // restore rbx
                     rbx_tmp = inout(reg) new.pair.lo => _,
                     in("rcx") new.pair.hi,
                     inout("rax") old.pair.lo => prev_lo,
@@ -135,40 +132,6 @@ unsafe fn _cmpxchg16b(
     }
 }
 
-// 128-bit atomic load by two 64-bit atomic loads.
-//
-// See atomic_update for details.
-#[cfg(any(
-    test,
-    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-    any(miri, portable_atomic_sanitize_thread),
-))]
-#[inline]
-unsafe fn byte_wise_atomic_load(src: *mut u128) -> u128 {
-    debug_assert!(src as usize % 16 == 0);
-
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(any(miri, portable_atomic_sanitize_thread))]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        atomic_load(src, Ordering::Relaxed)
-    }
-    #[cfg(not(any(miri, portable_atomic_sanitize_thread)))]
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        let (prev_lo, prev_hi);
-        asm!(
-            concat!("mov {prev_lo}, qword ptr [{src", ptr_modifier!(), "}]"),
-            concat!("mov {prev_hi}, qword ptr [{src", ptr_modifier!(), "} + 8]"),
-            src = in(reg) src,
-            prev_lo = out(reg) prev_lo,
-            prev_hi = out(reg) prev_hi,
-            options(nostack, preserves_flags, readonly),
-        );
-        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
-    }
-}
-
 // VMOVDQA is atomic on Intel and AMD CPUs with AVX.
 // See https://gcc.gnu.org/bugzilla//show_bug.cgi?id=104688 for details.
 //
@@ -176,13 +139,17 @@ unsafe fn byte_wise_atomic_load(src: *mut u128) -> u128 {
 //
 // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
 // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
 #[cfg(target_feature = "sse")]
 #[target_feature(enable = "avx")]
 #[inline]
-unsafe fn _atomic_load_vmovdqa(src: *mut u128, _order: Ordering) -> u128 {
+unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
     debug_assert!(src as usize % 16 == 0);
+    debug_assert_vmovdqa_atomic!();
 
     // SAFETY: the caller must uphold the safety contract.
+    //
+    // atomic load by vmovdqa is always SeqCst.
     unsafe {
         let out: core::arch::x86_64::__m128;
         asm!(
@@ -194,11 +161,13 @@ unsafe fn _atomic_load_vmovdqa(src: *mut u128, _order: Ordering) -> u128 {
         core::mem::transmute(out)
     }
 }
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
 #[cfg(target_feature = "sse")]
 #[target_feature(enable = "avx")]
 #[inline]
-unsafe fn _atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
+unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
     debug_assert!(dst as usize % 16 == 0);
+    debug_assert_vmovdqa_atomic!();
 
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
@@ -227,51 +196,127 @@ unsafe fn _atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
     }
 }
 
+#[cfg(not(all(
+    any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+    any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+)))]
+macro_rules! load_store_detect {
+    (
+        vmovdqa = $vmovdqa:ident
+        cmpxchg16b = $cmpxchg16b:ident
+        fallback = $fallback:ident
+    ) => {{
+        let cpuid = detect::detect();
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        {
+            // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
+            if cpuid.has_cmpxchg16b() {
+                // We do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
+                #[cfg(target_feature = "sse")]
+                {
+                    if cpuid.has_vmovdqa_atomic() {
+                        $vmovdqa
+                    } else {
+                        $cmpxchg16b
+                    }
+                }
+                #[cfg(not(target_feature = "sse"))]
+                {
+                    $cmpxchg16b
+                }
+            } else {
+                fallback::$fallback
+            }
+        }
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        {
+            if cpuid.has_vmovdqa_atomic() {
+                $vmovdqa
+            } else {
+                $cmpxchg16b
+            }
+        }
+    }};
+}
+
 #[inline]
-unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
+unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
     // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
     // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
     // SGX doesn't support CPUID.
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(any(
-        not(target_feature = "sse"),
-        portable_atomic_no_outline_atomics,
-        target_env = "sgx",
-        miri,
-        portable_atomic_sanitize_thread,
+    #[cfg(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     ))]
     // SAFETY: the caller must uphold the safety contract.
+    // cfg guarantees that CMPXCHG16B is available at compile-time.
     unsafe {
-        _atomic_load_cmpxchg16b(src, order)
+        // cmpxchg16b is always SeqCst.
+        atomic_load_cmpxchg16b(src)
     }
-    #[cfg(not(any(
-        not(target_feature = "sse"),
-        portable_atomic_no_outline_atomics,
-        target_env = "sgx",
-        miri,
-        portable_atomic_sanitize_thread,
+    #[cfg(not(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        ifunc!(unsafe fn(src: *mut u128, order: Ordering) -> u128 {
-            // Check CMPXCHG16B anyway to prevent mixing atomic and non-atomic access.
-            let cpuid = detect::detect();
-            if cpuid.has_cmpxchg16b() && cpuid.has_vmovdqa_atomic() {
-                _atomic_load_vmovdqa
-            } else {
-                _atomic_load_cmpxchg16b
+        ifunc!(unsafe fn(src: *mut u128) -> u128 {
+            load_store_detect! {
+                vmovdqa = atomic_load_vmovdqa
+                cmpxchg16b = atomic_load_cmpxchg16b
+                // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
+                fallback = atomic_load_seqcst
             }
         })
     }
 }
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
 #[inline]
-unsafe fn _atomic_load_cmpxchg16b(src: *mut u128, order: Ordering) -> u128 {
-    let fail_order = crate::utils::strongest_failure_ordering(order);
-    // SAFETY: the caller must uphold the safety contract.
+unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
+    debug_assert!(src as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
+
+    // SAFETY: the caller must guarantee that `src` is valid for both writes and
+    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
+    // cfg guarantees that the CPU supports CMPXCHG16B.
+    //
+    // See cmpxchg16b function for more.
+    //
+    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+    // omitting the storing of condition flags and avoid use of xchg to handle rbx.
     unsafe {
-        match atomic_compare_exchange(src, 0, 0, order, fail_order) {
-            Ok(v) | Err(v) => v,
+        // cmpxchg16b is always SeqCst.
+        let (prev_lo, prev_hi);
+        macro_rules! cmpxchg16b {
+            ($rdi:tt) => {
+                asm!(
+                    // rbx is reserved by LLVM
+                    "mov {rbx_tmp}, rbx",
+                    "xor rbx, rbx", // zeroed rbx
+                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                    "mov rbx, {rbx_tmp}", // restore rbx
+                    // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
+                    rbx_tmp = out(reg) _,
+                    in("rcx") 0_u64,
+                    inout("rax") 0_u64 => prev_lo,
+                    inout("rdx") 0_u64 => prev_hi,
+                    in($rdi) src,
+                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                    options(nostack),
+                )
+            };
         }
+        #[cfg(target_pointer_width = "32")]
+        cmpxchg16b!("edi");
+        #[cfg(target_pointer_width = "64")]
+        cmpxchg16b!("rdi");
+        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
     }
 }
 
@@ -280,61 +325,50 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
     // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
     // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
     // SGX doesn't support CPUID.
-    // Miri and Sanitizer do not support inline assembly.
-    #[cfg(any(
-        not(target_feature = "sse"),
-        portable_atomic_no_outline_atomics,
-        target_env = "sgx",
-        miri,
-        portable_atomic_sanitize_thread,
+    #[cfg(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     ))]
     // SAFETY: the caller must uphold the safety contract.
+    // cfg guarantees that CMPXCHG16B is available at compile-time.
     unsafe {
-        _atomic_store_cmpxchg16b(dst, val, order);
+        // cmpxchg16b is always SeqCst.
+        let _ = order;
+        atomic_store_cmpxchg16b(dst, val);
     }
-    #[cfg(not(any(
-        not(target_feature = "sse"),
-        portable_atomic_no_outline_atomics,
-        target_env = "sgx",
-        miri,
-        portable_atomic_sanitize_thread,
+    #[cfg(not(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
+        #[cfg(target_feature = "sse")]
         fn_alias! {
             #[target_feature(enable = "avx")]
             unsafe fn(dst: *mut u128, val: u128);
-            _atomic_store_vmovdqa_relaxed = _atomic_store_vmovdqa(Ordering::Relaxed);
-            _atomic_store_vmovdqa_seqcst = _atomic_store_vmovdqa(Ordering::SeqCst);
-        }
-        fn_alias! {
-            unsafe fn(dst: *mut u128, val: u128);
-            _atomic_store_cmpxchg16b_relaxed = _atomic_store_cmpxchg16b(Ordering::Relaxed);
-            _atomic_store_cmpxchg16b_seqcst = _atomic_store_cmpxchg16b(Ordering::SeqCst);
+            // atomic store by vmovdqa has at least release semantics.
+            atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
+            atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
         }
         match order {
             // Relaxed and Release stores are equivalent in all implementations
             // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
-            // Due to cfg, core::arch's cmpxchg16b will never called here.
+            // core::arch's cmpxchg16b will never called here.
             Ordering::Relaxed | Ordering::Release => {
                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
-                    // Check CMPXCHG16B anyway to prevent mixing atomic and non-atomic access.
-                    let cpuid = detect::detect();
-                    if cpuid.has_cmpxchg16b() && cpuid.has_vmovdqa_atomic() {
-                        _atomic_store_vmovdqa_relaxed
-                    } else {
-                        _atomic_store_cmpxchg16b_relaxed
+                    load_store_detect! {
+                        vmovdqa = atomic_store_vmovdqa_non_seqcst
+                        cmpxchg16b = atomic_store_cmpxchg16b
+                        fallback = atomic_store_non_seqcst
                     }
                 });
             }
             Ordering::SeqCst => {
                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
-                    // Check CMPXCHG16B anyway to prevent mixing atomic and non-atomic access.
-                    let cpuid = detect::detect();
-                    if cpuid.has_cmpxchg16b() && cpuid.has_vmovdqa_atomic() {
-                        _atomic_store_vmovdqa_seqcst
-                    } else {
-                        _atomic_store_cmpxchg16b_seqcst
+                    load_store_detect! {
+                        vmovdqa = atomic_store_vmovdqa_seqcst
+                        cmpxchg16b = atomic_store_cmpxchg16b
+                        fallback = atomic_store_seqcst
                     }
                 });
             }
@@ -342,11 +376,15 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
         }
     }
 }
-#[inline]
-unsafe fn _atomic_store_cmpxchg16b(dst: *mut u128, val: u128, order: Ordering) {
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
+unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        atomic_swap(dst, val, order);
+        // cmpxchg16b is always SeqCst.
+        atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
     }
 }
 
@@ -355,30 +393,26 @@ unsafe fn atomic_compare_exchange(
     dst: *mut u128,
     old: u128,
     new: u128,
-    success: Ordering,
-    failure: Ordering,
+    _success: Ordering,
+    _failure: Ordering,
 ) -> Result<u128, u128> {
-    let success = crate::utils::upgrade_success_ordering(success, failure);
     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
-    // and cfg guarantees that CMPXCHG16B is statically available.
-    let (res, ok) = unsafe { _cmpxchg16b(dst, old, new, success, failure) };
+    // and cfg guarantees that CMPXCHG16B is available at compile-time.
+    let (res, ok) = unsafe { cmpxchg16b(dst, old, new) };
     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
-    let (res, ok) = {
-        // SAFETY: the caller must guarantee that `dst` is valid for both writes and
-        // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
-        unsafe {
-            ifunc!(unsafe fn(
-                dst: *mut u128, old: u128, new: u128, success: Ordering, failure: Ordering
-            ) -> (u128, bool) {
-                if detect::detect().has_cmpxchg16b() {
-                    _cmpxchg16b
-                } else {
-                    fallback::atomic_compare_exchange
-                }
-            })
-        }
+    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+    // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
+    let (res, ok) = unsafe {
+        ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
+            if detect::detect().has_cmpxchg16b() {
+                cmpxchg16b
+            } else {
+                // Use SeqCst because cmpxchg16b is always SeqCst.
+                fallback::atomic_compare_exchange_seqcst
+            }
+        })
     };
     if ok {
         Ok(res)
@@ -389,66 +423,29 @@ unsafe fn atomic_compare_exchange(
 
 use atomic_compare_exchange as atomic_compare_exchange_weak;
 
-#[cfg(any(
-    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-    any(miri, portable_atomic_sanitize_thread),
-))]
-#[inline(always)]
-unsafe fn atomic_update<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
-where
-    F: FnMut(u128) -> u128,
-{
-    // SAFETY: the caller must uphold the safety contract.
-    unsafe {
-        // This is based on the code generated for the first load in DW RMWs by LLVM,
-        // but it is interesting that they generate code that does mixed-sized atomic access.
-        //
-        // This is not single-copy atomic reads, but this is ok because subsequent
-        // CAS will check for consistency.
-        //
-        // byte_wise_atomic_load works the same way as seqlock's byte-wise atomic memcpy,
-        // so it works well even when atomic_compare_exchange_weak calls global lock-based fallback.
-        //
-        // Note that the C++20 memory model does not allow mixed-sized atomic access,
-        // so we must use inline assembly to implement byte_wise_atomic_load.
-        // (i.e., byte-wise atomic based on the standard library's atomic types
-        // cannot be used here). Since fallback's byte-wise atomic memcpy is per
-        // 64-bit on x86_64 (even on x32 ABI), it's okay to use it together with this.
-        let mut old = byte_wise_atomic_load(dst);
-        loop {
-            let next = f(old);
-            // This is a private function and all instances of `f` only operate on the value
-            // loaded, so there is no need to synchronize the first load/failed CAS.
-            match atomic_compare_exchange_weak(dst, old, next, order, Ordering::Relaxed) {
-                Ok(x) => return x,
-                Err(x) => old = x,
-            }
-        }
-    }
-}
-
-// We use atomic_rmw_by_atomic_update when cmpxchg16b is not available at compile-time, or
-// on Miri and Sanitizer that do not support inline assembly.
-#[cfg(not(any(
+#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+use atomic_swap_cmpxchg16b as atomic_swap;
+#[cfg_attr(
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-    any(miri, portable_atomic_sanitize_thread),
-)))]
+    target_feature(enable = "cmpxchg16b")
+)]
 #[inline]
-unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
+unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
 
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
     // cfg guarantees that the CPU supports CMPXCHG16B.
     //
-    // See _cmpxchg16b for more.
+    // See cmpxchg16b function for more.
     //
-    // We could use atomic_update here, but using an inline assembly allows omitting
-    // the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
+    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+    // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
     //
     // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
     unsafe {
-        // atomic swap is always SeqCst.
+        // cmpxchg16b is always SeqCst.
         let _ = order;
         let val = U128 { whole: val };
         let (mut prev_lo, mut prev_hi);
@@ -457,14 +454,21 @@ unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
                 asm!(
                     // rbx is reserved by LLVM
                     "xchg {rbx_tmp}, rbx",
-                    // See atomic_update
+                    // This is not single-copy atomic reads, but this is ok because subsequent
+                    // CAS will check for consistency.
+                    //
+                    // This is based on the code generated for the first load in DW RMWs by LLVM.
+                    //
+                    // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                    // so we must use inline assembly to implement this.
+                    // (i.e., byte-wise atomic based on the standard library's atomic types
+                    // cannot be used here).
                     concat!("mov rax, qword ptr [", $rdi, "]"),
                     concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                     "2:",
                         concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                         "jne 2b",
-                    // restore rbx
-                    "mov rbx, {rbx_tmp}",
+                    "mov rbx, {rbx_tmp}", // restore rbx
                     rbx_tmp = inout(reg) val.pair.lo => _,
                     in("rcx") val.pair.hi,
                     out("rax") prev_lo,
@@ -490,27 +494,27 @@ unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
 /// - rsi/r8 pair: val argument (read-only for `$op`)
 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
 /// - rbx/rcx pair: new value that will to stored
-// We could use atomic_update here, but using an inline assembly allows omitting
-// the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
-#[rustfmt::skip] // buggy macro formatting
+// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
 macro_rules! atomic_rmw_cas_3 {
-    ($name:ident, $($op:tt)*) => {
-        // We use atomic_rmw_by_atomic_update when cmpxchg16b is not available at compile-time, or
-        // on Miri and Sanitizer that do not support inline assembly.
-        #[cfg(not(any(
+    ($name:ident as $reexport_name:ident, $($op:tt)*) => {
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        use $name as $reexport_name;
+        #[cfg_attr(
             not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-            any(miri, portable_atomic_sanitize_thread),
-        )))]
+            target_feature(enable = "cmpxchg16b")
+        )]
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            debug_assert_cmpxchg16b!();
             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
             // cfg guarantees that the CPU supports CMPXCHG16B.
             //
-            // See _cmpxchg16b for more.
+            // See cmpxchg16b function for more.
             unsafe {
-                // atomic swap is always SeqCst.
+                // cmpxchg16b is always SeqCst.
                 let val = U128 { whole: val };
                 let (mut prev_lo, mut prev_hi);
                 macro_rules! cmpxchg16b {
@@ -518,15 +522,22 @@ macro_rules! atomic_rmw_cas_3 {
                         asm!(
                             // rbx is reserved by LLVM
                             "mov {rbx_tmp}, rbx",
-                            // See atomic_update
+                            // This is not single-copy atomic reads, but this is ok because subsequent
+                            // CAS will check for consistency.
+                            //
+                            // This is based on the code generated for the first load in DW RMWs by LLVM.
+                            //
+                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                            // so we must use inline assembly to implement this.
+                            // (i.e., byte-wise atomic based on the standard library's atomic types
+                            // cannot be used here).
                             concat!("mov rax, qword ptr [", $rdi, "]"),
                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                             "2:",
                                 $($op)*
                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                                 "jne 2b",
-                            // restore rbx
-                            "mov rbx, {rbx_tmp}",
+                            "mov rbx, {rbx_tmp}", // restore rbx
                             rbx_tmp = out(reg) _,
                             out("rcx") _,
                             out("rax") prev_lo,
@@ -554,42 +565,49 @@ macro_rules! atomic_rmw_cas_3 {
 /// `$op` can use the following registers:
 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
 /// - rbx/rcx pair: new value that will to stored
-// We could use atomic_update here, but using an inline assembly allows omitting
-// the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
-#[rustfmt::skip] // buggy macro formatting
+// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+// omitting the storing of condition flags and avoid use of xchg to handle rbx.
 macro_rules! atomic_rmw_cas_2 {
-    ($name:ident, $($op:tt)*) => {
-        // We use atomic_rmw_by_atomic_update when cmpxchg16b is not available at compile-time, or
-        // on Miri and Sanitizer that do not support inline assembly.
-        #[cfg(not(any(
+    ($name:ident as $reexport_name:ident, $($op:tt)*) => {
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        use $name as $reexport_name;
+        #[cfg_attr(
             not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-            any(miri, portable_atomic_sanitize_thread),
-        )))]
+            target_feature(enable = "cmpxchg16b")
+        )]
         #[inline]
         unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
+            debug_assert_cmpxchg16b!();
             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
             // cfg guarantees that the CPU supports CMPXCHG16B.
             //
-            // See _cmpxchg16b for more.
+            // See cmpxchg16b function for more.
             unsafe {
-                // atomic swap is always SeqCst.
+                // cmpxchg16b is always SeqCst.
                 let (mut prev_lo, mut prev_hi);
                 macro_rules! cmpxchg16b {
                     ($rdi:tt) => {
                         asm!(
                             // rbx is reserved by LLVM
                             "mov {rbx_tmp}, rbx",
-                            // See atomic_update
+                            // This is not single-copy atomic reads, but this is ok because subsequent
+                            // CAS will check for consistency.
+                            //
+                            // This is based on the code generated for the first load in DW RMWs by LLVM.
+                            //
+                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                            // so we must use inline assembly to implement this.
+                            // (i.e., byte-wise atomic based on the standard library's atomic types
+                            // cannot be used here).
                             concat!("mov rax, qword ptr [", $rdi, "]"),
                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                             "2:",
                                 $($op)*
                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                                 "jne 2b",
-                            // restore rbx
-                            "mov rbx, {rbx_tmp}",
+                            "mov rbx, {rbx_tmp}", // restore rbx
                             rbx_tmp = out(reg) _,
                             out("rcx") _,
                             out("rax") prev_lo,
@@ -611,28 +629,28 @@ macro_rules! atomic_rmw_cas_2 {
 }
 
 atomic_rmw_cas_3! {
-    atomic_add,
+    atomic_add_cmpxchg16b as atomic_add,
     "mov rbx, rax",
     "add rbx, rsi",
     "mov rcx, rdx",
     "adc rcx, r8",
 }
 atomic_rmw_cas_3! {
-    atomic_sub,
+    atomic_sub_cmpxchg16b as atomic_sub,
     "mov rbx, rax",
     "sub rbx, rsi",
     "mov rcx, rdx",
     "sbb rcx, r8",
 }
 atomic_rmw_cas_3! {
-    atomic_and,
+    atomic_and_cmpxchg16b as atomic_and,
     "mov rbx, rax",
     "and rbx, rsi",
     "mov rcx, rdx",
     "and rcx, r8",
 }
 atomic_rmw_cas_3! {
-    atomic_nand,
+    atomic_nand_cmpxchg16b as atomic_nand,
     "mov rbx, rax",
     "and rbx, rsi",
     "not rbx",
@@ -641,14 +659,14 @@ atomic_rmw_cas_3! {
     "not rcx",
 }
 atomic_rmw_cas_3! {
-    atomic_or,
+    atomic_or_cmpxchg16b as atomic_or,
     "mov rbx, rax",
     "or rbx, rsi",
     "mov rcx, rdx",
     "or rcx, r8",
 }
 atomic_rmw_cas_3! {
-    atomic_xor,
+    atomic_xor_cmpxchg16b as atomic_xor,
     "mov rbx, rax",
     "xor rbx, rsi",
     "mov rcx, rdx",
@@ -656,14 +674,14 @@ atomic_rmw_cas_3! {
 }
 
 atomic_rmw_cas_2! {
-    atomic_not,
+    atomic_not_cmpxchg16b as atomic_not,
     "mov rbx, rax",
     "not rbx",
     "mov rcx, rdx",
     "not rcx",
 }
 atomic_rmw_cas_2! {
-    atomic_neg,
+    atomic_neg_cmpxchg16b as atomic_neg,
     "mov rbx, rax",
     "neg rbx",
     "mov rcx, 0",
@@ -671,7 +689,7 @@ atomic_rmw_cas_2! {
 }
 
 atomic_rmw_cas_3! {
-    atomic_max,
+    atomic_max_cmpxchg16b as atomic_max,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
@@ -681,7 +699,7 @@ atomic_rmw_cas_3! {
     "cmovl rbx, rax",
 }
 atomic_rmw_cas_3! {
-    atomic_umax,
+    atomic_umax_cmpxchg16b as atomic_umax,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
@@ -691,7 +709,7 @@ atomic_rmw_cas_3! {
     "cmovb rbx, rax",
 }
 atomic_rmw_cas_3! {
-    atomic_min,
+    atomic_min_cmpxchg16b as atomic_min,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
@@ -701,7 +719,7 @@ atomic_rmw_cas_3! {
     "cmovge rbx, rax",
 }
 atomic_rmw_cas_3! {
-    atomic_umin,
+    atomic_umin_cmpxchg16b as atomic_umin,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
@@ -711,19 +729,117 @@ atomic_rmw_cas_3! {
     "cmovae rbx, rax",
 }
 
-// We use atomic_rmw_by_atomic_update when cmpxchg16b is not available at compile-time, or
-// on Miri and Sanitizer that do not support inline assembly.
-#[cfg(any(
-    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
-    any(miri, portable_atomic_sanitize_thread),
-))]
-atomic_rmw_by_atomic_update!();
+macro_rules! atomic_rmw_with_ifunc {
+    (
+        unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
+        cmpxchg16b = $cmpxchg16b_fn:ident;
+        fallback = $seqcst_fallback_fn:ident;
+    ) => {
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        #[inline]
+        unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
+            fn_alias! {
+                #[cfg_attr(
+                    not(any(
+                        target_feature = "cmpxchg16b",
+                        portable_atomic_target_feature = "cmpxchg16b",
+                    )),
+                    target_feature(enable = "cmpxchg16b")
+                )]
+                unsafe fn($($arg)*) $(-> $ret_ty)?;
+                // cmpxchg16b is always SeqCst.
+                cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
+            }
+            // SAFETY: the caller must uphold the safety contract.
+            // we only calls cmpxchg16b_fn if cmpxchg16b is available.
+            unsafe {
+                ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                    if detect::detect().has_cmpxchg16b() {
+                        cmpxchg16b_seqcst_fn
+                    } else {
+                        // Use SeqCst because cmpxchg16b is always SeqCst.
+                        fallback::$seqcst_fallback_fn
+                    }
+                })
+            }
+        }
+    };
+}
+
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_swap_cmpxchg16b;
+    fallback = atomic_swap_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_add_cmpxchg16b;
+    fallback = atomic_add_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_sub_cmpxchg16b;
+    fallback = atomic_sub_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_and_cmpxchg16b;
+    fallback = atomic_and_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_nand_cmpxchg16b;
+    fallback = atomic_nand_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_or_cmpxchg16b;
+    fallback = atomic_or_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_xor_cmpxchg16b;
+    fallback = atomic_xor_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_max_cmpxchg16b;
+    fallback = atomic_max_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_umax_cmpxchg16b;
+    fallback = atomic_umax_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_min_cmpxchg16b;
+    fallback = atomic_min_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_umin_cmpxchg16b;
+    fallback = atomic_umin_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_not(dst: *mut u128) -> u128;
+    cmpxchg16b = atomic_not_cmpxchg16b;
+    fallback = atomic_not_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_neg(dst: *mut u128) -> u128;
+    cmpxchg16b = atomic_neg_cmpxchg16b;
+    fallback = atomic_neg_seqcst;
+}
 
 #[inline]
 fn is_lock_free() -> bool {
     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
     {
-        // CMPXCHG16B is statically available.
+        // CMPXCHG16B is available at compile-time.
         true
     }
     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
@@ -744,147 +860,4 @@ mod tests {
 
     test_atomic_int!(i128);
     test_atomic_int!(u128);
-
-    #[test]
-    fn test() {
-        // Miri doesn't support inline assembly used in is_x86_feature_detected
-        #[cfg(not(miri))]
-        {
-            assert!(std::is_x86_feature_detected!("cmpxchg16b"));
-        }
-        assert!(AtomicI128::is_lock_free());
-        assert!(AtomicU128::is_lock_free());
-    }
-
-    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
-    mod quickcheck {
-        use core::cell::UnsafeCell;
-
-        use test_helper::Align16;
-
-        use super::super::*;
-
-        ::quickcheck::quickcheck! {
-            fn test(x: u128, y: u128, z: u128) -> bool {
-                // Miri doesn't support inline assembly used in is_x86_feature_detected
-                #[cfg(not(miri))]
-                {
-                    assert!(std::is_x86_feature_detected!("cmpxchg16b"));
-                }
-                unsafe {
-                    let a = Align16(UnsafeCell::new(x));
-                    let (res, ok) = _cmpxchg16b(a.get(), y, z, Ordering::SeqCst, Ordering::SeqCst);
-                    if x == y {
-                        assert!(ok);
-                        assert_eq!(res, x);
-                        assert_eq!(*a.get(), z);
-                    } else {
-                        assert!(!ok);
-                        assert_eq!(res, x);
-                        assert_eq!(*a.get(), x);
-                    }
-                }
-                true
-            }
-        }
-    }
-}
-
-#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
-#[cfg(test)]
-mod tests_no_cmpxchg16b {
-    use super::*;
-
-    #[inline(never)]
-    unsafe fn cmpxchg16b(
-        dst: *mut u128,
-        old: u128,
-        new: u128,
-        success: Ordering,
-        failure: Ordering,
-    ) -> (u128, bool) {
-        unsafe { fallback::atomic_compare_exchange(dst, old, new, success, failure) }
-    }
-    #[inline]
-    unsafe fn byte_wise_atomic_load(src: *mut u128) -> u128 {
-        debug_assert!(src as usize % 16 == 0);
-
-        // Miri and Sanitizer do not support inline assembly.
-        #[cfg(any(miri, portable_atomic_sanitize_thread))]
-        unsafe {
-            atomic_load(src, Ordering::Relaxed)
-        }
-        #[cfg(not(any(miri, portable_atomic_sanitize_thread)))]
-        unsafe {
-            super::byte_wise_atomic_load(src)
-        }
-    }
-
-    #[inline(never)]
-    unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
-        let fail_order = crate::utils::strongest_failure_ordering(order);
-        unsafe {
-            match atomic_compare_exchange(src, 0, 0, order, fail_order) {
-                Ok(v) | Err(v) => v,
-            }
-        }
-    }
-
-    #[inline(never)]
-    unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
-        unsafe {
-            atomic_swap(dst, val, order);
-        }
-    }
-
-    #[inline]
-    unsafe fn atomic_compare_exchange(
-        dst: *mut u128,
-        old: u128,
-        new: u128,
-        success: Ordering,
-        failure: Ordering,
-    ) -> Result<u128, u128> {
-        let success = crate::utils::upgrade_success_ordering(success, failure);
-        let (res, ok) = unsafe { cmpxchg16b(dst, old, new, success, failure) };
-        if ok {
-            Ok(res)
-        } else {
-            Err(res)
-        }
-    }
-
-    use atomic_compare_exchange as atomic_compare_exchange_weak;
-
-    #[inline(always)]
-    unsafe fn atomic_update<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
-    where
-        F: FnMut(u128) -> u128,
-    {
-        unsafe {
-            let mut old = byte_wise_atomic_load(dst);
-            loop {
-                let next = f(old);
-                match atomic_compare_exchange_weak(dst, old, next, order, Ordering::Relaxed) {
-                    Ok(x) => return x,
-                    Err(x) => old = x,
-                }
-            }
-        }
-    }
-
-    atomic_rmw_by_atomic_update!();
-
-    #[inline]
-    const fn is_lock_free() -> bool {
-        IS_ALWAYS_LOCK_FREE
-    }
-    const IS_ALWAYS_LOCK_FREE: bool = false;
-
-    atomic128!(AtomicI128, i128, atomic_max, atomic_min);
-    atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
-
-    // Do not put this in the nested tests module due to glob imports refer to super::super::Atomic*.
-    test_atomic_int!(i128);
-    test_atomic_int!(u128);
 }
diff --git a/src/imp/core_atomic.rs b/src/imp/core_atomic.rs
index 3f121329..1e0bb497 100644
--- a/src/imp/core_atomic.rs
+++ b/src/imp/core_atomic.rs
@@ -65,7 +65,7 @@ impl AtomicBool {
             // See also https://github.com/rust-lang/rust/pull/66705 and
             // https://github.com/rust-lang/rust/issues/66136#issuecomment-557867116.
             unsafe {
-                (*(self as *const Self as *const core::cell::UnsafeCell<u8>)).get() as *mut bool
+                (*(self as *const Self as *const UnsafeCell<u8>)).get() as *mut bool
             }
         }
     }
@@ -159,7 +159,7 @@ impl<T> AtomicPtr<T> {
             // SAFETY: Self is #[repr(C)] and internally UnsafeCell<*mut T>.
             // See also https://github.com/rust-lang/rust/pull/66705 and
             // https://github.com/rust-lang/rust/issues/66136#issuecomment-557867116.
-            unsafe { (*(self as *const Self as *const core::cell::UnsafeCell<*mut T>)).get() }
+            unsafe { (*(self as *const Self as *const UnsafeCell<*mut T>)).get() }
         }
     }
 }
@@ -269,7 +269,7 @@ macro_rules! atomic_int {
                     // See also https://github.com/rust-lang/rust/pull/66705 and
                     // https://github.com/rust-lang/rust/issues/66136#issuecomment-557867116.
                     unsafe {
-                        (*(self as *const Self as *const core::cell::UnsafeCell<$int_type>)).get()
+                        (*(self as *const Self as *const UnsafeCell<$int_type>)).get()
                     }
                 }
             }
diff --git a/src/imp/fallback/mod.rs b/src/imp/fallback/mod.rs
index a2b7442f..21de0943 100644
--- a/src/imp/fallback/mod.rs
+++ b/src/imp/fallback/mod.rs
@@ -375,26 +375,20 @@ macro_rules! atomic {
     };
 }
 
+#[cfg_attr(portable_atomic_no_cfg_target_has_atomic, cfg(any(test, portable_atomic_no_atomic_64)))]
+#[cfg_attr(
+    not(portable_atomic_no_cfg_target_has_atomic),
+    cfg(any(test, not(target_has_atomic = "64")))
+)]
 cfg_no_fast_atomic_64! {
-    #[cfg_attr(
-        portable_atomic_no_cfg_target_has_atomic,
-        cfg(any(test, portable_atomic_no_atomic_64))
-    )]
-    #[cfg_attr(
-        not(portable_atomic_no_cfg_target_has_atomic),
-        cfg(any(test, not(target_has_atomic = "64")))
-    )]
     atomic!(AtomicI64, i64, 8);
 }
+#[cfg_attr(portable_atomic_no_cfg_target_has_atomic, cfg(any(test, portable_atomic_no_atomic_64)))]
+#[cfg_attr(
+    not(portable_atomic_no_cfg_target_has_atomic),
+    cfg(any(test, not(target_has_atomic = "64")))
+)]
 cfg_no_fast_atomic_64! {
-    #[cfg_attr(
-        portable_atomic_no_cfg_target_has_atomic,
-        cfg(any(test, portable_atomic_no_atomic_64))
-    )]
-    #[cfg_attr(
-        not(portable_atomic_no_cfg_target_has_atomic),
-        cfg(any(test, not(target_has_atomic = "64")))
-    )]
     atomic!(AtomicU64, u64, 8);
 }
 
diff --git a/src/imp/fallback/outline_atomics.rs b/src/imp/fallback/outline_atomics.rs
index b553c2fa..df018382 100644
--- a/src/imp/fallback/outline_atomics.rs
+++ b/src/imp/fallback/outline_atomics.rs
@@ -9,53 +9,63 @@
 use core::sync::atomic::Ordering;
 
 #[cfg(target_arch = "x86_64")]
-#[allow(clippy::upper_case_acronyms)]
-pub(crate) type UDW = u128;
+pub(crate) type Udw = u128;
 #[cfg(target_arch = "x86_64")]
-pub(crate) type AtomicUDW = super::super::fallback::AtomicU128;
-// #[cfg(target_arch = "x86_64")]
-// pub(crate) type AtomicIDW = super::super::fallback::AtomicI128;
+pub(crate) type AtomicUdw = super::super::fallback::AtomicU128;
+#[cfg(target_arch = "x86_64")]
+pub(crate) type AtomicIdw = super::super::fallback::AtomicI128;
 
 #[cfg(target_arch = "arm")]
-#[allow(clippy::upper_case_acronyms)]
-pub(crate) type UDW = u64;
+pub(crate) type Udw = u64;
 #[cfg(target_arch = "arm")]
-pub(crate) type AtomicUDW = super::super::fallback::AtomicU64;
+pub(crate) type AtomicUdw = super::super::fallback::AtomicU64;
 #[cfg(target_arch = "arm")]
-pub(crate) type AtomicIDW = super::super::fallback::AtomicI64;
+pub(crate) type AtomicIdw = super::super::fallback::AtomicI64;
+
+// Asserts that the function is called in the correct context.
+macro_rules! debug_assert_outline_atomics {
+    () => {
+        #[cfg(target_arch = "x86_64")]
+        {
+            debug_assert!(!super::detect::detect().has_cmpxchg16b());
+        }
+        #[cfg(target_arch = "arm")]
+        {
+            debug_assert!(!super::has_kuser_cmpxchg64());
+        }
+    };
+}
 
-#[cfg(not(target_arch = "x86_64"))]
 #[cold]
-pub(crate) unsafe fn atomic_load(src: *mut UDW, order: Ordering) -> UDW {
+pub(crate) unsafe fn atomic_load(src: *mut Udw, order: Ordering) -> Udw {
+    debug_assert_outline_atomics!();
     #[allow(clippy::cast_ptr_alignment)]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        (*(src as *const AtomicUDW)).load(order)
+        (*(src as *const AtomicUdw)).load(order)
     }
 }
-#[cfg(not(target_arch = "x86_64"))]
 fn_alias! {
     #[cold]
-    pub(crate) unsafe fn(src: *mut UDW) -> UDW;
+    pub(crate) unsafe fn(src: *mut Udw) -> Udw;
     // fallback's atomic load has at least acquire semantics.
-    #[cfg(not(target_arch = "arm"))]
+    #[cfg(not(any(target_arch = "arm", target_arch = "x86_64")))]
     atomic_load_non_seqcst = atomic_load(Ordering::Acquire);
     atomic_load_seqcst = atomic_load(Ordering::SeqCst);
 }
 
-#[cfg(not(target_arch = "x86_64"))]
 #[cold]
-pub(crate) unsafe fn atomic_store(dst: *mut UDW, val: UDW, order: Ordering) {
+pub(crate) unsafe fn atomic_store(dst: *mut Udw, val: Udw, order: Ordering) {
+    debug_assert_outline_atomics!();
     #[allow(clippy::cast_ptr_alignment)]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        (*(dst as *const AtomicUDW)).store(val, order);
+        (*(dst as *const AtomicUdw)).store(val, order);
     }
 }
-#[cfg(not(target_arch = "x86_64"))]
 fn_alias! {
     #[cold]
-    pub(crate) unsafe fn(dst: *mut UDW, val: UDW);
+    pub(crate) unsafe fn(dst: *mut Udw, val: Udw);
     // fallback's atomic store has at least release semantics.
     #[cfg(not(target_arch = "arm"))]
     atomic_store_non_seqcst = atomic_store(Ordering::Release);
@@ -64,27 +74,27 @@ fn_alias! {
 
 #[cold]
 pub(crate) unsafe fn atomic_compare_exchange(
-    dst: *mut UDW,
-    old: UDW,
-    new: UDW,
+    dst: *mut Udw,
+    old: Udw,
+    new: Udw,
     success: Ordering,
     failure: Ordering,
-) -> (UDW, bool) {
+) -> (Udw, bool) {
+    debug_assert_outline_atomics!();
     #[allow(clippy::cast_ptr_alignment)]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
-        match (*(dst as *const AtomicUDW)).compare_exchange(old, new, success, failure) {
+        match (*(dst as *const AtomicUdw)).compare_exchange(old, new, success, failure) {
             Ok(v) => (v, true),
             Err(v) => (v, false),
         }
     }
 }
-#[cfg(not(target_arch = "x86_64"))]
 fn_alias! {
     #[cold]
-    pub(crate) unsafe fn(dst: *mut UDW, old: UDW, new: UDW) -> (UDW, bool);
-    // fallback's atomic RMW has at least AcqRel semantics.
-    #[cfg(not(target_arch = "arm"))]
+    pub(crate) unsafe fn(dst: *mut Udw, old: Udw, new: Udw) -> (Udw, bool);
+    // fallback's atomic CAS has at least AcqRel semantics.
+    #[cfg(not(any(target_arch = "arm", target_arch = "x86_64")))]
     atomic_compare_exchange_non_seqcst
         = atomic_compare_exchange(Ordering::AcqRel, Ordering::Acquire);
     atomic_compare_exchange_seqcst
@@ -96,21 +106,20 @@ macro_rules! atomic_rmw_3 {
         $name:ident($atomic_type:ident::$method_name:ident),
         $non_seqcst_alias:ident, $seqcst_alias:ident
     ) => {
-        #[cfg(not(target_arch = "x86_64"))]
         #[cold]
-        pub(crate) unsafe fn $name(dst: *mut UDW, val: UDW, order: Ordering) -> UDW {
+        pub(crate) unsafe fn $name(dst: *mut Udw, val: Udw, order: Ordering) -> Udw {
+            debug_assert_outline_atomics!();
             #[allow(clippy::cast_ptr_alignment)]
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                (*(dst as *const $atomic_type)).$method_name(val as _, order) as UDW
+                (*(dst as *const $atomic_type)).$method_name(val as _, order) as Udw
             }
         }
-        #[cfg(not(target_arch = "x86_64"))]
         fn_alias! {
             #[cold]
-            pub(crate) unsafe fn(dst: *mut UDW, val: UDW) -> UDW;
+            pub(crate) unsafe fn(dst: *mut Udw, val: Udw) -> Udw;
             // fallback's atomic RMW has at least AcqRel semantics.
-            #[cfg(not(target_arch = "arm"))]
+            #[cfg(not(any(target_arch = "arm", target_arch = "x86_64")))]
             $non_seqcst_alias = $name(Ordering::AcqRel);
             $seqcst_alias = $name(Ordering::SeqCst);
         }
@@ -121,38 +130,37 @@ macro_rules! atomic_rmw_2 {
         $name:ident($atomic_type:ident::$method_name:ident),
         $non_seqcst_alias:ident, $seqcst_alias:ident
     ) => {
-        #[cfg(not(target_arch = "x86_64"))]
         #[cold]
-        pub(crate) unsafe fn $name(dst: *mut UDW, order: Ordering) -> UDW {
+        pub(crate) unsafe fn $name(dst: *mut Udw, order: Ordering) -> Udw {
+            debug_assert_outline_atomics!();
             #[allow(clippy::cast_ptr_alignment)]
             // SAFETY: the caller must uphold the safety contract.
             unsafe {
-                (*(dst as *const $atomic_type)).$method_name(order) as UDW
+                (*(dst as *const $atomic_type)).$method_name(order) as Udw
             }
         }
-        #[cfg(not(target_arch = "x86_64"))]
         fn_alias! {
             #[cold]
-            pub(crate) unsafe fn(dst: *mut UDW) -> UDW;
+            pub(crate) unsafe fn(dst: *mut Udw) -> Udw;
             // fallback's atomic RMW has at least AcqRel semantics.
-            #[cfg(not(target_arch = "arm"))]
+            #[cfg(not(any(target_arch = "arm", target_arch = "x86_64")))]
             $non_seqcst_alias = $name(Ordering::AcqRel);
             $seqcst_alias = $name(Ordering::SeqCst);
         }
     };
 }
 
-atomic_rmw_3!(atomic_swap(AtomicUDW::swap), atomic_swap_non_seqcst, atomic_swap_seqcst);
-atomic_rmw_3!(atomic_add(AtomicUDW::fetch_add), atomic_add_non_seqcst, atomic_add_seqcst);
-atomic_rmw_3!(atomic_sub(AtomicUDW::fetch_sub), atomic_sub_non_seqcst, atomic_sub_seqcst);
-atomic_rmw_3!(atomic_and(AtomicUDW::fetch_and), atomic_and_non_seqcst, atomic_and_seqcst);
-atomic_rmw_3!(atomic_nand(AtomicUDW::fetch_nand), atomic_nand_non_seqcst, atomic_nand_seqcst);
-atomic_rmw_3!(atomic_or(AtomicUDW::fetch_or), atomic_or_non_seqcst, atomic_or_seqcst);
-atomic_rmw_3!(atomic_xor(AtomicUDW::fetch_xor), atomic_xor_non_seqcst, atomic_xor_seqcst);
-atomic_rmw_3!(atomic_max(AtomicIDW::fetch_max), atomic_max_non_seqcst, atomic_max_seqcst);
-atomic_rmw_3!(atomic_umax(AtomicUDW::fetch_max), atomic_umax_non_seqcst, atomic_umax_seqcst);
-atomic_rmw_3!(atomic_min(AtomicIDW::fetch_min), atomic_min_non_seqcst, atomic_min_seqcst);
-atomic_rmw_3!(atomic_umin(AtomicUDW::fetch_min), atomic_umin_non_seqcst, atomic_umin_seqcst);
+atomic_rmw_3!(atomic_swap(AtomicUdw::swap), atomic_swap_non_seqcst, atomic_swap_seqcst);
+atomic_rmw_3!(atomic_add(AtomicUdw::fetch_add), atomic_add_non_seqcst, atomic_add_seqcst);
+atomic_rmw_3!(atomic_sub(AtomicUdw::fetch_sub), atomic_sub_non_seqcst, atomic_sub_seqcst);
+atomic_rmw_3!(atomic_and(AtomicUdw::fetch_and), atomic_and_non_seqcst, atomic_and_seqcst);
+atomic_rmw_3!(atomic_nand(AtomicUdw::fetch_nand), atomic_nand_non_seqcst, atomic_nand_seqcst);
+atomic_rmw_3!(atomic_or(AtomicUdw::fetch_or), atomic_or_non_seqcst, atomic_or_seqcst);
+atomic_rmw_3!(atomic_xor(AtomicUdw::fetch_xor), atomic_xor_non_seqcst, atomic_xor_seqcst);
+atomic_rmw_3!(atomic_max(AtomicIdw::fetch_max), atomic_max_non_seqcst, atomic_max_seqcst);
+atomic_rmw_3!(atomic_umax(AtomicUdw::fetch_max), atomic_umax_non_seqcst, atomic_umax_seqcst);
+atomic_rmw_3!(atomic_min(AtomicIdw::fetch_min), atomic_min_non_seqcst, atomic_min_seqcst);
+atomic_rmw_3!(atomic_umin(AtomicUdw::fetch_min), atomic_umin_non_seqcst, atomic_umin_seqcst);
 
-atomic_rmw_2!(atomic_not(AtomicUDW::fetch_not), atomic_not_non_seqcst, atomic_not_seqcst);
-atomic_rmw_2!(atomic_neg(AtomicUDW::fetch_neg), atomic_neg_non_seqcst, atomic_neg_seqcst);
+atomic_rmw_2!(atomic_not(AtomicUdw::fetch_not), atomic_not_non_seqcst, atomic_not_seqcst);
+atomic_rmw_2!(atomic_neg(AtomicUdw::fetch_neg), atomic_neg_non_seqcst, atomic_neg_seqcst);
diff --git a/src/imp/float.rs b/src/imp/float.rs
index 6cfe99c1..fe790b7b 100644
--- a/src/imp/float.rs
+++ b/src/imp/float.rs
@@ -11,7 +11,7 @@
 
 #![cfg(any(not(target_pointer_width = "16"), feature = "fallback"))] // See lib.rs's AtomicU32 definition
 
-use core::sync::atomic::Ordering;
+use core::{cell::UnsafeCell, sync::atomic::Ordering};
 
 macro_rules! atomic_float {
     (
@@ -23,7 +23,7 @@ macro_rules! atomic_float {
     ) => {
         #[repr(C, align($align))]
         pub(crate) struct $atomic_type {
-            v: core::cell::UnsafeCell<$float_type>,
+            v: UnsafeCell<$float_type>,
         }
 
         // Send is implicitly implemented.
@@ -33,7 +33,7 @@ macro_rules! atomic_float {
         impl $atomic_type {
             #[inline]
             pub(crate) const fn new(v: $float_type) -> Self {
-                Self { v: core::cell::UnsafeCell::new(v) }
+                Self { v: UnsafeCell::new(v) }
             }
 
             #[inline]
diff --git a/src/imp/interrupt/mod.rs b/src/imp/interrupt/mod.rs
index ff6d8680..1371b284 100644
--- a/src/imp/interrupt/mod.rs
+++ b/src/imp/interrupt/mod.rs
@@ -24,7 +24,7 @@
 // interrupts [^avr2] in atomic ops by default, is considered the latter.
 // MSP430 as well.
 //
-// See also README.md of this module.
+// See also README.md of this directory.
 //
 // [^avr1]: https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp#LL963
 // [^avr2]: https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0/llvm/test/CodeGen/AVR/atomics/load16.ll#L5
diff --git a/src/imp/mod.rs b/src/imp/mod.rs
index 08e75c8e..2c44868d 100644
--- a/src/imp/mod.rs
+++ b/src/imp/mod.rs
@@ -20,21 +20,21 @@
 )]
 mod core_atomic;
 
-// Miri and Sanitizer do not support inline assembly.
-#[cfg(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics))]
-#[cfg(target_arch = "aarch64")]
-#[path = "atomic128/intrinsics.rs"]
-mod aarch64;
-#[cfg(not(all(
-    any(miri, portable_atomic_sanitize_thread),
-    portable_atomic_new_atomic_intrinsics,
-)))]
 #[cfg(any(not(portable_atomic_no_asm), portable_atomic_unstable_asm))]
 #[cfg(target_arch = "aarch64")]
-#[path = "atomic128/aarch64.rs"]
+// Use intrinsics.rs on Miri and Sanitizer that do not support inline assembly.
+#[cfg_attr(
+    all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics),
+    path = "atomic128/intrinsics.rs"
+)]
+#[cfg_attr(
+    not(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics)),
+    path = "atomic128/aarch64.rs"
+)]
 mod aarch64;
 
 #[cfg(any(not(portable_atomic_no_asm), portable_atomic_unstable_asm))]
+#[cfg(target_arch = "x86_64")]
 #[cfg(any(
     target_feature = "cmpxchg16b",
     portable_atomic_target_feature = "cmpxchg16b",
@@ -45,33 +45,39 @@ mod aarch64;
         not(target_env = "sgx"),
     ),
 ))]
-#[cfg(target_arch = "x86_64")]
-#[path = "atomic128/x86_64.rs"]
+// Use intrinsics.rs on Miri and Sanitizer that do not support inline assembly.
+#[cfg_attr(any(miri, portable_atomic_sanitize_thread), path = "atomic128/intrinsics.rs")]
+#[cfg_attr(not(any(miri, portable_atomic_sanitize_thread)), path = "atomic128/x86_64.rs")]
 mod x86_64;
 
-// Miri and Sanitizer do not support inline assembly.
-#[cfg(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_llvm_15))]
 #[cfg(portable_atomic_unstable_asm_experimental_arch)]
-#[cfg(any(
-    target_feature = "quadword-atomics",
-    portable_atomic_target_feature = "quadword-atomics",
-))]
 #[cfg(target_arch = "powerpc64")]
-#[path = "atomic128/intrinsics.rs"]
-mod powerpc64;
-#[cfg(not(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_llvm_15)))]
-#[cfg(portable_atomic_unstable_asm_experimental_arch)]
 #[cfg(any(
     target_feature = "quadword-atomics",
     portable_atomic_target_feature = "quadword-atomics",
 ))]
-#[cfg(target_arch = "powerpc64")]
-#[path = "atomic128/powerpc64.rs"]
+// Use intrinsics.rs on Miri and Sanitizer that do not support inline assembly.
+#[cfg_attr(
+    all(any(miri, portable_atomic_sanitize_thread), portable_atomic_llvm_15),
+    path = "atomic128/intrinsics.rs"
+)]
+#[cfg_attr(
+    not(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_llvm_15)),
+    path = "atomic128/powerpc64.rs"
+)]
 mod powerpc64;
 
 #[cfg(portable_atomic_unstable_asm_experimental_arch)]
 #[cfg(target_arch = "s390x")]
-#[path = "atomic128/s390x.rs"]
+// Use intrinsics.rs on Miri and Sanitizer that do not support inline assembly.
+#[cfg_attr(
+    all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics),
+    path = "atomic128/intrinsics.rs"
+)]
+#[cfg_attr(
+    not(all(any(miri, portable_atomic_sanitize_thread), portable_atomic_new_atomic_intrinsics)),
+    path = "atomic128/s390x.rs"
+)]
 mod s390x;
 
 // Miri and Sanitizer do not support inline assembly.
diff --git a/src/imp/x86.rs b/src/imp/x86.rs
index 564b58fc..ba0ba3a0 100644
--- a/src/imp/x86.rs
+++ b/src/imp/x86.rs
@@ -117,7 +117,7 @@ macro_rules! atomic_bit_opts {
         // LLVM 15 only supports generating `lock bt{s,r,c}` for immediate bit offsets.
         // https://godbolt.org/z/dzzhr81z6
         // LLVM 16 can generate `lock bt{s,r,c}` for both immediate and register bit offsets.
-        // https://github.com/taiki-e/portable-atomic/issues/48#issuecomment-1453473831
+        // https://godbolt.org/z/7YTvsorn1
         // So, use fetch_* based implementations on LLVM 16+, otherwise use asm based implementations.
         #[cfg(portable_atomic_llvm_16)]
         impl_default_bit_opts!($atomic_type, $int_type);
diff --git a/src/lib.rs b/src/lib.rs
index 8391585e..a484fd9a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4798,7 +4798,9 @@ This is `const fn` on Rust 1.58+."),
                 /// Returning an `*mut` pointer from a shared reference to this atomic is
                 /// safe because the atomic types work with interior mutability. Any use of
                 /// the returned raw pointer requires an `unsafe` block and has to uphold
-                /// the safety requirements:
+                /// the safety requirements. If there is concurrent access, note the following
+                /// additional safety requirements:
+                ///
                 /// - If this atomic type is [lock-free](Self::is_lock_free), any concurrent
                 ///   operations on it must be atomic.
                 /// - Otherwise, any concurrent operations on it must be compatible with
diff --git a/src/tests/helper.rs b/src/tests/helper.rs
index a756d858..1dda9048 100644
--- a/src/tests/helper.rs
+++ b/src/tests/helper.rs
@@ -2209,17 +2209,8 @@ pub(crate) fn test_swap_ordering<T: std::fmt::Debug>(f: impl Fn(Ordering) -> T)
 fn skip_should_panic_test() -> bool {
     // Miri's panic handling is slow
     // MSAN false positive: https://gist.github.com/taiki-e/dd6269a8ffec46284fdc764a4849f884
-    is_panic_abort()
+    test_helper::is_panic_abort()
         || cfg!(miri)
         || option_env!("CARGO_PROFILE_RELEASE_LTO").map_or(false, |v| v == "fat")
             && option_env!("MSAN_OPTIONS").is_some()
 }
-// For -C panic=abort -Z panic_abort_tests: https://github.com/rust-lang/rust/issues/67650
-#[rustversion::since(1.60)] // cfg!(panic) requires Rust 1.60
-fn is_panic_abort() -> bool {
-    cfg!(panic = "abort")
-}
-#[rustversion::before(1.60)] // cfg!(panic) requires Rust 1.60
-fn is_panic_abort() -> bool {
-    false
-}
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index a9b2d136..fa7d67c2 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -105,7 +105,6 @@ extern "C" {
     fn _atomic_f64_ffi_safety(_: AtomicF64);
 }
 
-#[rustversion::since(1.60)] // cfg!(target_has_atomic) requires Rust 1.60
 #[test]
 fn test_is_lock_free() {
     assert!(AtomicI8::is_always_lock_free());
@@ -120,30 +119,34 @@ fn test_is_lock_free() {
     assert!(AtomicI32::is_lock_free());
     assert!(AtomicU32::is_always_lock_free());
     assert!(AtomicU32::is_lock_free());
-    if cfg!(all(
-        feature = "fallback",
-        not(any(miri, portable_atomic_sanitize_thread)),
-        any(not(portable_atomic_no_asm), portable_atomic_unstable_asm),
-        target_arch = "arm",
-        any(target_os = "linux", target_os = "android"),
-        not(any(target_feature = "v6", portable_atomic_target_feature = "v6")),
-        not(portable_atomic_no_outline_atomics),
-        not(target_has_atomic = "64"),
-    )) {
-        assert!(!AtomicI64::is_always_lock_free());
-        assert!(AtomicI64::is_lock_free());
-        assert!(!AtomicU64::is_always_lock_free());
-        assert!(AtomicU64::is_lock_free());
-    } else if cfg!(target_has_atomic = "64") {
-        assert!(AtomicI64::is_always_lock_free());
-        assert!(AtomicI64::is_lock_free());
-        assert!(AtomicU64::is_always_lock_free());
-        assert!(AtomicU64::is_lock_free());
-    } else {
-        assert!(!AtomicI64::is_always_lock_free());
-        assert!(!AtomicI64::is_lock_free());
-        assert!(!AtomicU64::is_always_lock_free());
-        assert!(!AtomicU64::is_lock_free());
+    #[cfg(not(portable_atomic_no_cfg_target_has_atomic))]
+    {
+        if cfg!(all(
+            feature = "fallback",
+            not(any(miri, portable_atomic_sanitize_thread)),
+            any(not(portable_atomic_no_asm), portable_atomic_unstable_asm),
+            target_arch = "arm",
+            any(target_os = "linux", target_os = "android"),
+            not(any(target_feature = "v6", portable_atomic_target_feature = "v6")),
+            not(portable_atomic_no_outline_atomics),
+            not(target_has_atomic = "64"),
+            not(portable_atomic_test_outline_atomics_detect_false),
+        )) {
+            assert!(!AtomicI64::is_always_lock_free());
+            assert!(AtomicI64::is_lock_free());
+            assert!(!AtomicU64::is_always_lock_free());
+            assert!(AtomicU64::is_lock_free());
+        } else if cfg!(target_has_atomic = "64") {
+            assert!(AtomicI64::is_always_lock_free());
+            assert!(AtomicI64::is_lock_free());
+            assert!(AtomicU64::is_always_lock_free());
+            assert!(AtomicU64::is_lock_free());
+        } else {
+            assert!(!AtomicI64::is_always_lock_free());
+            assert!(!AtomicI64::is_lock_free());
+            assert!(!AtomicU64::is_always_lock_free());
+            assert!(!AtomicU64::is_lock_free());
+        }
     }
     if cfg!(any(
         target_arch = "aarch64",
@@ -182,6 +185,7 @@ fn test_is_lock_free() {
                 portable_atomic_cmpxchg16b_target_feature,
                 not(portable_atomic_no_outline_atomics),
                 not(target_env = "sgx"),
+                not(portable_atomic_test_outline_atomics_detect_false),
             )) && std::is_x86_feature_detected!("cmpxchg16b");
             assert_eq!(AtomicI128::is_lock_free(), has_cmpxchg16b);
             assert_eq!(AtomicU128::is_lock_free(), has_cmpxchg16b);
diff --git a/tests/helper/Cargo.toml b/tests/helper/Cargo.toml
index df56cf94..61682d9a 100644
--- a/tests/helper/Cargo.toml
+++ b/tests/helper/Cargo.toml
@@ -8,13 +8,15 @@ publish = false
 [lib]
 doctest = false
 doc = false
+test = false
 
 [features]
-std = ["critical-section", "fs-err", "serde", "libc", "windows-sys"]
+std = ["critical-section", "fs-err", "rustversion", "serde", "libc", "windows-sys"]
 
 [dependencies]
 critical-section = { version = "1", optional = true, features = ["restore-state-bool"] }
 fs-err = { version = "2", optional = true }
+rustversion = { version = "1", optional = true }
 serde = { version = "1", optional = true }
 
 [target.'cfg(unix)'.dependencies]
diff --git a/tests/helper/src/lib.rs b/tests/helper/src/lib.rs
index 14c68c43..fb58d050 100644
--- a/tests/helper/src/lib.rs
+++ b/tests/helper/src/lib.rs
@@ -24,7 +24,7 @@ mod once_lock;
 #[cfg(feature = "std")]
 pub mod serde;
 
-use core::{ops, sync::atomic::Ordering};
+use core::sync::atomic::Ordering;
 
 pub const LOAD_ORDERINGS: [Ordering; 3] = [Ordering::Relaxed, Ordering::Acquire, Ordering::SeqCst];
 pub const STORE_ORDERINGS: [Ordering; 3] = [Ordering::Relaxed, Ordering::Release, Ordering::SeqCst];
@@ -50,15 +50,14 @@ pub const COMPARE_EXCHANGE_ORDERINGS: [(Ordering, Ordering); 15] = [
 pub const FENCE_ORDERINGS: [Ordering; 4] =
     [Ordering::Release, Ordering::Acquire, Ordering::AcqRel, Ordering::SeqCst];
 
-#[derive(Debug, Clone, Copy, Default)]
-#[repr(C, align(16))]
-pub struct Align16<T>(pub T);
-
-impl<T> ops::Deref for Align16<T> {
-    type Target = T;
-
-    #[inline]
-    fn deref(&self) -> &T {
-        &self.0
-    }
+// For -C panic=abort -Z panic_abort_tests: https://github.com/rust-lang/rust/issues/67650
+#[cfg(feature = "std")]
+#[rustversion::since(1.60)] // cfg!(panic) requires Rust 1.60
+pub fn is_panic_abort() -> bool {
+    cfg!(panic = "abort")
+}
+#[cfg(feature = "std")]
+#[rustversion::before(1.60)] // cfg!(panic) requires Rust 1.60
+pub fn is_panic_abort() -> bool {
+    false
 }
diff --git a/tools/build.sh b/tools/build.sh
index da0f299f..a63b1484 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -91,14 +91,18 @@ default_targets=(
     s390x-unknown-linux-gnu
 )
 known_cfgs=(
-    docsrs
-    qemu
-    valgrind
-    rustfmt
+    # Public APIs
     portable_atomic_unsafe_assume_single_core
     portable_atomic_s_mode
     portable_atomic_disable_fiq
     portable_atomic_no_outline_atomics
+
+    # Not public APIs
+    portable_atomic_test_outline_atomics_detect_false
+    docsrs
+    qemu
+    rustfmt
+    valgrind
 )
 
 x() {
diff --git a/tools/test.sh b/tools/test.sh
index f529f306..63eeb83e 100755
--- a/tools/test.sh
+++ b/tools/test.sh
@@ -119,10 +119,23 @@ args+=(
     --workspace --exclude bench --exclude portable-atomic-internal-codegen
 )
 target="${target:-"${host}"}"
+target_lower="${target//-/_}"
+target_lower="${target_lower//./_}"
+target_upper="$(tr '[:lower:]' '[:upper:]' <<<"${target_lower}")"
+
+if [[ -n "${VALGRIND:-}" ]]; then
+    export "CARGO_TARGET_${target_upper}_RUNNER"="${VALGRIND} -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes"
+    export RUSTFLAGS="${RUSTFLAGS:-} --cfg valgrind"
+    export RUSTDOCFLAGS="${RUSTDOCFLAGS:-} --cfg valgrind"
+    # doctest on Valgrind is very slow
+    if [[ ${#tests[@]} -eq 0 ]]; then
+        tests=(--tests)
+    fi
+fi
 
 run() {
     if [[ "${RUSTFLAGS:-}" == *"-Z sanitizer="* ]] || [[ "${RUSTFLAGS:-}" == *"-Zsanitizer="* ]]; then
-        # debug build + doctests is slow
+        # doctest with debug build on Sanitizer is slow
         x_cargo ${pre_args[@]+"${pre_args[@]}"} test --tests "$@"
     else
         x_cargo ${pre_args[@]+"${pre_args[@]}"} test ${tests[@]+"${tests[@]}"} "$@"