From 1cbf7c791c0478d8381fe117313fca1f89c24a5a Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 18 Jun 2024 12:29:04 +0530 Subject: [PATCH] Added runtime detection Expanded the cache size to 93 (we will need this in near future) Fixed detection of VAES, GFNI and VPCLMULQDQ Could not test with `cupid` because they do not support these yet --- crates/std_detect/src/detect/arch/x86.rs | 15 ++++++++++++ crates/std_detect/src/detect/cache.rs | 30 +++++++++++++++--------- crates/std_detect/src/detect/os/x86.rs | 30 +++++++++++++++++------- crates/std_detect/tests/x86-specific.rs | 13 +++++++++- 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index f4f45750ed..f4c2129156 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -76,6 +76,11 @@ features! { /// * `"avx512bf16"` /// * `"avx512vp2intersect"` /// * `"avx512fp16"` + /// * `"avxvnni"` + /// * `"avxifma"` + /// * `"avxneconvert"` + /// * `"avxvnniint8"` + /// * `"avxvnniint16"` /// * `"f16c"` /// * `"fma"` /// * `"bmi1"` @@ -172,6 +177,16 @@ features! { /// AVX-512 P2INTERSECT @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16"; /// AVX-512 FP16 (FLOAT16 instructions) + @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma"; + /// AVX-IFMA (Integer Fused Multiply Add) + @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert"; + /// AVX-NE-CONVERT (Exceptionless Convert) + @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni"; + /// AVX-VNNI (Vector Neural Network Instructions) + @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16"; + /// AVX-VNNI_INT8 (VNNI with 16-bit Integers) + @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8"; + /// AVX-VNNI_INT16 (VNNI with 8-bit integers) @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c"; /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats) @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma"; diff --git a/crates/std_detect/src/detect/cache.rs b/crates/std_detect/src/detect/cache.rs index 478d4ee72f..182513d883 100644 --- a/crates/std_detect/src/detect/cache.rs +++ b/crates/std_detect/src/detect/cache.rs @@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize; /// Sets the `bit` of `x`. #[inline] -const fn set_bit(x: u64, bit: u32) -> u64 { +const fn set_bit(x: u128, bit: u32) -> u128 { x | 1 << bit } /// Tests the `bit` of `x`. #[inline] -const fn test_bit(x: u64, bit: u32) -> bool { +const fn test_bit(x: u128, bit: u32) -> bool { x & (1 << bit) != 0 } /// Unset the `bit of `x`. #[inline] -const fn unset_bit(x: u64, bit: u32) -> u64 { +const fn unset_bit(x: u128, bit: u32) -> u128 { x & !(1 << bit) } /// Maximum number of features that can be cached. -const CACHE_CAPACITY: u32 = 62; +const CACHE_CAPACITY: u32 = 93; /// This type is used to initialize the cache // The derived `Default` implementation will initialize the field to zero, // which is what we want. #[derive(Copy, Clone, Default)] -pub(crate) struct Initializer(u64); +pub(crate) struct Initializer(u128); // NOTE: the `debug_assert!` would catch that we do not add more Features than // the one fitting our cache. @@ -71,10 +71,15 @@ impl Initializer { } /// This global variable is a cache of the features supported by the CPU. -// Note: on x64, we only use the first slot -static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()]; - -/// Feature cache with capacity for `size_of::() * 8 - 1` features. +// Note: the third slot is only used in x86 +// Another Slot can be added if needed without any change to `Initializer` +static CACHE: [Cache; 3] = [ + Cache::uninitialized(), + Cache::uninitialized(), + Cache::uninitialized(), +]; + +/// Feature cache with capacity for `size_of::() * 8 - 1` features. /// /// Note: 0 is used to represent an uninitialized cache, and (at least) the most /// significant bit is set on any cache which has been initialized. @@ -102,7 +107,7 @@ impl Cache { if cached == 0 { None } else { - Some(test_bit(cached as u64, bit)) + Some(test_bit(cached as u128, bit)) } } @@ -173,6 +178,7 @@ cfg_if::cfg_if! { fn do_initialize(value: Initializer) { CACHE[0].initialize((value.0) as usize & Cache::MASK); CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK); + CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK); } // We only have to detect features once, and it's fairly costly, so hint to LLVM @@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer { pub(crate) fn test(bit: u32) -> bool { let (relative_bit, idx) = if bit < Cache::CAPACITY { (bit, 0) - } else { + } else if bit < 2 * Cache::CAPACITY { (bit - Cache::CAPACITY, 1) + } else { + (bit - 2 * Cache::CAPACITY, 2) }; CACHE[idx] .test(relative_bit) diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs index 3e55baa7d8..027df571f8 100644 --- a/crates/std_detect/src/detect/os/x86.rs +++ b/crates/std_detect/src/detect/os/x86.rs @@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer { extended_features_ecx, extended_features_edx, extended_features_eax_leaf_1, + extended_features_edx_leaf_1, ) = if max_basic_leaf >= 7 { let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) }; - let CpuidResult { eax: eax_1, .. } = - unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) }; - (ebx, ecx, edx, eax_1) + let CpuidResult { + eax: eax_1, + edx: edx_1, + .. + } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) }; + (ebx, ecx, edx, eax_1, edx_1) } else { - (0, 0, 0, 0) // CPUID does not support "Extended Features" + (0, 0, 0, 0, 0) // CPUID does not support "Extended Features" }; // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported @@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer { enable(proc_info_edx, 26, Feature::sse2); enable(extended_features_ebx, 29, Feature::sha); + enable(extended_features_ecx, 8, Feature::gfni); + enable(extended_features_ecx, 9, Feature::vaes); + enable(extended_features_ecx, 10, Feature::vpclmulqdq); + enable(extended_features_ebx, 3, Feature::bmi1); enable(extended_features_ebx, 8, Feature::bmi2); @@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer { let xcr0 = unsafe { _xgetbv(0) }; // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`: let os_avx_support = xcr0 & 6 == 6; - // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`: - let os_avx512_support = xcr0 & 224 == 224; + // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`: + let os_avx512_support = xcr0 & 0xe0 == 0xe0; // Only if the OS and the CPU support saving/restoring the AVX // registers we enable `xsave` support: @@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer { enable(proc_info_ecx, 28, Feature::avx); enable(extended_features_ebx, 5, Feature::avx2); + // "Short" versions of AVX512 instructions + enable(extended_features_eax_leaf_1, 4, Feature::avxvnni); + enable(extended_features_eax_leaf_1, 23, Feature::avxifma); + enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8); + enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert); + enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16); + // For AVX-512 the OS also needs to support saving/restoring // the extended state, only then we enable AVX-512 support: if os_avx512_support { @@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer { enable(extended_features_ebx, 31, Feature::avx512vl); enable(extended_features_ecx, 1, Feature::avx512vbmi); enable(extended_features_ecx, 6, Feature::avx512vbmi2); - enable(extended_features_ecx, 8, Feature::gfni); - enable(extended_features_ecx, 9, Feature::vaes); - enable(extended_features_ecx, 10, Feature::vpclmulqdq); enable(extended_features_ecx, 11, Feature::avx512vnni); enable(extended_features_ecx, 12, Feature::avx512bitalg); enable(extended_features_ecx, 14, Feature::avx512vpopcntdq); diff --git a/crates/std_detect/tests/x86-specific.rs b/crates/std_detect/tests/x86-specific.rs index ae7f677ed4..d2b2675716 100644 --- a/crates/std_detect/tests/x86-specific.rs +++ b/crates/std_detect/tests/x86-specific.rs @@ -1,6 +1,6 @@ #![cfg(any(target_arch = "x86", target_arch = "x86_64"))] #![allow(internal_features)] -#![feature(stdarch_internal)] +#![feature(stdarch_internal, avx512_target_feature)] extern crate cupid; #[macro_use] @@ -68,6 +68,17 @@ fn dump() { println!("adx: {:?}", is_x86_feature_detected!("adx")); println!("rtm: {:?}", is_x86_feature_detected!("rtm")); println!("movbe: {:?}", is_x86_feature_detected!("movbe")); + println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni")); + println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8")); + println!( + "avxneconvert: {:?}", + is_x86_feature_detected!("avxneconvert") + ); + println!("avxifma: {:?}", is_x86_feature_detected!("avxifma")); + println!( + "avxvnniint16: {:?}", + is_x86_feature_detected!("avxvnniint16") + ); } #[cfg(feature = "std_detect_env_override")]