From cdedf4334edcab2630cd3c0fc5cde04bbc070860 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 7 Apr 2023 15:51:29 -0700 Subject: [PATCH] x64: Lower SIMD requirement to SSE4.1 from SSE4.2 Cranelift only has one instruction SIMD which depends on SSE4.2 so this commit adds a lowering rule for `pcmpgtq` which doesn't use SSE4.2 and enables lowering the baseline requirement for SIMD support from SSE4.2 to SSE4.1. The `has_sse42` setting is no longer enabled by default for Cranelift. Additionally `enable_simd` no longer requires `has_sse42` on x64. Finally the fuzz-generator for Wasmtime codegen settings now enables flipping the `has_sse42` setting instead of unconditionally setting it to `true`. The specific lowering for `pcmpgtq` is copied from LLVM's lowering of this instruction. --- cranelift/codegen/meta/src/isa/x86.rs | 2 +- cranelift/codegen/src/isa/x64/inst.isle | 49 ++++++++++++++++++- cranelift/codegen/src/isa/x64/lower/isle.rs | 5 ++ cranelift/codegen/src/isa/x64/mod.rs | 9 +--- .../filetests/isa/x64/simd-cmp-avx.clif | 2 +- .../filetests/runtests/simd-icmp-sgt.clif | 3 +- cranelift/native/src/lib.rs | 1 - .../src/generators/codegen_settings.rs | 6 +-- 8 files changed, 62 insertions(+), 15 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86.rs b/cranelift/codegen/meta/src/isa/x86.rs index 8fc6cd27bf35..e385388e3082 100644 --- a/cranelift/codegen/meta/src/isa/x86.rs +++ b/cranelift/codegen/meta/src/isa/x86.rs @@ -38,7 +38,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup { "has_sse42", "Has support for SSE4.2.", "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]", - true, + false, ); let has_avx = settings.add_bool( "has_avx", diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index bde21d48dbb7..97540660b823 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1642,6 +1642,9 @@ (decl use_sse41 (bool) Type) (extern extractor infallible use_sse41 use_sse41) +(decl pure use_sse42 () bool) +(extern constructor use_sse42 use_sse42) + (decl pure use_avx_simd () bool) (extern constructor use_avx_simd use_avx_simd) @@ -4214,7 +4217,51 @@ (rule (x64_pcmpgt $I8X16 x y) (x64_pcmpgtb x y)) (rule (x64_pcmpgt $I16X8 x y) (x64_pcmpgtw x y)) (rule (x64_pcmpgt $I32X4 x y) (x64_pcmpgtd x y)) -(rule (x64_pcmpgt $I64X2 x y) (x64_pcmpgtq x y)) + +;; SSE4.2 gives a single-instruction for this lowering, but prior to that it's a +;; bit more complicated. +(rule 1 (x64_pcmpgt $I64X2 x y) + (if-let $true (use_sse42)) + (x64_pcmpgtq x y)) + +;; Without SSE4.2 a 64-bit comparison is expanded to a number of instructions. +;; The basic idea is to delegate to a 32-bit comparison and work with the +;; results from there. The comparison to execute is: +;; +;; [ xhi ][ xlo ] > [ yhi ][ ylo ] +;; +;; If xhi != yhi, then the result is whatever the result of that comparison is. +;; If xhi == yhi, then the result is the unsigned comparison of xlo/ylo since +;; the 64-bit value is positive. To achieve this as part of the same comparison +;; the upper bit of `xlo` and `ylo` is flipped to change the sign when compared +;; as a 32-bit signed number. The result here is then: +;; +;; * if xlo and yhi had the same upper bit, then the unsigned comparison should +;; be the same as comparing the flipped versions as signed. +;; * if xlo had an upper bit of 0 and ylo had an upper bit of 1, then xlo > ylo +;; is false. When flipping the bits xlo becomes negative and ylo becomes +;; positive when compared as 32-bits, so the result is the same. +;; * if xlo had an upper bit of 1 and ylo had an upper bit of 0, then xlo > ylo +;; is true. When flipping the bits xlo becomes positive and ylo becomes +;; negative when compared as 32-bits, so the result is the same. +;; +;; Given all that the sequence here is to flip the upper bits of xlo and ylo, +;; then compare the masked results for equality and for gt. If the upper 32-bits +;; are not equal then the gt result for the upper bits is used. If the upper +;; 32-bits are equal then the lower 32-bits comparison is used instead. +(rule 0 (x64_pcmpgt $I64X2 x y) + (let ( + (mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00000000_80000000_00000000_80000000))) + (x_masked Xmm (x64_pxor mask x)) + (y_masked Xmm (x64_pxor mask y)) + (cmp32 Xmm (x64_pcmpgtd x_masked y_masked)) + (low_halves_gt Xmm (x64_pshufd cmp32 0xa0)) + (high_halves_gt Xmm (x64_pshufd cmp32 0xf5)) + (cmp_eq Xmm (x64_pcmpeqd x_masked y_masked)) + (high_halves_eq Xmm (x64_pshufd cmp_eq 0xf5)) + (low_gt_and_high_eq Xmm (x64_pand low_halves_gt high_halves_eq)) + ) + (x64_por low_gt_and_high_eq high_halves_gt))) (decl x64_pcmpgtb (Xmm XmmMem) Xmm) (rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y)) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index bf328b5268cd..36302e39c555 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -229,6 +229,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { self.backend.x64_flags.use_sse41() } + #[inline] + fn use_sse42(&mut self) -> bool { + self.backend.x64_flags.use_sse42() + } + #[inline] fn imm8_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 029c7f9f5754..b363dda6f7e7 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -222,13 +222,9 @@ fn isa_constructor( // Check for compatibility between flags and ISA level // requested. In particular, SIMD support requires SSE4.2. if shared_flags.enable_simd() { - if !isa_flags.has_sse3() - || !isa_flags.has_ssse3() - || !isa_flags.has_sse41() - || !isa_flags.has_sse42() - { + if !isa_flags.has_sse3() || !isa_flags.has_ssse3() || !isa_flags.has_sse41() { return Err(CodegenError::Unsupported( - "SIMD support requires SSE3, SSSE3, SSE4.1, and SSE4.2 on x86_64.".into(), + "SIMD support requires SSE3, SSSE3, and SSE4.1 on x86_64.".into(), )); } } @@ -253,7 +249,6 @@ mod test { isa_builder.set("has_sse3", "false").unwrap(); isa_builder.set("has_ssse3", "false").unwrap(); isa_builder.set("has_sse41", "false").unwrap(); - isa_builder.set("has_sse42", "false").unwrap(); assert!(matches!( isa_builder.finish(shared_flags), Err(CodegenError::Unsupported(_)), diff --git a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif index 9c69ac019660..93d0cffe8677 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 has_avx +target x86_64 sse42 has_avx function %i8x16_eq(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif index ed074fe68a4e..6d9cbc8e5e10 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif @@ -2,7 +2,8 @@ test interpret test run set enable_simd target x86_64 -target x86_64 has_avx +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs index 3425621002a4..5a4418160d83 100644 --- a/cranelift/native/src/lib.rs +++ b/cranelift/native/src/lib.rs @@ -80,7 +80,6 @@ pub fn infer_native_flags(isa_builder: &mut dyn Configurable) -> Result<(), &'st isa_builder.set("has_sse3", "false").unwrap(); isa_builder.set("has_ssse3", "false").unwrap(); isa_builder.set("has_sse41", "false").unwrap(); - isa_builder.set("has_sse42", "false").unwrap(); if std::is_x86_feature_detected!("sse3") { isa_builder.enable("has_sse3").unwrap(); diff --git a/crates/fuzzing/src/generators/codegen_settings.rs b/crates/fuzzing/src/generators/codegen_settings.rs index 30484d009fa1..9f2841d91bf6 100644 --- a/crates/fuzzing/src/generators/codegen_settings.rs +++ b/crates/fuzzing/src/generators/codegen_settings.rs @@ -35,7 +35,7 @@ impl CodegenSettings { } } - /// Features such as sse4.2 are unconditionally enabled on the x86_64 target + /// Features such as sse3 are unconditionally enabled on the x86_64 target /// because they are hard required for SIMD, but when SIMD is disabled, for /// example, we support disabling these features. /// @@ -57,7 +57,7 @@ impl CodegenSettings { // to have test case failures unrelated to codegen setting input // that fail on one architecture to fail on other architectures as // well. - let new_flags = ["has_sse3", "has_ssse3", "has_sse41", "has_sse42"] + let new_flags = ["has_sse3", "has_ssse3", "has_sse41"] .into_iter() .map(|name| Ok((name, u.arbitrary()?))) .collect::>>()?; @@ -147,8 +147,8 @@ impl<'a> Arbitrary<'a> for CodegenSettings { std:"sse3" => clif:"has_sse3" ratio: 1 in 1, std:"ssse3" => clif:"has_ssse3" ratio: 1 in 1, std:"sse4.1" => clif:"has_sse41" ratio: 1 in 1, - std:"sse4.2" => clif:"has_sse42" ratio: 1 in 1, + std:"sse4.2" => clif:"has_sse42", std:"popcnt" => clif:"has_popcnt", std:"avx" => clif:"has_avx", std:"avx2" => clif:"has_avx2",