From cdedf4334edcab2630cd3c0fc5cde04bbc070860 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 7 Apr 2023 15:51:29 -0700
Subject: [PATCH] x64: Lower SIMD requirement to SSE4.1 from SSE4.2

Cranelift only has one instruction SIMD which depends on SSE4.2 so this
commit adds a lowering rule for `pcmpgtq` which doesn't use SSE4.2 and
enables lowering the baseline requirement for SIMD support from SSE4.2
to SSE4.1.

The `has_sse42` setting is no longer enabled by default for Cranelift.
Additionally `enable_simd` no longer requires `has_sse42` on x64.
Finally the fuzz-generator for Wasmtime codegen settings now enables
flipping the `has_sse42` setting instead of unconditionally setting it
to `true`.

The specific lowering for `pcmpgtq` is copied from LLVM's lowering of
this instruction.
---
 cranelift/codegen/meta/src/isa/x86.rs         |  2 +-
 cranelift/codegen/src/isa/x64/inst.isle       | 49 ++++++++++++++++++-
 cranelift/codegen/src/isa/x64/lower/isle.rs   |  5 ++
 cranelift/codegen/src/isa/x64/mod.rs          |  9 +---
 .../filetests/isa/x64/simd-cmp-avx.clif       |  2 +-
 .../filetests/runtests/simd-icmp-sgt.clif     |  3 +-
 cranelift/native/src/lib.rs                   |  1 -
 .../src/generators/codegen_settings.rs        |  6 +--
 8 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86.rs b/cranelift/codegen/meta/src/isa/x86.rs
index 8fc6cd27bf35..e385388e3082 100644
--- a/cranelift/codegen/meta/src/isa/x86.rs
+++ b/cranelift/codegen/meta/src/isa/x86.rs
@@ -38,7 +38,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
         "has_sse42",
         "Has support for SSE4.2.",
         "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]",
-        true,
+        false,
     );
     let has_avx = settings.add_bool(
         "has_avx",
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index bde21d48dbb7..97540660b823 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1642,6 +1642,9 @@
 (decl use_sse41 (bool) Type)
 (extern extractor infallible use_sse41 use_sse41)
 
+(decl pure use_sse42 () bool)
+(extern constructor use_sse42 use_sse42)
+
 (decl pure use_avx_simd () bool)
 (extern constructor use_avx_simd use_avx_simd)
 
@@ -4214,7 +4217,51 @@
 (rule (x64_pcmpgt $I8X16 x y) (x64_pcmpgtb x y))
 (rule (x64_pcmpgt $I16X8 x y) (x64_pcmpgtw x y))
 (rule (x64_pcmpgt $I32X4 x y) (x64_pcmpgtd x y))
-(rule (x64_pcmpgt $I64X2 x y) (x64_pcmpgtq x y))
+
+;; SSE4.2 gives a single-instruction for this lowering, but prior to that it's a
+;; bit more complicated.
+(rule 1 (x64_pcmpgt $I64X2 x y)
+        (if-let $true (use_sse42))
+        (x64_pcmpgtq x y))
+
+;; Without SSE4.2 a 64-bit comparison is expanded to a number of instructions.
+;; The basic idea is to delegate to a 32-bit comparison and work with the
+;; results from there. The comparison to execute is:
+;;
+;;    [ xhi ][ xlo ] > [ yhi ][ ylo ]
+;;
+;; If xhi != yhi, then the result is whatever the result of that comparison is.
+;; If xhi == yhi, then the result is the unsigned comparison of xlo/ylo since
+;; the 64-bit value is positive. To achieve this as part of the same comparison
+;; the upper bit of `xlo` and `ylo` is flipped to change the sign when compared
+;; as a 32-bit signed number. The result here is then:
+;;
+;; * if xlo and yhi had the same upper bit, then the unsigned comparison should
+;;   be the same as comparing the flipped versions as signed.
+;; * if xlo had an upper bit of 0 and ylo had an upper bit of 1, then xlo > ylo
+;;   is false. When flipping the bits xlo becomes negative and ylo becomes
+;;   positive when compared as 32-bits, so the result is the same.
+;; * if xlo had an upper bit of 1 and ylo had an upper bit of 0, then xlo > ylo
+;;   is true. When flipping the bits xlo becomes positive and ylo becomes
+;;   negative when compared as 32-bits, so the result is the same.
+;;
+;; Given all that the sequence here is to flip the upper bits of xlo and ylo,
+;; then compare the masked results for equality and for gt. If the upper 32-bits
+;; are not equal then the gt result for the upper bits is used. If the upper
+;; 32-bits are equal then the lower 32-bits comparison is used instead.
+(rule 0 (x64_pcmpgt $I64X2 x y)
+        (let (
+            (mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00000000_80000000_00000000_80000000)))
+            (x_masked           Xmm (x64_pxor mask x))
+            (y_masked           Xmm (x64_pxor mask y))
+            (cmp32              Xmm (x64_pcmpgtd x_masked y_masked))
+            (low_halves_gt      Xmm (x64_pshufd cmp32 0xa0))
+            (high_halves_gt     Xmm (x64_pshufd cmp32 0xf5))
+            (cmp_eq             Xmm (x64_pcmpeqd x_masked y_masked))
+            (high_halves_eq     Xmm (x64_pshufd cmp_eq 0xf5))
+            (low_gt_and_high_eq Xmm (x64_pand low_halves_gt high_halves_eq))
+          )
+          (x64_por low_gt_and_high_eq high_halves_gt)))
 
 (decl x64_pcmpgtb (Xmm XmmMem) Xmm)
 (rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y))
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index bf328b5268cd..36302e39c555 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -229,6 +229,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
         self.backend.x64_flags.use_sse41()
     }
 
+    #[inline]
+    fn use_sse42(&mut self) -> bool {
+        self.backend.x64_flags.use_sse42()
+    }
+
     #[inline]
     fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
         let inst = self.lower_ctx.dfg().value_def(val).inst()?;
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 029c7f9f5754..b363dda6f7e7 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -222,13 +222,9 @@ fn isa_constructor(
     // Check for compatibility between flags and ISA level
     // requested. In particular, SIMD support requires SSE4.2.
     if shared_flags.enable_simd() {
-        if !isa_flags.has_sse3()
-            || !isa_flags.has_ssse3()
-            || !isa_flags.has_sse41()
-            || !isa_flags.has_sse42()
-        {
+        if !isa_flags.has_sse3() || !isa_flags.has_ssse3() || !isa_flags.has_sse41() {
             return Err(CodegenError::Unsupported(
-                "SIMD support requires SSE3, SSSE3, SSE4.1, and SSE4.2 on x86_64.".into(),
+                "SIMD support requires SSE3, SSSE3, and SSE4.1 on x86_64.".into(),
             ));
         }
     }
@@ -253,7 +249,6 @@ mod test {
         isa_builder.set("has_sse3", "false").unwrap();
         isa_builder.set("has_ssse3", "false").unwrap();
         isa_builder.set("has_sse41", "false").unwrap();
-        isa_builder.set("has_sse42", "false").unwrap();
         assert!(matches!(
             isa_builder.finish(shared_flags),
             Err(CodegenError::Unsupported(_)),
diff --git a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif
index 9c69ac019660..93d0cffe8677 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif
@@ -1,6 +1,6 @@
 test compile precise-output
 set enable_simd
-target x86_64 has_avx
+target x86_64 sse42 has_avx
 
 function %i8x16_eq(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
index ed074fe68a4e..6d9cbc8e5e10 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
@@ -2,7 +2,8 @@ test interpret
 test run
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
 
diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs
index 3425621002a4..5a4418160d83 100644
--- a/cranelift/native/src/lib.rs
+++ b/cranelift/native/src/lib.rs
@@ -80,7 +80,6 @@ pub fn infer_native_flags(isa_builder: &mut dyn Configurable) -> Result<(), &'st
         isa_builder.set("has_sse3", "false").unwrap();
         isa_builder.set("has_ssse3", "false").unwrap();
         isa_builder.set("has_sse41", "false").unwrap();
-        isa_builder.set("has_sse42", "false").unwrap();
 
         if std::is_x86_feature_detected!("sse3") {
             isa_builder.enable("has_sse3").unwrap();
diff --git a/crates/fuzzing/src/generators/codegen_settings.rs b/crates/fuzzing/src/generators/codegen_settings.rs
index 30484d009fa1..9f2841d91bf6 100644
--- a/crates/fuzzing/src/generators/codegen_settings.rs
+++ b/crates/fuzzing/src/generators/codegen_settings.rs
@@ -35,7 +35,7 @@ impl CodegenSettings {
         }
     }
 
-    /// Features such as sse4.2 are unconditionally enabled on the x86_64 target
+    /// Features such as sse3 are unconditionally enabled on the x86_64 target
     /// because they are hard required for SIMD, but when SIMD is disabled, for
     /// example, we support disabling these features.
     ///
@@ -57,7 +57,7 @@ impl CodegenSettings {
             // to have test case failures unrelated to codegen setting input
             // that fail on one architecture to fail on other architectures as
             // well.
-            let new_flags = ["has_sse3", "has_ssse3", "has_sse41", "has_sse42"]
+            let new_flags = ["has_sse3", "has_ssse3", "has_sse41"]
                 .into_iter()
                 .map(|name| Ok((name, u.arbitrary()?)))
                 .collect::<arbitrary::Result<HashMap<_, bool>>>()?;
@@ -147,8 +147,8 @@ impl<'a> Arbitrary<'a> for CodegenSettings {
                     std:"sse3" => clif:"has_sse3" ratio: 1 in 1,
                     std:"ssse3" => clif:"has_ssse3" ratio: 1 in 1,
                     std:"sse4.1" => clif:"has_sse41" ratio: 1 in 1,
-                    std:"sse4.2" => clif:"has_sse42" ratio: 1 in 1,
 
+                    std:"sse4.2" => clif:"has_sse42",
                     std:"popcnt" => clif:"has_popcnt",
                     std:"avx" => clif:"has_avx",
                     std:"avx2" => clif:"has_avx2",