Implement a few minor optimizations around 128-bit integers (#9136)

* Implement a few minor optimizations around 128-bit integers This commit implements a few minor changes for `i128` in both the egraph optimizations and lowerings for x64. The optimization pass will now transform `iconcat` into a `uextend` or `sextend` where appropriate. The x64 backend then pattern-matches this to produce slightly more optimal machine code. Additionally the x64 backend now handles memory/immediate operands a bit better when the argument to a 128-bit operation is an `iconcat`. * Update test expectations * Match iadd lowering rules for isub
bytecodealliance · Aug 16, 2024 · 69b005f · 69b005f
1 parent 3f5c21b
commit 69b005f
Show file tree

Hide file tree

Showing 5 changed files with 351 additions and 84 deletions.
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -101,15 +101,27 @@
 (rule 1 (lower (has_type $I128 (iadd x y)))
       ;; Get the high/low registers for `x`.
       (let ((x_regs ValueRegs x)
-            (x_lo Gpr (value_regs_get_gpr x_regs 0))
-            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
-        ;; Get the high/low registers for `y`.
-        (let ((y_regs ValueRegs y)
-              (y_lo Gpr (value_regs_get_gpr y_regs 0))
-              (y_hi Gpr (value_regs_get_gpr y_regs 1)))
-          ;; Do an add followed by an add-with-carry.
-          (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo)
-                      (x64_adc_paired $I64 x_hi y_hi)))))
+            (y_regs ValueRegs y))
+        (iadd128
+          (value_regs_get_gpr x_regs 0)
+          (value_regs_get_gpr x_regs 1)
+          (value_regs_get_gpr y_regs 0)
+          (value_regs_get_gpr y_regs 1))))
+(rule 2 (lower (has_type $I128 (iadd x (iconcat y_lo y_hi))))
+        (let ((x_regs ValueRegs x))
+          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
+(rule 3 (lower (has_type $I128 (iadd x (uextend y @ (value_type $I64)))))
+        (let ((x_regs ValueRegs x))
+          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
+                   y (RegMemImm.Imm 0))))
+
+;; Helper for lowering 128-bit addition with the 64-bit halves of the lhs/rhs
+;; already split. The first two arguments are lo/hi for the lhs and the second
+;; two are lo/hi for the rhs.
+(decl iadd128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
+(rule (iadd128 x_lo x_hi y_lo y_hi)
+      (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo)
+                  (x64_adc_paired $I64 x_hi y_hi)))
 
 ;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -245,15 +257,27 @@
 (rule 1 (lower (has_type $I128 (isub x y)))
       ;; Get the high/low registers for `x`.
       (let ((x_regs ValueRegs x)
-            (x_lo Gpr (value_regs_get_gpr x_regs 0))
-            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
-        ;; Get the high/low registers for `y`.
-        (let ((y_regs ValueRegs y)
-              (y_lo Gpr (value_regs_get_gpr y_regs 0))
-              (y_hi Gpr (value_regs_get_gpr y_regs 1)))
-          ;; Do a sub followed by an sub-with-borrow.
-          (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo)
-                      (x64_sbb_paired $I64 x_hi y_hi)))))
+            (y_regs ValueRegs y))
+        (isub128
+          (value_regs_get_gpr x_regs 0)
+          (value_regs_get_gpr x_regs 1)
+          (value_regs_get_gpr y_regs 0)
+          (value_regs_get_gpr y_regs 1))))
+(rule 2 (lower (has_type $I128 (isub x (iconcat y_lo y_hi))))
+        (let ((x_regs ValueRegs x))
+          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
+(rule 3 (lower (has_type $I128 (isub x (uextend y @ (value_type $I64)))))
+        (let ((x_regs ValueRegs x))
+          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
+                   y (RegMemImm.Imm 0))))
+
+;; Helper for lowering 128-bit subtraction with the 64-bit halves of the lhs/rhs
+;; already split. The first two arguments are lo/hi for the lhs and the second
+;; two are lo/hi for the rhs.
+(decl isub128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
+(rule (isub128 x_lo x_hi y_lo y_hi)
+      (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo)
+                  (x64_sbb_paired $I64 x_hi y_hi)))
 
 ;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -999,6 +1023,22 @@
 
 ;; `i128`.
 
+(rule 2 (lower (has_type $I128 (imul x y)))
+      (let ((x_regs ValueRegs x)
+            (y_regs ValueRegs y))
+        (imul128
+          (value_regs_get_gpr x_regs 0)
+          (value_regs_get_gpr x_regs 1)
+          (value_regs_get_gpr y_regs 0)
+          (value_regs_get_gpr y_regs 1))))
+
+(rule 4 (lower (has_type $I128 (imul (iconcat x_lo x_hi) (iconcat y_lo y_hi))))
+        (imul128 x_lo x_hi y_lo y_hi))
+
+;; Helper for lowering 128-bit multiplication with the 64-bit halves of the
+;; lhs/rhs already split. The first two arguments are lo/hi for the lhs and the
+;; second two are lo/hi for the rhs.
+;;
 ;; mul:
 ;;   dst_lo = lhs_lo * rhs_lo
 ;;   dst_hi = umulhi(lhs_lo, rhs_lo) +
@@ -1012,16 +1052,10 @@
 ;;   dst_lo:hi_lolo = mulhi_u x_lo, y_lo
 ;;   dst_hi = add hilo_hilo, hi_lolo
 ;;   return (dst_lo, dst_hi)
-(rule 2 (lower (has_type $I128 (imul x y)))
+(decl imul128 (Gpr Gpr GprMem GprMem) ValueRegs)
+(rule (imul128 x_lo x_hi y_lo y_hi)
       ;; Put `x` into registers and unpack its hi/lo halves.
-      (let ((x_regs ValueRegs x)
-            (x_lo Gpr (value_regs_get_gpr x_regs 0))
-            (x_hi Gpr (value_regs_get_gpr x_regs 1))
-            ;; Put `y` into registers and unpack its hi/lo halves.
-            (y_regs ValueRegs y)
-            (y_lo Gpr (value_regs_get_gpr y_regs 0))
-            (y_hi Gpr (value_regs_get_gpr y_regs 1))
-            ;; lo_hi = mul x_lo, y_hi
+      (let (;; lo_hi = mul x_lo, y_hi
             (lo_hi Gpr (x64_imul $I64 x_lo y_hi))
             ;; hi_lo = mul x_hi, y_lo
             (hi_lo Gpr (x64_imul $I64 x_hi y_lo))
@@ -1035,6 +1069,17 @@
             (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo)))
         (value_gprs dst_lo dst_hi)))
 
+;; The `mul` and `imul` instructions on x64 are defined as taking 64-bit
+;; operands and producing a 128-bit result, which exactly matches the semantics
+;; of widening 64-bit inputs to 128-bit and then multiplying them. That means
+;; that these cases can get some some simpler codegen.
+(rule 5 (lower (has_type $I128 (imul (uextend x @ (value_type $I64))
+                                     (uextend y @ (value_type $I64)))))
+        (x64_mul $I64 $false x y))
+(rule 5 (lower (has_type $I128 (imul (sextend x @ (value_type $I64))
+                                     (sextend y @ (value_type $I64)))))
+        (x64_mul $I64 $true x y))
+
 ;; SSE.
 
 ;; (No i8x16 multiply.)

diff --git a/cranelift/codegen/src/opts/extends.isle b/cranelift/codegen/src/opts/extends.isle
@@ -89,3 +89,7 @@
 (rule (simplify (ireduce ty (bor  _ x y))) (bor  ty (ireduce ty x) (ireduce ty y)))
 (rule (simplify (ireduce ty (bxor _ x y))) (bxor ty (ireduce ty x) (ireduce ty y)))
 (rule (simplify (ireduce ty (band _ x y))) (band ty (ireduce ty x) (ireduce ty y)))
+
+;; Try to transform an `iconcat` into an i128 into either an sextend or uextend
+(rule (simplify (iconcat $I128 x (iconst_u _ 0))) (uextend $I128 x))
+(rule (simplify (iconcat $I128 x (sshr _ x (iconst_u _ 63)))) (sextend $I128 x))
diff --git a/cranelift/filetests/filetests/egraph/extends.clif b/cranelift/filetests/filetests/egraph/extends.clif
@@ -227,3 +227,23 @@ block0(v0: i16):
 
 ; check: v5 = bnot v0
 ; check: return v5
+
+function %concat_zero(i64) -> i128 {
+block0(v0: i64):
+    v1 = iconst.i64 0
+    v2 = iconcat v0, v1
+    return v2
+}
+
+; check: v3 = uextend.i128 v0
+; check: return v3
+
+function %sext128(i64) -> i128 {
+block0(v0: i64):
+    v1 = sshr_imm v0, 63
+    v2 = iconcat v0, v1
+    return v2
+}
+
+; check: v4 = sextend.i128 v0
+; check: return v4