cranelift: Port ishl SIMD lowerings to ISLE (#3686)

bytecodealliance · Jan 13, 2022 · a7dba81 · a7dba81
1 parent 13f17db
commit a7dba81
Show file tree

Hide file tree

Showing 11 changed files with 698 additions and 296 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
-src/prelude.isle d95510fad2e2473c
+src/prelude.isle 7b911d3b894ae17
 src/isa/aarch64/inst.isle 5fa80451697b084f
 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -54,6 +54,8 @@
             (MovsxRmR (ext_mode ExtMode)
                       (src RegMem)
                       (dst WritableReg))
+            (Mov64MR (src SyntheticAmode)
+                     (dst WritableReg))
             (Cmove (size OperandSize)
                    (cc CC)
                    (consequent RegMem)
@@ -70,6 +72,8 @@
             (Not (size OperandSize)
                  (src Reg)
                  (dst WritableReg))
+            (LoadEffectiveAddress (addr SyntheticAmode)
+                                  (dst WritableReg))
 ))
 
 (type OperandSize extern
@@ -318,6 +322,17 @@
 
 (type SyntheticAmode extern (enum))
 
+(decl synthetic_amode_to_reg_mem (SyntheticAmode) RegMem)
+(extern constructor synthetic_amode_to_reg_mem synthetic_amode_to_reg_mem)
+
+(type Amode extern (enum))
+
+(decl amode_imm_reg_reg_shift (u32 Reg Reg u8) Amode)
+(extern constructor amode_imm_reg_reg_shift amode_imm_reg_reg_shift)
+
+(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
+(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
+
 (type ShiftKind extern
       (enum ShiftLeft
             ShiftRightLogical
@@ -438,6 +453,11 @@
 
 ;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(type ExtKind extern
+      (enum None
+            SignExtend
+            ZeroExtend))
+
 (type ExtendKind (enum Sign Zero))
 
 (type ExtMode extern (enum BL BQ WL WQ LQ))
@@ -549,6 +569,40 @@
                                  (RegMem.Reg r)
                                  (OperandSize.Size32))))
 
+;;;; Helpers for Emitting Loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Load a value into a register.
+(decl x64_load (Type SyntheticAmode ExtKind) Reg)
+
+(rule (x64_load (fits_in_32 ty) addr (ExtKind.SignExtend))
+      (movsx ty
+             (ext_mode (ty_bytes ty) 8)
+             (synthetic_amode_to_reg_mem addr)))
+
+(rule (x64_load $I64 addr _ext_kind)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.Mov64MR addr dst))))
+        (writable_reg_to_reg dst)))
+
+(rule (x64_load $F32 addr _ext_kind)
+      (xmm_unary_rm_r (SseOpcode.Movss)
+                      (synthetic_amode_to_reg_mem addr)))
+
+(rule (x64_load $F64 addr _ext_kind)
+      (xmm_unary_rm_r (SseOpcode.Movsd)
+                      (synthetic_amode_to_reg_mem addr)))
+
+(rule (x64_load $F32X4 addr _ext_kind)
+      (xmm_unary_rm_r (SseOpcode.Movups)
+                      (synthetic_amode_to_reg_mem addr)))
+
+(rule (x64_load $F64X2 addr _ext_kind)
+      (xmm_unary_rm_r (SseOpcode.Movupd)
+                      (synthetic_amode_to_reg_mem addr)))
+
+(rule (x64_load (multi_lane _bits _lanes) addr _ext_kind)
+      (xmm_unary_rm_r (SseOpcode.Movdqu)
+                      (synthetic_amode_to_reg_mem addr)))
 
 ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -1236,6 +1290,16 @@
                                            dst))))
         (writable_reg_to_reg dst)))
 
+;; Helper for creating `psllw` instructions.
+(decl psllw (Reg RegMemImm) Reg)
+(rule (psllw src1 src2)
+      (xmm_rmi_reg (SseOpcode.Psllw) src1 src2))
+
+;; Helper for creating `pslld` instructions.
+(decl pslld (Reg RegMemImm) Reg)
+(rule (pslld src1 src2)
+      (xmm_rmi_reg (SseOpcode.Pslld) src1 src2))
+
 ;; Helper for creating `psllq` instructions.
 (decl psllq (Reg RegMemImm) Reg)
 (rule (psllq src1 src2)
@@ -1353,3 +1417,9 @@
             (size OperandSize (operand_size_of_type_32_64 ty))
             (_ Unit (emit (MInst.Not size src dst))))
         (writable_reg_to_reg dst)))
+
+(decl lea (SyntheticAmode) Reg)
+(rule (lea addr)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
+        (writable_reg_to_reg dst)))
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -3053,6 +3053,12 @@ impl MachInst for Inst {
     }
 
     fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
+        log::trace!(
+            "Inst::gen_move {:?} -> {:?} (type: {:?})",
+            src_reg,
+            dst_reg.to_reg(),
+            ty
+        );
         let rc_dst = dst_reg.to_reg().get_class();
         let rc_src = src_reg.get_class();
         // If this isn't true, we have gone way off the rails.

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -571,6 +571,67 @@
       (let ((amt_ Reg (lo_reg amt)))
         (shl_i128 (put_in_regs src) amt_)))
 
+;; SSE.
+
+;; Since the x86 instruction set does not have any 8x16 shift instructions (even
+;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
+;; instructions. The basic idea, whether the amount to shift by is an immediate
+;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
+(rule (lower (has_type $I8X16 (ishl src amt)))
+      (let ((src_ Reg (put_in_reg src))
+            (amt_gpr RegMemImm (put_in_reg_mem_imm amt))
+            (amt_xmm RegMemImm (reg_mem_imm_to_xmm amt_gpr))
+            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
+            ;; correct for half of the lanes; the others must be fixed up with
+            ;; the mask below.
+            (unmasked Reg (psllw src_ amt_xmm))
+            (mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr))
+            (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
+        (value_reg (sse_and $I8X16 unmasked (RegMem.Reg mask)))))
+
+;; Get the address of the mask to use when fixing up the lanes that weren't
+;; correctly generated by the 16x8 shift.
+(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)
+
+;; When the shift amount is known, we can statically (i.e. at compile time)
+;; determine the mask to use and only emit that.
+(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
+      (ishl_i8x16_mask_for_const amt))
+
+;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
+;; time) find the correct mask offset in the table. We do this use `lea` to find
+;; the base address of the mask table and then complex addressing to offset to
+;; the right mask: `base_address + amt << 4`
+(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
+      (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
+            (base_mask_addr Reg (lea mask_table))
+            (mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
+        (amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
+                                                           base_mask_addr
+                                                           mask_offset
+                                                           0))))
+(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
+      (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
+
+;; Get the address of the mask for a constant 8x16 shift amount.
+(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
+(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
+
+;; Get the address of the mask table for a dynamic 8x16 shift amount.
+(decl ishl_i8x16_mask_table () SyntheticAmode)
+(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
+
+;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
+(rule (lower (has_type $I16X8 (ishl src amt)))
+      (value_reg (psllw (put_in_reg src)
+                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
+(rule (lower (has_type $I32X4 (ishl src amt)))
+      (value_reg (pslld (put_in_reg src)
+                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
+(rule (lower (has_type $I64X2 (ishl src amt)))
+      (value_reg (psllq (put_in_reg src)
+                        (reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
+
 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `i64` and smaller.

diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1539,9 +1539,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::Bnot
         | Opcode::Bitselect
         | Opcode::Vselect
-        | Opcode::Sshr => implemented_in_isle(ctx),
+        | Opcode::Sshr
+        | Opcode::Ishl => implemented_in_isle(ctx),
 
-        Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
+        Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
             let dst_ty = ctx.output_ty(insn, 0);
             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
 
@@ -1557,7 +1558,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // This implementation uses the last two encoding methods.
                 let (size, lhs) = match dst_ty {
                     types::I8 | types::I16 => match op {
-                        Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
                         Opcode::Ushr => (
                             OperandSize::Size32,
                             extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
@@ -1589,7 +1589,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
                 let shift_kind = match op {
-                    Opcode::Ishl => ShiftKind::ShiftLeft,
                     Opcode::Ushr => ShiftKind::ShiftRightLogical,
                     Opcode::Rotl => ShiftKind::RotateLeft,
                     Opcode::Rotr => ShiftKind::RotateRight,
@@ -1608,7 +1607,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let dst = get_output_reg(ctx, outputs[0]);
 
                 match op {
-                    Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
+                    Opcode::Ushr | Opcode::Rotl => {
                         implemented_in_isle(ctx);
                     }
                     Opcode::Rotr => {
@@ -1643,7 +1642,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     _ => unreachable!(),
                 }
-            } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
+            } else if dst_ty == types::I8X16 && op == Opcode::Ushr {
                 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
                 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
                 // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
@@ -1671,7 +1670,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
                 // the others must be fixed up with the mask below.
                 let shift_opcode = match op {
-                    Opcode::Ishl => SseOpcode::Psllw,
                     Opcode::Ushr => SseOpcode::Psrlw,
                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                 };
@@ -1695,20 +1693,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
                     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                 ];
-                const SHL_MASKS: [u8; 128] = [
-                    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-                    0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
-                    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
-                    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
-                    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
-                    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-                    0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
-                    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
-                    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
-                    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                ];
+
                 let mask = match op {
-                    Opcode::Ishl => &SHL_MASKS,
                     Opcode::Ushr => &USHR_MASKS,
                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                 };
@@ -1775,17 +1761,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let sse_op = match dst_ty {
                     types::I16X8 => match op {
-                        Opcode::Ishl => SseOpcode::Psllw,
                         Opcode::Ushr => SseOpcode::Psrlw,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },
                     types::I32X4 => match op {
-                        Opcode::Ishl => SseOpcode::Pslld,
                         Opcode::Ushr => SseOpcode::Psrld,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },
                     types::I64X2 => match op {
-                        Opcode::Ishl => SseOpcode::Psllq,
                         Opcode::Ushr => SseOpcode::Psrlq,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },