bytecodealliance · cfallin · Aug 29, 2022 · Aug 27, 2022 · Aug 26, 2022
@@ -952,11 +952,19 @@
 
 ;; Helper for calculating the `ScalarSize` corresponding to a type
 (decl scalar_size (Type) ScalarSize)
+
 (rule (scalar_size $I8) (ScalarSize.Size8))
 (rule (scalar_size $I16) (ScalarSize.Size16))
 (rule (scalar_size $I32) (ScalarSize.Size32))
 (rule (scalar_size $I64) (ScalarSize.Size64))
 (rule (scalar_size $I128) (ScalarSize.Size128))
+
+(rule (scalar_size $B8) (ScalarSize.Size8))
+(rule (scalar_size $B16) (ScalarSize.Size16))
+(rule (scalar_size $B32) (ScalarSize.Size32))
+(rule (scalar_size $B64) (ScalarSize.Size64))
+(rule (scalar_size $B128) (ScalarSize.Size128))
+
 (rule (scalar_size $F32) (ScalarSize.Size32))
 (rule (scalar_size $F64) (ScalarSize.Size64))
 
@@ -1452,6 +1460,9 @@
 (decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
 (extern constructor lshl_from_imm64 lshl_from_imm64)
 
+(decl pure lshl_from_u64 (Type u64) ShiftOpAndAmt)
+(extern constructor lshl_from_u64 lshl_from_u64)
+
 (decl integral_ty (Type) Type)
 (extern extractor integral_ty integral_ty)
 
@@ -1704,6 +1715,14 @@
          (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2)
          dst)))
 
+;; Helper for emitting `adds` instructions, setting flags in ambient
+;; state. Used only for `iadd_ifcout`.
+(decl add_with_flags (Type Reg Reg) Reg)
+(rule (add_with_flags ty src1 src2)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2))))
+        dst))
+
 ;; Helper for emitting `adc` instructions.
 (decl adc_paired (Type Reg Reg) ConsumesFlags)
 (rule (adc_paired ty src1 src2)
@@ -1927,6 +1946,13 @@
             (_ Unit (emit (MInst.VecExtend op dst src high_half size))))
         dst))
 
+;; Helper for emitting `MInst.VecExtract` instructions.
+(decl vec_extract (Reg Reg u8) Reg)
+(rule (vec_extract src1 src2 idx)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecExtract dst src1 src2 idx))))
+        dst))
+
 ;; Helper for emitting `MInst.LoadAcquire` instructions.
 (decl load_acquire (Type Reg) Reg)
 (rule (load_acquire ty addr)
@@ -2118,6 +2144,10 @@
 (decl addp (Reg Reg VectorSize) Reg)
 (rule (addp x y size) (vec_rrr (VecALUOp.Addp) x y size))
 
+;; Helper for generating `zip1` instructions.
+(decl zip1 (Reg Reg VectorSize) Reg)
+(rule (zip1 x y size) (vec_rrr (VecALUOp.Zip1) x y size))
+
 ;; Helper for generating vector `abs` instructions.
 (decl vec_abs (Reg VectorSize) Reg)
 (rule (vec_abs x size) (vec_misc (VecMisc2.Abs) x size))
@@ -2826,3 +2856,24 @@
 
 (decl gen_call_indirect (SigRef Value ValueSlice) InstOutput)
 (extern constructor gen_call_indirect gen_call_indirect)
+
+;; Helpers for pinned register manipulation.
+
+(decl writable_pinned_reg () WritableReg)
+(extern constructor writable_pinned_reg writable_pinned_reg)
+
+(decl pinned_reg () Reg)
+(rule (pinned_reg) (writable_pinned_reg))
+
+(decl write_pinned_reg (Reg) SideEffectNoResult)
+(rule (write_pinned_reg val)
+      (let ((dst WritableReg (writable_pinned_reg)))
+        (SideEffectNoResult.Inst (gen_move $I64 dst val))))
+
+;; Helpers for stackslot effective address generation.
+
+(decl compute_stack_addr (StackSlot Offset32) Reg)
+(rule (compute_stack_addr stack_slot offset)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+           (_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
+        dst))
@@ -2030,3 +2030,212 @@
 ;; N.B.: the Ret itself is generated by the ABI.
 (rule (lower (return args))
       (lower_return (range 0 (value_slice_len args)) args))
+
+;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (get_pinned_reg))
+      (pinned_reg))
+
+(rule (lower (set_pinned_reg val))
+      (side_effect (write_pinned_reg val)))
+
+;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
+      (mov_from_vec src 0 (ScalarSize.Size32)))
+
+(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
+      (mov_to_fpu src (ScalarSize.Size32)))
+
+(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
+      (mov_from_vec src 0 (ScalarSize.Size64)))
+
+(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
+      (mov_to_fpu src (ScalarSize.Size64)))
+
+;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (raw_bitcast val))
+      val)
+
+;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; extractlane with lane 0 can pass through the value unchanged; upper
+;; bits are undefined when a narrower type is in a wider register.
+(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
+      val)
+
+(rule (lower (has_type (ty_int_bool ty)
+                       (extractlane val
+                                    (u8_from_uimm8 lane))))
+      (mov_from_vec val lane (scalar_size ty)))
+
+(rule (lower (has_type (ty_scalar_float ty)
+                       (extractlane val @ (value_type vty)
+                                    (u8_from_uimm8 lane))))
+      (fpu_move_from_vec val lane (vector_size vty)))
+
+;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (insertlane vec @ (value_type vty)
+                         val @ (value_type (ty_int_bool _))
+                         (u8_from_uimm8 lane)))
+      (mov_to_vec vec val lane (vector_size vty)))
+
+(rule (lower (insertlane vec @ (value_type vty)
+                         val @ (value_type (ty_scalar_float _))
+                         (u8_from_uimm8 lane)))
+      (mov_vec_elem vec val lane 0 (vector_size vty)))
+
+;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (copy x))
+      x)
+
+;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (stack_addr stack_slot offset))
+      (compute_stack_addr stack_slot offset))
+
+;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; All three sequences use one integer temporary and two vector
+;; temporaries.  The shift is done early so as to give the register
+;; allocator the possibility of using the same reg for `tmp_v1` and
+;; `src_v` in the case that this is the last use of `src_v`.  See
+;; https://github.com/WebAssembly/simd/pull/201 for the background and
+;; derivation of these sequences. Alternative sequences are discussed
+;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
+;; although they are not used here.
+
+(rule (lower (vhigh_bits vec @ (value_type $I8X16)))
+      (let (
+            ;; Replicate the MSB of each of the 16 byte lanes across
+            ;; the whole lane (sshr is an arithmetic right shift).
+            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16)))
+            ;; Bitwise-and with a mask
+            ;; `0x80402010_08040201_80402010_08040201` to get the bit
+            ;; in the proper location for each group of 8 lanes.
+            (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
+            ;; Produce a version of `anded` with upper 8 lanes and
+            ;; lower 8 lanes swapped.
+            (anded_swapped Reg (vec_extract anded anded 8))
+            ;; Zip together the two; with the above this produces the lane permutation:
+            ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
+            (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
+            ;; Add 16-bit lanes together ("add across vector"), so we
+            ;; get, in the low 16 bits, 15+14+...+8 in the high byte
+            ;; and 7+6+...+0 in the low byte. This effectively puts
+            ;; the 16 MSBs together, giving our results.
+            ;;
+            ;; N.B.: `Size16x8` is not a typo!
+            (result Reg (addv zipped (VectorSize.Size16x8))))
+        (mov_from_vec result 0 (ScalarSize.Size16))))
+
+(rule (lower (vhigh_bits vec @ (value_type $I16X8)))
+      (let (
+            ;; Replicate the MSB of each of the 8 16-bit lanes across
+            ;; the whole lane (sshr is an arithmetic right shift).
+            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8)))
+            ;; Bitwise-and with a mask
+            ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
+            ;; bit in the proper location for each group of 4 lanes.
+            (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
+            ;; Add lanes together to get the 8 MSBs in the low byte.
+            (result Reg (addv anded (VectorSize.Size16x8))))
+        (mov_from_vec result 0 (ScalarSize.Size16))))
+
+(rule (lower (vhigh_bits vec @ (value_type $I32X4)))
+      (let (
+            ;; Replicate the MSB of each of the 4 32-bit lanes across
+            ;; the whole lane (sshr is an arithmetic right shift).
+            (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4)))
+            ;; Bitwise-and with a mask
+            ;; `0x00000008_00000004_00000002_00000001` to get the bit
+            ;; in the proper location for each group of 4 lanes.
+            (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
+            ;; Add lanes together to get the 4 MSBs in the low byte.
+            (result Reg (addv anded (VectorSize.Size32x4))))
+        (mov_from_vec result 0 (ScalarSize.Size32))))
+
+(rule (lower (vhigh_bits vec @ (value_type $I64X2)))
+      (let (
+            ;; Grab the MSB out of each of the lanes, right-shift to
+            ;; LSB, and add with a left-shift of upper lane's MSB back
+            ;; to bit 1.  the whole lane (sshr is an arithmetic right
+            ;; shift).
+            (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
+            (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
+            (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
+            (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
+        (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))
+
+;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; This is a two-output instruction that is needed for the
+;; legalizer's explicit heap-check sequence, among possible other
+;; uses. Its second output is a flags output only ever meant to
+;; check for overflow using the
+;; `backend.unsigned_add_overflow_condition()` condition.
+;; 
+;; Note that the CLIF validation will ensure that no flag-setting
+;; operation comes between this IaddIfcout and its use (e.g., a
+;; Trapif). Thus, we can rely on implicit communication through the
+;; processor flags rather than explicitly generating flags into a
+;; register. We simply use the variant of the add instruction that
+;; sets flags (`adds`) here.
+;; 
+;; Note that the second output (the flags) need not be generated,
+;; because flags are never materialized into a register; the only
+;; instructions that can use a value of type `iflags` or `fflags`
+;; will look directly for the flags-producing instruction (which can
+;; always be found, by construction) and merge it.
+;; 
+;; Now handle the iadd as above, except use an AddS opcode that sets
+;; flags.
+
+(rule (lower (has_type (ty_int ty)
+                       (iadd_ifcout a b)))
+      (output_pair
+       (add_with_flags ty a b)
+       (invalid_reg)))
+
+;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO.
+
+;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F64X2 (fcvt_low_from_sint val)))
+      (let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64)))
+            (converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2))))
+        converted))
+
+;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (fvpromote_low val))
+      (vec_rr_long (VecRRLongOp.Fcvtl32) val $false))
+
+;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO: requires icmp/fcmp first.
+
+;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO: requires icmp/fcmp first.
+
+;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO: requires icmp/fcmp first.
+
+;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO: requires icmp/fcmp first.
+
+;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO.
+
+;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO.
@@ -128,7 +128,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
     }
 
     fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
-        let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
+        self.lshl_from_u64(ty, n.bits() as u64)
+    }
+
+    fn lshl_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
+        let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
         let shiftee_bits = ty_bits(ty);
         if shiftee_bits <= std::u8::MAX as usize {
             let shiftimm = shiftimm.mask(shiftee_bits as u8);
@@ -722,4 +726,8 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
             );
         }
     }
+
+    fn writable_pinned_reg(&mut self) -> WritableReg {
+        super::regs::writable_xreg(super::regs::PINNED_REG)
+    }
 }