Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AArch64: port misc ops to ISLE. #4796

Merged
merged 2 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -952,11 +952,19 @@

;; Helper for calculating the `ScalarSize` corresponding to a type
(decl scalar_size (Type) ScalarSize)

(rule (scalar_size $I8) (ScalarSize.Size8))
(rule (scalar_size $I16) (ScalarSize.Size16))
(rule (scalar_size $I32) (ScalarSize.Size32))
(rule (scalar_size $I64) (ScalarSize.Size64))
(rule (scalar_size $I128) (ScalarSize.Size128))

(rule (scalar_size $B8) (ScalarSize.Size8))
(rule (scalar_size $B16) (ScalarSize.Size16))
(rule (scalar_size $B32) (ScalarSize.Size32))
(rule (scalar_size $B64) (ScalarSize.Size64))
(rule (scalar_size $B128) (ScalarSize.Size128))

(rule (scalar_size $F32) (ScalarSize.Size32))
(rule (scalar_size $F64) (ScalarSize.Size64))

Expand Down Expand Up @@ -1452,6 +1460,9 @@
(decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
(extern constructor lshl_from_imm64 lshl_from_imm64)

(decl pure lshl_from_u64 (Type u64) ShiftOpAndAmt)
(extern constructor lshl_from_u64 lshl_from_u64)

(decl integral_ty (Type) Type)
(extern extractor integral_ty integral_ty)

Expand Down Expand Up @@ -1704,6 +1715,14 @@
(MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2)
dst)))

;; Helper for emitting `adds` instructions, setting flags in ambient
;; state. Used only for `iadd_ifcout`.
(decl add_with_flags (Type Reg Reg) Reg)
(rule (add_with_flags ty src1 src2)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.AluRRR (ALUOp.AddS) (operand_size ty) dst src1 src2))))
dst))

;; Helper for emitting `adc` instructions.
(decl adc_paired (Type Reg Reg) ConsumesFlags)
(rule (adc_paired ty src1 src2)
Expand Down Expand Up @@ -1927,6 +1946,13 @@
(_ Unit (emit (MInst.VecExtend op dst src high_half size))))
dst))

;; Helper for emitting `MInst.VecExtract` instructions.
(decl vec_extract (Reg Reg u8) Reg)
(rule (vec_extract src1 src2 idx)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecExtract dst src1 src2 idx))))
dst))

;; Helper for emitting `MInst.LoadAcquire` instructions.
(decl load_acquire (Type Reg) Reg)
(rule (load_acquire ty addr)
Expand Down Expand Up @@ -2118,6 +2144,10 @@
(decl addp (Reg Reg VectorSize) Reg)
(rule (addp x y size) (vec_rrr (VecALUOp.Addp) x y size))

;; Helper for generating `zip1` instructions.
(decl zip1 (Reg Reg VectorSize) Reg)
(rule (zip1 x y size) (vec_rrr (VecALUOp.Zip1) x y size))

;; Helper for generating vector `abs` instructions.
(decl vec_abs (Reg VectorSize) Reg)
(rule (vec_abs x size) (vec_misc (VecMisc2.Abs) x size))
Expand Down Expand Up @@ -2826,3 +2856,24 @@

(decl gen_call_indirect (SigRef Value ValueSlice) InstOutput)
(extern constructor gen_call_indirect gen_call_indirect)

;; Helpers for pinned register manipulation.

(decl writable_pinned_reg () WritableReg)
(extern constructor writable_pinned_reg writable_pinned_reg)

(decl pinned_reg () Reg)
(rule (pinned_reg) (writable_pinned_reg))

(decl write_pinned_reg (Reg) SideEffectNoResult)
(rule (write_pinned_reg val)
(let ((dst WritableReg (writable_pinned_reg)))
(SideEffectNoResult.Inst (gen_move $I64 dst val))))

;; Helpers for stackslot effective address generation.

(decl compute_stack_addr (StackSlot Offset32) Reg)
(rule (compute_stack_addr stack_slot offset)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
dst))
209 changes: 209 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -2030,3 +2030,212 @@
;; N.B.: the Ret itself is generated by the ABI.
(rule (lower (return args))
(lower_return (range 0 (value_slice_len args)) args))

;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (get_pinned_reg))
(pinned_reg))

(rule (lower (set_pinned_reg val))
(side_effect (write_pinned_reg val)))

;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
(mov_from_vec src 0 (ScalarSize.Size32)))

(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
(mov_to_fpu src (ScalarSize.Size32)))

(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
(mov_from_vec src 0 (ScalarSize.Size64)))

(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
(mov_to_fpu src (ScalarSize.Size64)))

;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (raw_bitcast val))
val)

;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; extractlane with lane 0 can pass through the value unchanged; upper
;; bits are undefined when a narrower type is in a wider register.
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
val)

(rule (lower (has_type (ty_int_bool ty)
(extractlane val
(u8_from_uimm8 lane))))
(mov_from_vec val lane (scalar_size ty)))

(rule (lower (has_type (ty_scalar_float ty)
(extractlane val @ (value_type vty)
(u8_from_uimm8 lane))))
(fpu_move_from_vec val lane (vector_size vty)))

;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_int_bool _))
(u8_from_uimm8 lane)))
(mov_to_vec vec val lane (vector_size vty)))

(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_scalar_float _))
(u8_from_uimm8 lane)))
(mov_vec_elem vec val lane 0 (vector_size vty)))

;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (copy x))
x)

;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (stack_addr stack_slot offset))
(compute_stack_addr stack_slot offset))

;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; All three sequences use one integer temporary and two vector
;; temporaries. The shift is done early so as to give the register
;; allocator the possibility of using the same reg for `tmp_v1` and
;; `src_v` in the case that this is the last use of `src_v`. See
;; https://github.com/WebAssembly/simd/pull/201 for the background and
;; derivation of these sequences. Alternative sequences are discussed
;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
;; although they are not used here.

(rule (lower (vhigh_bits vec @ (value_type $I8X16)))
(let (
;; Replicate the MSB of each of the 16 byte lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16)))
;; Bitwise-and with a mask
;; `0x80402010_08040201_80402010_08040201` to get the bit
;; in the proper location for each group of 8 lanes.
(anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
;; Produce a version of `anded` with upper 8 lanes and
;; lower 8 lanes swapped.
(anded_swapped Reg (vec_extract anded anded 8))
;; Zip together the two; with the above this produces the lane permutation:
;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
(zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
;; Add 16-bit lanes together ("add across vector"), so we
;; get, in the low 16 bits, 15+14+...+8 in the high byte
;; and 7+6+...+0 in the low byte. This effectively puts
;; the 16 MSBs together, giving our results.
;;
;; N.B.: `Size16x8` is not a typo!
(result Reg (addv zipped (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))

(rule (lower (vhigh_bits vec @ (value_type $I16X8)))
(let (
;; Replicate the MSB of each of the 8 16-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8)))
;; Bitwise-and with a mask
;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
;; bit in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
;; Add lanes together to get the 8 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))

(rule (lower (vhigh_bits vec @ (value_type $I32X4)))
(let (
;; Replicate the MSB of each of the 4 32-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4)))
;; Bitwise-and with a mask
;; `0x00000008_00000004_00000002_00000001` to get the bit
;; in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
;; Add lanes together to get the 4 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size32x4))))
(mov_from_vec result 0 (ScalarSize.Size32))))

(rule (lower (vhigh_bits vec @ (value_type $I64X2)))
(let (
;; Grab the MSB out of each of the lanes, right-shift to
;; LSB, and add with a left-shift of upper lane's MSB back
;; to bit 1. the whole lane (sshr is an arithmetic right
;; shift).
(upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
(lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
(upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
(lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
(add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))

;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This is a two-output instruction that is needed for the
;; legalizer's explicit heap-check sequence, among possible other
;; uses. Its second output is a flags output only ever meant to
;; check for overflow using the
;; `backend.unsigned_add_overflow_condition()` condition.
;;
;; Note that the CLIF validation will ensure that no flag-setting
;; operation comes between this IaddIfcout and its use (e.g., a
;; Trapif). Thus, we can rely on implicit communication through the
;; processor flags rather than explicitly generating flags into a
;; register. We simply use the variant of the add instruction that
;; sets flags (`adds`) here.
;;
;; Note that the second output (the flags) need not be generated,
;; because flags are never materialized into a register; the only
;; instructions that can use a value of type `iflags` or `fflags`
;; will look directly for the flags-producing instruction (which can
;; always be found, by construction) and merge it.
;;
;; Now handle the iadd as above, except use an AddS opcode that sets
;; flags.

(rule (lower (has_type (ty_int ty)
(iadd_ifcout a b)))
(output_pair
(add_with_flags ty a b)
(invalid_reg)))

;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO.

;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F64X2 (fcvt_low_from_sint val)))
(let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64)))
(converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2))))
converted))

;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (fvpromote_low val))
(vec_rr_long (VecRRLongOp.Fcvtl32) val $false))

;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: requires icmp/fcmp first.

;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: requires icmp/fcmp first.

;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: requires icmp/fcmp first.

;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: requires icmp/fcmp first.

;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO.

;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO.
10 changes: 9 additions & 1 deletion cranelift/codegen/src/isa/aarch64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
}

fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
self.lshl_from_u64(ty, n.bits() as u64)
}

fn lshl_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
let shiftee_bits = ty_bits(ty);
if shiftee_bits <= std::u8::MAX as usize {
let shiftimm = shiftimm.mask(shiftee_bits as u8);
Expand Down Expand Up @@ -722,4 +726,8 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
);
}
}

fn writable_pinned_reg(&mut self) -> WritableReg {
super::regs::writable_xreg(super::regs::PINNED_REG)
}
}
Loading