Skip to content

Commit

Permalink
x64: Lower extractlane, scalar_to_vector, and splat in ISLE (#4780)
Browse files Browse the repository at this point in the history
Lower extractlane, scalar_to_vector and splat in ISLE.

This PR also makes some changes to the SinkableLoad api
* change the return type of sink_load to RegMem as there are more functions available for dealing with RegMem
* add reg_mem_to_reg_mem_imm and register it as an automatic conversion
  • Loading branch information
elliottt authored Aug 25, 2022
1 parent d3c463a commit 9386409
Show file tree
Hide file tree
Showing 10 changed files with 285 additions and 251 deletions.
45 changes: 44 additions & 1 deletion cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,13 @@
(Reg (reg Reg))
(Mem (addr SyntheticAmode))))

;; Convert a RegMem to a RegMemImm.
(decl reg_mem_to_reg_mem_imm (RegMem) RegMemImm)
(rule (reg_mem_to_reg_mem_imm (RegMem.Reg reg))
(RegMemImm.Reg reg))
(rule (reg_mem_to_reg_mem_imm (RegMem.Mem addr))
(RegMemImm.Mem addr))

;; Put the given clif value into a `RegMem` operand.
;;
;; Asserts that the value fits into a single register, and doesn't require
Expand Down Expand Up @@ -1456,13 +1463,17 @@
;; This is a side-effectful operation that notifies the context that the
;; instruction that produced the `SinkableImm` has been sunk into another
;; instruction, and no longer needs to be lowered.
(decl sink_load (SinkableLoad) RegMemImm)
(decl sink_load (SinkableLoad) RegMem)
(extern constructor sink_load sink_load)

(decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm)
(rule (sink_load_to_gpr_mem_imm load)
(gpr_mem_imm_new (sink_load load)))

(decl sink_load_to_xmm_mem (SinkableLoad) XmmMem)
(rule (sink_load_to_xmm_mem load)
(reg_mem_to_xmm_mem (sink_load load)))

;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(type ExtKind extern
Expand Down Expand Up @@ -1534,6 +1545,13 @@
(let ((r WritableXmm (temp_writable_xmm)))
(x64_pcmpeqd r r)))

;; Helper for creating XmmUninitializedValue instructions.
(decl xmm_uninit_value () Xmm)
(rule (xmm_uninit_value)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (MInst.XmmUninitializedValue dst))))
dst))

;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
(decl make_i64x2_from_lanes (GprMem GprMem) Xmm)
(rule (make_i64x2_from_lanes lo hi)
Expand Down Expand Up @@ -2828,6 +2846,30 @@
(rule (x64_psrad src1 src2)
(xmm_rmi_xmm (SseOpcode.Psrad) src1 src2))

;; Helper for creating `pextrb` instructions.
(decl x64_pextrb (Type Xmm u8) Gpr)
(rule (x64_pextrb ty src lane)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb)
dst
src
dst
lane
(operand_size_of_type_32_64 (lane_type ty))))))
dst))

;; Helper for creating `pextrw` instructions.
(decl x64_pextrw (Type Xmm u8) Gpr)
(rule (x64_pextrw ty src lane)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw)
dst
src
dst
lane
(operand_size_of_type_32_64 (lane_type ty))))))
dst))

;; Helper for creating `pextrd` instructions.
(decl x64_pextrd (Type Xmm u8) Gpr)
(rule (x64_pextrd ty src lane)
Expand Down Expand Up @@ -3707,6 +3749,7 @@
(convert WritableGpr Gpr writable_gpr_to_gpr)
(convert RegMemImm GprMemImm gpr_mem_imm_new)
(convert RegMem GprMem reg_mem_to_gpr_mem)
(convert RegMem RegMemImm reg_mem_to_reg_mem_imm)
(convert Reg GprMem reg_to_gpr_mem)
(convert Reg GprMemImm reg_to_gpr_mem_imm)
(convert WritableGpr WritableReg writable_gpr_to_reg)
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ impl Inst {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmR {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}

#[test]
Expand Down
18 changes: 0 additions & 18 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,17 +263,6 @@ impl Inst {
Inst::MovRR { size, src, dst }
}

// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmR {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
debug_assert!(dst.to_reg().class() == RegClass::Float);
debug_assert!(ty.is_vector() && ty.bits() == 128);
Expand Down Expand Up @@ -316,13 +305,6 @@ impl Inst {
}
}

pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUninitializedValue {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
debug_assert!(src.class() == RegClass::Float);
Inst::XmmMovRM {
Expand Down
96 changes: 96 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3547,3 +3547,99 @@
mask
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
(x64_pshufb src mask)))

;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Remove the extractlane instruction, leaving the float where it is. The upper
;; bits will remain unchanged; for correctness, this relies on Cranelift type
;; checking to avoid using those bits.
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
val)

;; Cases 2-4 for an F32X4
(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty))
(u8_from_uimm8 lane))))
(x64_pshufd val lane (OperandSize.Size32)))

;; This is the only remaining case for F64X2
(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
(u8_from_uimm8 1))))
;; 0xee == 0b11_10_11_10
(x64_pshufd val 0xee (OperandSize.Size32)))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
(x64_pextrb ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
(x64_pextrw ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
(x64_pextrd ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane)))
(x64_pextrd ty val lane))

;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Case 1: when moving a scalar float, we simply move from one XMM register
;; to another, expecting the register allocator to elide this. Here we
;; assume that the upper bits of a scalar float have not been munged with
;; (the same assumption the old backend makes).
(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
src)

;; Case 2: when moving a scalar value of any other type, use MOVD to zero
;; the upper lanes.
(rule (lower (scalar_to_vector src @ (value_type ty)))
(bitcast_gpr_to_xmm ty src))

;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
;; MOVSS/MOVSD instruction.
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
(x64_movss_load (sink_load_to_xmm_mem src)))
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
(x64_movsd_load (sink_load_to_xmm_mem src)))

;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16) (splat src)))
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
(zeros Xmm (x64_pxor vec vec)))
;; Shuffle the lowest byte lane to all other lanes.
(x64_pshufb vec zeros)))

(rule (lower (has_type (multi_lane 16 8) (splat src)))
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
;; Shuffle the lowest two lanes to all other lanes.
(x64_pshufd vec 0 (OperandSize.Size32))))

(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_32x4 $F32X4 src))

(rule (lower (has_type (multi_lane 32 4) (splat src)))
(lower_splat_32x4 $I32X4 src))

(decl lower_splat_32x4 (Type Value) Xmm)
(rule (lower_splat_32x4 ty src)
(let ((src RegMem src)
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
;; Shuffle the lowest lane to all other lanes.
(x64_pshufd vec 0 (OperandSize.Size32))))

(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_64x2 $F64X2 src))

(rule (lower (has_type (multi_lane 64 2) (splat src)))
(lower_splat_64x2 $I64X2 src))

(decl lower_splat_64x2 (Type Value) Xmm)
(rule (lower_splat_64x2 ty src)
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
(vec_insert_lane ty vec src 1)))
Loading

0 comments on commit 9386409

Please sign in to comment.