Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Lower extractlane, scalar_to_vector, and splat in ISLE #4780

Merged
merged 6 commits into from
Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,13 @@
(Reg (reg Reg))
(Mem (addr SyntheticAmode))))

;; Convert a RegMem to a RegMemImm.
(decl reg_mem_to_reg_mem_imm (RegMem) RegMemImm)
(rule (reg_mem_to_reg_mem_imm (RegMem.Reg reg))
(RegMemImm.Reg reg))
(rule (reg_mem_to_reg_mem_imm (RegMem.Mem addr))
(RegMemImm.Mem addr))

;; Put the given clif value into a `RegMem` operand.
;;
;; Asserts that the value fits into a single register, and doesn't require
Expand Down Expand Up @@ -1456,13 +1463,17 @@
;; This is a side-effectful operation that notifies the context that the
;; instruction that produced the `SinkableImm` has been sunk into another
;; instruction, and no longer needs to be lowered.
(decl sink_load (SinkableLoad) RegMemImm)
(decl sink_load (SinkableLoad) RegMem)
(extern constructor sink_load sink_load)

(decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm)
(rule (sink_load_to_gpr_mem_imm load)
(gpr_mem_imm_new (sink_load load)))

(decl sink_load_to_xmm_mem (SinkableLoad) XmmMem)
(rule (sink_load_to_xmm_mem load)
(reg_mem_to_xmm_mem (sink_load load)))

;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(type ExtKind extern
Expand Down Expand Up @@ -1534,6 +1545,13 @@
(let ((r WritableXmm (temp_writable_xmm)))
(x64_pcmpeqd r r)))

;; Helper for creating XmmUninitializedValue instructions.
(decl xmm_uninit_value () Xmm)
(rule (xmm_uninit_value)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (MInst.XmmUninitializedValue dst))))
dst))

;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
(decl make_i64x2_from_lanes (GprMem GprMem) Xmm)
(rule (make_i64x2_from_lanes lo hi)
Expand Down Expand Up @@ -2828,6 +2846,30 @@
(rule (x64_psrad src1 src2)
(xmm_rmi_xmm (SseOpcode.Psrad) src1 src2))

;; Helper for creating `pextrb` instructions.
(decl x64_pextrb (Type Xmm u8) Gpr)
(rule (x64_pextrb ty src lane)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb)
dst
src
dst
lane
(operand_size_of_type_32_64 (lane_type ty))))))
dst))

;; Helper for creating `pextrw` instructions.
(decl x64_pextrw (Type Xmm u8) Gpr)
(rule (x64_pextrw ty src lane)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw)
dst
src
dst
lane
(operand_size_of_type_32_64 (lane_type ty))))))
dst))

;; Helper for creating `pextrd` instructions.
(decl x64_pextrd (Type Xmm u8) Gpr)
(rule (x64_pextrd ty src lane)
Expand Down Expand Up @@ -3707,6 +3749,7 @@
(convert WritableGpr Gpr writable_gpr_to_gpr)
(convert RegMemImm GprMemImm gpr_mem_imm_new)
(convert RegMem GprMem reg_mem_to_gpr_mem)
(convert RegMem RegMemImm reg_mem_to_reg_mem_imm)
(convert Reg GprMem reg_to_gpr_mem)
(convert Reg GprMemImm reg_to_gpr_mem_imm)
(convert WritableGpr WritableReg writable_gpr_to_reg)
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ impl Inst {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmR {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}

#[test]
Expand Down
18 changes: 0 additions & 18 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,17 +263,6 @@ impl Inst {
Inst::MovRR { size, src, dst }
}

// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUnaryRmR {
op,
src: XmmMem::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
debug_assert!(dst.to_reg().class() == RegClass::Float);
debug_assert!(ty.is_vector() && ty.bits() == 128);
Expand Down Expand Up @@ -316,13 +305,6 @@ impl Inst {
}
}

pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmUninitializedValue {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
debug_assert!(src.class() == RegClass::Float);
Inst::XmmMovRM {
Expand Down
96 changes: 96 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3547,3 +3547,99 @@
mask
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
(x64_pshufb src mask)))

;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Remove the extractlane instruction, leaving the float where it is. The upper
;; bits will remain unchanged; for correctness, this relies on Cranelift type
;; checking to avoid using those bits.
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
val)

;; Cases 2-4 for an F32X4
(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty))
(u8_from_uimm8 lane))))
(x64_pshufd val lane (OperandSize.Size32)))

;; This is the only remaining case for F64X2
(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
(u8_from_uimm8 1))))
;; 0xee == 0b11_10_11_10
(x64_pshufd val 0xee (OperandSize.Size32)))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
(x64_pextrb ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
(x64_pextrw ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
(x64_pextrd ty val lane))

(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane)))
(x64_pextrd ty val lane))

;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Case 1: when moving a scalar float, we simply move from one XMM register
;; to another, expecting the register allocator to elide this. Here we
;; assume that the upper bits of a scalar float have not been munged with
;; (the same assumption the old backend makes).
(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
src)

;; Case 2: when moving a scalar value of any other type, use MOVD to zero
;; the upper lanes.
(rule (lower (scalar_to_vector src @ (value_type ty)))
(bitcast_gpr_to_xmm ty src))

;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
;; MOVSS/MOVSD instruction.
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
(x64_movss_load (sink_load_to_xmm_mem src)))
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
(x64_movsd_load (sink_load_to_xmm_mem src)))

;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16) (splat src)))
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
(zeros Xmm (x64_pxor vec vec)))
;; Shuffle the lowest byte lane to all other lanes.
(x64_pshufb vec zeros)))

(rule (lower (has_type (multi_lane 16 8) (splat src)))
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
;; Shuffle the lowest two lanes to all other lanes.
(x64_pshufd vec 0 (OperandSize.Size32))))

(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_32x4 $F32X4 src))

(rule (lower (has_type (multi_lane 32 4) (splat src)))
(lower_splat_32x4 $I32X4 src))

(decl lower_splat_32x4 (Type Value) Xmm)
(rule (lower_splat_32x4 ty src)
(let ((src RegMem src)
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
;; Shuffle the lowest lane to all other lanes.
(x64_pshufd vec 0 (OperandSize.Size32))))

(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_64x2 $F64X2 src))

(rule (lower (has_type (multi_lane 64 2) (splat src)))
(lower_splat_64x2 $I64X2 src))

(decl lower_splat_64x2 (Type Value) Xmm)
(rule (lower_splat_64x2 ty src)
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
(vec_insert_lane ty vec src 1)))
Loading