diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index fae522c74626..0f491c42e121 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -777,6 +777,13 @@ (Reg (reg Reg)) (Mem (addr SyntheticAmode)))) +;; Convert a RegMem to a RegMemImm. +(decl reg_mem_to_reg_mem_imm (RegMem) RegMemImm) +(rule (reg_mem_to_reg_mem_imm (RegMem.Reg reg)) + (RegMemImm.Reg reg)) +(rule (reg_mem_to_reg_mem_imm (RegMem.Mem addr)) + (RegMemImm.Mem addr)) + ;; Put the given clif value into a `RegMem` operand. ;; ;; Asserts that the value fits into a single register, and doesn't require @@ -1456,13 +1463,17 @@ ;; This is a side-effectful operation that notifies the context that the ;; instruction that produced the `SinkableImm` has been sunk into another ;; instruction, and no longer needs to be lowered. -(decl sink_load (SinkableLoad) RegMemImm) +(decl sink_load (SinkableLoad) RegMem) (extern constructor sink_load sink_load) (decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm) (rule (sink_load_to_gpr_mem_imm load) (gpr_mem_imm_new (sink_load load))) +(decl sink_load_to_xmm_mem (SinkableLoad) XmmMem) +(rule (sink_load_to_xmm_mem load) + (reg_mem_to_xmm_mem (sink_load load))) + ;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (type ExtKind extern @@ -1534,6 +1545,13 @@ (let ((r WritableXmm (temp_writable_xmm))) (x64_pcmpeqd r r))) +;; Helper for creating XmmUninitializedValue instructions. +(decl xmm_uninit_value () Xmm) +(rule (xmm_uninit_value) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmUninitializedValue dst)))) + dst)) + ;; Helper for creating an SSE register holding an `i64x2` from two `i64` values. (decl make_i64x2_from_lanes (GprMem GprMem) Xmm) (rule (make_i64x2_from_lanes lo hi) @@ -2828,6 +2846,30 @@ (rule (x64_psrad src1 src2) (xmm_rmi_xmm (SseOpcode.Psrad) src1 src2)) +;; Helper for creating `pextrb` instructions. +(decl x64_pextrb (Type Xmm u8) Gpr) +(rule (x64_pextrb ty src lane) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb) + dst + src + dst + lane + (operand_size_of_type_32_64 (lane_type ty)))))) + dst)) + +;; Helper for creating `pextrw` instructions. +(decl x64_pextrw (Type Xmm u8) Gpr) +(rule (x64_pextrw ty src lane) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw) + dst + src + dst + lane + (operand_size_of_type_32_64 (lane_type ty)))))) + dst)) + ;; Helper for creating `pextrd` instructions. (decl x64_pextrd (Type Xmm u8) Gpr) (rule (x64_pextrd ty src lane) @@ -3707,6 +3749,7 @@ (convert WritableGpr Gpr writable_gpr_to_gpr) (convert RegMemImm GprMemImm gpr_mem_imm_new) (convert RegMem GprMem reg_mem_to_gpr_mem) +(convert RegMem RegMemImm reg_mem_to_reg_mem_imm) (convert Reg GprMem reg_to_gpr_mem) (convert Reg GprMemImm reg_to_gpr_mem_imm) (convert WritableGpr WritableReg writable_gpr_to_reg) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index f23547e8c540..26b897d3b4f2 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -78,6 +78,17 @@ impl Inst { dst: WritableXmm::from_writable_reg(dst).unwrap(), } } + + // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level) + fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable) -> Inst { + src.assert_regclass_is(RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmUnaryRmR { + op, + src: XmmMem::new(src).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } } #[test] diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 36a1ea9e729f..4278cb192a4f 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -263,17 +263,6 @@ impl Inst { Inst::MovRR { size, src, dst } } - // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level) - pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable) -> Inst { - src.assert_regclass_is(RegClass::Float); - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmUnaryRmR { - op, - src: XmmMem::new(src).unwrap(), - dst: WritableXmm::from_writable_reg(dst).unwrap(), - } - } - pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable, ty: Type) -> Inst { debug_assert!(dst.to_reg().class() == RegClass::Float); debug_assert!(ty.is_vector() && ty.bits() == 128); @@ -316,13 +305,6 @@ impl Inst { } } - pub(crate) fn xmm_uninit_value(dst: Writable) -> Self { - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmUninitializedValue { - dst: WritableXmm::from_writable_reg(dst).unwrap(), - } - } - pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into) -> Inst { debug_assert!(src.class() == RegClass::Float); Inst::XmmMovRM { diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index bbb93eb0310e..3e863633c007 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3547,3 +3547,99 @@ mask (x64_xmm_load_const $I8X16 (swizzle_zero_mask))))) (x64_pshufb src mask))) + +;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Remove the extractlane instruction, leaving the float where it is. The upper +;; bits will remain unchanged; for correctness, this relies on Cranelift type +;; checking to avoid using those bits. +(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0)))) + val) + +;; Cases 2-4 for an F32X4 +(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty)) + (u8_from_uimm8 lane)))) + (x64_pshufd val lane (OperandSize.Size32))) + +;; This is the only remaining case for F64X2 +(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty)) + (u8_from_uimm8 1)))) + ;; 0xee == 0b11_10_11_10 + (x64_pshufd val 0xee (OperandSize.Size32))) + +(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane))) + (x64_pextrb ty val lane)) + +(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) + (x64_pextrw ty val lane)) + +(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane))) + (x64_pextrd ty val lane)) + +(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane))) + (x64_pextrd ty val lane)) + +;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Case 1: when moving a scalar float, we simply move from one XMM register +;; to another, expecting the register allocator to elide this. Here we +;; assume that the upper bits of a scalar float have not been munged with +;; (the same assumption the old backend makes). +(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _)))) + src) + +;; Case 2: when moving a scalar value of any other type, use MOVD to zero +;; the upper lanes. +(rule (lower (scalar_to_vector src @ (value_type ty))) + (bitcast_gpr_to_xmm ty src)) + +;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single +;; MOVSS/MOVSD instruction. +(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _))))) + (x64_movss_load (sink_load_to_xmm_mem src))) +(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _))))) + (x64_movsd_load (sink_load_to_xmm_mem src))) + +;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (multi_lane 8 16) (splat src))) + (let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0)) + (zeros Xmm (x64_pxor vec vec))) + ;; Shuffle the lowest byte lane to all other lanes. + (x64_pshufb vec zeros))) + +(rule (lower (has_type (multi_lane 16 8) (splat src))) + (let (;; Force the input into a register so that we don't create a + ;; VCodeConstant. + (src RegMem (RegMem.Reg src)) + (vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0)) + (vec Xmm (vec_insert_lane $I16X8 vec src 1))) + ;; Shuffle the lowest two lanes to all other lanes. + (x64_pshufd vec 0 (OperandSize.Size32)))) + +(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) + (lower_splat_32x4 $F32X4 src)) + +(rule (lower (has_type (multi_lane 32 4) (splat src))) + (lower_splat_32x4 $I32X4 src)) + +(decl lower_splat_32x4 (Type Value) Xmm) +(rule (lower_splat_32x4 ty src) + (let ((src RegMem src) + (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) + ;; Shuffle the lowest lane to all other lanes. + (x64_pshufd vec 0 (OperandSize.Size32)))) + +(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) + (lower_splat_64x2 $F64X2 src)) + +(rule (lower (has_type (multi_lane 64 2) (splat src))) + (lower_splat_64x2 $I64X2 src)) + +(decl lower_splat_64x2 (Type Value) Xmm) +(rule (lower_splat_64x2 ty src) + (let (;; Force the input into a register so that we don't create a + ;; VCodeConstant. + (src RegMem (RegMem.Reg src)) + (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) + (vec_insert_lane ty vec src 1))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 5a335da67aa7..20bd49356af4 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3,7 +3,7 @@ // ISLE integration glue. pub(super) mod isle; -use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type}; +use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type}; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; @@ -160,100 +160,6 @@ fn input_to_imm(ctx: &mut Lower, spec: InsnInput) -> Option { .constant } -/// Emit an instruction to insert a value `src` into a lane of `dst`. -fn emit_insert_lane(ctx: &mut Lower, src: RegMem, dst: Writable, lane: u8, ty: Type) { - if !ty.is_float() { - let (sse_op, size) = match ty.lane_bits() { - 8 => (SseOpcode::Pinsrb, OperandSize::Size32), - 16 => (SseOpcode::Pinsrw, OperandSize::Size32), - 32 => (SseOpcode::Pinsrd, OperandSize::Size32), - 64 => (SseOpcode::Pinsrd, OperandSize::Size64), - _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), - }; - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size)); - } else if ty == types::F32 { - let sse_op = SseOpcode::Insertps; - // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane - // shifted into bits 5:6). - let lane = 0b00_00_00_00 | lane << 4; - ctx.emit(Inst::xmm_rm_r_imm( - sse_op, - src, - dst, - lane, - OperandSize::Size32, - )); - } else if ty == types::F64 { - let sse_op = match lane { - // Move the lowest quadword in replacement to vector without changing - // the upper bits. - 0 => SseOpcode::Movsd, - // Move the low 64 bits of replacement vector to the high 64 bits of the - // vector. - 1 => SseOpcode::Movlhps, - _ => unreachable!(), - }; - // Here we use the `xmm_rm_r` encoding because it correctly tells the register - // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other - // encoding formats like `xmm_unary_rm_r` treat it as a `def`. - ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); - } else { - panic!("unable to emit insertlane for type: {}", ty) - } -} - -/// Emit an instruction to extract a lane of `src` into `dst`. -fn emit_extract_lane(ctx: &mut Lower, src: Reg, dst: Writable, lane: u8, ty: Type) { - if !ty.is_float() { - let (sse_op, size) = match ty.lane_bits() { - 8 => (SseOpcode::Pextrb, OperandSize::Size32), - 16 => (SseOpcode::Pextrw, OperandSize::Size32), - 32 => (SseOpcode::Pextrd, OperandSize::Size32), - 64 => (SseOpcode::Pextrd, OperandSize::Size64), - _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), - }; - let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size)); - } else if ty == types::F32 || ty == types::F64 { - if lane == 0 { - // Remove the extractlane instruction, leaving the float where it is. The upper - // bits will remain unchanged; for correctness, this relies on Cranelift type - // checking to avoid using those bits. - ctx.emit(Inst::gen_move(dst, src, ty)); - } else { - // Otherwise, shuffle the bits in `lane` to the lowest lane. - let sse_op = SseOpcode::Pshufd; - let mask = match ty { - // Move the value at `lane` to lane 0, copying existing value at lane 0 to - // other lanes. Again, this relies on Cranelift type checking to avoid - // using those bits. - types::F32 => { - assert!(lane > 0 && lane < 4); - 0b00_00_00_00 | lane - } - // Move the value at `lane` 1 (we know it must be 1 because of the `if` - // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type - // checking assumption also applies here. - types::F64 => { - assert!(lane == 1); - 0b11_10_11_10 - } - _ => unreachable!(), - }; - let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm( - sse_op, - src, - dst, - mask, - OperandSize::Size32, - )); - } - } else { - panic!("unable to emit extractlane for type: {}", ty) - } -} - fn emit_vm_call( ctx: &mut Lower, flags: &Flags, @@ -586,132 +492,15 @@ fn lower_insn_to_regs( | Opcode::RawBitcast | Opcode::Insertlane | Opcode::Shuffle - | Opcode::Swizzle => { + | Opcode::Swizzle + | Opcode::Extractlane + | Opcode::ScalarToVector + | Opcode::Splat => { implemented_in_isle(ctx); } Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"), - Opcode::Extractlane => { - // The instruction format maps to variables like: %dst = extractlane %src, %lane - let ty = ty.unwrap(); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let src_ty = ctx.input_ty(insn, 0); - assert_eq!(src_ty.bits(), 128); - let src = put_input_in_reg(ctx, inputs[0]); - let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { - *imm - } else { - unreachable!(); - }; - debug_assert!(lane < src_ty.lane_count() as u8); - - emit_extract_lane(ctx, src, dst, lane, ty); - } - - Opcode::ScalarToVector => { - // When moving a scalar value to a vector register, we must be handle several - // situations: - // 1. a scalar float is already in an XMM register, so we simply move it - // 2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an - // XMM register and zeroes the upper bits - // 3. a scalar (float or otherwise) that has previously been loaded from memory (e.g. - // the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single - // MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the - // unused load. - let src = input_to_reg_mem(ctx, inputs[0]); - let src_ty = ctx.input_ty(insn, 0); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let dst_ty = ty.unwrap(); - assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128); - match src { - RegMem::Reg { reg } => { - if src_ty.is_float() { - // Case 1: when moving a scalar float, we simply move from one XMM register - // to another, expecting the register allocator to elide this. Here we - // assume that the upper bits of a scalar float have not been munged with - // (the same assumption the old backend makes). - ctx.emit(Inst::gen_move(dst, reg, dst_ty)); - } else { - // Case 2: when moving a scalar value of any other type, use MOVD to zero - // the upper lanes. - let src_size = match src_ty.bits() { - 32 => OperandSize::Size32, - 64 => OperandSize::Size64, - _ => unimplemented!("invalid source size for type: {}", src_ty), - }; - ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst)); - } - } - RegMem::Mem { .. } => { - // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single - // MOVSS/MOVSD instruction. - let opcode = match src_ty.bits() { - 32 => SseOpcode::Movss, - 64 => SseOpcode::Movsd, - _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty), - }; - ctx.emit(Inst::xmm_mov(opcode, src, dst)); - } - } - } - - Opcode::Splat => { - let ty = ty.unwrap(); - assert_eq!(ty.bits(), 128); - let src_ty = ctx.input_ty(insn, 0); - assert!(src_ty.bits() < 128); - - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - // We know that splat will overwrite all of the lanes of `dst` but it takes several - // instructions to do so. Because of the multiple instructions, there is no good way to - // declare `dst` a `def` except with the following pseudo-instruction. - ctx.emit(Inst::xmm_uninit_value(dst)); - - // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST* - // and VPBROADCAST*. - match ty.lane_bits() { - 8 => { - emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); - // Initialize a register with all 0s. - let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); - // Shuffle the lowest byte lane to all other lanes. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) - } - 16 => { - emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); - emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); - // Shuffle the lowest two lanes to all other lanes. - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Pshufd, - RegMem::from(dst), - dst, - 0, - OperandSize::Size32, - )) - } - 32 => { - emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); - // Shuffle the lowest lane to all other lanes. - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Pshufd, - RegMem::from(dst), - dst, - 0, - OperandSize::Size32, - )) - } - 64 => { - emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); - emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); - } - _ => panic!("Invalid type to splat: {}", ty), - } - } - Opcode::VanyTrue => { let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let src_ty = ctx.input_ty(insn, 0); diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 36f9c1dda222..15daf707a9ba 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -306,10 +306,10 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { None } - fn sink_load(&mut self, load: &SinkableLoad) -> RegMemImm { + fn sink_load(&mut self, load: &SinkableLoad) -> RegMem { self.lower_ctx.sink_inst(load.inst); let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset); - RegMemImm::Mem { + RegMem::Mem { addr: SyntheticAmode::Real(addr), } } diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 4d9fe9fe20dd..a6a3a39657f4 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -298,6 +298,24 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn ty_32(&mut self, ty: Type) -> Option { + if ty.bits() == 32 { + Some(ty) + } else { + None + } + } + + #[inline] + fn ty_64(&mut self, ty: Type) -> Option { + if ty.bits() == 64 { + Some(ty) + } else { + None + } + } + #[inline] fn ty_32_or_64(&mut self, ty: Type) -> Option { if ty.bits() == 32 || ty.bits() == 64 { diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index f5caeb94b783..5eb7a6c5dd0f 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -328,6 +328,14 @@ (decl fits_in_64 (Type) Type) (extern extractor fits_in_64 fits_in_64) +;; An extractor that only matches types that fit in exactly 32 bits. +(decl ty_32 (Type) Type) +(extern extractor ty_32 ty_32) + +;; An extractor that only matches types that fit in exactly 64 bits. +(decl ty_64 (Type) Type) +(extern extractor ty_64 ty_64) + ;; A pure constructor that only matches scalar booleans, integers, and ;; references that can fit in 64 bits. (decl pure ty_int_bool_ref_scalar_64 (Type) Type) diff --git a/cranelift/filetests/filetests/isa/x64/extractlane.clif b/cranelift/filetests/filetests/isa/x64/extractlane.clif new file mode 100644 index 000000000000..448e61abeafc --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/extractlane.clif @@ -0,0 +1,87 @@ +test compile precise-output +target x86_64 + +function %f1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrb $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f2(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrw $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrd $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrd.w $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f5(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f6(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane v0, 1 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $238, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index 1dd0dbc29a6f..7b1607737f60 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -74,8 +74,8 @@ block0(v0: i8): ; block0: ; uninit %xmm0 ; pinsrb $0, %xmm0, %rdi, %xmm0 -; pxor %xmm6, %xmm6, %xmm6 -; pshufb %xmm0, %xmm6, %xmm0 +; pxor %xmm7, %xmm7, %xmm7 +; pshufb %xmm0, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -90,11 +90,11 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movl $65535, %eax -; uninit %xmm0 -; pinsrw $0, %xmm0, %rax, %xmm0 -; pinsrw $1, %xmm0, %rax, %xmm0 -; pshufd $0, %xmm0, %xmm0 +; movl $65535, %edi +; uninit %xmm5 +; pinsrw $0, %xmm5, %rdi, %xmm5 +; pinsrw $1, %xmm5, %rdi, %xmm5 +; pshufd $0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -108,9 +108,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; uninit %xmm0 -; pinsrd $0, %xmm0, %rdi, %xmm0 -; pshufd $0, %xmm0, %xmm0 +; uninit %xmm4 +; pinsrd $0, %xmm4, %rdi, %xmm4 +; pshufd $0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -124,11 +124,11 @@ block0(v0: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm4 +; movdqa %xmm0, %xmm6 ; uninit %xmm0 -; movdqa %xmm4, %xmm5 -; movsd %xmm0, %xmm5, %xmm0 -; movlhps %xmm0, %xmm5, %xmm0 +; movdqa %xmm6, %xmm7 +; movsd %xmm0, %xmm7, %xmm0 +; movlhps %xmm0, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret