diff --git a/build.rs b/build.rs index 682187eaf17c..5f9b18ebf803 100644 --- a/build.rs +++ b/build.rs @@ -210,7 +210,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "issue4807", "issue_3327_bnot_lowering", "load_splat_out_of_bounds", - "replace_lane_preserve", "simd_align", "simd_bit_shift", "simd_bitwise", @@ -246,10 +245,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_int_to_int_extend", "simd_lane", "simd_load", - "simd_load16_lane", - "simd_load32_lane", - "simd_load64_lane", - "simd_load8_lane", "simd_load_extend", "simd_load_zero", "simd_splat", diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 9a4f99b8e36c..6b639ad311b9 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -328,6 +328,7 @@ (vd WritableReg) (vs2 Reg) (vs1 Reg) + (mask VecOpMasking) (vstate VState)) (VecAluRRImm5 @@ -335,18 +336,21 @@ (vd WritableReg) (vs2 Reg) (imm Imm5) + (mask VecOpMasking) (vstate VState)) (VecAluRR (op VecAluOpRR) (vd WritableReg) (vs Reg) + (mask VecOpMasking) (vstate VState)) (VecAluRImm5 (op VecAluOpRImm5) (vd WritableReg) (imm Imm5) + (mask VecOpMasking) (vstate VState)) (VecSetState @@ -358,6 +362,7 @@ (to WritableReg) (from VecAMode) (flags MemFlags) + (mask VecOpMasking) (vstate VState)) (VecStore @@ -365,6 +370,7 @@ (to VecAMode) (from Reg) (flags MemFlags) + (mask VecOpMasking) (vstate VState)) )) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index d4c03c1ea053..2f9d9bd486d2 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -5,7 +5,6 @@ use crate::ir::RelSourceLoc; use crate::ir::TrapCode; use crate::isa::riscv64::inst::*; use crate::isa::riscv64::inst::{zero_reg, AluOPRRR}; -use crate::isa::riscv64::lower::isle::generated_code::VecOpMasking; use crate::machinst::{AllocationConsumer, Reg, Writable}; use cranelift_control::ControlPlane; use regalloc2::Allocation; @@ -2804,32 +2803,58 @@ impl MachInstEmit for Inst { sink.bind_label(label_done, &mut state.ctrl_plane); } &Inst::VecAluRRR { - op, vd, vs1, vs2, .. + op, + vd, + vs1, + vs2, + ref mask, + .. } => { let vs1 = allocs.next(vs1); let vs2 = allocs.next(vs2); let vd = allocs.next_writable(vd); + let mask = mask.with_allocs(&mut allocs); - sink.put4(encode_valu(op, vd, vs1, vs2, VecOpMasking::Disabled)); + sink.put4(encode_valu(op, vd, vs1, vs2, mask)); } &Inst::VecAluRRImm5 { - op, vd, imm, vs2, .. + op, + vd, + imm, + vs2, + ref mask, + .. } => { let vs2 = allocs.next(vs2); let vd = allocs.next_writable(vd); + let mask = mask.with_allocs(&mut allocs); - sink.put4(encode_valu_imm(op, vd, imm, vs2, VecOpMasking::Disabled)); + sink.put4(encode_valu_imm(op, vd, imm, vs2, mask)); } - &Inst::VecAluRR { op, vd, vs, .. } => { + &Inst::VecAluRR { + op, + vd, + vs, + ref mask, + .. + } => { let vs = allocs.next(vs); let vd = allocs.next_writable(vd); + let mask = mask.with_allocs(&mut allocs); - sink.put4(encode_valu_rr(op, vd, vs, VecOpMasking::Disabled)); + sink.put4(encode_valu_rr(op, vd, vs, mask)); } - &Inst::VecAluRImm5 { op, vd, imm, .. } => { + &Inst::VecAluRImm5 { + op, + vd, + imm, + ref mask, + .. + } => { let vd = allocs.next_writable(vd); + let mask = mask.with_allocs(&mut allocs); - sink.put4(encode_valu_r_imm(op, vd, imm, VecOpMasking::Disabled)); + sink.put4(encode_valu_r_imm(op, vd, imm, mask)); } &Inst::VecSetState { rd, ref vstate } => { let rd = allocs.next_writable(rd); @@ -2849,11 +2874,13 @@ impl MachInstEmit for Inst { eew, to, ref from, + ref mask, flags, .. } => { let from = from.clone().with_allocs(&mut allocs); let to = allocs.next_writable(to); + let mask = mask.with_allocs(&mut allocs); // Vector Loads don't support immediate offsets, so we need to load it into a register. let addr = match from { @@ -2889,8 +2916,7 @@ impl MachInstEmit for Inst { eew, addr, from.lumop(), - // We don't implement masking yet. - VecOpMasking::Disabled, + mask, from.mop(), from.nf(), )); @@ -2900,11 +2926,13 @@ impl MachInstEmit for Inst { eew, ref to, from, + ref mask, flags, .. } => { let to = to.clone().with_allocs(&mut allocs); let from = allocs.next(from); + let mask = mask.with_allocs(&mut allocs); // Vector Stores don't support immediate offsets, so we need to load it into a register. let addr = match to { @@ -2940,8 +2968,7 @@ impl MachInstEmit for Inst { eew, addr, to.sumop(), - // We don't implement masking yet. - VecOpMasking::Disabled, + mask, to.mop(), to.nf(), )); diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 6324484eebbb..659dcb0fa0cf 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -4,7 +4,7 @@ #![allow(dead_code)] #![allow(non_camel_case_types)] -use super::lower::isle::generated_code::{VecAMode, VecElementWidth}; +use super::lower::isle::generated_code::{VecAMode, VecElementWidth, VecOpMasking}; use crate::binemit::{Addend, CodeOffset, Reloc}; pub use crate::ir::condcodes::IntCC; use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64}; @@ -301,6 +301,7 @@ impl Inst { to: into_reg, from: VecAMode::UnitStride { base: mem }, flags, + mask: VecOpMasking::Disabled, vstate: VState::from_type(ty), } } else { @@ -321,6 +322,7 @@ impl Inst { to: VecAMode::UnitStride { base: mem }, from: from_reg, flags, + mask: VecOpMasking::Disabled, vstate: VState::from_type(ty), } } else { @@ -335,6 +337,19 @@ impl Inst { } //============================================================================= + +fn vec_mask_operands VReg>( + mask: &VecOpMasking, + collector: &mut OperandCollector<'_, F>, +) { + match mask { + VecOpMasking::Enabled { reg } => { + collector.reg_fixed_use(*reg, pv_reg(0).into()); + } + VecOpMasking::Disabled => {} + } +} + fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) { match inst { &Inst::Nop0 => {} @@ -625,7 +640,12 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan // no need let reg alloc know. } &Inst::VecAluRRR { - op, vd, vs1, vs2, .. + op, + vd, + vs1, + vs2, + ref mask, + .. } => { debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); debug_assert_eq!(vs2.class(), RegClass::Vector); @@ -634,40 +654,64 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_use(vs1); collector.reg_use(vs2); collector.reg_def(vd); + vec_mask_operands(mask, collector); } - &Inst::VecAluRRImm5 { vd, vs2, .. } => { + &Inst::VecAluRRImm5 { + vd, vs2, ref mask, .. + } => { debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); debug_assert_eq!(vs2.class(), RegClass::Vector); collector.reg_use(vs2); collector.reg_def(vd); + vec_mask_operands(mask, collector); } - &Inst::VecAluRR { op, vd, vs, .. } => { + &Inst::VecAluRR { + op, + vd, + vs, + ref mask, + .. + } => { debug_assert_eq!(vd.to_reg().class(), op.dst_regclass()); debug_assert_eq!(vs.class(), op.src_regclass()); collector.reg_use(vs); collector.reg_def(vd); + vec_mask_operands(mask, collector); } - &Inst::VecAluRImm5 { vd, .. } => { + &Inst::VecAluRImm5 { vd, ref mask, .. } => { debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); collector.reg_def(vd); + vec_mask_operands(mask, collector); } &Inst::VecSetState { rd, .. } => { collector.reg_def(rd); } - &Inst::VecLoad { to, ref from, .. } => { + &Inst::VecLoad { + to, + ref from, + ref mask, + .. + } => { if let Some(r) = from.get_allocatable_register() { collector.reg_use(r); } collector.reg_def(to); + vec_mask_operands(mask, collector); } - &Inst::VecStore { ref to, from, .. } => { + &Inst::VecStore { + ref to, + from, + ref mask, + .. + } => { if let Some(r) = to.get_allocatable_register() { collector.reg_use(r); } collector.reg_use(from); + vec_mask_operands(mask, collector); } } } @@ -876,6 +920,13 @@ impl Inst { } }; + let format_mask = |mask: &VecOpMasking, allocs: &mut AllocationConsumer<'_>| -> String { + match mask { + VecOpMasking::Enabled { reg } => format!(",{}.t", format_reg(*reg, allocs)), + VecOpMasking::Disabled => format!(""), + } + }; + let format_regs = |regs: &[Reg], allocs: &mut AllocationConsumer<'_>| -> String { let mut x = if regs.len() > 1 { String::from("[") @@ -1572,22 +1623,24 @@ impl Inst { vd, vs1, vs2, + ref mask, ref vstate, } => { let vs1_s = format_reg(vs1, allocs); let vs2_s = format_reg(vs2, allocs); let vd_s = format_reg(vd.to_reg(), allocs); + let mask = format_mask(mask, allocs); // Note: vs2 and vs1 here are opposite to the standard scalar ordering. // This is noted in Section 10.1 of the RISC-V Vector spec. match (op, vs2, vs1) { (VecAluOpRRR::VrsubVX, _, vs1) if vs1 == zero_reg() => { - format!("vneg.v {},{} {}", vd_s, vs2_s, vstate) + format!("vneg.v {vd_s},{vs2_s}{mask} {vstate}") } (VecAluOpRRR::VfsgnjnVV, vs2, vs1) if vs2 == vs1 => { - format!("vfneg.v {},{} {}", vd_s, vs2_s, vstate) + format!("vfneg.v {vd_s},{vs2_s}{mask} {vstate}") } - _ => format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate), + _ => format!("{op} {vd_s},{vs2_s},{vs1_s}{mask} {vstate}"), } } &Inst::VecAluRRImm5 { @@ -1595,10 +1648,12 @@ impl Inst { vd, imm, vs2, + ref mask, ref vstate, } => { let vs2_s = format_reg(vs2, allocs); let vd_s = format_reg(vd.to_reg(), allocs); + let mask = format_mask(mask, allocs); // Some opcodes interpret the immediate as unsigned, lets show the // correct number here. @@ -1608,28 +1663,32 @@ impl Inst { format!("{}", imm) }; - format!("{} {},{},{} {}", op, vd_s, vs2_s, imm_s, vstate) + format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}") } &Inst::VecAluRR { op, vd, vs, + ref mask, ref vstate, } => { let vs_s = format_reg(vs, allocs); let vd_s = format_reg(vd.to_reg(), allocs); + let mask = format_mask(mask, allocs); - format!("{} {},{} {}", op, vd_s, vs_s, vstate) + format!("{op} {vd_s},{vs_s}{mask} {vstate}") } &Inst::VecAluRImm5 { op, vd, imm, + ref mask, ref vstate, } => { let vd_s = format_reg(vd.to_reg(), allocs); + let mask = format_mask(mask, allocs); - format!("{} {},{} {}", op, vd_s, imm, vstate) + format!("{op} {vd_s},{imm}{mask} {vstate}") } &Inst::VecSetState { rd, ref vstate } => { let rd_s = format_reg(rd.to_reg(), allocs); @@ -1640,23 +1699,29 @@ impl Inst { eew, to, from, + ref mask, ref vstate, .. } => { let base = format_vec_amode(from, allocs); let vd = format_reg(to.to_reg(), allocs); - format!("vl{}.v {},{} {}", eew, vd, base, vstate) + let mask = format_mask(mask, allocs); + + format!("vl{eew}.v {vd},{base}{mask} {vstate}") } Inst::VecStore { eew, to, from, + ref mask, ref vstate, .. } => { let dst = format_vec_amode(to, allocs); let vs3 = format_reg(*from, allocs); - format!("vs{}.v {},{} {}", eew, vs3, dst, vstate) + let mask = format_mask(mask, allocs); + + format!("vs{eew}.v {vs3},{dst}{mask} {vstate}") } } } diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 8d21ef9ebe57..48d5192efaaf 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -234,10 +234,19 @@ impl VecOpCategory { impl VecOpMasking { pub fn encode(&self) -> u32 { match self { - VecOpMasking::Enabled => 0, + VecOpMasking::Enabled { .. } => 0, VecOpMasking::Disabled => 1, } } + + pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self { + match self { + VecOpMasking::Enabled { reg } => VecOpMasking::Enabled { + reg: allocs.next(*reg), + }, + VecOpMasking::Disabled => VecOpMasking::Disabled, + } + } } impl VecAluOpRRR { @@ -268,6 +277,7 @@ impl VecAluOpRRR { VecAluOpRRR::VxorVV => 0b001011, VecAluOpRRR::VslidedownVX => 0b001111, VecAluOpRRR::VfrsubVF => 0b100111, + VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111, VecAluOpRRR::VfdivVV | VecAluOpRRR::VfdivVF => 0b100000, VecAluOpRRR::VfrdivVF => 0b100001, VecAluOpRRR::VfsgnjnVV => 0b001001, @@ -280,14 +290,16 @@ impl VecAluOpRRR { | VecAluOpRRR::VsubVV | VecAluOpRRR::VandVV | VecAluOpRRR::VorVV - | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV, + | VecAluOpRRR::VxorVV + | VecAluOpRRR::VmergeVVM => VecOpCategory::OPIVV, VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => { VecOpCategory::OPMVV } VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX - | VecAluOpRRR::VslidedownVX => VecOpCategory::OPIVX, + | VecAluOpRRR::VslidedownVX + | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX, VecAluOpRRR::VfaddVV | VecAluOpRRR::VfsubVV | VecAluOpRRR::VfmulVV @@ -298,7 +310,8 @@ impl VecAluOpRRR { | VecAluOpRRR::VfrsubVF | VecAluOpRRR::VfmulVF | VecAluOpRRR::VfdivVF - | VecAluOpRRR::VfrdivVF => VecOpCategory::OPFVF, + | VecAluOpRRR::VfrdivVF + | VecAluOpRRR::VfmergeVFM => VecOpCategory::OPFVF, } } @@ -315,10 +328,15 @@ impl VecAluOpRRR { impl fmt::Display for VecAluOpRRR { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let suffix_length = match self { + VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 3, + _ => 2, + }; + let mut s = format!("{self:?}"); s.make_ascii_lowercase(); - let (opcode, category) = s.split_at(s.len() - 2); - f.write_str(&format!("{}.{}", opcode, category)) + let (opcode, category) = s.split_at(s.len() - suffix_length); + f.write_str(&format!("{opcode}.{category}")) } } @@ -337,31 +355,38 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VaddVI => 0b000000, VecAluOpRRImm5::VrsubVI => 0b000011, VecAluOpRRImm5::VslidedownVI => 0b001111, + VecAluOpRRImm5::VmergeVIM => 0b010111, } } pub fn category(&self) -> VecOpCategory { match self { - VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VslidedownVI => { - VecOpCategory::OPIVI - } + VecAluOpRRImm5::VaddVI + | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VslidedownVI + | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI, } } pub fn imm_is_unsigned(&self) -> bool { match self { VecAluOpRRImm5::VslidedownVI => true, - VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI => false, + VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VmergeVIM => false, } } } impl fmt::Display for VecAluOpRRImm5 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let suffix_length = match self { + VecAluOpRRImm5::VmergeVIM => 3, + _ => 2, + }; + let mut s = format!("{self:?}"); s.make_ascii_lowercase(); - let (opcode, category) = s.split_at(s.len() - 2); - f.write_str(&format!("{}.{}", opcode, category)) + let (opcode, category) = s.split_at(s.len() - suffix_length); + f.write_str(&format!("{opcode}.{category}")) } } @@ -421,12 +446,12 @@ impl VecAluOpRR { /// other way around. As far as I can tell only vmv.v.* are backwards. pub fn vs_is_vs2_encoded(&self) -> bool { match self { + VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true, VecAluOpRR::VmvSX - | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF - | VecAluOpRR::VfmvFS - | VecAluOpRR::VfsqrtV => true, - VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => false, + | VecAluOpRR::VmvVV + | VecAluOpRR::VmvVX + | VecAluOpRR::VfmvVF => false, } } diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index dcd8c2bdeb86..64294d2276fa 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -77,10 +77,16 @@ ;; When masked, the instruction will only operate on the elements that are dictated by ;; the mask register. Currently this is always fixed to v0. (type VecOpMasking (enum - (Enabled) + (Enabled (reg Reg)) (Disabled) )) +(decl pure masked (Reg) VecOpMasking) +(rule (masked reg) (VecOpMasking.Enabled reg)) + +(decl pure unmasked () VecOpMasking) +(rule (unmasked) (VecOpMasking.Disabled)) + ;; Register to Register ALU Ops (type VecAluOpRRR (enum ;; Vector-Vector Opcodes @@ -97,6 +103,7 @@ (VfmulVV) (VfdivVV) (VfsgnjnVV) + (VmergeVVM) ;; Vector-Scalar Opcodes (VaddVX) @@ -109,6 +116,8 @@ (VfmulVF) (VfdivVF) (VfrdivVF) + (VmergeVXM) + (VfmergeVFM) )) ;; Register-Imm ALU Ops @@ -117,6 +126,7 @@ (VaddVI) (VrsubVI) (VslidedownVI) + (VmergeVIM) )) ;; Imm only ALU Ops @@ -202,232 +212,289 @@ ;; See Section 10.1 of the RISC-V Vector Extension Specification. ;; Helper for emitting `MInst.VecAluRRR` instructions. -(decl vec_alu_rrr (VecAluOpRRR Reg Reg VState) Reg) -(rule (vec_alu_rrr op vs2 vs1 vstate) +(decl vec_alu_rrr (VecAluOpRRR Reg Reg VecOpMasking VState) Reg) +(rule (vec_alu_rrr op vs2 vs1 mask vstate) (let ((vd WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 vstate)))) + (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 mask vstate)))) vd)) ;; Helper for emitting `MInst.VecAluRRImm5` instructions. -(decl vec_alu_rr_imm5 (VecAluOpRRImm5 Reg Imm5 VState) Reg) -(rule (vec_alu_rr_imm5 op vs2 imm vstate) +(decl vec_alu_rr_imm5 (VecAluOpRRImm5 Reg Imm5 VecOpMasking VState) Reg) +(rule (vec_alu_rr_imm5 op vs2 imm mask vstate) (let ((vd WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm vstate)))) + (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm mask vstate)))) vd)) ;; Helper for emitting `MInst.VecAluRRImm5` instructions where the immediate ;; is zero extended instead of sign extended. -(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VState) Reg) -(rule (vec_alu_rr_uimm5 op vs2 imm vstate) - (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) vstate)) +(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VecOpMasking VState) Reg) +(rule (vec_alu_rr_uimm5 op vs2 imm mask vstate) + (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) mask vstate)) ;; Helper for emitting `MInst.VecAluRRImm5` instructions that use the Imm5 as ;; auxiliary encoding space. -(decl vec_alu_rr (VecAluOpRR Reg VState) Reg) -(rule (vec_alu_rr op vs vstate) +(decl vec_alu_rr (VecAluOpRR Reg VecOpMasking VState) Reg) +(rule (vec_alu_rr op vs mask vstate) (let ((vd WritableReg (temp_writable_reg (vec_alu_rr_dst_type op))) - (_ Unit (emit (MInst.VecAluRR op vd vs vstate)))) + (_ Unit (emit (MInst.VecAluRR op vd vs mask vstate)))) vd)) ;; Helper for emitting `MInst.VecAluRImm5` instructions. -(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VState) Reg) -(rule (vec_alu_r_imm5 op imm vstate) +(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VecOpMasking VState) Reg) +(rule (vec_alu_r_imm5 op imm mask vstate) (let ((vd WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecAluRImm5 op vd imm vstate)))) + (_ Unit (emit (MInst.VecAluRImm5 op vd imm mask vstate)))) vd)) ;; Helper for emitting `MInst.VecLoad` instructions. -(decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg) -(rule (vec_load eew from flags vstate) +(decl vec_load (VecElementWidth VecAMode MemFlags VecOpMasking VState) Reg) +(rule (vec_load eew from flags mask vstate) (let ((vd WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecLoad eew vd from flags vstate)))) + (_ Unit (emit (MInst.VecLoad eew vd from flags mask vstate)))) vd)) ;; Helper for emitting `MInst.VecStore` instructions. -(decl vec_store (VecElementWidth VecAMode Reg MemFlags VState) InstOutput) -(rule (vec_store eew to from flags vstate) +(decl vec_store (VecElementWidth VecAMode Reg MemFlags VecOpMasking VState) InstOutput) +(rule (vec_store eew to from flags mask vstate) (side_effect - (SideEffectNoResult.Inst (MInst.VecStore eew to from flags vstate)))) + (SideEffectNoResult.Inst (MInst.VecStore eew to from flags mask vstate)))) ;; Helper for emitting the `vadd.vv` instruction. -(decl rv_vadd_vv (Reg Reg VState) Reg) -(rule (rv_vadd_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 vstate)) +(decl rv_vadd_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vadd.vx` instruction. -(decl rv_vadd_vx (Reg Reg VState) Reg) -(rule (rv_vadd_vx vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 vstate)) +(decl rv_vadd_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vadd_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 mask vstate)) ;; Helper for emitting the `vadd.vi` instruction. -(decl rv_vadd_vi (Reg Imm5 VState) Reg) -(rule (rv_vadd_vi vs2 imm vstate) - (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm vstate)) +(decl rv_vadd_vi (Reg Imm5 VecOpMasking VState) Reg) +(rule (rv_vadd_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm mask vstate)) ;; Helper for emitting the `vsub.vv` instruction. -(decl rv_vsub_vv (Reg Reg VState) Reg) -(rule (rv_vsub_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 vstate)) +(decl rv_vsub_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vsub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vsub.vx` instruction. -(decl rv_vsub_vx (Reg Reg VState) Reg) -(rule (rv_vsub_vx vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 vstate)) +(decl rv_vsub_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vsub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 mask vstate)) ;; Helper for emitting the `vrsub.vx` instruction. -(decl rv_vrsub_vx (Reg Reg VState) Reg) -(rule (rv_vrsub_vx vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate)) +(decl rv_vrsub_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vrsub_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 mask vstate)) ;; Helper for emitting the `vneg.v` pseudo-instruction. -(decl rv_vneg_v (Reg VState) Reg) -(rule (rv_vneg_v vs2 vstate) - (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) vstate)) +(decl rv_vneg_v (Reg VecOpMasking VState) Reg) +(rule (rv_vneg_v vs2 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) mask vstate)) ;; Helper for emitting the `vrsub.vi` instruction. -(decl rv_vrsub_vi (Reg Imm5 VState) Reg) -(rule (rv_vrsub_vi vs2 imm vstate) - (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm vstate)) +(decl rv_vrsub_vi (Reg Imm5 VecOpMasking VState) Reg) +(rule (rv_vrsub_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm mask vstate)) ;; Helper for emitting the `vmul.vv` instruction. -(decl rv_vmul_vv (Reg Reg VState) Reg) -(rule (rv_vmul_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 vstate)) +(decl rv_vmul_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vmulh.vv` instruction. -(decl rv_vmulh_vv (Reg Reg VState) Reg) -(rule (rv_vmulh_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 vstate)) +(decl rv_vmulh_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vmulh_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vmulhu.vv` instruction. -(decl rv_vmulhu_vv (Reg Reg VState) Reg) -(rule (rv_vmulhu_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 vstate)) +(decl rv_vmulhu_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vmulhu_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vand.vv` instruction. -(decl rv_vand_vv (Reg Reg VState) Reg) -(rule (rv_vand_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 vstate)) +(decl rv_vand_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vand_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vor.vv` instruction. -(decl rv_vor_vv (Reg Reg VState) Reg) -(rule (rv_vor_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 vstate)) +(decl rv_vor_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vor_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vxor.vv` instruction. -(decl rv_vxor_vv (Reg Reg VState) Reg) -(rule (rv_vxor_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate)) +(decl rv_vxor_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vxor_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfadd.vv` instruction. -(decl rv_vfadd_vv (Reg Reg VState) Reg) -(rule (rv_vfadd_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 vstate)) +(decl rv_vfadd_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfadd_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfadd.vf` instruction. -(decl rv_vfadd_vf (Reg Reg VState) Reg) -(rule (rv_vfadd_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 vstate)) +(decl rv_vfadd_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfadd_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfsub.vv` instruction. -(decl rv_vfsub_vv (Reg Reg VState) Reg) -(rule (rv_vfsub_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 vstate)) +(decl rv_vfsub_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfsub_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfsub.vf` instruction. -(decl rv_vfsub_vf (Reg Reg VState) Reg) -(rule (rv_vfsub_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 vstate)) +(decl rv_vfsub_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfsub_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfrsub.vf` instruction. -(decl rv_vfrsub_vf (Reg Reg VState) Reg) -(rule (rv_vfrsub_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 vstate)) +(decl rv_vfrsub_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfrsub_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfmul.vv` instruction. -(decl rv_vfmul_vv (Reg Reg VState) Reg) -(rule (rv_vfmul_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 vstate)) +(decl rv_vfmul_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfmul.vf` instruction. -(decl rv_vfmul_vf (Reg Reg VState) Reg) -(rule (rv_vfmul_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 vstate)) +(decl rv_vfmul_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfmul_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfdiv.vv` instruction. -(decl rv_vfdiv_vv (Reg Reg VState) Reg) -(rule (rv_vfdiv_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 vstate)) +(decl rv_vfdiv_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfdiv_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfdiv.vf` instruction. -(decl rv_vfdiv_vf (Reg Reg VState) Reg) -(rule (rv_vfdiv_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 vstate)) +(decl rv_vfdiv_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfdiv_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfrdiv.vf` instruction. -(decl rv_vfrdiv_vf (Reg Reg VState) Reg) -(rule (rv_vfrdiv_vf vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 vstate)) +(decl rv_vfrdiv_vf (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfrdiv_vf vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfsgnjn.vv` ("Floating Point Sign Injection Negated") instruction. ;; The output of this instruction is `vs2` with the negated sign bit from `vs1` -(decl rv_vfsgnjn_vv (Reg Reg VState) Reg) -(rule (rv_vfsgnjn_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 vstate)) +(decl rv_vfsgnjn_vv (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vfsgnjn_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 mask vstate)) ;; Helper for emitting the `vfneg.v` instruction. ;; This instruction is a mnemonic for `vfsgnjn.vv vd, vs, vs` -(decl rv_vfneg_v (Reg VState) Reg) -(rule (rv_vfneg_v vs vstate) (rv_vfsgnjn_vv vs vs vstate)) +(decl rv_vfneg_v (Reg VecOpMasking VState) Reg) +(rule (rv_vfneg_v vs mask vstate) (rv_vfsgnjn_vv vs vs mask vstate)) ;; Helper for emitting the `vfsqrt.v` instruction. ;; This instruction splats the F regsiter into all elements of the destination vector. -(decl rv_vfsqrt_v (Reg VState) Reg) -(rule (rv_vfsqrt_v vs vstate) - (vec_alu_rr (VecAluOpRR.VfsqrtV) vs vstate)) +(decl rv_vfsqrt_v (Reg VecOpMasking VState) Reg) +(rule (rv_vfsqrt_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfsqrtV) vs mask vstate)) ;; Helper for emitting the `vslidedown.vx` instruction. ;; `vslidedown` moves all elements in the vector down by n elements. ;; The top most elements are up to the tail policy. -(decl rv_vslidedown_vx (Reg Reg VState) Reg) -(rule (rv_vslidedown_vx vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 vstate)) +(decl rv_vslidedown_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vslidedown_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 mask vstate)) ;; Helper for emitting the `vslidedown.vi` instruction. ;; Unlike other `vi` instructions the immediate is zero extended. -(decl rv_vslidedown_vi (Reg UImm5 VState) Reg) -(rule (rv_vslidedown_vi vs2 imm vstate) - (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm vstate)) +(decl rv_vslidedown_vi (Reg UImm5 VecOpMasking VState) Reg) +(rule (rv_vslidedown_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm mask vstate)) ;; Helper for emitting the `vmv.x.s` instruction. ;; This instruction copies the first element of the source vector to the destination X register. +;; Masked versions of this instuction are not supported. (decl rv_vmv_xs (Reg VState) Reg) (rule (rv_vmv_xs vs vstate) - (vec_alu_rr (VecAluOpRR.VmvXS) vs vstate)) + (vec_alu_rr (VecAluOpRR.VmvXS) vs (unmasked) vstate)) ;; Helper for emitting the `vfmv.f.s` instruction. ;; This instruction copies the first element of the source vector to the destination F register. +;; Masked versions of this instuction are not supported. (decl rv_vfmv_fs (Reg VState) Reg) (rule (rv_vfmv_fs vs vstate) - (vec_alu_rr (VecAluOpRR.VfmvFS) vs vstate)) + (vec_alu_rr (VecAluOpRR.VfmvFS) vs (unmasked) vstate)) + +;; Helper for emitting the `vmv.s.x` instruction. +;; This instruction copies the source X register into first element of the source vector. +;; Masked versions of this instuction are not supported. +(decl rv_vmv_sx (Reg VState) Reg) +(rule (rv_vmv_sx vs vstate) + (vec_alu_rr (VecAluOpRR.VmvSX) vs (unmasked) vstate)) + +;; Helper for emitting the `vfmv.s.f` instruction. +;; This instruction copies the source F register into first element of the source vector. +;; Masked versions of this instuction are not supported. +(decl rv_vfmv_sf (Reg VState) Reg) +(rule (rv_vfmv_sf vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvSF) vs (unmasked) vstate)) ;; Helper for emitting the `vmv.v.x` instruction. ;; This instruction splats the X regsiter into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` (decl rv_vmv_vx (Reg VState) Reg) (rule (rv_vmv_vx vs vstate) - (vec_alu_rr (VecAluOpRR.VmvVX) vs vstate)) + (vec_alu_rr (VecAluOpRR.VmvVX) vs (unmasked) vstate)) ;; Helper for emitting the `vfmv.v.f` instruction. ;; This instruction splats the F regsiter into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` (decl rv_vfmv_vf (Reg VState) Reg) (rule (rv_vfmv_vf vs vstate) - (vec_alu_rr (VecAluOpRR.VfmvVF) vs vstate)) + (vec_alu_rr (VecAluOpRR.VfmvVF) vs (unmasked) vstate)) ;; Helper for emitting the `vmv.v.i` instruction. ;; This instruction splat's the immediate value into all elements of the destination vector. +;; Masked versions of this instruction are called `vmerge` (decl rv_vmv_vi (Imm5 VState) Reg) (rule (rv_vmv_vi imm vstate) - (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm vstate)) + (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm (unmasked) vstate)) + +;; Helper for emitting the `vmerge.vvm` instruction. +;; This instruction merges the elements of the two source vectors into the destination vector +;; based on a mask. Elements are taken from the first source vector if the mask bit is clear, +;; and from the second source vector if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? vs1[i] : vs2[i] +(decl rv_vmerge_vvm (Reg Reg Reg VState) Reg) +(rule (rv_vmerge_vvm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmergeVVM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vmerge.vxm` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the X +;; register if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? x[rs1] : vs2[i] +(decl rv_vmerge_vxm (Reg Reg Reg VState) Reg) +(rule (rv_vmerge_vxm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmergeVXM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vfmerge.vfm` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the F +;; register if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? f[rs1] : vs2[i] +(decl rv_vfmerge_vfm (Reg Reg Reg VState) Reg) +(rule (rv_vfmerge_vfm vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VfmergeVFM) vs2 vs1 (masked mask) vstate)) + +;; Helper for emitting the `vmerge.vim` instruction. +;; Elements are taken from the first source vector if the mask bit is clear, and from the +;; immediate value if the mask bit is set. This instruction is always masked. +;; +;; vd[i] = v0.mask[i] ? imm : vs2[i] +(decl rv_vmerge_vim (Reg Imm5 Reg VState) Reg) +(rule (rv_vmerge_vim vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VmergeVIM) vs2 imm (masked mask) vstate)) + ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -447,8 +514,20 @@ ;; in index 0, and then use the appropriate `vmv` instruction. ;; If the index fits into a 5-bit immediate, we can emit a `vslidedown.vi`. (rule 1 (gen_extractlane (ty_vec_fits_in_register ty) src (uimm5_from_u8 idx)) - (gen_extractlane ty (rv_vslidedown_vi src idx ty) 0)) + (gen_extractlane ty (rv_vslidedown_vi src idx (unmasked) ty) 0)) ;; Otherwise lower it into an X register. (rule 0 (gen_extractlane (ty_vec_fits_in_register ty) src idx) - (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) ty) 0)) + (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) (unmasked) ty) 0)) + + +;; Build a vector mask from a u64 +;; TODO: We should merge this with the `vconst` rules, and take advantage of +;; the other existing `vconst` rules. One example is using `vmv.v.i` which +;; can represent some of these masks. +(decl gen_vec_mask (u64) Reg) + +;; Materialize the mask into an X register, and move it into the bottom of +;; the vector register. +(rule (gen_vec_mask mask) + (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2))) \ No newline at end of file diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index d28e30344d60..ef72b3568dd1 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -16,6 +16,7 @@ (element_width_from_type ty) (VecAMode.UnitStride (gen_const_amode (const_to_vconst n))) (mem_flags_trusted) + (unmasked) ty)) ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -108,19 +109,19 @@ ;; SIMD Vectors (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (iadd x y))) - (rv_vadd_vv x y ty)) + (rv_vadd_vv x y (unmasked) ty)) (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (splat y)))) - (rv_vadd_vx x y ty)) + (rv_vadd_vx x y (unmasked) ty)) (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (splat x) y))) - (rv_vadd_vx y x ty)) + (rv_vadd_vx y x (unmasked) ty)) (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y)))) - (rv_vadd_vi x y ty)) + (rv_vadd_vi x y (unmasked) ty)) (rule 12 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y))) - (rv_vadd_vi y x ty)) + (rv_vadd_vi y x (unmasked) ty)) ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;; (rule @@ -144,16 +145,16 @@ ;; SIMD Vectors (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (isub x y))) - (rv_vsub_vv x y ty)) + (rv_vsub_vv x y (unmasked) ty)) (rule 4 (lower (has_type (ty_vec_fits_in_register ty) (isub x (splat y)))) - (rv_vsub_vx x y ty)) + (rv_vsub_vx x y (unmasked) ty)) (rule 5 (lower (has_type (ty_vec_fits_in_register ty) (isub (splat x) y))) - (rv_vrsub_vx y x ty)) + (rv_vrsub_vx y x (unmasked) ty)) (rule 6 (lower (has_type (ty_vec_fits_in_register ty) (isub (replicated_imm5 x) y))) - (rv_vrsub_vi y x ty)) + (rv_vrsub_vi y x (unmasked) ty)) ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -162,7 +163,7 @@ (neg ty val)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ineg x))) - (rv_vneg_v x ty)) + (rv_vneg_v x (unmasked) ty)) ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -201,21 +202,21 @@ (value_regs dst_lo dst_hi))) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y))) - (rv_vmul_vv x y ty)) + (rv_vmul_vv x y (unmasked) ty)) ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y))) (lower_smlhi ty (ext_int_if_need $true x ty) (ext_int_if_need $true y ty))) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y))) - (rv_vmulh_vv x y ty)) + (rv_vmulh_vv x y (unmasked) ty)) ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y))) (lower_umlhi ty (ext_int_if_need $false x ty) (ext_int_if_need $false y ty))) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y))) - (rv_vmulhu_vv x y ty)) + (rv_vmulhu_vv x y (unmasked) ty)) ;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -325,7 +326,7 @@ (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (band x y))) - (rv_vand_vv x y ty)) + (rv_vand_vv x y (unmasked) ty)) ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -372,7 +373,7 @@ (value_regs low high))) (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bor x y))) - (rv_vor_vv x y ty)) + (rv_vor_vv x y (unmasked) ty)) ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y))) @@ -395,7 +396,7 @@ (lower_float_binary (AluOPRRR.Xor) x y $F64)) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y))) - (rv_vxor_vv x y ty)) + (rv_vxor_vv x y (unmasked) ty)) ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (bnot x))) @@ -588,7 +589,7 @@ (rv_fneg ty x)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fneg x))) - (rv_vfneg_v x ty)) + (rv_vfneg_v x (unmasked) ty)) ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (fcopysign x y))) @@ -604,7 +605,7 @@ (rv_fsqrt ty x)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqrt x))) - (rv_vfsqrt_v x ty)) + (rv_vfsqrt_v x (unmasked) ty)) ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 @@ -719,13 +720,13 @@ (rv_fadd ty x y)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fadd x y))) - (rv_vfadd_vv x y ty)) + (rv_vfadd_vv x y (unmasked) ty)) (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fadd x (splat y)))) - (rv_vfadd_vf x y ty)) + (rv_vfadd_vf x y (unmasked) ty)) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fadd (splat x) y))) - (rv_vfadd_vf y x ty)) + (rv_vfadd_vf y x (unmasked) ty)) ;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -733,26 +734,26 @@ (rv_fsub ty x y)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fsub x y))) - (rv_vfsub_vv x y ty)) + (rv_vfsub_vv x y (unmasked) ty)) (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fsub x (splat y)))) - (rv_vfsub_vf x y ty)) + (rv_vfsub_vf x y (unmasked) ty)) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fsub (splat x) y))) - (rv_vfrsub_vf y x ty)) + (rv_vfrsub_vf y x (unmasked) ty)) ;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_scalar_float ty) (fmul x y))) (rv_fmul ty x y)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmul x y))) - (rv_vfmul_vv x y ty)) + (rv_vfmul_vv x y (unmasked) ty)) (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fmul x (splat y)))) - (rv_vfmul_vf x y ty)) + (rv_vfmul_vf x y (unmasked) ty)) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fmul (splat x) y))) - (rv_vfmul_vf y x ty)) + (rv_vfmul_vf y x (unmasked) ty)) ;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -760,13 +761,13 @@ (rv_fdiv ty x y)) (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x y))) - (rv_vfdiv_vv x y ty)) + (rv_vfdiv_vv x y (unmasked) ty)) (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x (splat y)))) - (rv_vfdiv_vf x y ty)) + (rv_vfdiv_vf x y (unmasked) ty)) (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fdiv (splat x) y))) - (rv_vfrdiv_vf y x ty)) + (rv_vfrdiv_vf y x (unmasked) ty)) ;;;; Rules for `fmin/fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -915,7 +916,7 @@ (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (load flags p @ (value_type (ty_addr64 _)) offset))) (let ((eew VecElementWidth (element_width_from_type ty))) - (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags ty))) + (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty))) ;;;;; Rules for `istore8`;;;;;;;;; (rule @@ -944,7 +945,7 @@ (rule 2 (lower (store flags x @ (value_type (ty_vec_fits_in_register ty)) p @ (value_type (ty_addr64 _)) offset)) (let ((eew VecElementWidth (element_width_from_type ty))) - (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags ty))) + (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags (unmasked) ty))) (decl gen_icmp (IntCC ValueRegs ValueRegs Type) Reg) (rule @@ -1088,6 +1089,32 @@ (rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx))) (gen_extractlane ty x idx)) +;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We can insert a lane by using a masked splat from an X register. +;; Build a mask that is only enabled in the lane we want to insert. +;; Then use a masked splat (vmerge) to insert the value. +(rule 0 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty)) + val @ (value_type (ty_int _)) + (u8_from_uimm8 lane))) + (let ((mask Reg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vmerge_vxm vec val mask ty))) + +;; Similar to above, but using the float variants of the instructions. +(rule 1 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty)) + val @ (value_type (ty_scalar_float _)) + (u8_from_uimm8 lane))) + (let ((mask Reg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vfmerge_vfm vec val mask ty))) + +;; If we are inserting from an Imm5 const we can use the immediate +;; variant of vmerge. +(rule 2 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty)) + (iconst (u64_from_imm64 (imm5_from_u64 imm))) + (u8_from_uimm8 lane))) + (let ((mask Reg (gen_vec_mask (u64_shl 1 lane)))) + (rv_vmerge_vim vec imm mask ty))) + ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type ty (splat n @ (value_type (ty_scalar_float _))))) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif new file mode 100644 index 000000000000..5e4899512711 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif @@ -0,0 +1,530 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %insertlane_15(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane v0, v1, 15 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; lui a2,8 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v9,v1,a0,v0.t #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; lui a2, 8 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0xa7, 0x84, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_5(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane v0, v1, 5 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,32 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v9,v1,a0,v0.t #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 0x20 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x84, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_2(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane v0, v1, 2 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,4 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v9,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 4 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x84, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,1 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v9,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 1 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x84, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_0_in_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,1 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 1 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_1_in_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,2 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 2 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_3_in_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,1 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 1 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_1_in_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a2,2 +; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a2, zero, 2 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_const_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 10 + v2 = insertlane v0, v1, 15 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; lui a1,8 +; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v8,v1,10,v0.t #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; lui a1, 8 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe0, 0x05, 0x42 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x34, 0x15, 0x5c +; .byte 0x27, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_const_5(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -2 + v2 = insertlane v0, v1, 5 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a1,32 +; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v8,v1,-2,v0.t #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a1, zero, 0x20 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe0, 0x05, 0x42 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x34, 0x1f, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_const_2(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = insertlane v0, v1, 2 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a1,4 +; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v8,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a1, zero, 4 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe0, 0x05, 0x42 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb4, 0x17, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %insertlane_const_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -9 + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; li a1,1 +; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v8,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi a1, zero, 1 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xe0, 0x05, 0x42 +; .byte 0x57, 0xb4, 0x1b, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x04, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif b/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif new file mode 100644 index 000000000000..f2203889aa47 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif @@ -0,0 +1,23 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + +function %insertlane_preserves_upper_bits(f64) -> i64 fast { +block0(v5: f64): + v3 = vconst.i8x16 0x0000000000000001ffffffffffffffff + v6 = bitcast.f64x2 little v3 + v7 = insertlane v6, v5, 0 + v8 = bitcast.i64x2 little v7 + v9 = extractlane v8, 1 + return v9 +} +; run: %insertlane_preserves_upper_bits(0x0.0) == 1 +; run: %insertlane_preserves_upper_bits(0x9.0) == 1 +; run: %insertlane_preserves_upper_bits(+Inf) == 1 \ No newline at end of file diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif index 56ae6dedde2c..d69e7a08ebd6 100644 --- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif @@ -8,6 +8,7 @@ target x86_64 target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx +target riscv64 has_v function %insertlane_i8x16_0(i8x16, i8) -> i8x16 { block0(v0: i8x16, v1: i8): @@ -166,3 +167,36 @@ block0(v0: f64x2, v1: f64): return v4 } ; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0] + + +function %insertlane_const_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 10 + v2 = insertlane v0, v1, 15 + return v2 +} +; run: %insertlane_const_15([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10] + +function %insertlane_const_5(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -2 + v2 = insertlane v0, v1, 5 + return v2 +} +; run: %insertlane_const_5([1 1 1 1 1 1 1 1]) == [1 1 1 1 1 -2 1 1] + +function %insertlane_const_2(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = insertlane v0, v1, 2 + return v2 +} +; run: %insertlane_const_2([1 1 1 1]) == [1 1 15 1] + +function %insertlane_const_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -9 + v2 = insertlane v0, v1, 0 + return v2 +} +; run: %insertlane_const_0([1 1]) == [-9 1] \ No newline at end of file