diff --git a/build.rs b/build.rs
index 682187eaf17c..5f9b18ebf803 100644
--- a/build.rs
+++ b/build.rs
@@ -210,7 +210,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "issue4807",
                 "issue_3327_bnot_lowering",
                 "load_splat_out_of_bounds",
-                "replace_lane_preserve",
                 "simd_align",
                 "simd_bit_shift",
                 "simd_bitwise",
@@ -246,10 +245,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_int_to_int_extend",
                 "simd_lane",
                 "simd_load",
-                "simd_load16_lane",
-                "simd_load32_lane",
-                "simd_load64_lane",
-                "simd_load8_lane",
                 "simd_load_extend",
                 "simd_load_zero",
                 "simd_splat",
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index 9a4f99b8e36c..6b639ad311b9 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -328,6 +328,7 @@
       (vd WritableReg)
       (vs2 Reg)
       (vs1 Reg)
+      (mask VecOpMasking)
       (vstate VState))
 
     (VecAluRRImm5
@@ -335,18 +336,21 @@
       (vd WritableReg)
       (vs2 Reg)
       (imm Imm5)
+      (mask VecOpMasking)
       (vstate VState))
 
     (VecAluRR
       (op VecAluOpRR)
       (vd WritableReg)
       (vs Reg)
+      (mask VecOpMasking)
       (vstate VState))
 
     (VecAluRImm5
       (op VecAluOpRImm5)
       (vd WritableReg)
       (imm Imm5)
+      (mask VecOpMasking)
       (vstate VState))
 
     (VecSetState
@@ -358,6 +362,7 @@
       (to WritableReg)
       (from VecAMode)
       (flags MemFlags)
+      (mask VecOpMasking)
       (vstate VState))
 
     (VecStore
@@ -365,6 +370,7 @@
       (to VecAMode)
       (from Reg)
       (flags MemFlags)
+      (mask VecOpMasking)
       (vstate VState))
 ))
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
index d4c03c1ea053..2f9d9bd486d2 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
@@ -5,7 +5,6 @@ use crate::ir::RelSourceLoc;
 use crate::ir::TrapCode;
 use crate::isa::riscv64::inst::*;
 use crate::isa::riscv64::inst::{zero_reg, AluOPRRR};
-use crate::isa::riscv64::lower::isle::generated_code::VecOpMasking;
 use crate::machinst::{AllocationConsumer, Reg, Writable};
 use cranelift_control::ControlPlane;
 use regalloc2::Allocation;
@@ -2804,32 +2803,58 @@ impl MachInstEmit for Inst {
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::VecAluRRR {
-                op, vd, vs1, vs2, ..
+                op,
+                vd,
+                vs1,
+                vs2,
+                ref mask,
+                ..
             } => {
                 let vs1 = allocs.next(vs1);
                 let vs2 = allocs.next(vs2);
                 let vd = allocs.next_writable(vd);
+                let mask = mask.with_allocs(&mut allocs);
 
-                sink.put4(encode_valu(op, vd, vs1, vs2, VecOpMasking::Disabled));
+                sink.put4(encode_valu(op, vd, vs1, vs2, mask));
             }
             &Inst::VecAluRRImm5 {
-                op, vd, imm, vs2, ..
+                op,
+                vd,
+                imm,
+                vs2,
+                ref mask,
+                ..
             } => {
                 let vs2 = allocs.next(vs2);
                 let vd = allocs.next_writable(vd);
+                let mask = mask.with_allocs(&mut allocs);
 
-                sink.put4(encode_valu_imm(op, vd, imm, vs2, VecOpMasking::Disabled));
+                sink.put4(encode_valu_imm(op, vd, imm, vs2, mask));
             }
-            &Inst::VecAluRR { op, vd, vs, .. } => {
+            &Inst::VecAluRR {
+                op,
+                vd,
+                vs,
+                ref mask,
+                ..
+            } => {
                 let vs = allocs.next(vs);
                 let vd = allocs.next_writable(vd);
+                let mask = mask.with_allocs(&mut allocs);
 
-                sink.put4(encode_valu_rr(op, vd, vs, VecOpMasking::Disabled));
+                sink.put4(encode_valu_rr(op, vd, vs, mask));
             }
-            &Inst::VecAluRImm5 { op, vd, imm, .. } => {
+            &Inst::VecAluRImm5 {
+                op,
+                vd,
+                imm,
+                ref mask,
+                ..
+            } => {
                 let vd = allocs.next_writable(vd);
+                let mask = mask.with_allocs(&mut allocs);
 
-                sink.put4(encode_valu_r_imm(op, vd, imm, VecOpMasking::Disabled));
+                sink.put4(encode_valu_r_imm(op, vd, imm, mask));
             }
             &Inst::VecSetState { rd, ref vstate } => {
                 let rd = allocs.next_writable(rd);
@@ -2849,11 +2874,13 @@ impl MachInstEmit for Inst {
                 eew,
                 to,
                 ref from,
+                ref mask,
                 flags,
                 ..
             } => {
                 let from = from.clone().with_allocs(&mut allocs);
                 let to = allocs.next_writable(to);
+                let mask = mask.with_allocs(&mut allocs);
 
                 // Vector Loads don't support immediate offsets, so we need to load it into a register.
                 let addr = match from {
@@ -2889,8 +2916,7 @@ impl MachInstEmit for Inst {
                     eew,
                     addr,
                     from.lumop(),
-                    // We don't implement masking yet.
-                    VecOpMasking::Disabled,
+                    mask,
                     from.mop(),
                     from.nf(),
                 ));
@@ -2900,11 +2926,13 @@ impl MachInstEmit for Inst {
                 eew,
                 ref to,
                 from,
+                ref mask,
                 flags,
                 ..
             } => {
                 let to = to.clone().with_allocs(&mut allocs);
                 let from = allocs.next(from);
+                let mask = mask.with_allocs(&mut allocs);
 
                 // Vector Stores don't support immediate offsets, so we need to load it into a register.
                 let addr = match to {
@@ -2940,8 +2968,7 @@ impl MachInstEmit for Inst {
                     eew,
                     addr,
                     to.sumop(),
-                    // We don't implement masking yet.
-                    VecOpMasking::Disabled,
+                    mask,
                     to.mop(),
                     to.nf(),
                 ));
diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 6324484eebbb..659dcb0fa0cf 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -4,7 +4,7 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
 
-use super::lower::isle::generated_code::{VecAMode, VecElementWidth};
+use super::lower::isle::generated_code::{VecAMode, VecElementWidth, VecOpMasking};
 use crate::binemit::{Addend, CodeOffset, Reloc};
 pub use crate::ir::condcodes::IntCC;
 use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64};
@@ -301,6 +301,7 @@ impl Inst {
                 to: into_reg,
                 from: VecAMode::UnitStride { base: mem },
                 flags,
+                mask: VecOpMasking::Disabled,
                 vstate: VState::from_type(ty),
             }
         } else {
@@ -321,6 +322,7 @@ impl Inst {
                 to: VecAMode::UnitStride { base: mem },
                 from: from_reg,
                 flags,
+                mask: VecOpMasking::Disabled,
                 vstate: VState::from_type(ty),
             }
         } else {
@@ -335,6 +337,19 @@ impl Inst {
 }
 
 //=============================================================================
+
+fn vec_mask_operands<F: Fn(VReg) -> VReg>(
+    mask: &VecOpMasking,
+    collector: &mut OperandCollector<'_, F>,
+) {
+    match mask {
+        VecOpMasking::Enabled { reg } => {
+            collector.reg_fixed_use(*reg, pv_reg(0).into());
+        }
+        VecOpMasking::Disabled => {}
+    }
+}
+
 fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) {
     match inst {
         &Inst::Nop0 => {}
@@ -625,7 +640,12 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             // no need let reg alloc know.
         }
         &Inst::VecAluRRR {
-            op, vd, vs1, vs2, ..
+            op,
+            vd,
+            vs1,
+            vs2,
+            ref mask,
+            ..
         } => {
             debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
             debug_assert_eq!(vs2.class(), RegClass::Vector);
@@ -634,40 +654,64 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_use(vs1);
             collector.reg_use(vs2);
             collector.reg_def(vd);
+            vec_mask_operands(mask, collector);
         }
-        &Inst::VecAluRRImm5 { vd, vs2, .. } => {
+        &Inst::VecAluRRImm5 {
+            vd, vs2, ref mask, ..
+        } => {
             debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
             debug_assert_eq!(vs2.class(), RegClass::Vector);
 
             collector.reg_use(vs2);
             collector.reg_def(vd);
+            vec_mask_operands(mask, collector);
         }
-        &Inst::VecAluRR { op, vd, vs, .. } => {
+        &Inst::VecAluRR {
+            op,
+            vd,
+            vs,
+            ref mask,
+            ..
+        } => {
             debug_assert_eq!(vd.to_reg().class(), op.dst_regclass());
             debug_assert_eq!(vs.class(), op.src_regclass());
 
             collector.reg_use(vs);
             collector.reg_def(vd);
+            vec_mask_operands(mask, collector);
         }
-        &Inst::VecAluRImm5 { vd, .. } => {
+        &Inst::VecAluRImm5 { vd, ref mask, .. } => {
             debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
 
             collector.reg_def(vd);
+            vec_mask_operands(mask, collector);
         }
         &Inst::VecSetState { rd, .. } => {
             collector.reg_def(rd);
         }
-        &Inst::VecLoad { to, ref from, .. } => {
+        &Inst::VecLoad {
+            to,
+            ref from,
+            ref mask,
+            ..
+        } => {
             if let Some(r) = from.get_allocatable_register() {
                 collector.reg_use(r);
             }
             collector.reg_def(to);
+            vec_mask_operands(mask, collector);
         }
-        &Inst::VecStore { ref to, from, .. } => {
+        &Inst::VecStore {
+            ref to,
+            from,
+            ref mask,
+            ..
+        } => {
             if let Some(r) = to.get_allocatable_register() {
                 collector.reg_use(r);
             }
             collector.reg_use(from);
+            vec_mask_operands(mask, collector);
         }
     }
 }
@@ -876,6 +920,13 @@ impl Inst {
             }
         };
 
+        let format_mask = |mask: &VecOpMasking, allocs: &mut AllocationConsumer<'_>| -> String {
+            match mask {
+                VecOpMasking::Enabled { reg } => format!(",{}.t", format_reg(*reg, allocs)),
+                VecOpMasking::Disabled => format!(""),
+            }
+        };
+
         let format_regs = |regs: &[Reg], allocs: &mut AllocationConsumer<'_>| -> String {
             let mut x = if regs.len() > 1 {
                 String::from("[")
@@ -1572,22 +1623,24 @@ impl Inst {
                 vd,
                 vs1,
                 vs2,
+                ref mask,
                 ref vstate,
             } => {
                 let vs1_s = format_reg(vs1, allocs);
                 let vs2_s = format_reg(vs2, allocs);
                 let vd_s = format_reg(vd.to_reg(), allocs);
+                let mask = format_mask(mask, allocs);
 
                 // Note: vs2 and vs1 here are opposite to the standard scalar ordering.
                 // This is noted in Section 10.1 of the RISC-V Vector spec.
                 match (op, vs2, vs1) {
                     (VecAluOpRRR::VrsubVX, _, vs1) if vs1 == zero_reg() => {
-                        format!("vneg.v {},{} {}", vd_s, vs2_s, vstate)
+                        format!("vneg.v {vd_s},{vs2_s}{mask} {vstate}")
                     }
                     (VecAluOpRRR::VfsgnjnVV, vs2, vs1) if vs2 == vs1 => {
-                        format!("vfneg.v {},{} {}", vd_s, vs2_s, vstate)
+                        format!("vfneg.v {vd_s},{vs2_s}{mask} {vstate}")
                     }
-                    _ => format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate),
+                    _ => format!("{op} {vd_s},{vs2_s},{vs1_s}{mask} {vstate}"),
                 }
             }
             &Inst::VecAluRRImm5 {
@@ -1595,10 +1648,12 @@ impl Inst {
                 vd,
                 imm,
                 vs2,
+                ref mask,
                 ref vstate,
             } => {
                 let vs2_s = format_reg(vs2, allocs);
                 let vd_s = format_reg(vd.to_reg(), allocs);
+                let mask = format_mask(mask, allocs);
 
                 // Some opcodes interpret the immediate as unsigned, lets show the
                 // correct number here.
@@ -1608,28 +1663,32 @@ impl Inst {
                     format!("{}", imm)
                 };
 
-                format!("{} {},{},{} {}", op, vd_s, vs2_s, imm_s, vstate)
+                format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}")
             }
             &Inst::VecAluRR {
                 op,
                 vd,
                 vs,
+                ref mask,
                 ref vstate,
             } => {
                 let vs_s = format_reg(vs, allocs);
                 let vd_s = format_reg(vd.to_reg(), allocs);
+                let mask = format_mask(mask, allocs);
 
-                format!("{} {},{} {}", op, vd_s, vs_s, vstate)
+                format!("{op} {vd_s},{vs_s}{mask} {vstate}")
             }
             &Inst::VecAluRImm5 {
                 op,
                 vd,
                 imm,
+                ref mask,
                 ref vstate,
             } => {
                 let vd_s = format_reg(vd.to_reg(), allocs);
+                let mask = format_mask(mask, allocs);
 
-                format!("{} {},{} {}", op, vd_s, imm, vstate)
+                format!("{op} {vd_s},{imm}{mask} {vstate}")
             }
             &Inst::VecSetState { rd, ref vstate } => {
                 let rd_s = format_reg(rd.to_reg(), allocs);
@@ -1640,23 +1699,29 @@ impl Inst {
                 eew,
                 to,
                 from,
+                ref mask,
                 ref vstate,
                 ..
             } => {
                 let base = format_vec_amode(from, allocs);
                 let vd = format_reg(to.to_reg(), allocs);
-                format!("vl{}.v {},{} {}", eew, vd, base, vstate)
+                let mask = format_mask(mask, allocs);
+
+                format!("vl{eew}.v {vd},{base}{mask} {vstate}")
             }
             Inst::VecStore {
                 eew,
                 to,
                 from,
+                ref mask,
                 ref vstate,
                 ..
             } => {
                 let dst = format_vec_amode(to, allocs);
                 let vs3 = format_reg(*from, allocs);
-                format!("vs{}.v {},{} {}", eew, vs3, dst, vstate)
+                let mask = format_mask(mask, allocs);
+
+                format!("vs{eew}.v {vs3},{dst}{mask} {vstate}")
             }
         }
     }
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 8d21ef9ebe57..48d5192efaaf 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -234,10 +234,19 @@ impl VecOpCategory {
 impl VecOpMasking {
     pub fn encode(&self) -> u32 {
         match self {
-            VecOpMasking::Enabled => 0,
+            VecOpMasking::Enabled { .. } => 0,
             VecOpMasking::Disabled => 1,
         }
     }
+
+    pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self {
+        match self {
+            VecOpMasking::Enabled { reg } => VecOpMasking::Enabled {
+                reg: allocs.next(*reg),
+            },
+            VecOpMasking::Disabled => VecOpMasking::Disabled,
+        }
+    }
 }
 
 impl VecAluOpRRR {
@@ -268,6 +277,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VxorVV => 0b001011,
             VecAluOpRRR::VslidedownVX => 0b001111,
             VecAluOpRRR::VfrsubVF => 0b100111,
+            VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111,
             VecAluOpRRR::VfdivVV | VecAluOpRRR::VfdivVF => 0b100000,
             VecAluOpRRR::VfrdivVF => 0b100001,
             VecAluOpRRR::VfsgnjnVV => 0b001001,
@@ -280,14 +290,16 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VsubVV
             | VecAluOpRRR::VandVV
             | VecAluOpRRR::VorVV
-            | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV,
+            | VecAluOpRRR::VxorVV
+            | VecAluOpRRR::VmergeVVM => VecOpCategory::OPIVV,
             VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
                 VecOpCategory::OPMVV
             }
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VrsubVX
-            | VecAluOpRRR::VslidedownVX => VecOpCategory::OPIVX,
+            | VecAluOpRRR::VslidedownVX
+            | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX,
             VecAluOpRRR::VfaddVV
             | VecAluOpRRR::VfsubVV
             | VecAluOpRRR::VfmulVV
@@ -298,7 +310,8 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VfrsubVF
             | VecAluOpRRR::VfmulVF
             | VecAluOpRRR::VfdivVF
-            | VecAluOpRRR::VfrdivVF => VecOpCategory::OPFVF,
+            | VecAluOpRRR::VfrdivVF
+            | VecAluOpRRR::VfmergeVFM => VecOpCategory::OPFVF,
         }
     }
 
@@ -315,10 +328,15 @@ impl VecAluOpRRR {
 
 impl fmt::Display for VecAluOpRRR {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let suffix_length = match self {
+            VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 3,
+            _ => 2,
+        };
+
         let mut s = format!("{self:?}");
         s.make_ascii_lowercase();
-        let (opcode, category) = s.split_at(s.len() - 2);
-        f.write_str(&format!("{}.{}", opcode, category))
+        let (opcode, category) = s.split_at(s.len() - suffix_length);
+        f.write_str(&format!("{opcode}.{category}"))
     }
 }
 
@@ -337,31 +355,38 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VaddVI => 0b000000,
             VecAluOpRRImm5::VrsubVI => 0b000011,
             VecAluOpRRImm5::VslidedownVI => 0b001111,
+            VecAluOpRRImm5::VmergeVIM => 0b010111,
         }
     }
 
     pub fn category(&self) -> VecOpCategory {
         match self {
-            VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VslidedownVI => {
-                VecOpCategory::OPIVI
-            }
+            VecAluOpRRImm5::VaddVI
+            | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VslidedownVI
+            | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI,
         }
     }
 
     pub fn imm_is_unsigned(&self) -> bool {
         match self {
             VecAluOpRRImm5::VslidedownVI => true,
-            VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI => false,
+            VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VmergeVIM => false,
         }
     }
 }
 
 impl fmt::Display for VecAluOpRRImm5 {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let suffix_length = match self {
+            VecAluOpRRImm5::VmergeVIM => 3,
+            _ => 2,
+        };
+
         let mut s = format!("{self:?}");
         s.make_ascii_lowercase();
-        let (opcode, category) = s.split_at(s.len() - 2);
-        f.write_str(&format!("{}.{}", opcode, category))
+        let (opcode, category) = s.split_at(s.len() - suffix_length);
+        f.write_str(&format!("{opcode}.{category}"))
     }
 }
 
@@ -421,12 +446,12 @@ impl VecAluOpRR {
     /// other way around. As far as I can tell only vmv.v.* are backwards.
     pub fn vs_is_vs2_encoded(&self) -> bool {
         match self {
+            VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true,
             VecAluOpRR::VmvSX
-            | VecAluOpRR::VmvXS
             | VecAluOpRR::VfmvSF
-            | VecAluOpRR::VfmvFS
-            | VecAluOpRR::VfsqrtV => true,
-            VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => false,
+            | VecAluOpRR::VmvVV
+            | VecAluOpRR::VmvVX
+            | VecAluOpRR::VfmvVF => false,
         }
     }
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index dcd8c2bdeb86..64294d2276fa 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -77,10 +77,16 @@
 ;; When masked, the instruction will only operate on the elements that are dictated by
 ;; the mask register. Currently this is always fixed to v0.
 (type VecOpMasking (enum
-  (Enabled)
+  (Enabled (reg Reg))
   (Disabled)
 ))
 
+(decl pure masked (Reg) VecOpMasking)
+(rule (masked reg) (VecOpMasking.Enabled reg))
+
+(decl pure unmasked () VecOpMasking)
+(rule (unmasked) (VecOpMasking.Disabled))
+
 ;; Register to Register ALU Ops
 (type VecAluOpRRR (enum
   ;; Vector-Vector Opcodes
@@ -97,6 +103,7 @@
   (VfmulVV)
   (VfdivVV)
   (VfsgnjnVV)
+  (VmergeVVM)
 
   ;; Vector-Scalar Opcodes
   (VaddVX)
@@ -109,6 +116,8 @@
   (VfmulVF)
   (VfdivVF)
   (VfrdivVF)
+  (VmergeVXM)
+  (VfmergeVFM)
 ))
 
 ;; Register-Imm ALU Ops
@@ -117,6 +126,7 @@
   (VaddVI)
   (VrsubVI)
   (VslidedownVI)
+  (VmergeVIM)
 ))
 
 ;; Imm only ALU Ops
@@ -202,232 +212,289 @@
 ;; See Section 10.1 of the RISC-V Vector Extension Specification.
 
 ;; Helper for emitting `MInst.VecAluRRR` instructions.
-(decl vec_alu_rrr (VecAluOpRRR Reg Reg VState) Reg)
-(rule (vec_alu_rrr op vs2 vs1 vstate)
+(decl vec_alu_rrr (VecAluOpRRR Reg Reg VecOpMasking VState) Reg)
+(rule (vec_alu_rrr op vs2 vs1 mask vstate)
       (let ((vd WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 vstate))))
+            (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 mask vstate))))
         vd))
 
 ;; Helper for emitting `MInst.VecAluRRImm5` instructions.
-(decl vec_alu_rr_imm5 (VecAluOpRRImm5 Reg Imm5 VState) Reg)
-(rule (vec_alu_rr_imm5 op vs2 imm vstate)
+(decl vec_alu_rr_imm5 (VecAluOpRRImm5 Reg Imm5 VecOpMasking  VState) Reg)
+(rule (vec_alu_rr_imm5 op vs2 imm mask vstate)
       (let ((vd WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm vstate))))
+            (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm mask vstate))))
         vd))
 
 ;; Helper for emitting `MInst.VecAluRRImm5` instructions where the immediate
 ;; is zero extended instead of sign extended.
-(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VState) Reg)
-(rule (vec_alu_rr_uimm5 op vs2 imm vstate)
-      (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) vstate))
+(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VecOpMasking VState) Reg)
+(rule (vec_alu_rr_uimm5 op vs2 imm mask vstate)
+      (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) mask vstate))
 
 ;; Helper for emitting `MInst.VecAluRRImm5` instructions that use the Imm5 as
 ;; auxiliary encoding space.
-(decl vec_alu_rr (VecAluOpRR Reg VState) Reg)
-(rule (vec_alu_rr op vs vstate)
+(decl vec_alu_rr (VecAluOpRR Reg VecOpMasking VState) Reg)
+(rule (vec_alu_rr op vs mask vstate)
       (let ((vd WritableReg (temp_writable_reg (vec_alu_rr_dst_type op)))
-            (_ Unit (emit (MInst.VecAluRR op vd vs vstate))))
+            (_ Unit (emit (MInst.VecAluRR op vd vs mask vstate))))
         vd))
 
 ;; Helper for emitting `MInst.VecAluRImm5` instructions.
-(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VState) Reg)
-(rule (vec_alu_r_imm5 op imm vstate)
+(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VecOpMasking VState) Reg)
+(rule (vec_alu_r_imm5 op imm mask vstate)
       (let ((vd WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecAluRImm5 op vd imm vstate))))
+            (_ Unit (emit (MInst.VecAluRImm5 op vd imm mask vstate))))
         vd))
 
 ;; Helper for emitting `MInst.VecLoad` instructions.
-(decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg)
-(rule (vec_load eew from flags vstate)
+(decl vec_load (VecElementWidth VecAMode MemFlags VecOpMasking VState) Reg)
+(rule (vec_load eew from flags mask vstate)
       (let ((vd WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecLoad eew vd from flags vstate))))
+            (_ Unit (emit (MInst.VecLoad eew vd from flags mask vstate))))
         vd))
 
 ;; Helper for emitting `MInst.VecStore` instructions.
-(decl vec_store (VecElementWidth VecAMode Reg MemFlags VState) InstOutput)
-(rule (vec_store eew to from flags vstate)
+(decl vec_store (VecElementWidth VecAMode Reg MemFlags VecOpMasking VState) InstOutput)
+(rule (vec_store eew to from flags mask vstate)
       (side_effect
-        (SideEffectNoResult.Inst (MInst.VecStore eew to from flags vstate))))
+        (SideEffectNoResult.Inst (MInst.VecStore eew to from flags mask vstate))))
 
 ;; Helper for emitting the `vadd.vv` instruction.
-(decl rv_vadd_vv (Reg Reg VState) Reg)
-(rule (rv_vadd_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 vstate))
+(decl rv_vadd_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vadd_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vadd.vx` instruction.
-(decl rv_vadd_vx (Reg Reg VState) Reg)
-(rule (rv_vadd_vx vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 vstate))
+(decl rv_vadd_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vadd_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vadd.vi` instruction.
-(decl rv_vadd_vi (Reg Imm5 VState) Reg)
-(rule (rv_vadd_vi vs2 imm vstate)
-  (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm vstate))
+(decl rv_vadd_vi (Reg Imm5 VecOpMasking VState) Reg)
+(rule (rv_vadd_vi vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm mask vstate))
 
 ;; Helper for emitting the `vsub.vv` instruction.
-(decl rv_vsub_vv (Reg Reg VState) Reg)
-(rule (rv_vsub_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 vstate))
+(decl rv_vsub_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vsub_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vsub.vx` instruction.
-(decl rv_vsub_vx (Reg Reg VState) Reg)
-(rule (rv_vsub_vx vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 vstate))
+(decl rv_vsub_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vsub_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vrsub.vx` instruction.
-(decl rv_vrsub_vx (Reg Reg VState) Reg)
-(rule (rv_vrsub_vx vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate))
+(decl rv_vrsub_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vrsub_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vneg.v` pseudo-instruction.
-(decl rv_vneg_v (Reg VState) Reg)
-(rule (rv_vneg_v vs2 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) vstate))
+(decl rv_vneg_v (Reg VecOpMasking VState) Reg)
+(rule (rv_vneg_v vs2 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) mask vstate))
 
 ;; Helper for emitting the `vrsub.vi` instruction.
-(decl rv_vrsub_vi (Reg Imm5 VState) Reg)
-(rule (rv_vrsub_vi vs2 imm vstate)
-  (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm vstate))
+(decl rv_vrsub_vi (Reg Imm5 VecOpMasking VState) Reg)
+(rule (rv_vrsub_vi vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm mask vstate))
 
 ;; Helper for emitting the `vmul.vv` instruction.
-(decl rv_vmul_vv (Reg Reg VState) Reg)
-(rule (rv_vmul_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 vstate))
+(decl rv_vmul_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vmul_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vmulh.vv` instruction.
-(decl rv_vmulh_vv (Reg Reg VState) Reg)
-(rule (rv_vmulh_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 vstate))
+(decl rv_vmulh_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vmulh_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vmulhu.vv` instruction.
-(decl rv_vmulhu_vv (Reg Reg VState) Reg)
-(rule (rv_vmulhu_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 vstate))
+(decl rv_vmulhu_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vmulhu_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vand.vv` instruction.
-(decl rv_vand_vv (Reg Reg VState) Reg)
-(rule (rv_vand_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 vstate))
+(decl rv_vand_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vand_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vor.vv` instruction.
-(decl rv_vor_vv (Reg Reg VState) Reg)
-(rule (rv_vor_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 vstate))
+(decl rv_vor_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vor_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vxor.vv` instruction.
-(decl rv_vxor_vv (Reg Reg VState) Reg)
-(rule (rv_vxor_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate))
+(decl rv_vxor_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vxor_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfadd.vv` instruction.
-(decl rv_vfadd_vv (Reg Reg VState) Reg)
-(rule (rv_vfadd_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 vstate))
+(decl rv_vfadd_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfadd_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfadd.vf` instruction.
-(decl rv_vfadd_vf (Reg Reg VState) Reg)
-(rule (rv_vfadd_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 vstate))
+(decl rv_vfadd_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfadd_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfsub.vv` instruction.
-(decl rv_vfsub_vv (Reg Reg VState) Reg)
-(rule (rv_vfsub_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 vstate))
+(decl rv_vfsub_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfsub_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfsub.vf` instruction.
-(decl rv_vfsub_vf (Reg Reg VState) Reg)
-(rule (rv_vfsub_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 vstate))
+(decl rv_vfsub_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfsub_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfrsub.vf` instruction.
-(decl rv_vfrsub_vf (Reg Reg VState) Reg)
-(rule (rv_vfrsub_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 vstate))
+(decl rv_vfrsub_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfrsub_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfmul.vv` instruction.
-(decl rv_vfmul_vv (Reg Reg VState) Reg)
-(rule (rv_vfmul_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 vstate))
+(decl rv_vfmul_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfmul_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfmul.vf` instruction.
-(decl rv_vfmul_vf (Reg Reg VState) Reg)
-(rule (rv_vfmul_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 vstate))
+(decl rv_vfmul_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfmul_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfdiv.vv` instruction.
-(decl rv_vfdiv_vv (Reg Reg VState) Reg)
-(rule (rv_vfdiv_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 vstate))
+(decl rv_vfdiv_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfdiv_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfdiv.vf` instruction.
-(decl rv_vfdiv_vf (Reg Reg VState) Reg)
-(rule (rv_vfdiv_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 vstate))
+(decl rv_vfdiv_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfdiv_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfrdiv.vf` instruction.
-(decl rv_vfrdiv_vf (Reg Reg VState) Reg)
-(rule (rv_vfrdiv_vf vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 vstate))
+(decl rv_vfrdiv_vf (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfrdiv_vf vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfsgnjn.vv` ("Floating Point Sign Injection Negated") instruction.
 ;; The output of this instruction is `vs2` with the negated sign bit from `vs1`
-(decl rv_vfsgnjn_vv (Reg Reg VState) Reg)
-(rule (rv_vfsgnjn_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 vstate))
+(decl rv_vfsgnjn_vv (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vfsgnjn_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vfneg.v` instruction.
 ;; This instruction is a mnemonic for `vfsgnjn.vv vd, vs, vs`
-(decl rv_vfneg_v (Reg VState) Reg)
-(rule (rv_vfneg_v vs vstate) (rv_vfsgnjn_vv vs vs vstate))
+(decl rv_vfneg_v (Reg VecOpMasking VState) Reg)
+(rule (rv_vfneg_v vs mask vstate) (rv_vfsgnjn_vv vs vs mask vstate))
 
 ;; Helper for emitting the `vfsqrt.v` instruction.
 ;; This instruction splats the F regsiter into all elements of the destination vector.
-(decl rv_vfsqrt_v (Reg VState) Reg)
-(rule (rv_vfsqrt_v vs vstate)
-  (vec_alu_rr (VecAluOpRR.VfsqrtV) vs vstate))
+(decl rv_vfsqrt_v (Reg VecOpMasking VState) Reg)
+(rule (rv_vfsqrt_v vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VfsqrtV) vs mask vstate))
 
 ;; Helper for emitting the `vslidedown.vx` instruction.
 ;; `vslidedown` moves all elements in the vector down by n elements.
 ;; The top most elements are up to the tail policy.
-(decl rv_vslidedown_vx (Reg Reg VState) Reg)
-(rule (rv_vslidedown_vx vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 vstate))
+(decl rv_vslidedown_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vslidedown_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vslidedown.vi` instruction.
 ;; Unlike other `vi` instructions the immediate is zero extended.
-(decl rv_vslidedown_vi (Reg UImm5 VState) Reg)
-(rule (rv_vslidedown_vi vs2 imm vstate)
-  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm vstate))
+(decl rv_vslidedown_vi (Reg UImm5 VecOpMasking VState) Reg)
+(rule (rv_vslidedown_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm mask vstate))
 
 ;; Helper for emitting the `vmv.x.s` instruction.
 ;; This instruction copies the first element of the source vector to the destination X register.
+;; Masked versions of this instuction are not supported.
 (decl rv_vmv_xs (Reg VState) Reg)
 (rule (rv_vmv_xs vs vstate)
-  (vec_alu_rr (VecAluOpRR.VmvXS) vs vstate))
+  (vec_alu_rr (VecAluOpRR.VmvXS) vs (unmasked) vstate))
 
 ;; Helper for emitting the `vfmv.f.s` instruction.
 ;; This instruction copies the first element of the source vector to the destination F register.
+;; Masked versions of this instuction are not supported.
 (decl rv_vfmv_fs (Reg VState) Reg)
 (rule (rv_vfmv_fs vs vstate)
-  (vec_alu_rr (VecAluOpRR.VfmvFS) vs vstate))
+  (vec_alu_rr (VecAluOpRR.VfmvFS) vs (unmasked) vstate))
+
+;; Helper for emitting the `vmv.s.x` instruction.
+;; This instruction copies the source X register into first element of the source vector.
+;; Masked versions of this instuction are not supported.
+(decl rv_vmv_sx (Reg VState) Reg)
+(rule (rv_vmv_sx vs vstate)
+  (vec_alu_rr (VecAluOpRR.VmvSX) vs (unmasked) vstate))
+
+;; Helper for emitting the `vfmv.s.f` instruction.
+;; This instruction copies the source F register into first element of the source vector.
+;; Masked versions of this instuction are not supported.
+(decl rv_vfmv_sf (Reg VState) Reg)
+(rule (rv_vfmv_sf vs vstate)
+  (vec_alu_rr (VecAluOpRR.VfmvSF) vs (unmasked) vstate))
 
 ;; Helper for emitting the `vmv.v.x` instruction.
 ;; This instruction splats the X regsiter into all elements of the destination vector.
+;; Masked versions of this instruction are called `vmerge`
 (decl rv_vmv_vx (Reg VState) Reg)
 (rule (rv_vmv_vx vs vstate)
-  (vec_alu_rr (VecAluOpRR.VmvVX) vs vstate))
+  (vec_alu_rr (VecAluOpRR.VmvVX) vs (unmasked) vstate))
 
 ;; Helper for emitting the `vfmv.v.f` instruction.
 ;; This instruction splats the F regsiter into all elements of the destination vector.
+;; Masked versions of this instruction are called `vmerge`
 (decl rv_vfmv_vf (Reg VState) Reg)
 (rule (rv_vfmv_vf vs vstate)
-  (vec_alu_rr (VecAluOpRR.VfmvVF) vs vstate))
+  (vec_alu_rr (VecAluOpRR.VfmvVF) vs (unmasked) vstate))
 
 ;; Helper for emitting the `vmv.v.i` instruction.
 ;; This instruction splat's the immediate value into all elements of the destination vector.
+;; Masked versions of this instruction are called `vmerge`
 (decl rv_vmv_vi (Imm5 VState) Reg)
 (rule (rv_vmv_vi imm vstate)
-  (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm vstate))
+  (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm (unmasked) vstate))
+
+;; Helper for emitting the `vmerge.vvm` instruction.
+;; This instruction merges the elements of the two source vectors into the destination vector
+;; based on a mask. Elements are taken from the first source vector if the mask bit is clear,
+;; and from the second source vector if the mask bit is set. This instruction is always masked.
+;;
+;; vd[i] = v0.mask[i] ? vs1[i] : vs2[i]
+(decl rv_vmerge_vvm (Reg Reg Reg VState) Reg)
+(rule (rv_vmerge_vvm vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmergeVVM) vs2 vs1 (masked mask) vstate))
+
+;; Helper for emitting the `vmerge.vxm` instruction.
+;; Elements are taken from the first source vector if the mask bit is clear, and from the X
+;; register if the mask bit is set. This instruction is always masked.
+;;
+;; vd[i] = v0.mask[i] ? x[rs1] : vs2[i]
+(decl rv_vmerge_vxm (Reg Reg Reg VState) Reg)
+(rule (rv_vmerge_vxm vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmergeVXM) vs2 vs1 (masked mask) vstate))
+
+;; Helper for emitting the `vfmerge.vfm` instruction.
+;; Elements are taken from the first source vector if the mask bit is clear, and from the F
+;; register if the mask bit is set. This instruction is always masked.
+;;
+;; vd[i] = v0.mask[i] ? f[rs1] : vs2[i]
+(decl rv_vfmerge_vfm (Reg Reg Reg VState) Reg)
+(rule (rv_vfmerge_vfm vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VfmergeVFM) vs2 vs1 (masked mask) vstate))
+
+;; Helper for emitting the `vmerge.vim` instruction.
+;; Elements are taken from the first source vector if the mask bit is clear, and from the
+;; immediate value if the mask bit is set. This instruction is always masked.
+;;
+;; vd[i] = v0.mask[i] ? imm : vs2[i]
+(decl rv_vmerge_vim (Reg Imm5 Reg VState) Reg)
+(rule (rv_vmerge_vim vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VmergeVIM) vs2 imm (masked mask) vstate))
+
 
 ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -447,8 +514,20 @@
 ;; in index 0, and then use the appropriate `vmv` instruction.
 ;; If the index fits into a 5-bit immediate, we can emit a `vslidedown.vi`.
 (rule 1 (gen_extractlane (ty_vec_fits_in_register ty) src (uimm5_from_u8 idx))
-  (gen_extractlane ty (rv_vslidedown_vi src idx ty) 0))
+  (gen_extractlane ty (rv_vslidedown_vi src idx (unmasked) ty) 0))
 
 ;; Otherwise lower it into an X register.
 (rule 0 (gen_extractlane (ty_vec_fits_in_register ty) src idx)
-  (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) ty) 0))
+  (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) (unmasked) ty) 0))
+
+
+;; Build a vector mask from a u64
+;; TODO: We should merge this with the `vconst` rules, and take advantage of
+;; the other existing `vconst` rules. One example is using `vmv.v.i` which
+;; can represent some of these masks.
+(decl gen_vec_mask (u64) Reg)
+
+;; Materialize the mask into an X register, and move it into the bottom of
+;; the vector register.
+(rule (gen_vec_mask mask)
+  (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2)))
\ No newline at end of file
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index d28e30344d60..ef72b3568dd1 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -16,6 +16,7 @@
     (element_width_from_type ty)
     (VecAMode.UnitStride (gen_const_amode (const_to_vconst n)))
     (mem_flags_trusted)
+    (unmasked)
     ty))
 
 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -108,19 +109,19 @@
 
 ;; SIMD Vectors
 (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (iadd x y)))
-  (rv_vadd_vv x y ty))
+  (rv_vadd_vv x y (unmasked) ty))
 
 (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (splat y))))
-  (rv_vadd_vx x y ty))
+  (rv_vadd_vx x y (unmasked) ty))
 
 (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (splat x) y)))
-  (rv_vadd_vx y x ty))
+  (rv_vadd_vx y x (unmasked) ty))
 
 (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y))))
-  (rv_vadd_vi x y ty))
+  (rv_vadd_vi x y (unmasked) ty))
 
 (rule 12 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y)))
-  (rv_vadd_vi y x ty))
+  (rv_vadd_vi y x (unmasked) ty))
 
 ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;
 (rule
@@ -144,16 +145,16 @@
 
 ;; SIMD Vectors
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (isub x y)))
-  (rv_vsub_vv x y ty))
+  (rv_vsub_vv x y (unmasked) ty))
 
 (rule 4 (lower (has_type (ty_vec_fits_in_register ty) (isub x (splat y))))
-  (rv_vsub_vx x y ty))
+  (rv_vsub_vx x y (unmasked) ty))
 
 (rule 5 (lower (has_type (ty_vec_fits_in_register ty) (isub (splat x) y)))
-  (rv_vrsub_vx y x ty))
+  (rv_vrsub_vx y x (unmasked) ty))
 
 (rule 6 (lower (has_type (ty_vec_fits_in_register ty) (isub (replicated_imm5 x) y)))
-  (rv_vrsub_vi y x ty))
+  (rv_vrsub_vi y x (unmasked) ty))
 
 
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -162,7 +163,7 @@
   (neg ty val))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ineg x)))
-  (rv_vneg_v x ty))
+  (rv_vneg_v x (unmasked) ty))
 
 
 ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -201,21 +202,21 @@
     (value_regs dst_lo dst_hi)))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y)))
-  (rv_vmul_vv x y ty))
+  (rv_vmul_vv x y (unmasked) ty))
 
 ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y)))
   (lower_smlhi ty (ext_int_if_need $true x ty) (ext_int_if_need $true y ty)))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y)))
-  (rv_vmulh_vv x y ty))
+  (rv_vmulh_vv x y (unmasked) ty))
 
 ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y)))
   (lower_umlhi ty (ext_int_if_need $false x ty) (ext_int_if_need $false y ty)))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y)))
-  (rv_vmulhu_vv x y ty))
+  (rv_vmulhu_vv x y (unmasked) ty))
 
 ;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -325,7 +326,7 @@
 
 
 (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (band x y)))
-  (rv_vand_vv x y ty))
+  (rv_vand_vv x y (unmasked) ty))
 
 
 ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -372,7 +373,7 @@
     (value_regs low high)))
 
 (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bor x y)))
-  (rv_vor_vv x y ty))
+  (rv_vor_vv x y (unmasked) ty))
 
 ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule -1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y)))
@@ -395,7 +396,7 @@
   (lower_float_binary (AluOPRRR.Xor) x y $F64))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y)))
-  (rv_vxor_vv x y ty))
+  (rv_vxor_vv x y (unmasked) ty))
 
 ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty (bnot x)))
@@ -588,7 +589,7 @@
   (rv_fneg ty x))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fneg x)))
-  (rv_vfneg_v x ty))
+  (rv_vfneg_v x (unmasked) ty))
 
 ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty (fcopysign x y)))
@@ -604,7 +605,7 @@
   (rv_fsqrt ty x))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqrt x)))
-  (rv_vfsqrt_v x ty))
+  (rv_vfsqrt_v x (unmasked) ty))
 
 ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule -1
@@ -719,13 +720,13 @@
   (rv_fadd ty x y))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fadd x y)))
-  (rv_vfadd_vv x y ty))
+  (rv_vfadd_vv x y (unmasked) ty))
 
 (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fadd x (splat y))))
-  (rv_vfadd_vf x y ty))
+  (rv_vfadd_vf x y (unmasked) ty))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fadd (splat x) y)))
-  (rv_vfadd_vf y x ty))
+  (rv_vfadd_vf y x (unmasked) ty))
 
 
 ;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -733,26 +734,26 @@
   (rv_fsub ty x y))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fsub x y)))
-  (rv_vfsub_vv x y ty))
+  (rv_vfsub_vv x y (unmasked) ty))
 
 (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fsub x (splat y))))
-  (rv_vfsub_vf x y ty))
+  (rv_vfsub_vf x y (unmasked) ty))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fsub (splat x) y)))
-  (rv_vfrsub_vf y x ty))
+  (rv_vfrsub_vf y x (unmasked) ty))
 
 ;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_scalar_float ty) (fmul x y)))
   (rv_fmul ty x y))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmul x y)))
-  (rv_vfmul_vv x y ty))
+  (rv_vfmul_vv x y (unmasked) ty))
 
 (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fmul x (splat y))))
-  (rv_vfmul_vf x y ty))
+  (rv_vfmul_vf x y (unmasked) ty))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fmul (splat x) y)))
-  (rv_vfmul_vf y x ty))
+  (rv_vfmul_vf y x (unmasked) ty))
 
 
 ;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -760,13 +761,13 @@
   (rv_fdiv ty x y))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x y)))
-  (rv_vfdiv_vv x y ty))
+  (rv_vfdiv_vv x y (unmasked) ty))
 
 (rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x (splat y))))
-  (rv_vfdiv_vf x y ty))
+  (rv_vfdiv_vf x y (unmasked) ty))
 
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fdiv (splat x) y)))
-  (rv_vfrdiv_vf y x ty))
+  (rv_vfrdiv_vf y x (unmasked) ty))
 
 ;;;; Rules for `fmin/fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -915,7 +916,7 @@
 (rule 2
   (lower (has_type (ty_vec_fits_in_register ty) (load flags p @ (value_type (ty_addr64 _)) offset)))
   (let ((eew VecElementWidth (element_width_from_type ty)))
-    (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags ty)))
+    (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty)))
 
 ;;;;;  Rules for `istore8`;;;;;;;;;
 (rule
@@ -944,7 +945,7 @@
 (rule 2
   (lower (store flags x @ (value_type (ty_vec_fits_in_register ty)) p @ (value_type (ty_addr64 _)) offset))
   (let ((eew VecElementWidth (element_width_from_type ty)))
-    (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags ty)))
+    (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags (unmasked) ty)))
 
 (decl gen_icmp (IntCC ValueRegs ValueRegs Type) Reg)
 (rule
@@ -1088,6 +1089,32 @@
 (rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))
   (gen_extractlane ty x idx))
 
+;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; We can insert a lane by using a masked splat from an X register.
+;; Build a mask that is only enabled in the lane we want to insert.
+;; Then use a masked splat (vmerge) to insert the value.
+(rule 0 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty))
+                           val @ (value_type (ty_int _))
+                           (u8_from_uimm8 lane)))
+  (let ((mask Reg (gen_vec_mask (u64_shl 1 lane))))
+    (rv_vmerge_vxm vec val mask ty)))
+
+;; Similar to above, but using the float variants of the instructions.
+(rule 1 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty))
+                           val @ (value_type (ty_scalar_float _))
+                           (u8_from_uimm8 lane)))
+  (let ((mask Reg (gen_vec_mask (u64_shl 1 lane))))
+    (rv_vfmerge_vfm vec val mask ty)))
+
+;; If we are inserting from an Imm5 const we can use the immediate
+;; variant of vmerge.
+(rule 2 (lower (insertlane vec @ (value_type (ty_vec_fits_in_register ty))
+                           (iconst (u64_from_imm64 (imm5_from_u64 imm)))
+                           (u8_from_uimm8 lane)))
+  (let ((mask Reg (gen_vec_mask (u64_shl 1 lane))))
+    (rv_vmerge_vim vec imm mask ty)))
+
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type ty (splat n @ (value_type (ty_scalar_float _)))))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif
new file mode 100644
index 000000000000..5e4899512711
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif
@@ -0,0 +1,530 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %insertlane_15(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = insertlane v0, v1, 15
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   lui a2,8
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v9,v1,a0,v0.t #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   lui a2, 8
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0xa7, 0x84, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_5(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = insertlane v0, v1, 5
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,32
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v9,v1,a0,v0.t #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 0x20
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x84, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_2(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = insertlane v0, v1, 2
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,4
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v9,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 4
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x84, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_0(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = insertlane v0, v1, 0
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,1
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v9,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 1
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x84, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_0_in_f64x2(f64x2, f64) -> f64x2 {
+block0(v0: f64x2, v1: f64):
+    v2 = insertlane v0, v1, 0
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,1
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 1
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_1_in_f64x2(f64x2, f64) -> f64x2 {
+block0(v0: f64x2, v1: f64):
+    v2 = insertlane v0, v1, 1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,2
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 2
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_3_in_f32x4(f32x4, f32) -> f32x4 {
+block0(v0: f32x4, v1: f32):
+    v2 = insertlane v0, v1, 0
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,1
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 1
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_1_in_f32x4(f32x4, f32) -> f32x4 {
+block0(v0: f32x4, v1: f32):
+    v2 = insertlane v0, v1, 1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a2,2
+;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a2, zero, 2
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_const_15(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 10
+    v2 = insertlane v0, v1, 15
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   lui a1,8
+;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v8,v1,10,v0.t #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   lui a1, 8
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe0, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x34, 0x15, 0x5c
+;   .byte 0x27, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_const_5(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -2
+    v2 = insertlane v0, v1, 5
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a1,32
+;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v8,v1,-2,v0.t #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a1, zero, 0x20
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe0, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x34, 0x1f, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_const_2(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = insertlane v0, v1, 2
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a1,4
+;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v8,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a1, zero, 4
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe0, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xb4, 0x17, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %insertlane_const_0(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -9
+    v2 = insertlane v0, v1, 0
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   li a1,1
+;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v8,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi a1, zero, 1
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe0, 0x05, 0x42
+;   .byte 0x57, 0xb4, 0x1b, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x04, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif b/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif
new file mode 100644
index 000000000000..f2203889aa47
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-insert-extract-lane.clif
@@ -0,0 +1,23 @@
+test run
+target aarch64
+target s390x
+target x86_64 ssse3 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+target riscv64 has_v
+
+function %insertlane_preserves_upper_bits(f64) -> i64 fast {
+block0(v5: f64):
+    v3 = vconst.i8x16 0x0000000000000001ffffffffffffffff
+    v6 = bitcast.f64x2 little v3
+    v7 = insertlane v6, v5, 0
+    v8 = bitcast.i64x2 little v7
+    v9 = extractlane v8, 1
+    return v9
+}
+; run: %insertlane_preserves_upper_bits(0x0.0) == 1
+; run: %insertlane_preserves_upper_bits(0x9.0) == 1
+; run: %insertlane_preserves_upper_bits(+Inf) == 1
\ No newline at end of file
diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif
index 56ae6dedde2c..d69e7a08ebd6 100644
--- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif
+++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif
@@ -8,6 +8,7 @@ target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
+target riscv64 has_v
 
 function %insertlane_i8x16_0(i8x16, i8) -> i8x16 {
 block0(v0: i8x16, v1: i8):
@@ -166,3 +167,36 @@ block0(v0: f64x2, v1: f64):
     return v4
 }
 ; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]
+
+
+function %insertlane_const_15(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 10
+    v2 = insertlane v0, v1, 15
+    return v2
+}
+; run: %insertlane_const_15([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10]
+
+function %insertlane_const_5(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -2
+    v2 = insertlane v0, v1, 5
+    return v2
+}
+; run: %insertlane_const_5([1 1 1 1 1 1 1 1]) == [1 1 1 1 1 -2 1 1]
+
+function %insertlane_const_2(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = insertlane v0, v1, 2
+    return v2
+}
+; run: %insertlane_const_2([1 1 1 1]) == [1 1 15 1]
+
+function %insertlane_const_0(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -9
+    v2 = insertlane v0, v1, 0
+    return v2
+}
+; run: %insertlane_const_0([1 1]) == [-9 1]
\ No newline at end of file