diff --git a/cranelift/codegen/meta/src/cdsl/formats.rs b/cranelift/codegen/meta/src/cdsl/formats.rs
index 876fb7702f3f..c627e0269bd6 100644
--- a/cranelift/codegen/meta/src/cdsl/formats.rs
+++ b/cranelift/codegen/meta/src/cdsl/formats.rs
@@ -116,6 +116,15 @@ impl InstructionFormatBuilder {
         self
     }
 
+    pub fn imm_with_name(mut self, name: &'static str, operand_kind: &OperandKind) -> Self {
+        let field = FormatField {
+            kind: operand_kind.clone(),
+            member: name,
+        };
+        self.0.imm_fields.push(field);
+        self
+    }
+
     pub fn typevar_operand(mut self, operand_index: usize) -> Self {
         assert!(self.0.typevar_operand.is_none());
         assert!(operand_index < self.0.num_value_operands);
diff --git a/cranelift/codegen/meta/src/shared/formats.rs b/cranelift/codegen/meta/src/shared/formats.rs
index 057e03bd3910..8c54c1916911 100644
--- a/cranelift/codegen/meta/src/shared/formats.rs
+++ b/cranelift/codegen/meta/src/shared/formats.rs
@@ -202,7 +202,8 @@ impl Formats {
             heap_addr: Builder::new("HeapAddr")
                 .imm(&entities.heap)
                 .value()
-                .imm(&imm.uimm32)
+                .imm_with_name("offset", &imm.uimm32)
+                .imm_with_name("size", &imm.uimm8)
                 .build(),
 
             // Accessing a WebAssembly table.
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
old mode 100644
new mode 100755
index c1b9f4e9fad4..4f74ac18c9d6
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1128,26 +1128,30 @@ pub(crate) fn define(
     );
 
     let H = &Operand::new("H", &entities.heap);
-    let p = &Operand::new("p", HeapOffset);
-    let Size = &Operand::new("Size", &imm.uimm32).with_doc("Size in bytes");
+    let index = &Operand::new("index", HeapOffset);
+    let Offset = &Operand::new("Offset", &imm.uimm32).with_doc("Static offset immediate in bytes");
+    let Size = &Operand::new("Size", &imm.uimm8).with_doc("Static size immediate in bytes");
 
     ig.push(
         Inst::new(
             "heap_addr",
             r#"
-        Bounds check and compute absolute address of heap memory.
+        Bounds check and compute absolute address of ``index + Offset`` in heap memory.
 
-        Verify that the offset range ``p .. p + Size - 1`` is in bounds for the
-        heap H, and generate an absolute address that is safe to dereference.
+        Verify that the range ``index .. index + Offset + Size`` is in bounds for the
+        heap ``H``, and generate an absolute address that is safe to dereference.
 
-        1. If ``p + Size`` is not greater than the heap bound, return an
-           absolute address corresponding to a byte offset of ``p`` from the
+        1. If ``index + Offset + Size`` is less than or equal ot the heap bound, return an
+           absolute address corresponding to a byte offset of ``index + Offset`` from the
            heap's base address.
-        2. If ``p + Size`` is greater than the heap bound, generate a trap.
+
+        2. If ``index + Offset + Size`` is greater than the heap bound, return the
+           ``NULL`` pointer or any other address that is guaranteed to generate a trap
+           when accessed.
         "#,
             &formats.heap_addr,
         )
-        .operands_in(vec![H, p, Size])
+        .operands_in(vec![H, index, Offset, Size])
         .operands_out(vec![addr]),
     );
 
diff --git a/cranelift/codegen/src/legalizer/heap.rs b/cranelift/codegen/src/legalizer/heap.rs
index 34ef3b34def0..d51a6244eb04 100644
--- a/cranelift/codegen/src/legalizer/heap.rs
+++ b/cranelift/codegen/src/legalizer/heap.rs
@@ -6,7 +6,7 @@
 use crate::cursor::{Cursor, FuncCursor};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::condcodes::IntCC;
-use crate::ir::immediates::Uimm32;
+use crate::ir::immediates::{Uimm32, Uimm8};
 use crate::ir::{self, InstBuilder, RelSourceLoc};
 use crate::isa::TargetIsa;
 
@@ -17,16 +17,18 @@ pub fn expand_heap_addr(
     cfg: &mut ControlFlowGraph,
     isa: &dyn TargetIsa,
     heap: ir::Heap,
-    offset: ir::Value,
-    access_size: Uimm32,
+    index_operand: ir::Value,
+    offset_immediate: Uimm32,
+    access_size: Uimm8,
 ) {
     match func.heaps[heap].style {
         ir::HeapStyle::Dynamic { bound_gv } => dynamic_addr(
             isa,
             inst,
             heap,
-            offset,
-            u64::from(access_size),
+            index_operand,
+            u32::from(offset_immediate),
+            u8::from(access_size),
             bound_gv,
             func,
         ),
@@ -34,8 +36,9 @@ pub fn expand_heap_addr(
             isa,
             inst,
             heap,
-            offset,
-            u64::from(access_size),
+            index_operand,
+            u32::from(offset_immediate),
+            u8::from(access_size),
             bound.into(),
             func,
             cfg,
@@ -48,35 +51,40 @@ fn dynamic_addr(
     isa: &dyn TargetIsa,
     inst: ir::Inst,
     heap: ir::Heap,
-    offset: ir::Value,
-    access_size: u64,
+    index: ir::Value,
+    offset: u32,
+    access_size: u8,
     bound_gv: ir::GlobalValue,
     func: &mut ir::Function,
 ) {
-    let offset_ty = func.dfg.value_type(offset);
+    let index_ty = func.dfg.value_type(index);
     let addr_ty = func.dfg.value_type(func.dfg.first_result(inst));
     let min_size = func.heaps[heap].min_size.into();
     let mut pos = FuncCursor::new(func).at_inst(inst);
     pos.use_srcloc(inst);
 
-    let offset = cast_offset_to_pointer_ty(offset, offset_ty, addr_ty, &mut pos);
+    let index = cast_index_to_pointer_ty(index, index_ty, addr_ty, &mut pos);
 
-    // Start with the bounds check. Trap if `offset + access_size > bound`.
+    // Start with the bounds check. Trap if `index + offset + access_size > bound`.
     let bound = pos.ins().global_value(addr_ty, bound_gv);
-    let (cc, lhs, bound) = if access_size == 1 {
-        // `offset > bound - 1` is the same as `offset >= bound`.
-        (IntCC::UnsignedGreaterThanOrEqual, offset, bound)
-    } else if access_size <= min_size {
-        // We know that bound >= min_size, so here we can compare `offset > bound - access_size`
-        // without wrapping.
-        let adj_bound = pos.ins().iadd_imm(bound, -(access_size as i64));
-        (IntCC::UnsignedGreaterThan, offset, adj_bound)
+    let (cc, lhs, bound) = if offset == 0 && access_size == 1 {
+        // `index > bound - 1` is the same as `index >= bound`.
+        (IntCC::UnsignedGreaterThanOrEqual, index, bound)
+    } else if offset_plus_size(offset, access_size) <= min_size {
+        // We know that `bound >= min_size`, so here we can compare `offset >
+        // bound - (offset + access_size)` without wrapping.
+        let adj_bound = pos
+            .ins()
+            .iadd_imm(bound, -(offset_plus_size(offset, access_size) as i64));
+        (IntCC::UnsignedGreaterThan, index, adj_bound)
     } else {
         // We need an overflow check for the adjusted offset.
-        let access_size_val = pos.ins().iconst(addr_ty, access_size as i64);
+        let access_size_val = pos
+            .ins()
+            .iconst(addr_ty, offset_plus_size(offset, access_size) as i64);
         let adj_offset =
             pos.ins()
-                .uadd_overflow_trap(offset, access_size_val, ir::TrapCode::HeapOutOfBounds);
+                .uadd_overflow_trap(index, access_size_val, ir::TrapCode::HeapOutOfBounds);
         (IntCC::UnsignedGreaterThan, adj_offset, bound)
     };
     let oob = pos.ins().icmp(cc, lhs, bound);
@@ -93,6 +101,7 @@ fn dynamic_addr(
         inst,
         heap,
         addr_ty,
+        index,
         offset,
         pos.func,
         spectre_oob_comparison,
@@ -104,26 +113,27 @@ fn static_addr(
     isa: &dyn TargetIsa,
     inst: ir::Inst,
     heap: ir::Heap,
-    mut offset: ir::Value,
-    access_size: u64,
+    index: ir::Value,
+    offset: u32,
+    access_size: u8,
     bound: u64,
     func: &mut ir::Function,
     cfg: &mut ControlFlowGraph,
 ) {
-    let offset_ty = func.dfg.value_type(offset);
+    let index_ty = func.dfg.value_type(index);
     let addr_ty = func.dfg.value_type(func.dfg.first_result(inst));
     let mut pos = FuncCursor::new(func).at_inst(inst);
     pos.use_srcloc(inst);
 
-    // The goal here is to trap if `offset + access_size > bound`.
+    // The goal here is to trap if `index + offset + access_size > bound`.
     //
-    // This first case is a trivial case where we can easily trap.
-    if access_size > bound {
+    // This first case is a trivial case where we can statically trap.
+    if offset_plus_size(offset, access_size) > bound {
         // This will simply always trap since `offset >= 0`.
         pos.ins().trap(ir::TrapCode::HeapOutOfBounds);
         pos.func.dfg.replace(inst).iconst(addr_ty, 0);
 
-        // Split Block, as the trap is a terminator instruction.
+        // Split the block, as the trap is a terminator instruction.
         let curr_block = pos.current_block().expect("Cursor is not in a block");
         let new_block = pos.func.dfg.make_block();
         pos.insert_block(new_block);
@@ -132,29 +142,29 @@ fn static_addr(
         return;
     }
 
-    // After the trivial case is done we're now mostly interested in trapping
-    // if `offset > bound - access_size`. We know `bound - access_size` here is
-    // non-negative from the above comparison.
+    // After the trivial case is done we're now mostly interested in trapping if
+    // `index > bound - offset - access_size`. We know `bound - offset -
+    // access_size` here is non-negative from the above comparison.
     //
-    // If we can know `bound - access_size >= 4GB` then with a 32-bit offset
-    // we're guaranteed:
+    // If we can know `bound - offset - access_size >= 4GB` then with a 32-bit
+    // offset we're guaranteed:
     //
-    //      bound - access_size >= 4GB > offset
+    //      bound - offset - access_size >= 4GB > index
     //
-    // or, in other words, `offset < bound - access_size`, meaning we can't trap
-    // for any value of `offset`.
+    // or, in other words, `index < bound - offset - access_size`, meaning we
+    // can't trap for any value of `index`.
     //
     // With that we have an optimization here where with 32-bit offsets and
     // `bound - access_size >= 4GB` we can omit a bounds check.
-    let limit = bound - access_size;
+    let limit = bound - offset as u64 - access_size as u64;
     let mut spectre_oob_comparison = None;
-    offset = cast_offset_to_pointer_ty(offset, offset_ty, addr_ty, &mut pos);
-    if offset_ty != ir::types::I32 || limit < 0xffff_ffff {
-        // Here we want to test the condition `offset > limit` and if that's
+    let index = cast_index_to_pointer_ty(index, index_ty, addr_ty, &mut pos);
+    if index_ty != ir::types::I32 || limit < 0xffff_ffff {
+        // Here we want to test the condition `index > limit` and if that's
         // true then this is an out-of-bounds access and needs to trap. For ARM
         // and other RISC architectures it's easier to test against an immediate
         // that's even instead of odd, so if `limit` is odd then we instead test
-        // for `offset >= limit + 1`.
+        // for `index >= limit + 1`.
         //
         // The thinking behind this is that:
         //
@@ -164,10 +174,10 @@ fn static_addr(
         // should mean that `A >= B + 1` is an equivalent check for `A > B`
         let (cc, lhs, limit_imm) = if limit & 1 == 1 {
             let limit = limit as i64 + 1;
-            (IntCC::UnsignedGreaterThanOrEqual, offset, limit)
+            (IntCC::UnsignedGreaterThanOrEqual, index, limit)
         } else {
             let limit = limit as i64;
-            (IntCC::UnsignedGreaterThan, offset, limit)
+            (IntCC::UnsignedGreaterThan, index, limit)
         };
         let oob = pos.ins().icmp_imm(cc, lhs, limit_imm);
         pos.ins().trapnz(oob, ir::TrapCode::HeapOutOfBounds);
@@ -182,29 +192,30 @@ fn static_addr(
         inst,
         heap,
         addr_ty,
+        index,
         offset,
         pos.func,
         spectre_oob_comparison,
     );
 }
 
-fn cast_offset_to_pointer_ty(
-    offset: ir::Value,
-    offset_ty: ir::Type,
+fn cast_index_to_pointer_ty(
+    index: ir::Value,
+    index_ty: ir::Type,
     addr_ty: ir::Type,
     pos: &mut FuncCursor,
 ) -> ir::Value {
-    if offset_ty == addr_ty {
-        return offset;
+    if index_ty == addr_ty {
+        return index;
     }
     // Note that using 64-bit heaps on a 32-bit host is not currently supported,
     // would require at least a bounds check here to ensure that the truncation
     // from 64-to-32 bits doesn't lose any upper bits. For now though we're
     // mostly interested in the 32-bit-heaps-on-64-bit-hosts cast.
-    assert!(offset_ty.bits() < addr_ty.bits());
+    assert!(index_ty.bits() < addr_ty.bits());
 
-    // Convert `offset` to `addr_ty`.
-    let extended_offset = pos.ins().uextend(addr_ty, offset);
+    // Convert `index` to `addr_ty`.
+    let extended_index = pos.ins().uextend(addr_ty, index);
 
     // Add debug value-label alias so that debuginfo can name the extended
     // value as the address
@@ -213,9 +224,9 @@ fn cast_offset_to_pointer_ty(
     pos.func
         .stencil
         .dfg
-        .add_value_label_alias(extended_offset, loc, offset);
+        .add_value_label_alias(extended_index, loc, index);
 
-    extended_offset
+    extended_index
 }
 
 /// Emit code for the base address computation of a `heap_addr` instruction.
@@ -224,7 +235,8 @@ fn compute_addr(
     inst: ir::Inst,
     heap: ir::Heap,
     addr_ty: ir::Type,
-    offset: ir::Value,
+    index: ir::Value,
+    offset: u32,
     func: &mut ir::Function,
     // If we are performing Spectre mitigation with conditional selects, the
     // values to compare and the condition code that indicates an out-of bounds
@@ -232,7 +244,7 @@ fn compute_addr(
     // speculatively safe address (a zero / null pointer) instead.
     spectre_oob_comparison: Option<(IntCC, ir::Value, ir::Value)>,
 ) {
-    debug_assert_eq!(func.dfg.value_type(offset), addr_ty);
+    debug_assert_eq!(func.dfg.value_type(index), addr_ty);
     let mut pos = FuncCursor::new(func).at_inst(inst);
     pos.use_srcloc(inst);
 
@@ -245,14 +257,33 @@ fn compute_addr(
     };
 
     if let Some((cc, a, b)) = spectre_oob_comparison {
-        let final_addr = pos.ins().iadd(base, offset);
+        let final_base = pos.ins().iadd(base, index);
+        // NB: The addition of the offset immediate must happen *before* the
+        // `select_spectre_guard`. If it happens after, then we potentially are
+        // letting speculative execution read the whole first 4GiB of memory.
+        let final_addr = if offset == 0 {
+            final_base
+        } else {
+            pos.ins().iadd_imm(final_base, offset as i64)
+        };
         let zero = pos.ins().iconst(addr_ty, 0);
         let cmp = pos.ins().icmp(cc, a, b);
         pos.func
             .dfg
             .replace(inst)
             .select_spectre_guard(cmp, zero, final_addr);
+    } else if offset == 0 {
+        pos.func.dfg.replace(inst).iadd(base, index);
     } else {
-        pos.func.dfg.replace(inst).iadd(base, offset);
+        let final_base = pos.ins().iadd(base, index);
+        pos.func
+            .dfg
+            .replace(inst)
+            .iadd_imm(final_base, offset as i64);
     }
 }
+
+fn offset_plus_size(offset: u32, size: u8) -> u64 {
+    // Cannot overflow because we are widening to `u64`.
+    offset as u64 + size as u64
+}
diff --git a/cranelift/codegen/src/legalizer/mod.rs b/cranelift/codegen/src/legalizer/mod.rs
index acb0c437a751..96eccb2079c9 100644
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -72,8 +72,9 @@ pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa:
                     opcode: ir::Opcode::HeapAddr,
                     heap,
                     arg,
-                    imm,
-                } => expand_heap_addr(inst, &mut pos.func, cfg, isa, heap, arg, imm),
+                    offset,
+                    size,
+                } => expand_heap_addr(inst, &mut pos.func, cfg, isa, heap, arg, offset, size),
                 InstructionData::StackLoad {
                     opcode: ir::Opcode::StackLoad,
                     stack_slot,
diff --git a/cranelift/codegen/src/write.rs b/cranelift/codegen/src/write.rs
index 8e4ae92437af..725ca46c4945 100644
--- a/cranelift/codegen/src/write.rs
+++ b/cranelift/codegen/src/write.rs
@@ -476,7 +476,13 @@ pub fn write_operands(w: &mut dyn Write, dfg: &DataFlowGraph, inst: Inst) -> fmt
             dynamic_stack_slot,
             ..
         } => write!(w, " {}, {}", arg, dynamic_stack_slot),
-        HeapAddr { heap, arg, imm, .. } => write!(w, " {}, {}, {}", heap, arg, imm),
+        HeapAddr {
+            heap,
+            arg,
+            offset,
+            size,
+            ..
+        } => write!(w, " {}, {}, {}, {}", heap, arg, offset, size),
         TableAddr { table, arg, .. } => write!(w, " {}, {}", table, arg),
         Load {
             flags, arg, offset, ..
diff --git a/cranelift/filetests/filetests/alias/extends.clif b/cranelift/filetests/filetests/alias/extends.clif
index d6bbf7d4a837..30d5cc03553f 100644
--- a/cranelift/filetests/filetests/alias/extends.clif
+++ b/cranelift/filetests/filetests/alias/extends.clif
@@ -9,9 +9,9 @@ function %f0(i64 vmctx, i32) -> i32, i32, i32, i64, i64, i64 {
     gv0 = vmctx
     gv1 = load.i64 notrap readonly aligned gv0+8
     heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32
- 
+
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 12, 0
 
     ;; Initial load. This will not be reused by anything below, even
     ;; though it does access the same address.
diff --git a/cranelift/filetests/filetests/alias/fence.clif b/cranelift/filetests/filetests/alias/fence.clif
index 3202dbfcd750..c5b55ccc63b1 100644
--- a/cranelift/filetests/filetests/alias/fence.clif
+++ b/cranelift/filetests/filetests/alias/fence.clif
@@ -9,9 +9,9 @@ function %f0(i64 vmctx, i32) -> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32
     gv0 = vmctx
     gv1 = load.i64 notrap readonly aligned gv0+8
     heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32
- 
+
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 12, 0
 
     v3 = load.i32 v2+8
     v4 = load.i32 vmctx v0+16
@@ -39,7 +39,7 @@ block0(v0: i64, v1: i32):
     v11 = atomic_load.i32 v0
 
     v12 = load.i32 vmctx v0+16
-    ; check: v12 = load.i32 vmctx v0+16    
+    ; check: v12 = load.i32 vmctx v0+16
 
     return v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
 }
diff --git a/cranelift/filetests/filetests/alias/multiple-blocks.clif b/cranelift/filetests/filetests/alias/multiple-blocks.clif
index 3812c8911fbb..4ce7488b0a31 100644
--- a/cranelift/filetests/filetests/alias/multiple-blocks.clif
+++ b/cranelift/filetests/filetests/alias/multiple-blocks.clif
@@ -11,7 +11,7 @@ function %f0(i64 vmctx, i32) -> i32 {
 
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 12, 0
     v3 = load.i32 v2+8
     brz v2, block1
     jump block2
diff --git a/cranelift/filetests/filetests/alias/partial-redundancy.clif b/cranelift/filetests/filetests/alias/partial-redundancy.clif
index e869d262f1b5..3c2926ed617b 100644
--- a/cranelift/filetests/filetests/alias/partial-redundancy.clif
+++ b/cranelift/filetests/filetests/alias/partial-redundancy.clif
@@ -16,17 +16,17 @@ block0(v0: i64, v1: i32):
     jump block2
 
 block1:
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 68, 0
     v3 = load.i32 v2+64
     jump block3(v3)
 
 block2:
-    v4 = heap_addr.i64 heap0, v1, 0
+    v4 = heap_addr.i64 heap0, v1, 132, 0
     v5 = load.i32 v4+128
     jump block3(v5)
 
 block3(v6: i32):
-    v7 = heap_addr.i64 heap0, v1, 0
+    v7 = heap_addr.i64 heap0, v1, 68, 0
     v8 = load.i32 v7+64
     ;; load should survive:
     ; check: v8 = load.i32 v7+64
diff --git a/cranelift/filetests/filetests/alias/simple-alias.clif b/cranelift/filetests/filetests/alias/simple-alias.clif
index 9b559bc3e571..f1109c8379e8 100644
--- a/cranelift/filetests/filetests/alias/simple-alias.clif
+++ b/cranelift/filetests/filetests/alias/simple-alias.clif
@@ -13,13 +13,13 @@ function %f0(i64 vmctx, i32) -> i32, i32, i32, i32 {
     fn0 = %g(i64 vmctx)
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 12, 0
     v3 = load.i32 v2+8
     ;; This should reuse the load above.
-    v4 = heap_addr.i64 heap0, v1, 0
+    v4 = heap_addr.i64 heap0, v1, 12, 0
     v5 = load.i32 v4+8
     ; check: v5 -> v3
-    
+
     call fn0(v0)
 
     ;; The second load is redundant wrt the first, but the call above
@@ -27,7 +27,7 @@ block0(v0: i64, v1: i32):
     v6 = load.i32 v4+8
     v7 = load.i32 v4+8
     ; check: v7 -> v6
-    
+
     return v3, v5, v6, v7
 }
 
@@ -42,13 +42,13 @@ function %f1(i64 vmctx, i32) -> i32 {
     fn0 = %g(i64 vmctx)
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 12, 0
     store.i32 v1, v2+8
 
     ;; This load should pick up the store above.
-    v3 = heap_addr.i64 heap0, v1, 0
+    v3 = heap_addr.i64 heap0, v1, 12, 0
     v4 = load.i32 v3+8
     ; check: v4 -> v1
-    
+
     return v4
 }
diff --git a/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif b/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
index f6b69d7c0c92..2f893bcd7122 100644
--- a/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/heap_addr.clif
@@ -9,7 +9,7 @@ function %dynamic_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -34,7 +34,7 @@ function %static_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -52,3 +52,59 @@ block0(v0: i64, v1: i32):
 ; block2:
 ;   udf #0xc11f
 
+
+function %dynamic_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0
+    heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+; block0:
+;   mov w11, w1
+;   ldr x10, [x0]
+;   movz x9, #24
+;   adds x11, x11, x9
+;   b.lo 8 ; udf
+;   subs xzr, x11, x10
+;   b.ls label1 ; b label2
+; block1:
+;   add x13, x0, x1, UXTW
+;   add x13, x13, #16
+;   movz x12, #0
+;   subs xzr, x11, x10
+;   csel x0, x12, x13, hi
+;   csdb
+;   ret
+; block2:
+;   udf #0xc11f
+
+function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+; block0:
+;   mov w9, w1
+;   movz x10, #65512
+;   subs xzr, x9, x10
+;   b.ls label1 ; b label2
+; block1:
+;   add x11, x0, x1, UXTW
+;   add x11, x11, #16
+;   movz x10, #65512
+;   movz x12, #0
+;   subs xzr, x9, x10
+;   csel x0, x12, x11, hi
+;   csdb
+;   ret
+; block2:
+;   udf #0xc11f
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif b/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif
index 140add4a6d8b..d90da8f22c45 100644
--- a/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/heap-addr.clif
@@ -8,7 +8,7 @@ function %dynamic_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -32,7 +32,7 @@ function %static_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -51,3 +51,59 @@ block0(v0: i64, v1: i32):
 ; block2:
 ;   udf##trap_code=heap_oob
 
+function %dynamic_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0
+    heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+; block0:
+;   uext.w t1,a1
+;   ld t0,0(a0)
+;   li t3,24
+;   add t2,t1,t3
+;   ult a1,t2,t1##ty=i64
+;   trap_if a1,heap_oob
+;   ule a1,t2,t0##ty=i64
+;   bne a1,zero,taken(label1),not_taken(label2)
+; block1:
+;   add a0,a0,t1
+;   addi a0,a0,16
+;   ugt t1,t2,t0##ty=i64
+;   li a1,0
+;   selectif_spectre_guard a0,a1,a0##test=t1
+;   ret
+; block2:
+;   udf##trap_code=heap_oob
+
+function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+; block0:
+;   uext.w t3,a1
+;   lui a7,16
+;   addi a7,a7,4072
+;   ule t0,t3,a7##ty=i64
+;   bne t0,zero,taken(label1),not_taken(label2)
+; block1:
+;   add t0,a0,t3
+;   addi t0,t0,16
+;   lui t4,16
+;   addi t4,t4,4072
+;   ugt t1,t3,t4##ty=i64
+;   li a0,0
+;   selectif_spectre_guard a0,a0,t0##test=t1
+;   ret
+; block2:
+;   udf##trap_code=heap_oob
+
diff --git a/cranelift/filetests/filetests/isa/s390x/heap_addr.clif b/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
index acde9132509c..4dc22f499f3e 100644
--- a/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
@@ -7,7 +7,7 @@ function %dynamic_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -32,7 +32,7 @@ function %static_heap_check(i64 vmctx, i32) -> i64 {
     heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v2 = heap_addr.i64 heap0, v1, 0
+    v2 = heap_addr.i64 heap0, v1, 0, 0
     return v2
 }
 
@@ -49,3 +49,56 @@ block0(v0: i64, v1: i32):
 ; block2:
 ;   trap
 
+function %dynamic_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0
+    heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+;   stmg %r7, %r15, 56(%r15)
+; block0:
+;   llgfr %r7, %r3
+;   lg %r4, 0(%r2)
+;   lghi %r5, 24
+;   algfr %r5, %r3
+;   jle 6 ; trap
+;   clgr %r5, %r4
+;   jgnh label1 ; jg label2
+; block1:
+;   agrk %r3, %r2, %r7
+;   aghik %r2, %r3, 16
+;   lghi %r3, 0
+;   clgr %r5, %r4
+;   locgrh %r2, %r3
+;   lmg %r7, %r15, 56(%r15)
+;   br %r14
+; block2:
+;   trap
+
+function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+; block0:
+;   llgfr %r5, %r3
+;   clgfi %r5, 65512
+;   jgnh label1 ; jg label2
+; block1:
+;   agrk %r3, %r2, %r5
+;   aghik %r2, %r3, 16
+;   lghi %r3, 0
+;   clgfi %r5, 65512
+;   locgrh %r2, %r3
+;   br %r14
+; block2:
+;   trap
+
diff --git a/cranelift/filetests/filetests/isa/x64/heap-no-spectre.clif b/cranelift/filetests/filetests/isa/x64/heap-no-spectre.clif
index 652742df8be7..ca1595001553 100644
--- a/cranelift/filetests/filetests/isa/x64/heap-no-spectre.clif
+++ b/cranelift/filetests/filetests/isa/x64/heap-no-spectre.clif
@@ -12,7 +12,7 @@ function %f(i32, i64 vmctx) -> i64 {
     heap0 = dynamic gv1, bound gv2, offset_guard 0x1000, index_type i32
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 0x8000
+    v2 = heap_addr.i64 heap0, v0, 0x8000, 0
     return v2
 }
 
@@ -20,14 +20,15 @@ block0(v0: i32, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movl    %edi, %eax
-;   movq    8(%rsi), %r9
-;   movq    %rax, %r10
-;   addq    %r10, $32768, %r10
+;   movq    8(%rsi), %r10
+;   movq    %rax, %r11
+;   addq    %r11, $32768, %r11
 ;   jnb ; ud2 heap_oob ;
-;   cmpq    %r9, %r10
+;   cmpq    %r10, %r11
 ;   jbe     label1; j label2
 ; block1:
 ;   addq    %rax, 0(%rsi), %rax
+;   addq    %rax, $32768, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -43,7 +44,7 @@ function %f(i64 vmctx, i32) -> i64 system_v {
     heap0 = static gv1, bound 0x1000, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v10 = heap_addr.i64 heap0, v1, 0
+    v10 = heap_addr.i64 heap0, v1, 0, 0
     return v10
 }
 
@@ -70,7 +71,7 @@ function %f(i64 vmctx, i32) -> i64 system_v {
     heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v10 = heap_addr.i64 heap0, v1, 0
+    v10 = heap_addr.i64 heap0, v1, 0, 0
     return v10
 }
 
diff --git a/cranelift/filetests/filetests/isa/x64/heap.clif b/cranelift/filetests/filetests/isa/x64/heap.clif
index b1eaaf15ff75..87444682ac46 100644
--- a/cranelift/filetests/filetests/isa/x64/heap.clif
+++ b/cranelift/filetests/filetests/isa/x64/heap.clif
@@ -25,7 +25,7 @@ function %f(i32, i64 vmctx) -> i64 {
     heap0 = dynamic gv1, bound gv2, offset_guard 0x1000, index_type i32
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 0x8000
+    v2 = heap_addr.i64 heap0, v0, 0x8000, 0
     return v2
 }
 
@@ -33,16 +33,17 @@ block0(v0: i32, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movl    %edi, %eax
-;   movq    8(%rsi), %r11
+;   movq    8(%rsi), %rdx
 ;   movq    %rax, %rdi
 ;   addq    %rdi, $32768, %rdi
 ;   jnb ; ud2 heap_oob ;
-;   cmpq    %r11, %rdi
+;   cmpq    %rdx, %rdi
 ;   jbe     label1; j label2
 ; block1:
 ;   addq    %rax, 0(%rsi), %rax
+;   addq    %rax, $32768, %rax
 ;   xorq    %rcx, %rcx, %rcx
-;   cmpq    %r11, %rdi
+;   cmpq    %rdx, %rdi
 ;   cmovnbeq %rcx, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -60,7 +61,7 @@ function %f(i64 vmctx, i32) -> i64 system_v {
     heap0 = static gv1, bound 0x1000, offset_guard 0x1000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v10 = heap_addr.i64 heap0, v1, 0
+    v10 = heap_addr.i64 heap0, v1, 0, 0
     return v10
 }
 
@@ -91,7 +92,7 @@ function %f(i64 vmctx, i32) -> i64 system_v {
     heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32
 
 block0(v0: i64, v1: i32):
-    v10 = heap_addr.i64 heap0, v1, 0
+    v10 = heap_addr.i64 heap0, v1, 0, 0
     return v10
 }
 
@@ -104,3 +105,66 @@ block0(v0: i64, v1: i32):
 ;   popq    %rbp
 ;   ret
 
+function %dynamic_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0
+    heap0 = dynamic gv0, bound gv1, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    %esi, %edi
+;   movq    %rax, %rcx
+;   movq    0(%rcx), %rsi
+;   movq    %rdi, %rdx
+;   addq    %rdx, $24, %rdx
+;   jnb ; ud2 heap_oob ;
+;   cmpq    %rsi, %rdx
+;   jbe     label1; j label2
+; block1:
+;   movq    %rcx, %rax
+;   addq    %rax, %rdi, %rax
+;   addq    %rax, $16, %rax
+;   xorq    %rcx, %rcx, %rcx
+;   cmpq    %rsi, %rdx
+;   cmovnbeq %rcx, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   ud2 heap_oob
+
+function %static_heap_check_with_offset(i64 vmctx, i32) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, bound 0x1_0000, offset_guard 0x1000, index_type i32
+
+block0(v0: i64, v1: i32):
+    v2 = heap_addr.i64 heap0, v1, 16, 8
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movl    %esi, %r10d
+;   cmpq    $65512, %r10
+;   jbe     label1; j label2
+; block1:
+;   movq    %rdi, %rax
+;   addq    %rax, %r10, %rax
+;   addq    %rax, $16, %rax
+;   xorq    %r11, %r11, %r11
+;   cmpq    $65512, %r10
+;   cmovnbeq %r11, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   ud2 heap_oob
+
diff --git a/cranelift/filetests/filetests/licm/load_readonly_notrap.clif b/cranelift/filetests/filetests/licm/load_readonly_notrap.clif
index 011b5833d5e3..27b72cfda922 100644
--- a/cranelift/filetests/filetests/licm/load_readonly_notrap.clif
+++ b/cranelift/filetests/filetests/licm/load_readonly_notrap.clif
@@ -16,7 +16,7 @@ block0(v0: i32, v1: i64):
 
 block1(v2: i32, v3: i64):
     v4 = iconst.i32 1
-    v5 = heap_addr.i64 heap0, v4, 1
+    v5 = heap_addr.i64 heap0, v4, 0, 4
     v6 = load.i32 notrap aligned readonly v5
     v7 = iadd v2, v6
     brz v2, block3(v2)
@@ -37,7 +37,7 @@ block3(v9: i32):
 ; nextln: 
 ; nextln: block0(v0: i32, v1: i64):
 ; nextln:    v4 = iconst.i32 1
-; nextln:    v5 = heap_addr.i64 heap0, v4, 1
+; nextln:    v5 = heap_addr.i64 heap0, v4, 0, 4
 ; nextln:    v6 = load.i32 notrap aligned readonly v5
 ; nextln:    jump block1(v0, v1)
 ; nextln: 
diff --git a/cranelift/filetests/filetests/licm/reject_load_notrap.clif b/cranelift/filetests/filetests/licm/reject_load_notrap.clif
index 6236d0d1efab..3ca5b1c5df8d 100644
--- a/cranelift/filetests/filetests/licm/reject_load_notrap.clif
+++ b/cranelift/filetests/filetests/licm/reject_load_notrap.clif
@@ -14,7 +14,7 @@ function %hoist_load(i32, i64 vmctx) -> i32 {
 
 block0(v0: i32, v1: i64):
     v4 = iconst.i32 1
-    v5 = heap_addr.i64 heap0, v4, 1
+    v5 = heap_addr.i64 heap0, v4, 0, 4
     jump block1(v0, v1)
 
 block1(v2: i32, v3: i64):
@@ -32,25 +32,25 @@ block3(v9: i32):
 }
 
 ; sameln: function %hoist_load(i32, i64 vmctx) -> i32 fast {
-; nextln:    gv0 = vmctx
-; nextln:    gv1 = load.i64 notrap aligned readonly gv0
-; nextln:    heap0 = static gv1, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000, index_type i32
+; nextln:     gv0 = vmctx
+; nextln:     gv1 = load.i64 notrap aligned readonly gv0
+; nextln:     heap0 = static gv1, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000, index_type i32
 ; nextln: 
 ; nextln: block0(v0: i32, v1: i64):
-; nextln:    v4 = iconst.i32 1
-; nextln:    v5 = heap_addr.i64 heap0, v4, 1
-; nextln:    jump block1(v0, v1)
+; nextln:     v4 = iconst.i32 1
+; nextln:     v5 = heap_addr.i64 heap0, v4, 0, 4  ; v4 = 1
+; nextln:     jump block1(v0, v1)
 ; nextln: 
 ; nextln: block1(v2: i32, v3: i64):
-; nextln:    v6 = load.i32 notrap aligned v5
-; nextln:    v7 = iadd v2, v6
-; nextln:    brz v2, block3(v2)
-; nextln:    jump block2
+; nextln:     v6 = load.i32 notrap aligned v5
+; nextln:     v7 = iadd v2, v6
+; nextln:     brz v2, block3(v2)
+; nextln:     jump block2
 ; nextln: 
 ; nextln: block2:
-; nextln:    v8 = isub.i32 v2, v4
-; nextln:    jump block1(v8, v3)
+; nextln:     v8 = isub.i32 v2, v4  ; v4 = 1
+; nextln:     jump block1(v8, v3)
 ; nextln: 
 ; nextln: block3(v9: i32):
-; nextln:    return v9
+; nextln:     return v9
 ; nextln: }
diff --git a/cranelift/filetests/filetests/licm/reject_load_readonly.clif b/cranelift/filetests/filetests/licm/reject_load_readonly.clif
index c94ace259124..a180847cd779 100644
--- a/cranelift/filetests/filetests/licm/reject_load_readonly.clif
+++ b/cranelift/filetests/filetests/licm/reject_load_readonly.clif
@@ -17,7 +17,7 @@ block0(v0: i32, v1: i64):
 
 block1(v2: i32, v3: i64):
     v4 = iconst.i32 1
-    v5 = heap_addr.i64 heap0, v4, 1
+    v5 = heap_addr.i64 heap0, v4, 0, 4
     v6 = load.i32 aligned readonly v5
     v7 = iadd v2, v6
     brz v2, block3(v2)
@@ -38,7 +38,7 @@ block3(v9: i32):
 ; nextln: 
 ; nextln: block0(v0: i32, v1: i64):
 ; nextln:    v4 = iconst.i32 1
-; nextln:    v5 = heap_addr.i64 heap0, v4, 1
+; nextln:    v5 = heap_addr.i64 heap0, v4, 0, 4
 ; nextln:    jump block1(v0, v1)
 ; nextln: 
 ; nextln: block1(v2: i32, v3: i64):
diff --git a/cranelift/filetests/filetests/parser/memory.clif b/cranelift/filetests/filetests/parser/memory.clif
index abe059c0fb7c..31f8589bea8f 100644
--- a/cranelift/filetests/filetests/parser/memory.clif
+++ b/cranelift/filetests/filetests/parser/memory.clif
@@ -60,8 +60,8 @@ function %sheap(i32, i64 vmctx) -> i64 {
     ; check: heap1 = static gv5, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
     ; check: heap2 = static gv5, min 0, bound 0x0001_0000, offset_guard 4096
 block0(v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap1, v1, 0
-    ; check: v3 = heap_addr.i64 heap1, v1, 0
+    v3 = heap_addr.i64 heap1, v1, 0, 0
+    ; check: v3 = heap_addr.i64 heap1, v1, 0, 0
     return v3
 }
 
@@ -76,7 +76,7 @@ function %dheap(i32, i64 vmctx) -> i64 {
     ; check: heap1 = dynamic gv5, min 0x0001_0000, bound gv6, offset_guard 0x8000_0000
     ; check: heap2 = dynamic gv5, min 0, bound gv6, offset_guard 4096
 block0(v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap2, v1, 0
-    ; check: v3 = heap_addr.i64 heap2, v1, 0
+    v3 = heap_addr.i64 heap2, v1, 0, 0
+    ; check: v3 = heap_addr.i64 heap2, v1, 0, 0
     return v3
 }
diff --git a/cranelift/filetests/filetests/runtests/conversions-load-store.clif b/cranelift/filetests/filetests/runtests/conversions-load-store.clif
index c30aa19b0df5..78abe5ba6748 100644
--- a/cranelift/filetests/filetests/runtests/conversions-load-store.clif
+++ b/cranelift/filetests/filetests/runtests/conversions-load-store.clif
@@ -11,7 +11,7 @@ function %fpromote_f32_f64(i64 vmctx, i64, f32) -> f64 {
     heap0 = static gv1, min 0x10, bound 0x10, offset_guard 0x0, index_type i64
 
 block0(v0: i64, v1: i64, v2: f32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.f32 v2, v3
     v4 = load.f32 v3
     v5 = fpromote.f64 v4
@@ -31,7 +31,7 @@ function %fdemote_test(i64 vmctx, i64, f64) -> f32 {
     heap0 = static gv1, min 0x10, bound 0x10, offset_guard 0x0, index_type i64
 
 block0(v0: i64, v1: i64, v2: f64):
-    v3 = heap_addr.i64 heap0, v1, 8
+    v3 = heap_addr.i64 heap0, v1, 0, 8
     store.f64 v2, v3
     v4 = load.f64 v3
     v5 = fdemote.f32 v4
@@ -51,7 +51,7 @@ function %fvdemote_test(i64 vmctx, i64, f64x2) -> f32x4 {
     heap0 = static gv1, min 0x20, bound 0x20, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: f64x2):
-    v3 = heap_addr.i64 heap0, v1, 16
+    v3 = heap_addr.i64 heap0, v1, 0, 16
     store.f64x2 v2, v3
     v4 = load.f64x2 v3
     v5 = fvdemote v4
@@ -72,7 +72,7 @@ function %fvpromote_low_test(i64 vmctx, i64, f32x4) -> f64x2 {
     heap0 = static gv1, min 0x20, bound 0x20, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: f32x4):
-    v3 = heap_addr.i64 heap0, v1, 16
+    v3 = heap_addr.i64 heap0, v1, 0, 16
     store.f32x4 v2, v3
     v4 = load.f32x4 v3
     v5 = fvpromote_low v4
diff --git a/cranelift/filetests/filetests/runtests/global_value.clif b/cranelift/filetests/filetests/runtests/global_value.clif
index e9514b1d6bd2..e8caf14805f6 100644
--- a/cranelift/filetests/filetests/runtests/global_value.clif
+++ b/cranelift/filetests/filetests/runtests/global_value.clif
@@ -12,7 +12,7 @@ function %store_load(i64 vmctx, i64, i32) -> i32 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 0
+    v3 = heap_addr.i64 heap0, v1, 0, 0
     store.i32 v2, v3
 
     v4 = global_value.i64 gv1
diff --git a/cranelift/filetests/filetests/runtests/heap.clif b/cranelift/filetests/filetests/runtests/heap.clif
index e956dcff3868..3e7bd41649ac 100644
--- a/cranelift/filetests/filetests/runtests/heap.clif
+++ b/cranelift/filetests/filetests/runtests/heap.clif
@@ -11,7 +11,7 @@ function %static_heap_i64(i64 vmctx, i64, i32) -> i32 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -29,7 +29,7 @@ function %static_heap_i32(i64 vmctx, i32, i32) -> i32 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i32
 
 block0(v0: i64, v1: i32, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -47,7 +47,7 @@ function %heap_no_min(i64 vmctx, i32, i32) -> i32 {
     heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0, index_type i32
 
 block0(v0: i64, v1: i32, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -66,7 +66,7 @@ function %dynamic_i64(i64 vmctx, i64, i32) -> i32 {
     heap0 = dynamic gv1, bound gv2, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -85,7 +85,7 @@ function %dynamic_i32(i64 vmctx, i32, i32) -> i32 {
     heap0 = dynamic gv1, bound gv2, offset_guard 0, index_type i32
 
 block0(v0: i64, v1: i32, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -110,11 +110,11 @@ block0(v0: i64, v1: i32, v2: i32):
     v4 = iconst.i32 0
 
     ; Store lhs in heap0
-    v5 = heap_addr.i64 heap0, v3, 4
+    v5 = heap_addr.i64 heap0, v3, 0, 4
     store.i32 v1, v5
 
     ; Store rhs in heap1
-    v6 = heap_addr.i64 heap1, v4, 4
+    v6 = heap_addr.i64 heap1, v4, 0, 4
     store.i32 v2, v6
 
 
@@ -146,11 +146,11 @@ block0(v0: i64, v1: i32, v2: i32):
     v4 = iconst.i64 0
 
     ; Store lhs in heap0
-    v5 = heap_addr.i64 heap0, v3, 4
+    v5 = heap_addr.i64 heap0, v3, 0, 4
     store.i32 v1, v5
 
     ; Store rhs in heap1
-    v6 = heap_addr.i64 heap1, v4, 4
+    v6 = heap_addr.i64 heap1, v4, 0, 4
     store.i32 v2, v6
 
 
@@ -172,7 +172,7 @@ function %unaligned_access(i64 vmctx, i64, i32) -> i32 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
@@ -196,7 +196,7 @@ function %iadd_imm(i64 vmctx, i32) -> i32 {
 
 block0(v0: i64, v1: i32):
     v2 = iconst.i64 0
-    v3 = heap_addr.i64 heap0, v2, 4
+    v3 = heap_addr.i64 heap0, v2, 0, 4
     store.i32 v1, v3
     v4 = load.i32 v3
     return v4
@@ -211,7 +211,7 @@ function %heap_limit_i64(i64 vmctx, i64, i32) -> i32 {
     heap0 = static gv1, min 0, bound 0x8, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     store.i32 v2, v3
     v4 = load.i32 v3
     return v4
diff --git a/cranelift/filetests/filetests/runtests/load-op-store.clif b/cranelift/filetests/filetests/runtests/load-op-store.clif
index 0d7ba86c4882..d2dfb12a4130 100644
--- a/cranelift/filetests/filetests/runtests/load-op-store.clif
+++ b/cranelift/filetests/filetests/runtests/load-op-store.clif
@@ -2,7 +2,7 @@ test run
 target x86_64
 target s390x
 target aarch64
-target riscv64 
+target riscv64
 
 
 function %load_op_store_iadd_i64(i64 vmctx, i64, i64) -> i64 {
@@ -11,7 +11,7 @@ function %load_op_store_iadd_i64(i64 vmctx, i64, i64) -> i64 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 8
+    v3 = heap_addr.i64 heap0, v1, 0, 8
     v4 = iconst.i64 42
     store.i64 v4, v3
     v5 = load.i64 v3
@@ -30,7 +30,7 @@ function %load_op_store_iadd_i32(i64 vmctx, i64, i32) -> i32 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i32):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     v4 = iconst.i32 42
     store.i32 v4, v3
     v5 = load.i32 v3
@@ -49,7 +49,7 @@ function %load_op_store_iadd_i8(i64 vmctx, i64, i8) -> i8 {
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i8):
-    v3 = heap_addr.i64 heap0, v1, 4
+    v3 = heap_addr.i64 heap0, v1, 0, 4
     v4 = iconst.i8 42
     store.i8 v4, v3
     v5 = load.i8 v3
@@ -68,7 +68,7 @@ function %load_op_store_iadd_isub_iand_ior_ixor_i64(i64 vmctx, i64, i64) -> i64
     heap0 = static gv1, min 0x1000, bound 0x1_0000_0000, offset_guard 0, index_type i64
 
 block0(v0: i64, v1: i64, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 8
+    v3 = heap_addr.i64 heap0, v1, 0, 8
     store.i64 v2, v3
     v4 = load.i64 v3
     v5 = iconst.i64 1
diff --git a/cranelift/filetests/filetests/runtests/table_addr.clif b/cranelift/filetests/filetests/runtests/table_addr.clif
index f77d356e367d..186dcb1a89c0 100644
--- a/cranelift/filetests/filetests/runtests/table_addr.clif
+++ b/cranelift/filetests/filetests/runtests/table_addr.clif
@@ -128,7 +128,7 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64):
     ; v1 - heap offset (bytes)
     ; v2 - table offset (elements)
     ; v3 - store/load value
-    v4 = heap_addr.i64 heap0, v1, 0
+    v4 = heap_addr.i64 heap0, v1, 0, 0
     v5 = table_addr.i64 table0, v2, +2
 
     ; Store via heap, load via table
diff --git a/cranelift/filetests/filetests/simple_gvn/readonly.clif b/cranelift/filetests/filetests/simple_gvn/readonly.clif
index 93ede4a5b8aa..b28da609a7b0 100644
--- a/cranelift/filetests/filetests/simple_gvn/readonly.clif
+++ b/cranelift/filetests/filetests/simple_gvn/readonly.clif
@@ -9,8 +9,8 @@ function %eliminate_redundant_global_loads(i32, i64 vmctx) {
     heap0 = static gv1, min 0x1_0000, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
-    v3 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
+    v3 = heap_addr.i64 heap0, v0, 0, 1
 
     v4 = iconst.i32 0
     store.i32 notrap aligned v4, v2
@@ -18,7 +18,7 @@ block0(v0: i32, v1: i64):
 
     return
 }
-; check: v2 = heap_addr.i64 heap0, v0, 1
+; check: v2 = heap_addr.i64 heap0, v0, 0, 1
 ; check: v3 -> v2
 ; check: v4 = iconst.i32 0
 ; check: store notrap aligned v4, v2
diff --git a/cranelift/filetests/filetests/simple_preopt/replace_branching_instructions_and_cfg_predecessors.clif b/cranelift/filetests/filetests/simple_preopt/replace_branching_instructions_and_cfg_predecessors.clif
index 493896f0d751..e5adefca0015 100644
--- a/cranelift/filetests/filetests/simple_preopt/replace_branching_instructions_and_cfg_predecessors.clif
+++ b/cranelift/filetests/filetests/simple_preopt/replace_branching_instructions_and_cfg_predecessors.clif
@@ -7,7 +7,7 @@ function u0:2(i64 , i64) {
     heap0 = static gv1
     block0(v0: i64, v1: i64):
         v16 = iconst.i32 6
-        v17 = heap_addr.i64 heap0, v16, 1
+        v17 = heap_addr.i64 heap0, v16, 0, 1
         v18 = load.i32 v17
         v19 = iconst.i32 4
         v20 = icmp ne v18, v19
diff --git a/cranelift/filetests/filetests/verifier/heap.clif b/cranelift/filetests/filetests/verifier/heap.clif
index 2a73f4ee8f01..b46779e23359 100644
--- a/cranelift/filetests/filetests/verifier/heap.clif
+++ b/cranelift/filetests/filetests/verifier/heap.clif
@@ -40,6 +40,6 @@ function %heap_addr_index_type(i64 vmctx, i64) {
     heap0 = static gv0, offset_guard 0x1000, bound 0x1_0000, index_type i32
 
 block0(v0: i64, v1: i64):
-    v2 = heap_addr.i64 heap0, v1, 0; error: index type i64 differs from heap index type i32
+    v2 = heap_addr.i64 heap0, v1, 0, 0; error: index type i64 differs from heap index type i32
     return
 }
diff --git a/cranelift/filetests/filetests/wasm/f32-memory64.clif b/cranelift/filetests/filetests/wasm/f32-memory64.clif
index 9985898b7947..8f6d5e44322c 100644
--- a/cranelift/filetests/filetests/wasm/f32-memory64.clif
+++ b/cranelift/filetests/filetests/wasm/f32-memory64.clif
@@ -11,7 +11,7 @@ function %f32_load(i32, i64 vmctx) -> f32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = load.f32 v2
     return v3
 }
@@ -21,7 +21,7 @@ function %f32_store(f32, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: f32, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     store v0, v3
     return
 }
diff --git a/cranelift/filetests/filetests/wasm/f64-memory64.clif b/cranelift/filetests/filetests/wasm/f64-memory64.clif
index f55a73fb8711..2805be18ef07 100644
--- a/cranelift/filetests/filetests/wasm/f64-memory64.clif
+++ b/cranelift/filetests/filetests/wasm/f64-memory64.clif
@@ -11,7 +11,7 @@ function %f64_load(i32, i64 vmctx) -> f64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = load.f64 v2
     return v3
 }
@@ -21,7 +21,7 @@ function %f64_store(f64, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: f64, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     store v0, v3
     return
 }
diff --git a/cranelift/filetests/filetests/wasm/i32-memory64.clif b/cranelift/filetests/filetests/wasm/i32-memory64.clif
index 7fcf0316c2ec..f4a89f1da2f1 100644
--- a/cranelift/filetests/filetests/wasm/i32-memory64.clif
+++ b/cranelift/filetests/filetests/wasm/i32-memory64.clif
@@ -11,7 +11,7 @@ function %i32_load(i32, i64 vmctx) -> i32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = load.i32 v2
     return v3
 }
@@ -21,7 +21,7 @@ function %i32_store(i32, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     store v0, v3
     return
 }
@@ -31,7 +31,7 @@ function %i32_load8_s(i32, i64 vmctx) -> i32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = sload8.i32 v2
     return v3
 }
@@ -41,7 +41,7 @@ function %i32_load8_u(i32, i64 vmctx) -> i32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = uload8.i32 v2
     return v3
 }
@@ -51,7 +51,7 @@ function %i32_store8(i32, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     istore8 v0, v3
     return
 }
@@ -61,7 +61,7 @@ function %i32_load16_s(i32, i64 vmctx) -> i32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = sload16.i32 v2
     return v3
 }
@@ -71,7 +71,7 @@ function %i32_load16_u(i32, i64 vmctx) -> i32 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = uload16.i32 v2
     return v3
 }
@@ -81,8 +81,7 @@ function %i32_store16(i32, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     istore16 v0, v3
     return
 }
-
diff --git a/cranelift/filetests/filetests/wasm/i64-memory64.clif b/cranelift/filetests/filetests/wasm/i64-memory64.clif
index 7f76ccd86e5d..64c3baecc64f 100644
--- a/cranelift/filetests/filetests/wasm/i64-memory64.clif
+++ b/cranelift/filetests/filetests/wasm/i64-memory64.clif
@@ -11,7 +11,7 @@ function %i64_load(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = load.i64 v2
     return v3
 }
@@ -21,7 +21,7 @@ function %i64_store(i64, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i64, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     store v0, v3
     return
 }
@@ -31,7 +31,7 @@ function %i64_load8_s(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = sload8.i64 v2
     return v3
 }
@@ -41,7 +41,7 @@ function %i64_load8_u(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = uload8.i64 v2
     return v3
 }
@@ -51,7 +51,7 @@ function %i64_store8(i64, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i64, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     istore8 v0, v3
     return
 }
@@ -61,7 +61,7 @@ function %i64_load16_s(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = sload16.i64 v2
     return v3
 }
@@ -71,7 +71,7 @@ function %i64_load16_u(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = uload16.i64 v2
     return v3
 }
@@ -81,7 +81,7 @@ function %i64_store16(i64, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i64, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     istore16 v0, v3
     return
 }
@@ -91,7 +91,7 @@ function %i64_load32_s(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = sload32.i64 v2
     return v3
 }
@@ -101,7 +101,7 @@ function %i64_load32_u(i32, i64 vmctx) -> i64 {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i32, v1: i64):
-    v2 = heap_addr.i64 heap0, v0, 1
+    v2 = heap_addr.i64 heap0, v0, 0, 1
     v3 = uload32.i64 v2
     return v3
 }
@@ -111,7 +111,7 @@ function %i64_store32(i64, i32, i64 vmctx) {
     heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, offset_guard 0x8000_0000
 
 block0(v0: i64, v1: i32, v2: i64):
-    v3 = heap_addr.i64 heap0, v1, 1
+    v3 = heap_addr.i64 heap0, v1, 0, 1
     istore32 v0, v3
     return
 }
diff --git a/cranelift/filetests/src/test_compile.rs b/cranelift/filetests/src/test_compile.rs
index 4f8fe10840b0..cfc180ba9ded 100644
--- a/cranelift/filetests/src/test_compile.rs
+++ b/cranelift/filetests/src/test_compile.rs
@@ -130,7 +130,10 @@ fn update_test(output: &[&str], context: &Context) -> Result<()> {
             // but after we hit a real line then we push all remaining lines.
             let mut in_next_function = false;
             for line in old_test {
-                if !in_next_function && (line.trim().is_empty() || line.starts_with(";")) {
+                if !in_next_function
+                    && (line.trim().is_empty()
+                        || (line.starts_with(";") && !line.starts_with(";;")))
+                {
                     continue;
                 }
                 in_next_function = true;
diff --git a/cranelift/filetests/src/test_licm.rs b/cranelift/filetests/src/test_licm.rs
index 2ca245055a74..b02bac1e74c6 100644
--- a/cranelift/filetests/src/test_licm.rs
+++ b/cranelift/filetests/src/test_licm.rs
@@ -45,6 +45,7 @@ impl SubTest for TestLICM {
             .map_err(|e| crate::pretty_anyhow_error(&comp_ctx.func, Into::into(e)))?;
 
         let text = comp_ctx.func.display().to_string();
+        log::debug!("Post-LICM CLIF:\n{}", text);
         run_filecheck(&text, context)
     }
 }
diff --git a/cranelift/interpreter/src/interpreter.rs b/cranelift/interpreter/src/interpreter.rs
index eb1af7b4e312..cc58cf29ba99 100644
--- a/cranelift/interpreter/src/interpreter.rs
+++ b/cranelift/interpreter/src/interpreter.rs
@@ -1011,7 +1011,7 @@ mod tests {
         block0(v0: i64):
             v1 = iconst.i64 0
             v2 = iconst.i64 123
-            v3 = heap_addr.i64 heap0, v1, 8
+            v3 = heap_addr.i64 heap0, v1, 0, 8
             store.i64 v2, v3
             v4 = load.i64 v3
             v5 = icmp eq v2, v4
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index c158f699e822..fcba98cdf7bf 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -119,10 +119,6 @@ where
             }
             // 32-bit
             InstructionData::UnaryIeee32 { imm, .. } => DataValue::from(imm),
-            InstructionData::HeapAddr { imm, .. } => {
-                let imm: u32 = imm.into();
-                DataValue::from(imm as i32) // Note the switch from unsigned to signed.
-            }
             InstructionData::Load { offset, .. }
             | InstructionData::Store { offset, .. }
             | InstructionData::StackLoad { offset, .. }
@@ -489,19 +485,27 @@ where
         Opcode::SymbolValue => unimplemented!("SymbolValue"),
         Opcode::TlsValue => unimplemented!("TlsValue"),
         Opcode::HeapAddr => {
-            if let InstructionData::HeapAddr { heap, .. } = inst {
+            if let InstructionData::HeapAddr {
+                heap,
+                offset: imm_offset,
+                size,
+                ..
+            } = inst
+            {
                 let addr_ty = inst_context.controlling_type().unwrap();
-                let offset = arg(0)?.into_int()? as u64;
-                let load_size = imm().into_int()? as u64;
+                let dyn_offset = arg(0)?.into_int()? as u64;
                 assign_or_memtrap({
                     AddressSize::try_from(addr_ty).and_then(|addr_size| {
                         // Attempt to build an address at the maximum possible offset
                         // for this load. If address generation fails we know it's out of bounds.
-                        let bound_offset = (offset + load_size).saturating_sub(1);
+                        let bound_offset =
+                            (dyn_offset + u64::from(u32::from(imm_offset)) + u64::from(size))
+                                .saturating_sub(1);
                         state.heap_address(addr_size, heap, bound_offset)?;
 
                         // Build the actual address
-                        let addr = state.heap_address(addr_size, heap, offset)?;
+                        let mut addr = state.heap_address(addr_size, heap, dyn_offset)?;
+                        addr.offset += u64::from(u32::from(imm_offset));
                         let dv = DataValue::try_from(addr)?;
                         Ok(dv.into())
                     })
diff --git a/cranelift/reader/src/parser.rs b/cranelift/reader/src/parser.rs
index 912282bae4f6..ff85acec72e7 100644
--- a/cranelift/reader/src/parser.rs
+++ b/cranelift/reader/src/parser.rs
@@ -2965,12 +2965,15 @@ impl<'a> Parser<'a> {
                 self.match_token(Token::Comma, "expected ',' between operands")?;
                 let arg = self.match_value("expected SSA value heap address")?;
                 self.match_token(Token::Comma, "expected ',' between operands")?;
-                let imm = self.match_uimm32("expected 32-bit integer size")?;
+                let offset = self.match_uimm32("expected 32-bit integer offset")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let size = self.match_uimm8("expected 8-bit integer size")?;
                 InstructionData::HeapAddr {
                     opcode,
                     heap,
                     arg,
-                    imm,
+                    offset,
+                    size,
                 }
             }
             InstructionFormat::TableAddr => {
diff --git a/cranelift/src/clif-util.rs b/cranelift/src/clif-util.rs
old mode 100755
new mode 100644
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 86d54ffc0328..28478b342fea 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -91,7 +91,6 @@ use cranelift_codegen::packed_option::ReservedValue;
 use cranelift_frontend::{FunctionBuilder, Variable};
 use itertools::Itertools;
 use smallvec::SmallVec;
-use std::cmp;
 use std::convert::TryFrom;
 use std::vec::Vec;
 use wasmparser::{FuncValidator, MemArg, Operator, WasmModuleResources};
@@ -697,33 +696,33 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             translate_load(memarg, ir::Opcode::Load, I8X16, builder, state, environ)?;
         }
         Operator::V128Load8x8S { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().sload8x8(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().sload8x8(flags, base, 0);
             state.push1(loaded);
         }
         Operator::V128Load8x8U { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().uload8x8(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().uload8x8(flags, base, 0);
             state.push1(loaded);
         }
         Operator::V128Load16x4S { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().sload16x4(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().sload16x4(flags, base, 0);
             state.push1(loaded);
         }
         Operator::V128Load16x4U { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().uload16x4(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().uload16x4(flags, base, 0);
             state.push1(loaded);
         }
         Operator::V128Load32x2S { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().sload32x2(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().sload32x2(flags, base, 0);
             state.push1(loaded);
         }
         Operator::V128Load32x2U { memarg } => {
-            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
-            let loaded = builder.ins().uload32x2(flags, base, offset);
+            let (flags, base) = prepare_addr(memarg, 8, builder, state, environ)?;
+            let loaded = builder.ins().uload32x2(flags, base, 0);
             state.push1(loaded);
         }
         /****************************** Store instructions ***********************************
@@ -1067,8 +1066,13 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let heap = state.get_heap(builder.func, memarg.memory, environ)?;
             let timeout = state.pop1(); // 64 (fixed)
             let expected = state.pop1(); // 32 or 64 (per the `Ixx` in `IxxAtomicWait`)
-            let (_flags, addr) =
-                prepare_atomic_addr(memarg, implied_ty.bytes(), builder, state, environ)?;
+            let (_flags, addr) = prepare_atomic_addr(
+                memarg,
+                u8::try_from(implied_ty.bytes()).unwrap(),
+                builder,
+                state,
+                environ,
+            )?;
             assert!(builder.func.dfg.value_type(expected) == implied_ty);
             // `fn translate_atomic_wait` can inspect the type of `expected` to figure out what
             // code it needs to generate, if it wants.
@@ -2171,21 +2175,20 @@ fn translate_unreachable_operator<FE: FuncEnvironment + ?Sized>(
 /// This function is a generalized helper for validating that a wasm-supplied
 /// heap address is in-bounds.
 ///
-/// This function takes a litany of parameters and requires that the address to
-/// be verified is at the top of the stack in `state`. This will generate
-/// necessary IR to validate that the heap address is correctly in-bounds, and
-/// various parameters are returned describing the valid heap address if
-/// execution reaches that point.
+/// This function takes a litany of parameters and requires that the *Wasm*
+/// address to be verified is at the top of the stack in `state`. This will
+/// generate necessary IR to validate that the heap address is correctly
+/// in-bounds, and various parameters are returned describing the valid *native*
+/// heap address if execution reaches that point.
 fn prepare_addr<FE: FuncEnvironment + ?Sized>(
     memarg: &MemArg,
-    access_size: u32,
+    access_size: u8,
     builder: &mut FunctionBuilder,
     state: &mut FuncTranslationState,
     environ: &mut FE,
-) -> WasmResult<(MemFlags, Value, Offset32)> {
+) -> WasmResult<(MemFlags, Value)> {
     let addr = state.pop1();
     let heap = state.get_heap(builder.func, memarg.memory, environ)?;
-    let offset_guard_size: u64 = builder.func.heaps[heap].offset_guard_size.into();
 
     // How exactly the bounds check is performed here and what it's performed
     // on is a bit tricky. Generally we want to rely on access violations (e.g.
@@ -2244,10 +2247,9 @@ fn prepare_addr<FE: FuncEnvironment + ?Sized>(
     // hit like so:
     //
     // * For wasm32, wasmtime defaults to 4gb "static" memories with 2gb guard
-    //   regions. This means our `adjusted_offset` is 1 for all offsets <=2gb.
-    //   This hits the optimized case for `heap_addr` on static memories 4gb in
-    //   size in cranelift's legalization of `heap_addr`, eliding the bounds
-    //   check entirely.
+    //   regions. This means that for all offsets <=2gb, we hit the optimized
+    //   case for `heap_addr` on static memories 4gb in size in cranelift's
+    //   legalization of `heap_addr`, eliding the bounds check entirely.
     //
     // * For wasm64 offsets <=2gb will generate a single `heap_addr`
     //   instruction, but at this time all heaps are "dyanmic" which means that
@@ -2258,43 +2260,17 @@ fn prepare_addr<FE: FuncEnvironment + ?Sized>(
     // offsets in `memarg` are <=2gb, which means we get the fast path of one
     // `heap_addr` instruction plus a hardcoded i32-offset in memory-related
     // instructions.
-    let adjusted_offset = if offset_guard_size == 0 {
-        // Why saturating? see (1) above
-        memarg.offset.saturating_add(u64::from(access_size))
-    } else {
-        // Why is there rounding here? see (2) above
-        assert!(access_size < 1024);
-        cmp::max(memarg.offset / offset_guard_size * offset_guard_size, 1)
-    };
-
-    debug_assert!(adjusted_offset > 0); // want to bounds check at least 1 byte
-    let (addr, offset) = match u32::try_from(adjusted_offset) {
-        // If our adjusted offset fits within a u32, then we can place the
-        // entire offset into the offset of the `heap_addr` instruction. After
-        // the `heap_addr` instruction, though, we need to factor the the offset
-        // into the returned address. This is either an immediate to later
-        // memory instructions if the offset further fits within `i32`, or a
-        // manual add instruction otherwise.
-        //
-        // Note that native instructions take a signed offset hence the switch
-        // to i32. Note also the lack of overflow checking in the offset
-        // addition, which should be ok since if `heap_addr` passed we're
-        // guaranteed that this won't overflow.
-        Ok(adjusted_offset) => {
-            let base = builder
+    let addr = match u32::try_from(memarg.offset) {
+        // If our offset fits within a u32, then we can place the it into the
+        // offset immediate of the `heap_addr` instruction.
+        Ok(offset) => {
+            builder
                 .ins()
-                .heap_addr(environ.pointer_type(), heap, addr, adjusted_offset);
-            match i32::try_from(memarg.offset) {
-                Ok(val) => (base, val),
-                Err(_) => {
-                    let adj = builder.ins().iadd_imm(base, memarg.offset as i64);
-                    (adj, 0)
-                }
-            }
+                .heap_addr(environ.pointer_type(), heap, addr, offset, access_size)
         }
 
-        // If the adjusted offset doesn't fit within a u32, then we can't pass
-        // the adjust sized to `heap_addr` raw.
+        // If the offset doesn't fit within a u32, then we can't pass it
+        // directly into `heap_addr`.
         //
         // One reasonable question you might ask is "why not?". There's no
         // fundamental reason why `heap_addr` *must* take a 32-bit offset. The
@@ -2313,8 +2289,6 @@ fn prepare_addr<FE: FuncEnvironment + ?Sized>(
         //
         // Once we have the effective address, offset already folded in, then
         // `heap_addr` is used to verify that the address is indeed in-bounds.
-        // The access size of the `heap_addr` is what we were passed in from
-        // above.
         //
         // Note that this is generating what's likely to be at least two
         // branches, one for the overflow and one for the bounds check itself.
@@ -2328,10 +2302,9 @@ fn prepare_addr<FE: FuncEnvironment + ?Sized>(
                 builder
                     .ins()
                     .uadd_overflow_trap(addr, offset, ir::TrapCode::HeapOutOfBounds);
-            let base = builder
+            builder
                 .ins()
-                .heap_addr(environ.pointer_type(), heap, addr, access_size);
-            (base, 0)
+                .heap_addr(environ.pointer_type(), heap, addr, 0, access_size)
         }
     };
 
@@ -2348,12 +2321,12 @@ fn prepare_addr<FE: FuncEnvironment + ?Sized>(
     // vmctx, stack) accesses.
     flags.set_heap();
 
-    Ok((flags, addr, offset.into()))
+    Ok((flags, addr))
 }
 
 fn prepare_atomic_addr<FE: FuncEnvironment + ?Sized>(
     memarg: &MemArg,
-    loaded_bytes: u32,
+    loaded_bytes: u8,
     builder: &mut FunctionBuilder,
     state: &mut FuncTranslationState,
     environ: &mut FE,
@@ -2386,18 +2359,7 @@ fn prepare_atomic_addr<FE: FuncEnvironment + ?Sized>(
         builder.ins().trapnz(f, ir::TrapCode::HeapMisaligned);
     }
 
-    let (flags, mut addr, offset) = prepare_addr(memarg, loaded_bytes, builder, state, environ)?;
-
-    // Currently cranelift IR operations for atomics don't have offsets
-    // associated with them so we fold the offset into the address itself. Note
-    // that via the `prepare_addr` helper we know that if execution reaches
-    // this point that this addition won't overflow.
-    let offset: i64 = offset.into();
-    if offset != 0 {
-        addr = builder.ins().iadd_imm(addr, offset);
-    }
-
-    Ok((flags, addr))
+    prepare_addr(memarg, loaded_bytes, builder, state, environ)
 }
 
 /// Translate a load instruction.
@@ -2409,14 +2371,16 @@ fn translate_load<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (flags, base, offset) = prepare_addr(
+    let (flags, base) = prepare_addr(
         memarg,
         mem_op_size(opcode, result_ty),
         builder,
         state,
         environ,
     )?;
-    let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
+    let (load, dfg) = builder
+        .ins()
+        .Load(opcode, result_ty, flags, Offset32::new(0), base);
     state.push1(dfg.first_result(load));
     Ok(())
 }
@@ -2432,20 +2396,19 @@ fn translate_store<FE: FuncEnvironment + ?Sized>(
     let val = state.pop1();
     let val_ty = builder.func.dfg.value_type(val);
 
-    let (flags, base, offset) =
-        prepare_addr(memarg, mem_op_size(opcode, val_ty), builder, state, environ)?;
+    let (flags, base) = prepare_addr(memarg, mem_op_size(opcode, val_ty), builder, state, environ)?;
     builder
         .ins()
-        .Store(opcode, val_ty, flags, offset.into(), val, base);
+        .Store(opcode, val_ty, flags, Offset32::new(0), val, base);
     Ok(())
 }
 
-fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
+fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u8 {
     match opcode {
         ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
         ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
         ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
-        ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
+        ir::Opcode::Store | ir::Opcode::Load => u8::try_from(ty.bytes()).unwrap(),
         _ => panic!("unknown size of mem op for {:?}", opcode),
     }
 }
@@ -2490,7 +2453,13 @@ fn translate_atomic_rmw<FE: FuncEnvironment + ?Sized>(
         arg2 = builder.ins().ireduce(access_ty, arg2);
     }
 
-    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let (flags, addr) = prepare_atomic_addr(
+        memarg,
+        u8::try_from(access_ty.bytes()).unwrap(),
+        builder,
+        state,
+        environ,
+    )?;
 
     let mut res = builder.ins().atomic_rmw(access_ty, flags, op, addr, arg2);
     if access_ty != widened_ty {
@@ -2538,7 +2507,13 @@ fn translate_atomic_cas<FE: FuncEnvironment + ?Sized>(
         replacement = builder.ins().ireduce(access_ty, replacement);
     }
 
-    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let (flags, addr) = prepare_atomic_addr(
+        memarg,
+        u8::try_from(access_ty.bytes()).unwrap(),
+        builder,
+        state,
+        environ,
+    )?;
     let mut res = builder.ins().atomic_cas(flags, addr, expected, replacement);
     if access_ty != widened_ty {
         res = builder.ins().uextend(widened_ty, res);
@@ -2572,7 +2547,13 @@ fn translate_atomic_load<FE: FuncEnvironment + ?Sized>(
     };
     assert!(w_ty_ok && widened_ty.bytes() >= access_ty.bytes());
 
-    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let (flags, addr) = prepare_atomic_addr(
+        memarg,
+        u8::try_from(access_ty.bytes()).unwrap(),
+        builder,
+        state,
+        environ,
+    )?;
     let mut res = builder.ins().atomic_load(access_ty, flags, addr);
     if access_ty != widened_ty {
         res = builder.ins().uextend(widened_ty, res);
@@ -2612,7 +2593,13 @@ fn translate_atomic_store<FE: FuncEnvironment + ?Sized>(
         data = builder.ins().ireduce(access_ty, data);
     }
 
-    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let (flags, addr) = prepare_atomic_addr(
+        memarg,
+        u8::try_from(access_ty.bytes()).unwrap(),
+        builder,
+        state,
+        environ,
+    )?;
     builder.ins().atomic_store(flags, data, addr);
     Ok(())
 }