From 1f6e901a88281294775caf1e71530114ac82a8b5 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 9 Feb 2024 17:44:05 -0500
Subject: [PATCH] x64: Refactor multiplication instructions (#7871)

* x64: Refactor multiplication instructions

This commit is inspired after reading over some code from #7865
and #7866. The goal of this commit was to refactor
scalar multiplication-related instructions in the x64 backend to more
closely align with their native instructions. Changes include:

* The `MulHi` instruction is renamed to `Mul`. This represents either
  `mul` or `imul` producing a doublewide result.
* A `Mul8` instruction was added to correspond to `Mul` for the 8-bit
  variants that produce a doublewide result in the `AX` register rather
  than the other instructions which split between `RAX` and `RDX`.
* The `UMulLo` instruction was removed as now it's covered by `Mul`
* The `AluRmiROpcode::Mul` opcode was removed in favor of new `IMul` and
  `IMulImm` instructions. Register allocation and emission already had
  special cases for `Mul` which felt better as standalone instructions
  rather than putting in an existing variant.

Lowerings using `imul` are not affected in general but the `IMulImm`
instruction has different register allocation behavior than before which
allows the destination to have a different register than the first
operand. The `umulhi` and `smulhi` instructions are also reimplemented
with their 8-bit variants instead of extension-plus-16-bit variants.

* Remove outdated emit tests

These are all covered by the filetests framework now too.

* Fix Winch build
---
 cranelift/codegen/src/isa/x64/inst.isle       | 127 +++--
 cranelift/codegen/src/isa/x64/inst/args.rs    |   3 -
 cranelift/codegen/src/isa/x64/inst/emit.rs    | 279 +++++-----
 .../codegen/src/isa/x64/inst/emit_tests.rs    | 393 --------------
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  92 ++--
 cranelift/codegen/src/isa/x64/lower.isle      |  87 ++--
 cranelift/codegen/src/isa/x64/pcc.rs          |  30 +-
 .../filetests/filetests/isa/x64/i128.clif     |   2 +-
 .../filetests/filetests/isa/x64/mul.clif      | 489 ++++++++++++++++++
 .../filetests/filetests/isa/x64/popcnt.clif   |  60 +--
 .../filetests/filetests/isa/x64/smulhi.clif   |  20 +-
 .../filetests/filetests/isa/x64/umulhi.clif   |  20 +-
 winch/codegen/src/isa/x64/asm.rs              |  10 +-
 13 files changed, 877 insertions(+), 735 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/mul.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 82fbcd052dc2..3812d459ce73 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -90,20 +90,34 @@
              (dividend Gpr)
              (dst WritableGpr))
 
-       ;; The high (and low) bits of a (un)signed multiply: `RDX:RAX := RAX *
-       ;; rhs`.
-       (MulHi (size OperandSize)
-              (signed bool)
-              (src1 Gpr)
-              (src2 GprMem)
-              (dst_lo WritableGpr)
-              (dst_hi WritableGpr))
-
-       ;; x64 'mul' instruction but it only outputs the low half
-       (UMulLo    (size OperandSize)
-                  (src1 Gpr)
-                  (src2 GprMem)
-                  (dst WritableGpr))
+       ;; Unsigned multiplication producing the high bits of the result in one
+       ;; register and the low bits in another register.
+       (Mul (size OperandSize)
+            (signed bool)
+            (src1 Gpr)
+            (src2 GprMem)
+            (dst_lo WritableGpr)
+            (dst_hi WritableGpr))
+
+       ;; Same as `Mul` but the 16-bit multiplication result is stored in `AX`.
+       (Mul8 (signed bool)
+             (src1 Gpr)
+             (src2 GprMem)
+             (dst WritableGpr))
+
+       ;; The two-operand form of `imul` which produces a truncated same-size
+       ;; result as the operands.
+       (IMul (size OperandSize)
+             (src1 Gpr)
+             (src2 GprMem)
+             (dst WritableGpr))
+
+       ;; The three-operand form of `imul` where the third operand must be
+       ;; a constant.
+       (IMulImm (size OperandSize)
+                (src1 GprMem)
+                (src2 i32)
+                (dst WritableGpr))
 
        ;; A synthetic instruction sequence used as part of the lowering of the
        ;; `srem` instruction which returns 0 if the divisor is -1 and
@@ -750,8 +764,7 @@
             Sbb
             And
             Or
-            Xor
-            Mul))
+            Xor))
 
 (type AluRmROpcode
       (enum Andn
@@ -2046,22 +2059,6 @@
             (_ Unit (emit (MInst.XmmRmRVex3 op src1 src2 src3 dst))))
         dst))
 
-;; Helper for creating `MInst.MulHi` instructions.
-;;
-;; Returns the (lo, hi) register halves of the multiplication.
-(decl mul_hi (Type bool Gpr GprMem) ValueRegs)
-(rule (mul_hi ty signed src1 src2)
-      (let ((dst_lo WritableGpr (temp_writable_gpr))
-            (dst_hi WritableGpr (temp_writable_gpr))
-            (size OperandSize (raw_operand_size_of_type ty))
-            (_ Unit (emit (MInst.MulHi size
-                                       signed
-                                       src1
-                                       src2
-                                       dst_lo
-                                       dst_hi))))
-        (value_gprs dst_lo dst_hi)))
-
 ;; Helper for creating `MInst.UnaryRmR` instructions.
 (decl unary_rm_r (UnaryRmROpcode Gpr OperandSize) Gpr)
 (rule (unary_rm_r op src size)
@@ -2559,31 +2556,55 @@
                         dst)
          dst)))
 
-;; Helper for creating `mul` instructions.
-(decl x64_mul (Type Gpr GprMemImm) Gpr)
-(rule (x64_mul ty src1 src2)
-      (alu_rmi_r ty
-                 (AluRmiROpcode.Mul)
-                 src1
-                 src2))
+;; Helper for creating `mul` instructions or `imul` instructions (depending
+;; on `signed`)
+(decl x64_mul (Type bool Gpr GprMem) ValueRegs)
+(rule (x64_mul ty signed src1 src2)
+      (let ((dst_lo WritableGpr (temp_writable_gpr))
+            (dst_hi WritableGpr (temp_writable_gpr))
+            (size OperandSize (raw_operand_size_of_type ty))
+            (_ Unit (emit (MInst.Mul size signed src1 src2 dst_lo dst_hi))))
+        (value_gprs dst_lo dst_hi)))
+
+;; Helper for creating `mul` instructions or `imul` instructions (depending
+;; on `signed`) for 8-bit operands.
+(decl x64_mul8 (bool Gpr GprMem) Gpr)
+(rule (x64_mul8 signed src1 src2)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.Mul8 signed src1 src2 dst))))
+        dst))
+
+;; Helper for creating `imul` instructions.
+(decl x64_imul (Type Gpr GprMem) Gpr)
+(rule (x64_imul ty src1 src2)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (raw_operand_size_of_type ty))
+            (_ Unit (emit (MInst.IMul size src1 src2 dst))))
+        dst))
 
-;; Helper for creating `umullo` instructions.
-(decl x64_umullo (Type Gpr GprMem) Gpr)
-(rule (x64_umullo ty src1 src2)
+;; Helper for creating `imul` instructions with an immediate operand.
+(decl x64_imul_imm (Type GprMem i32) Gpr)
+(rule (x64_imul_imm ty src1 src2)
       (let ((dst WritableGpr (temp_writable_gpr))
             (size OperandSize (raw_operand_size_of_type ty))
-            (_ Unit (emit (MInst.UMulLo size src1 src2 dst))))
+            (_ Unit (emit (MInst.IMulImm size src1 src2 dst))))
         dst))
 
-(decl x64_umullo_with_flags_paired (Type Gpr GprMem) ProducesFlags)
-(rule (x64_umullo_with_flags_paired ty src1 src2)
+(decl x64_mul8_with_flags_paired (bool Gpr GprMem) ProducesFlags)
+(rule (x64_mul8_with_flags_paired signed src1 src2)
       (let ((dst WritableGpr (temp_writable_gpr)))
            (ProducesFlags.ProducesFlagsReturnsResultWithConsumer
-                 (MInst.UMulLo (raw_operand_size_of_type ty)
-                               src1
-                               src2
-                               dst)
-                  dst)))
+                 (MInst.Mul8 signed src1 src2 dst)
+                 dst)))
+
+(decl x64_mul_lo_with_flags_paired (Type bool Gpr GprMem) ProducesFlags)
+(rule (x64_mul_lo_with_flags_paired ty signed src1 src2)
+      (let ((dst_lo WritableGpr (temp_writable_gpr))
+            (dst_hi WritableGpr (temp_writable_gpr))
+            (size OperandSize (raw_operand_size_of_type ty)))
+           (ProducesFlags.ProducesFlagsReturnsResultWithConsumer
+                 (MInst.Mul size signed src1 src2 dst_lo dst_hi)
+                 dst_lo)))
 
 ;; Helper for emitting `and` instructions.
 (decl x64_and (Type Gpr GprMemImm) Gpr)
@@ -3891,12 +3912,6 @@
                                              dst))))
         dst))
 
-;; Helper for creating `mul` instructions that return both the lower and
-;; (unsigned) higher halves of the result.
-(decl mulhi_u (Type Gpr GprMem) ValueRegs)
-(rule (mulhi_u ty src1 src2)
-      (mul_hi ty $false src1 src2))
-
 ;; Helper for creating `psllw` instructions.
 (decl x64_psllw (Xmm XmmMemImm) Xmm)
 (rule 0 (x64_psllw src1 src2)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 51f3bf01ae9e..05bb3aaf9069 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -818,8 +818,6 @@ pub enum AluRmiROpcode {
     Or,
     /// Bitwise exclusive OR.
     Xor,
-    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
-    Mul,
 }
 
 impl fmt::Debug for AluRmiROpcode {
@@ -832,7 +830,6 @@ impl fmt::Debug for AluRmiROpcode {
             AluRmiROpcode::And => "and",
             AluRmiROpcode::Or => "or",
             AluRmiROpcode::Xor => "xor",
-            AluRmiROpcode::Mul => "imul",
         };
         write!(fmt, "{}", name)
     }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 1312a1578db6..546f961da636 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -168,134 +168,73 @@ pub(crate) fn emit(
             };
 
             let mut rex = RexFlags::from(*size);
-            if *op == AluRmiROpcode::Mul {
-                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
-                // we have to special-case it.
-                if *size == OperandSize::Size8 {
-                    match src2 {
-                        RegMemImm::Reg { reg: reg_e } => {
-                            debug_assert!(reg_e.is_real());
-                            rex.always_emit_if_8bit_needed(reg_e);
-                            let enc_e = int_reg_enc(reg_e);
-                            emit_std_enc_enc(sink, LegacyPrefixes::None, 0xF6, 1, 5, enc_e, rex);
-                        }
-
-                        RegMemImm::Mem { addr } => {
-                            let amode = addr.finalize(state, sink);
-                            emit_std_enc_mem(
-                                sink,
-                                LegacyPrefixes::None,
-                                0xF6,
-                                1,
-                                5,
-                                &amode,
-                                rex,
-                                0,
-                            );
-                        }
+            let (opcode_r, opcode_m, subopcode_i) = match op {
+                AluRmiROpcode::Add => (0x01, 0x03, 0),
+                AluRmiROpcode::Adc => (0x11, 0x03, 0),
+                AluRmiROpcode::Sub => (0x29, 0x2B, 5),
+                AluRmiROpcode::Sbb => (0x19, 0x2B, 5),
+                AluRmiROpcode::And => (0x21, 0x23, 4),
+                AluRmiROpcode::Or => (0x09, 0x0B, 1),
+                AluRmiROpcode::Xor => (0x31, 0x33, 6),
+            };
 
-                        RegMemImm::Imm { .. } => {
-                            panic!("Cannot emit 8bit imul with 8bit immediate");
-                        }
-                    }
-                } else {
-                    match src2 {
-                        RegMemImm::Reg { reg: reg_e } => {
-                            emit_std_reg_reg(sink, prefix, 0x0FAF, 2, reg_g, reg_e, rex);
-                        }
+            let (opcode_r, opcode_m) = if *size == OperandSize::Size8 {
+                (opcode_r - 1, opcode_m - 1)
+            } else {
+                (opcode_r, opcode_m)
+            };
 
-                        RegMemImm::Mem { addr } => {
-                            let amode = addr.finalize(state, sink);
-                            emit_std_reg_mem(sink, prefix, 0x0FAF, 2, reg_g, &amode, rex, 0);
-                        }
+            if *size == OperandSize::Size8 {
+                debug_assert!(reg_g.is_real());
+                rex.always_emit_if_8bit_needed(reg_g);
+            }
 
-                        RegMemImm::Imm { simm32 } => {
-                            let imm_size = if low8_will_sign_extend_to_32(simm32) {
-                                1
-                            } else {
-                                if *size == OperandSize::Size16 {
-                                    2
-                                } else {
-                                    4
-                                }
-                            };
-                            let opcode = if imm_size == 1 { 0x6B } else { 0x69 };
-                            // Yes, really, reg_g twice.
-                            emit_std_reg_reg(sink, prefix, opcode, 1, reg_g, reg_g, rex);
-                            emit_simm(sink, imm_size, simm32);
-                        }
+            match src2 {
+                RegMemImm::Reg { reg: reg_e } => {
+                    if *size == OperandSize::Size8 {
+                        debug_assert!(reg_e.is_real());
+                        rex.always_emit_if_8bit_needed(reg_e);
                     }
-                }
-            } else {
-                let (opcode_r, opcode_m, subopcode_i) = match op {
-                    AluRmiROpcode::Add => (0x01, 0x03, 0),
-                    AluRmiROpcode::Adc => (0x11, 0x03, 0),
-                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
-                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5),
-                    AluRmiROpcode::And => (0x21, 0x23, 4),
-                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
-                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
-                    AluRmiROpcode::Mul => panic!("unreachable"),
-                };
 
-                let (opcode_r, opcode_m) = if *size == OperandSize::Size8 {
-                    (opcode_r - 1, opcode_m - 1)
-                } else {
-                    (opcode_r, opcode_m)
-                };
-
-                if *size == OperandSize::Size8 {
-                    debug_assert!(reg_g.is_real());
-                    rex.always_emit_if_8bit_needed(reg_g);
+                    // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
+                    // duality). Do this too, so as to be able to compare generated machine
+                    // code easily.
+                    emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex);
                 }
 
-                match src2 {
-                    RegMemImm::Reg { reg: reg_e } => {
-                        if *size == OperandSize::Size8 {
-                            debug_assert!(reg_e.is_real());
-                            rex.always_emit_if_8bit_needed(reg_e);
-                        }
-
-                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
-                        // duality). Do this too, so as to be able to compare generated machine
-                        // code easily.
-                        emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex);
-                    }
-
-                    RegMemImm::Mem { addr } => {
-                        let amode = addr.finalize(state, sink);
-                        // Here we revert to the "normal" G-E ordering.
-                        emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0);
-                    }
+                RegMemImm::Mem { addr } => {
+                    let amode = addr.finalize(state, sink);
+                    // Here we revert to the "normal" G-E ordering.
+                    emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0);
+                }
 
-                    RegMemImm::Imm { simm32 } => {
-                        let imm_size = if *size == OperandSize::Size8 {
+                RegMemImm::Imm { simm32 } => {
+                    let imm_size = if *size == OperandSize::Size8 {
+                        1
+                    } else {
+                        if low8_will_sign_extend_to_32(simm32) {
                             1
                         } else {
-                            if low8_will_sign_extend_to_32(simm32) {
-                                1
+                            if *size == OperandSize::Size16 {
+                                2
                             } else {
-                                if *size == OperandSize::Size16 {
-                                    2
-                                } else {
-                                    4
-                                }
+                                4
                             }
-                        };
+                        }
+                    };
 
-                        let opcode = if *size == OperandSize::Size8 {
-                            0x80
-                        } else if low8_will_sign_extend_to_32(simm32) {
-                            0x83
-                        } else {
-                            0x81
-                        };
+                    let opcode = if *size == OperandSize::Size8 {
+                        0x80
+                    } else if low8_will_sign_extend_to_32(simm32) {
+                        0x83
+                    } else {
+                        0x81
+                    };
 
-                        // And also here we use the "normal" G-E ordering.
-                        let enc_g = int_reg_enc(reg_g);
-                        emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex);
-                        emit_simm(sink, imm_size, simm32);
-                    }
+                    // And also here we use the "normal" G-E ordering.
+                    let enc_g = int_reg_enc(reg_g);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex);
+                    emit_simm(sink, imm_size, simm32);
                 }
             }
         }
@@ -611,9 +550,9 @@ pub(crate) fn emit(
             }
         }
 
-        Inst::MulHi {
-            size,
+        Inst::Mul {
             signed,
+            size,
             src1,
             src2,
             dst_lo,
@@ -625,6 +564,7 @@ pub(crate) fn emit(
             debug_assert_eq!(src1, regs::rax());
             debug_assert_eq!(dst_lo, regs::rax());
             debug_assert_eq!(dst_hi, regs::rdx());
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
 
             let rex_flags = RexFlags::from(*size);
             let prefix = match size {
@@ -635,21 +575,19 @@ pub(crate) fn emit(
             };
 
             let subopcode = if *signed { 5 } else { 4 };
-            match src2.clone().to_reg_mem() {
+            match src2 {
                 RegMem::Reg { reg } => {
-                    let reg = allocs.next(reg);
                     let src = int_reg_enc(reg);
                     emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
                 }
                 RegMem::Mem { addr: src } => {
-                    let amode = src.finalize(state, sink).with_allocs(allocs);
+                    let amode = src.finalize(state, sink);
                     emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
                 }
             }
         }
-
-        Inst::UMulLo {
-            size,
+        Inst::Mul8 {
+            signed,
             src1,
             src2,
             dst,
@@ -658,33 +596,98 @@ pub(crate) fn emit(
             let dst = allocs.next(dst.to_reg().to_reg());
             debug_assert_eq!(src1, regs::rax());
             debug_assert_eq!(dst, regs::rax());
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
 
-            let mut rex = RexFlags::from(*size);
+            let mut rex_flags = RexFlags::from(OperandSize::Size8);
+            let prefix = LegacyPrefixes::None;
+            let subopcode = if *signed { 5 } else { 4 };
+            match src2 {
+                RegMem::Reg { reg } => {
+                    // The intel manual states:
+                    //
+                    // > r/m8 can not be encoded to access the following byte
+                    // > registers if a REX prefix is used: AH, BH, CH, DH
+                    //
+                    // And apparently that also means that a REX prefix must be
+                    // used if it's not one of those registers.
+                    if !(reg == regs::rax()
+                        || reg == regs::rbx()
+                        || reg == regs::rcx()
+                        || reg == regs::rdx())
+                    {
+                        rex_flags.always_emit();
+                    }
+                    let src = int_reg_enc(reg);
+                    emit_std_enc_enc(sink, prefix, 0xF6, 1, subopcode, src, rex_flags)
+                }
+                RegMem::Mem { addr } => {
+                    let amode = addr.finalize(state, sink);
+                    emit_std_enc_mem(sink, prefix, 0xF6, 1, subopcode, &amode, rex_flags, 0);
+                }
+            }
+        }
+        Inst::IMul {
+            size,
+            src1,
+            src2,
+            dst,
+        } => {
+            let src1 = allocs.next(src1.to_reg());
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(src1, dst);
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
+
+            let rex = RexFlags::from(*size);
+            let prefix = LegacyPrefixes::None;
+            match src2 {
+                RegMem::Reg { reg } => {
+                    emit_std_reg_reg(sink, prefix, 0x0FAF, 2, dst, reg, rex);
+                }
+
+                RegMem::Mem { addr } => {
+                    let amode = addr.finalize(state, sink);
+                    emit_std_reg_mem(sink, prefix, 0x0FAF, 2, dst, &amode, rex, 0);
+                }
+            }
+        }
+
+        Inst::IMulImm {
+            size,
+            src1,
+            src2,
+            dst,
+        } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            let src1 = src1.clone().to_reg_mem().with_allocs(allocs);
+
+            let rex = RexFlags::from(*size);
             let prefix = match size {
+                // NB: the intel manual doesn't seem to mention this prefix as
+                // being required
                 OperandSize::Size16 => LegacyPrefixes::_66,
                 _ => LegacyPrefixes::None,
             };
-
-            let opcode = if *size == OperandSize::Size8 {
-                0xF6
+            let imm_size = if i8::try_from(*src2).is_ok() {
+                1
             } else {
-                0xF7
+                if *size == OperandSize::Size16 {
+                    2
+                } else {
+                    4
+                }
             };
-
-            match src2.clone().to_reg_mem() {
+            let opcode = if imm_size == 1 { 0x6B } else { 0x69 };
+            match src1 {
                 RegMem::Reg { reg } => {
-                    let reg = allocs.next(reg);
-                    if *size == OperandSize::Size8 {
-                        rex.always_emit_if_8bit_needed(reg);
-                    }
-                    let reg_e = int_reg_enc(reg);
-                    emit_std_enc_enc(sink, prefix, opcode, 1, 4, reg_e, rex);
+                    emit_std_reg_reg(sink, prefix, opcode, 1, dst, reg, rex);
                 }
-                RegMem::Mem { addr: src } => {
-                    let amode = src.finalize(state, sink).with_allocs(allocs);
-                    emit_std_enc_mem(sink, prefix, opcode, 1, 4, &amode, rex, 0);
+
+                RegMem::Mem { addr } => {
+                    let amode = addr.finalize(state, sink);
+                    emit_std_reg_mem(sink, prefix, opcode, 1, dst, &amode, rex, imm_size);
                 }
             }
+            emit_simm(sink, imm_size, *src2 as u32);
         }
 
         Inst::SignExtendData { size, src, dst } => {
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index b340f6e2a61e..8b2d51e9432c 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -61,32 +61,6 @@ impl Inst {
         }
     }
 
-    fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst {
-        debug_assert!(size.is_one_of(&[
-            OperandSize::Size16,
-            OperandSize::Size32,
-            OperandSize::Size64
-        ]));
-        rhs.assert_regclass_is(RegClass::Int);
-        Inst::MulHi {
-            size,
-            signed,
-            src1: Gpr::new(regs::rax()).unwrap(),
-            src2: GprMem::new(rhs).unwrap(),
-            dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
-        }
-    }
-
-    fn umul_lo(size: OperandSize, operand: RegMem) -> Inst {
-        Inst::UMulLo {
-            size,
-            src1: Gpr::new(regs::rax()).unwrap(),
-            src2: GprMem::new(operand).unwrap(),
-            dst: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-        }
-    }
-
     fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable<Reg>) -> Self {
         src2.assert_regclass_is(RegClass::Float);
         debug_assert!(src1.class() == RegClass::Float);
@@ -1352,157 +1326,6 @@ fn test_x64_emit() {
         "4C31FA",
         "xorq    %rdx, %r15, %rdx",
     ));
-    // Test all mul cases, though
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(r15),
-            w_rdx,
-        ),
-        "490FAFD7",
-        "imulq   %rdx, %r15, %rdx",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rcx),
-            w_r8,
-        ),
-        "440FAFC1",
-        "imull   %r8d, %ecx, %r8d",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rcx),
-            w_rsi,
-        ),
-        "0FAFF1",
-        "imull   %esi, %ecx, %esi",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Mul,
-            RegMemImm::mem(Amode::imm_reg(99, rdi)),
-            w_rdx,
-        ),
-        "480FAF5763",
-        "imulq   %rdx, 99(%rdi), %rdx",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::mem(Amode::imm_reg(99, rdi)),
-            w_r8,
-        ),
-        "440FAF4763",
-        "imull   %r8d, 99(%rdi), %r8d",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::mem(Amode::imm_reg(99, rdi)),
-            w_rsi,
-        ),
-        "0FAF7763",
-        "imull   %esi, 99(%rdi), %esi",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-127i32 as u32),
-            w_rdx,
-        ),
-        "486BD281",
-        "imulq   %rdx, $-127, %rdx",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-129i32 as u32),
-            w_rdx,
-        ),
-        "4869D27FFFFFFF",
-        "imulq   %rdx, $-129, %rdx",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(76543210),
-            w_rdx,
-        ),
-        "4869D2EAF48F04",
-        "imulq   %rdx, $76543210, %rdx",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-127i32 as u32),
-            w_r8,
-        ),
-        "456BC081",
-        "imull   %r8d, $-127, %r8d",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-129i32 as u32),
-            w_r8,
-        ),
-        "4569C07FFFFFFF",
-        "imull   %r8d, $-129, %r8d",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-76543210i32 as u32),
-            w_r8,
-        ),
-        "4569C0160B70FB",
-        "imull   %r8d, $-76543210, %r8d",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-127i32 as u32),
-            w_rsi,
-        ),
-        "6BF681",
-        "imull   %esi, $-127, %esi",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-129i32 as u32),
-            w_rsi,
-        ),
-        "69F67FFFFFFF",
-        "imull   %esi, $-129, %esi",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(76543210),
-            w_rsi,
-        ),
-        "69F6EAF48F04",
-        "imull   %esi, $76543210, %esi",
-    ));
 
     insns.push((
         Inst::alu_rmi_r(
@@ -1585,88 +1408,6 @@ fn test_x64_emit() {
         "andw    %r14w, $-512, %r14w",
     ));
 
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(10),
-            w_rax,
-        ),
-        "666BC00A",
-        "imulw   %ax, $10, %ax",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-512i32 as u32),
-            w_rax,
-        ),
-        "6669C000FE",
-        "imulw   %ax, $-512, %ax",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(10),
-            w_r11,
-        ),
-        "66456BDB0A",
-        "imulw   %r11w, $10, %r11w",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::imm(-512i32 as u32),
-            w_r11,
-        ),
-        "664569DB00FE",
-        "imulw   %r11w, $-512, %r11w",
-    ));
-
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rdx),
-            w_rax,
-        ),
-        "660FAFC2",
-        "imulw   %ax, %dx, %ax",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(r12),
-            w_rax,
-        ),
-        "66410FAFC4",
-        "imulw   %ax, %r12w, %ax",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rdx),
-            w_r11,
-        ),
-        "66440FAFDA",
-        "imulw   %r11w, %dx, %r11w",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size16,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(r12),
-            w_r11,
-        ),
-        "66450FAFDC",
-        "imulw   %r11w, %r12w, %r11w",
-    ));
-
     insns.push((
         Inst::alu_rmi_r(
             OperandSize::Size8,
@@ -1871,48 +1612,6 @@ fn test_x64_emit() {
         "andb    %r15b, %r15b, %r15b",
     ));
 
-    // the 8bit imul has rax as fixed dst
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size8,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rcx),
-            w_rax,
-        ),
-        "F6E9",
-        "imulb   %al, %cl, %al",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size8,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(rbp),
-            w_rax,
-        ),
-        "40F6ED",
-        "imulb   %al, %bpl, %al",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size8,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(r10),
-            w_rax,
-        ),
-        "41F6EA",
-        "imulb   %al, %r10b, %al",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size8,
-            AluRmiROpcode::Mul,
-            RegMemImm::reg(r15),
-            w_rax,
-        ),
-        "41F6EF",
-        "imulb   %al, %r15b, %al",
-    ));
-
     // ========================================================
     // AluRM
 
@@ -2265,98 +1964,6 @@ fn test_x64_emit() {
         "div     %al, %sil, %al ; trap=int_divz",
     ));
 
-    // ========================================================
-    // MulHi
-    insns.push((
-        Inst::mul_hi(
-            OperandSize::Size32,
-            true, /*signed*/
-            RegMem::reg(regs::rsi()),
-        ),
-        "F7EE",
-        "imul    %eax, %esi, %eax, %edx",
-    ));
-    insns.push((
-        Inst::mul_hi(
-            OperandSize::Size64,
-            true, /*signed*/
-            RegMem::reg(regs::r15()),
-        ),
-        "49F7EF",
-        "imul    %rax, %r15, %rax, %rdx",
-    ));
-    insns.push((
-        Inst::mul_hi(
-            OperandSize::Size32,
-            false, /*signed*/
-            RegMem::reg(regs::r14()),
-        ),
-        "41F7E6",
-        "mul     %eax, %r14d, %eax, %edx",
-    ));
-    insns.push((
-        Inst::mul_hi(
-            OperandSize::Size64,
-            false, /*signed*/
-            RegMem::reg(regs::rdi()),
-        ),
-        "48F7E7",
-        "mul     %rax, %rdi, %rax, %rdx",
-    ));
-
-    // ========================================================
-    // UMulLo
-    insns.push((
-        Inst::umul_lo(OperandSize::Size64, RegMem::reg(regs::rdx())),
-        "48F7E2",
-        "mulq    %rax, %rdx, %rax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size64, RegMem::reg(regs::r12())),
-        "49F7E4",
-        "mulq    %rax, %r12, %rax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size32, RegMem::reg(regs::rdx())),
-        "F7E2",
-        "mull    %eax, %edx, %eax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size32, RegMem::reg(regs::r12())),
-        "41F7E4",
-        "mull    %eax, %r12d, %eax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size16, RegMem::reg(regs::rdx())),
-        "66F7E2",
-        "mulw    %ax, %dx, %ax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size16, RegMem::reg(regs::r12())),
-        "6641F7E4",
-        "mulw    %ax, %r12w, %ax",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::rdx())),
-        "F6E2",
-        "mulb    %al, %dl, %al",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::rdi())),
-        "40F6E7",
-        "mulb    %al, %dil, %al",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::r9())),
-        "41F6E1",
-        "mulb    %al, %r9b, %al",
-    ));
-    insns.push((
-        Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::r12())),
-        "41F6E4",
-        "mulb    %al, %r12b, %al",
-    ));
-
     // ========================================================
     // Imm_R
     //
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 19bda166c19f..ea2ff83b3c83 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -130,8 +130,10 @@ impl Inst {
             | Inst::MovToPReg { .. }
             | Inst::MovsxRmR { .. }
             | Inst::MovzxRmR { .. }
-            | Inst::MulHi { .. }
-            | Inst::UMulLo { .. }
+            | Inst::Mul { .. }
+            | Inst::Mul8 { .. }
+            | Inst::IMul { .. }
+            | Inst::IMulImm { .. }
             | Inst::Neg { .. }
             | Inst::Not { .. }
             | Inst::Nop { .. }
@@ -857,7 +859,7 @@ impl PrettyPrint for Inst {
                 format!("{op} {dividend}, {divisor}, {dst} ; trap={trap}")
             }
 
-            Inst::MulHi {
+            Inst::Mul {
                 size,
                 signed,
                 src1,
@@ -869,15 +871,33 @@ impl PrettyPrint for Inst {
                 let dst_lo = pretty_print_reg(dst_lo.to_reg().to_reg(), size.to_bytes(), allocs);
                 let dst_hi = pretty_print_reg(dst_hi.to_reg().to_reg(), size.to_bytes(), allocs);
                 let src2 = src2.pretty_print(size.to_bytes(), allocs);
+                let suffix = suffix_bwlq(*size);
                 let op = ljustify(if *signed {
-                    "imul".to_string()
+                    format!("imul{suffix}")
                 } else {
-                    "mul".to_string()
+                    format!("mul{suffix}")
                 });
                 format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}")
             }
 
-            Inst::UMulLo {
+            Inst::Mul8 {
+                signed,
+                src1,
+                src2,
+                dst,
+            } => {
+                let src1 = pretty_print_reg(src1.to_reg(), 1, allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 1, allocs);
+                let src2 = src2.pretty_print(1, allocs);
+                let op = ljustify(if *signed {
+                    "imulb".to_string()
+                } else {
+                    "mulb".to_string()
+                });
+                format!("{op} {src1}, {src2}, {dst}")
+            }
+
+            Inst::IMul {
                 size,
                 src1,
                 src2,
@@ -886,10 +906,24 @@ impl PrettyPrint for Inst {
                 let src1 = pretty_print_reg(src1.to_reg(), size.to_bytes(), allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
                 let src2 = src2.pretty_print(size.to_bytes(), allocs);
-                let op = ljustify2("mul".to_string(), suffix_bwlq(*size));
+                let suffix = suffix_bwlq(*size);
+                let op = ljustify(format!("imul{suffix}"));
                 format!("{op} {src1}, {src2}, {dst}")
             }
 
+            Inst::IMulImm {
+                size,
+                src1,
+                src2,
+                dst,
+            } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
+                let src1 = src1.pretty_print(size.to_bytes(), allocs);
+                let suffix = suffix_bwlq(*size);
+                let op = ljustify(format!("imul{suffix}"));
+                format!("{op} {src1}, {src2:#x}, {dst}")
+            }
+
             Inst::CheckedSRemSeq {
                 size,
                 divisor,
@@ -1902,23 +1936,11 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
     // method above.
     match inst {
         Inst::AluRmiR {
-            size,
-            op,
-            src1,
-            src2,
-            dst,
-            ..
+            src1, src2, dst, ..
         } => {
-            if *size == OperandSize::Size8 && *op == AluRmiROpcode::Mul {
-                // 8-bit imul has RAX as a fixed input/output
-                collector.reg_fixed_use(src1.to_reg(), regs::rax());
-                collector.reg_fixed_def(dst.to_writable_reg(), regs::rax());
-                src2.get_operands(collector);
-            } else {
-                collector.reg_use(src1.to_reg());
-                collector.reg_reuse_def(dst.to_writable_reg(), 0);
-                src2.get_operands(collector);
-            }
+            collector.reg_use(src1.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+            src2.get_operands(collector);
         }
         Inst::AluConstOp { dst, .. } => collector.reg_def(dst.to_writable_reg()),
         Inst::AluRM { src1_dst, src2, .. } => {
@@ -1973,7 +1995,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             collector.reg_fixed_use(dividend.to_reg(), regs::rax());
             collector.reg_fixed_def(dst.to_writable_reg(), regs::rax());
         }
-        Inst::MulHi {
+        Inst::Mul {
             src1,
             src2,
             dst_lo,
@@ -1985,20 +2007,24 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             collector.reg_fixed_def(dst_hi.to_writable_reg(), regs::rdx());
             src2.get_operands(collector);
         }
-        Inst::UMulLo {
-            size,
-            src1,
-            src2,
-            dst,
-            ..
+        Inst::Mul8 {
+            src1, src2, dst, ..
         } => {
             collector.reg_fixed_use(src1.to_reg(), regs::rax());
             collector.reg_fixed_def(dst.to_writable_reg(), regs::rax());
-            if *size != OperandSize::Size8 {
-                collector.reg_clobbers(PRegSet::empty().with(regs::gpr_preg(regs::ENC_RDX)));
-            }
             src2.get_operands(collector);
         }
+        Inst::IMul {
+            src1, src2, dst, ..
+        } => {
+            collector.reg_use(src1.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+            src2.get_operands(collector);
+        }
+        Inst::IMulImm { src1, dst, .. } => {
+            collector.reg_def(dst.to_writable_reg());
+            src1.get_operands(collector);
+        }
         Inst::SignExtendData { size, src, dst } => {
             match size {
                 OperandSize::Size8 => {
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 516e3151ba6c..7fc2f50053ee 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -167,17 +167,19 @@
 
 ;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 2 (lower (umul_overflow x y @ (value_type (fits_in_64 ty))))
-      (construct_overflow_op (CC.O) (x64_umullo_with_flags_paired ty x y)))
+(rule 2 (lower (umul_overflow x y @ (value_type $I8)))
+      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired $false x y)))
+
+(rule 3 (lower (umul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty))))
+      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty $false x y)))
 
 ;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 2 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty))))
-      (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Mul) x y))
+(rule 2 (lower (smul_overflow x y @ (value_type $I8)))
+      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired $true x y)))
 
-;; there is no 8bit imul with an immediate operand so we need to put it in a register or memory
-(rule 1 (lower (smul_overflow x y @ (value_type $I8)))
-      (construct_overflow_op (CC.O) (x64_alurmi_with_flags_paired (AluRmiROpcode.Mul) $I8 x (reg_mem_to_reg_mem_imm (put_in_reg_mem y)))))
+(rule 3 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty))))
+      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty $true x y)))
 
 ;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -965,19 +967,22 @@
 
 ;; `i64` and smaller.
 
-;; Multiply two registers.
-(rule -5 (lower (has_type (fits_in_64 ty) (imul x y)))
-      (x64_mul ty x y))
+;; 8-bit base case, needs a special instruction encoding and additionally
+;; move sinkable loads to the right.
+(rule -7 (lower (has_type $I8 (imul x y))) (x64_mul8 $false x y))
+(rule -6 (lower (has_type $I8 (imul (sinkable_load x) y))) (x64_mul8 $false y x))
 
-;; Handle multiplication where the lhs is an immediate or sinkable load in
-;; addition to the automatic rhs handling above.
+;; 16-to-64-bit base cases, same as above by moving sinkable loads to the right.
+(rule -5 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x y)))
+         (x64_imul ty x y))
+(rule -4 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (sinkable_load x) y)))
+         (x64_imul ty y x))
 
-(rule -4 (lower (has_type (fits_in_64 ty)
-                       (imul (simm32_from_value x) y)))
-      (x64_mul ty y x))
-(rule -3 (lower (has_type (fits_in_64 ty)
-                       (imul (sinkable_load x) y)))
-      (x64_mul ty y x))
+;; lift out constants to use 3-operand form
+(rule -3 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x (iconst (simm32 y)))))
+         (x64_imul_imm ty x y))
+(rule -2 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (iconst (simm32 x)) y)))
+         (x64_imul_imm ty y x))
 
 ;; `i128`.
 
@@ -1004,13 +1009,13 @@
             (y_lo Gpr (value_regs_get_gpr y_regs 0))
             (y_hi Gpr (value_regs_get_gpr y_regs 1))
             ;; lo_hi = mul x_lo, y_hi
-            (lo_hi Gpr (x64_mul $I64 x_lo y_hi))
+            (lo_hi Gpr (x64_imul $I64 x_lo y_hi))
             ;; hi_lo = mul x_hi, y_lo
-            (hi_lo Gpr (x64_mul $I64 x_hi y_lo))
+            (hi_lo Gpr (x64_imul $I64 x_hi y_lo))
             ;; hilo_hilo = add lo_hi, hi_lo
             (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo))
-            ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo
-            (mul_regs ValueRegs (mulhi_u $I64 x_lo y_lo))
+            ;; dst_lo:hi_lolo = x64_mul x_lo, y_lo
+            (mul_regs ValueRegs (x64_mul $I64 $false x_lo y_lo))
             (dst_lo Gpr (value_regs_get_gpr mul_regs 0))
             (hi_lolo Gpr (value_regs_get_gpr mul_regs 1))
             ;; dst_hi = add hilo_hilo, hi_lolo
@@ -2258,7 +2263,7 @@
             ;; top byte: it is the sum of the bytes (masked4 >> 56) *
             ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01
             ;; + ... + (masked4 >> 0).
-            (mul Gpr (x64_mul $I64 masked4 ones))
+            (mul Gpr (x64_imul $I64 masked4 ones))
             ;; Now take that top byte and return it as the popcount.
             (final Gpr (x64_shr $I64 mul (Imm8Reg.Imm8 56))))
         final))
@@ -2280,7 +2285,7 @@
                            (x64_shr $I32 diff3 (Imm8Reg.Imm8 4))
                            diff3))
             (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f)))
-            (mul Gpr (x64_mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
+            (mul Gpr (x64_imul_imm $I32 masked4 0x01010101))
             (final Gpr (x64_shr $I32 mul (Imm8Reg.Imm8 24))))
         final))
 
@@ -4159,35 +4164,19 @@
 
 ;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; The umulhi instruction is not available for 8-bit types, so we can extend
-;; the inputs, use the 16-bit multiply and shift the result down.
-(rule 1 (lower (umulhi a @ (value_type $I8) b))
-      (let ((a_ext Gpr (extend_to_gpr a $I16 (ExtendKind.Zero)))
-            (b_ext Gpr (extend_to_gpr b $I16 (ExtendKind.Zero)))
-            (mul Gpr (x64_mul $I16 a_ext b_ext))
-            (hi Gpr (x64_shr $I64 mul (imm8_to_imm8_gpr 8))))
-        hi))
+(rule 0 (lower (umulhi a @ (value_type $I8) b))
+        (x64_shr $I16 (x64_mul8 $false a b) (imm8_to_imm8_gpr 8)))
 
-(rule 0 (lower (umulhi a @ (value_type ty) b))
-      (let ((res ValueRegs (mul_hi ty $false a b))
-            (hi Gpr (value_regs_get_gpr res 1)))
-        hi))
+(rule 1 (lower (umulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b))
+        (value_regs_get_gpr (x64_mul ty $false a b) 1))
 
 ;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; The smulhi instruction is not available for 8-bit types, so we can extend
-;; the inputs, use the 16-bit multiply and shift the result down.
-(rule 1 (lower (smulhi a @ (value_type $I8) b))
-      (let ((a_ext Gpr (extend_to_gpr a $I16 (ExtendKind.Sign)))
-            (b_ext Gpr (extend_to_gpr b $I16 (ExtendKind.Sign)))
-            (mul Gpr (x64_mul $I16 a_ext b_ext))
-            (hi Gpr (x64_sar $I64 mul (imm8_to_imm8_gpr 8))))
-        hi))
-
-(rule 0 (lower (smulhi a @ (value_type ty) b))
-      (let ((res ValueRegs (mul_hi ty $true a b))
-            (hi Gpr (value_regs_get_gpr res 1)))
-        hi))
+(rule 0 (lower (smulhi a @ (value_type $I8) b))
+        (x64_sar $I16 (x64_mul8 $true a b) (imm8_to_imm8_gpr 8)))
+
+(rule 1 (lower (smulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b))
+        (value_regs_get_gpr (x64_mul ty $true a b) 1))
 
 ;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/isa/x64/pcc.rs b/cranelift/codegen/src/isa/x64/pcc.rs
index 97afc57466bd..6fa8f886e3a8 100644
--- a/cranelift/codegen/src/isa/x64/pcc.rs
+++ b/cranelift/codegen/src/isa/x64/pcc.rs
@@ -247,7 +247,7 @@ pub(crate) fn check(
             undefined_result(ctx, vcode, dst, 64, 64)?;
             Ok(())
         }
-        Inst::MulHi {
+        Inst::Mul {
             size,
             dst_lo,
             dst_hi,
@@ -264,7 +264,17 @@ pub(crate) fn check(
             undefined_result(ctx, vcode, dst_hi, 64, 64)?;
             Ok(())
         }
-        Inst::UMulLo {
+        Inst::Mul8 { dst, ref src2, .. } => {
+            match <&RegMem>::from(src2) {
+                RegMem::Mem { ref addr } => {
+                    check_load(ctx, None, addr, vcode, I8, 64)?;
+                }
+                RegMem::Reg { .. } => {}
+            }
+            undefined_result(ctx, vcode, dst, 64, 64)?;
+            Ok(())
+        }
+        Inst::IMul {
             size,
             dst,
             ref src2,
@@ -279,7 +289,21 @@ pub(crate) fn check(
             undefined_result(ctx, vcode, dst, 64, 64)?;
             Ok(())
         }
-
+        Inst::IMulImm {
+            size,
+            dst,
+            ref src1,
+            ..
+        } => {
+            match <&RegMem>::from(src1) {
+                RegMem::Mem { ref addr } => {
+                    check_load(ctx, None, addr, vcode, size.to_type(), 64)?;
+                }
+                RegMem::Reg { .. } => {}
+            }
+            undefined_result(ctx, vcode, dst, 64, 64)?;
+            Ok(())
+        }
         Inst::CheckedSRemSeq {
             dst_quotient,
             dst_remainder,
diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif
index eb69119cccb7..5310d3a4b607 100644
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -206,7 +206,7 @@ block0(v0: i128, v1: i128):
 ;   imulq   %rsi, %rcx, %rsi
 ;   addq    %rdx, %rsi, %rdx
 ;   movq    %rdx, %r9
-;   mul     %rax, %rcx, %rax, %rdx
+;   mulq    %rax, %rcx, %rax, %rdx
 ;   movq    %rdx, %rcx
 ;   movq    %r9, %rdx
 ;   addq    %rdx, %rcx, %rdx
diff --git a/cranelift/filetests/filetests/isa/x64/mul.clif b/cranelift/filetests/filetests/isa/x64/mul.clif
new file mode 100644
index 000000000000..f12a73f05919
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/mul.clif
@@ -0,0 +1,489 @@
+test compile precise-output
+set unwind_info=false
+target x86_64
+
+function %imul_i8(i8, i8) -> i8{
+block0(v0: i8, v1: i8):
+    v2 = imul v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mulb    %al, %sil, %al
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   mulb %sil
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i16(i16, i16) -> i16{
+block0(v0: i16, v1: i16):
+    v2 = imul v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imulw   %ax, %si, %ax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   imull %esi, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32(i32, i32) -> i32{
+block0(v0: i32, v1: i32):
+    v2 = imul v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imull   %eax, %esi, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   imull %esi, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i64(i64, i64) -> i64{
+block0(v0: i64, v1: i64):
+    v2 = imul v0, v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imulq   %rax, %rsi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   imulq %rsi, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i8_three(i8, i8, i8) -> i8{
+block0(v0: i8, v1: i8, v2: i8):
+    v3 = imul v0, v1
+    v4 = imul v3, v2
+    return v4
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mulb    %al, %sil, %al
+;   mulb    %al, %dl, %al
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   mulb %sil
+;   mulb %dl
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32_three(i32, i32, i32) -> i32{
+block0(v0: i32, v1: i32, v2: i32):
+    v3 = imul v0, v1
+    v4 = imul v3, v2
+    return v4
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imull   %edi, %esi, %edi
+;   movq    %rdi, %rax
+;   imull   %eax, %edx, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imull %esi, %edi
+;   movq %rdi, %rax
+;   imull %edx, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32_load(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+    v2 = load.i32 notrap v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imull   %eax, 0(%rsi), %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   imull (%rsi), %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i64_load(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+    v2 = load.i64 notrap v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imulq   %rax, 0(%rsi), %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   imulq (%rsi), %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i8_const(i8) -> i8{
+block0(v0: i8):
+    v3 = imul_imm v0, 97
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mulb    %al, const(0), %al
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %rax
+;   mulb 0xb(%rip)
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+
+function %imul_i16_const(i16) -> i16{
+block0(v0: i16):
+    v3 = imul_imm v0, 97
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imulw   %di, 0x61, %ax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imulw $0x61, %di, %ax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32_const(i32) -> i32{
+block0(v0: i32):
+    v3 = imul_imm v0, 97
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imull   %edi, 0x61, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imull $0x61, %edi, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i64_const(i64) -> i64{
+block0(v0: i64):
+    v3 = imul_imm v0, 97
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imulq   %rdi, 0x61, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imulq $0x61, %rdi, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+
+function %imul_i16_bigger_const(i16) -> i16{
+block0(v0: i16):
+    v3 = imul_imm v0, 1021
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imulw   %di, 0x3fd, %ax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imulw $0x3fd, %di, %ax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32_bigger_const(i32) -> i32{
+block0(v0: i32):
+    v3 = imul_imm v0, 1021
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imull   %edi, 0x3fd, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imull $0x3fd, %edi, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i64_bigger_const(i64) -> i64{
+block0(v0: i64):
+    v3 = imul_imm v0, 1021
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imulq   %rdi, 0x3fd, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imulq $0x3fd, %rdi, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i16_const_and_load(i64) -> i16{
+block0(v0: i64):
+    v1 = load.i16 v0
+    v2 = imul_imm v1, 1021
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movzwq  0(%rdi), %rcx
+;   imulw   %cx, 0x3fd, %ax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movzwq (%rdi), %rcx ; trap: heap_oob
+;   imulw $0x3fd, %cx, %ax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i32_const_and_load(i64) -> i32{
+block0(v0: i64):
+    v1 = load.i32 v0
+    v2 = imul_imm v1, 1021
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imull   0(%rdi), 0x3fd, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imull $0x3fd, (%rdi), %eax ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %imul_i64_const_and_load(i64) -> i64{
+block0(v0: i64):
+    v1 = load.i64 v0+100
+    v2 = imul_imm v1, 1021
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   imulq   100(%rdi), 0x3fd, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   imulq $0x3fd, 0x64(%rdi), %rax ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt.clif
index 3aecd486fd12..b74ba4b41622 100644
--- a/cranelift/filetests/filetests/isa/x64/popcnt.clif
+++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif
@@ -147,11 +147,11 @@ block0(v0: i32):
 ;   shrl    $1, %eax, %eax
 ;   andl    %eax, %edx, %eax
 ;   subl    %edi, %eax, %edi
-;   movq    %rdi, %rax
-;   shrl    $4, %eax, %eax
-;   addl    %eax, %edi, %eax
-;   andl    %eax, $252645135, %eax
-;   imull   %eax, $16843009, %eax
+;   movq    %rdi, %r9
+;   shrl    $4, %r9d, %r9d
+;   addl    %r9d, %edi, %r9d
+;   andl    %r9d, $252645135, %r9d
+;   imull   %r9d, 0x1010101, %eax
 ;   shrl    $24, %eax, %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -173,11 +173,11 @@ block0(v0: i32):
 ;   shrl $1, %eax
 ;   andl %edx, %eax
 ;   subl %eax, %edi
-;   movq %rdi, %rax
-;   shrl $4, %eax
-;   addl %edi, %eax
-;   andl $0xf0f0f0f, %eax
-;   imull $0x1010101, %eax, %eax
+;   movq %rdi, %r9
+;   shrl $4, %r9d
+;   addl %edi, %r9d
+;   andl $0xf0f0f0f, %r9d
+;   imull $0x1010101, %r9d, %eax
 ;   shrl $0x18, %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
@@ -194,23 +194,23 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    0(%rdi), %edx
-;   movq    %rdx, %rcx
+;   movl    0(%rdi), %eax
+;   movq    %rax, %rcx
 ;   shrl    $1, %ecx, %ecx
 ;   movl    $2004318071, %r8d
 ;   andl    %ecx, %r8d, %ecx
-;   subl    %edx, %ecx, %edx
+;   subl    %eax, %ecx, %eax
 ;   shrl    $1, %ecx, %ecx
 ;   andl    %ecx, %r8d, %ecx
-;   subl    %edx, %ecx, %edx
+;   subl    %eax, %ecx, %eax
 ;   shrl    $1, %ecx, %ecx
 ;   andl    %ecx, %r8d, %ecx
-;   subl    %edx, %ecx, %edx
-;   movq    %rdx, %rax
-;   shrl    $4, %eax, %eax
-;   addl    %eax, %edx, %eax
-;   andl    %eax, $252645135, %eax
-;   imull   %eax, $16843009, %eax
+;   subl    %eax, %ecx, %eax
+;   movq    %rax, %r10
+;   shrl    $4, %r10d, %r10d
+;   addl    %r10d, %eax, %r10d
+;   andl    %r10d, $252645135, %r10d
+;   imull   %r10d, 0x1010101, %eax
 ;   shrl    $24, %eax, %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -221,23 +221,23 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movl (%rdi), %edx ; trap: heap_oob
-;   movq %rdx, %rcx
+;   movl (%rdi), %eax ; trap: heap_oob
+;   movq %rax, %rcx
 ;   shrl $1, %ecx
 ;   movl $0x77777777, %r8d
 ;   andl %r8d, %ecx
-;   subl %ecx, %edx
+;   subl %ecx, %eax
 ;   shrl $1, %ecx
 ;   andl %r8d, %ecx
-;   subl %ecx, %edx
+;   subl %ecx, %eax
 ;   shrl $1, %ecx
 ;   andl %r8d, %ecx
-;   subl %ecx, %edx
-;   movq %rdx, %rax
-;   shrl $4, %eax
-;   addl %edx, %eax
-;   andl $0xf0f0f0f, %eax
-;   imull $0x1010101, %eax, %eax
+;   subl %ecx, %eax
+;   movq %rax, %r10
+;   shrl $4, %r10d
+;   addl %eax, %r10d
+;   andl $0xf0f0f0f, %r10d
+;   imull $0x1010101, %r10d, %eax
 ;   shrl $0x18, %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
diff --git a/cranelift/filetests/filetests/isa/x64/smulhi.clif b/cranelift/filetests/filetests/isa/x64/smulhi.clif
index fb8356e4b6c2..92589c2f0cfa 100644
--- a/cranelift/filetests/filetests/isa/x64/smulhi.clif
+++ b/cranelift/filetests/filetests/isa/x64/smulhi.clif
@@ -11,10 +11,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movsbl  %dil, %eax
-;   movsbl  %sil, %r8d
-;   imull   %eax, %r8d, %eax
-;   sarq    $8, %rax, %rax
+;   movq    %rdi, %rax
+;   imulb   %al, %sil, %al
+;   sarw    $8, %ax, %ax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -24,10 +23,9 @@ block0(v0: i8, v1: i8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movsbl %dil, %eax
-;   movsbl %sil, %r8d
-;   imull %r8d, %eax
-;   sarq $8, %rax
+;   movq %rdi, %rax
+;   imulb %sil
+;   sarw $8, %ax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -43,7 +41,7 @@ block0(v0: i16, v1: i16):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   imul    %ax, %si, %ax, %dx
+;   imulw   %ax, %si, %ax, %dx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -72,7 +70,7 @@ block0(v0: i32, v1: i32):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   imul    %eax, %esi, %eax, %edx
+;   imull   %eax, %esi, %eax, %edx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -101,7 +99,7 @@ block0(v0: i64, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   imul    %rax, %rsi, %rax, %rdx
+;   imulq   %rax, %rsi, %rax, %rdx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
diff --git a/cranelift/filetests/filetests/isa/x64/umulhi.clif b/cranelift/filetests/filetests/isa/x64/umulhi.clif
index 487d2a897b51..e68df725c1b8 100644
--- a/cranelift/filetests/filetests/isa/x64/umulhi.clif
+++ b/cranelift/filetests/filetests/isa/x64/umulhi.clif
@@ -11,10 +11,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbl  %dil, %eax
-;   movzbl  %sil, %r8d
-;   imull   %eax, %r8d, %eax
-;   shrq    $8, %rax, %rax
+;   movq    %rdi, %rax
+;   mulb    %al, %sil, %al
+;   shrw    $8, %ax, %ax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -24,10 +23,9 @@ block0(v0: i8, v1: i8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movzbl %dil, %eax
-;   movzbl %sil, %r8d
-;   imull %r8d, %eax
-;   shrq $8, %rax
+;   movq %rdi, %rax
+;   mulb %sil
+;   shrw $8, %ax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -43,7 +41,7 @@ block0(v0: i16, v1: i16):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   mul     %ax, %si, %ax, %dx
+;   mulw    %ax, %si, %ax, %dx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -72,7 +70,7 @@ block0(v0: i32, v1: i32):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   mul     %eax, %esi, %eax, %edx
+;   mull    %eax, %esi, %eax, %edx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -101,7 +99,7 @@ block0(v0: i64, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   mul     %rax, %rsi, %rax, %rdx
+;   mulq    %rax, %rsi, %rax, %rdx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
index 89f1a5b82e22..8db06519bb9a 100644
--- a/winch/codegen/src/isa/x64/asm.rs
+++ b/winch/codegen/src/isa/x64/asm.rs
@@ -926,22 +926,18 @@ impl Assembler {
 
     /// Multiply immediate and register.
     pub fn mul_ir(&mut self, imm: i32, dst: Reg, size: OperandSize) {
-        let imm = RegMemImm::imm(imm as u32);
-
-        self.emit(Inst::AluRmiR {
+        self.emit(Inst::IMulImm {
             size: size.into(),
-            op: AluRmiROpcode::Mul,
             src1: dst.into(),
-            src2: GprMemImm::new(imm).expect("valid immediate"),
+            src2: imm,
             dst: dst.into(),
         });
     }
 
     /// Multiply register and register.
     pub fn mul_rr(&mut self, src: Reg, dst: Reg, size: OperandSize) {
-        self.emit(Inst::AluRmiR {
+        self.emit(Inst::IMul {
             size: size.into(),
-            op: AluRmiROpcode::Mul,
             src1: dst.into(),
             src2: src.into(),
             dst: dst.into(),