From 1f6e901a88281294775caf1e71530114ac82a8b5 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 9 Feb 2024 17:44:05 -0500 Subject: [PATCH] x64: Refactor multiplication instructions (#7871) * x64: Refactor multiplication instructions This commit is inspired after reading over some code from #7865 and #7866. The goal of this commit was to refactor scalar multiplication-related instructions in the x64 backend to more closely align with their native instructions. Changes include: * The `MulHi` instruction is renamed to `Mul`. This represents either `mul` or `imul` producing a doublewide result. * A `Mul8` instruction was added to correspond to `Mul` for the 8-bit variants that produce a doublewide result in the `AX` register rather than the other instructions which split between `RAX` and `RDX`. * The `UMulLo` instruction was removed as now it's covered by `Mul` * The `AluRmiROpcode::Mul` opcode was removed in favor of new `IMul` and `IMulImm` instructions. Register allocation and emission already had special cases for `Mul` which felt better as standalone instructions rather than putting in an existing variant. Lowerings using `imul` are not affected in general but the `IMulImm` instruction has different register allocation behavior than before which allows the destination to have a different register than the first operand. The `umulhi` and `smulhi` instructions are also reimplemented with their 8-bit variants instead of extension-plus-16-bit variants. * Remove outdated emit tests These are all covered by the filetests framework now too. * Fix Winch build --- cranelift/codegen/src/isa/x64/inst.isle | 127 +++-- cranelift/codegen/src/isa/x64/inst/args.rs | 3 - cranelift/codegen/src/isa/x64/inst/emit.rs | 279 +++++----- .../codegen/src/isa/x64/inst/emit_tests.rs | 393 -------------- cranelift/codegen/src/isa/x64/inst/mod.rs | 92 ++-- cranelift/codegen/src/isa/x64/lower.isle | 87 ++-- cranelift/codegen/src/isa/x64/pcc.rs | 30 +- .../filetests/filetests/isa/x64/i128.clif | 2 +- .../filetests/filetests/isa/x64/mul.clif | 489 ++++++++++++++++++ .../filetests/filetests/isa/x64/popcnt.clif | 60 +-- .../filetests/filetests/isa/x64/smulhi.clif | 20 +- .../filetests/filetests/isa/x64/umulhi.clif | 20 +- winch/codegen/src/isa/x64/asm.rs | 10 +- 13 files changed, 877 insertions(+), 735 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/mul.clif diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 82fbcd052dc2..3812d459ce73 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -90,20 +90,34 @@ (dividend Gpr) (dst WritableGpr)) - ;; The high (and low) bits of a (un)signed multiply: `RDX:RAX := RAX * - ;; rhs`. - (MulHi (size OperandSize) - (signed bool) - (src1 Gpr) - (src2 GprMem) - (dst_lo WritableGpr) - (dst_hi WritableGpr)) - - ;; x64 'mul' instruction but it only outputs the low half - (UMulLo (size OperandSize) - (src1 Gpr) - (src2 GprMem) - (dst WritableGpr)) + ;; Unsigned multiplication producing the high bits of the result in one + ;; register and the low bits in another register. + (Mul (size OperandSize) + (signed bool) + (src1 Gpr) + (src2 GprMem) + (dst_lo WritableGpr) + (dst_hi WritableGpr)) + + ;; Same as `Mul` but the 16-bit multiplication result is stored in `AX`. + (Mul8 (signed bool) + (src1 Gpr) + (src2 GprMem) + (dst WritableGpr)) + + ;; The two-operand form of `imul` which produces a truncated same-size + ;; result as the operands. + (IMul (size OperandSize) + (src1 Gpr) + (src2 GprMem) + (dst WritableGpr)) + + ;; The three-operand form of `imul` where the third operand must be + ;; a constant. + (IMulImm (size OperandSize) + (src1 GprMem) + (src2 i32) + (dst WritableGpr)) ;; A synthetic instruction sequence used as part of the lowering of the ;; `srem` instruction which returns 0 if the divisor is -1 and @@ -750,8 +764,7 @@ Sbb And Or - Xor - Mul)) + Xor)) (type AluRmROpcode (enum Andn @@ -2046,22 +2059,6 @@ (_ Unit (emit (MInst.XmmRmRVex3 op src1 src2 src3 dst)))) dst)) -;; Helper for creating `MInst.MulHi` instructions. -;; -;; Returns the (lo, hi) register halves of the multiplication. -(decl mul_hi (Type bool Gpr GprMem) ValueRegs) -(rule (mul_hi ty signed src1 src2) - (let ((dst_lo WritableGpr (temp_writable_gpr)) - (dst_hi WritableGpr (temp_writable_gpr)) - (size OperandSize (raw_operand_size_of_type ty)) - (_ Unit (emit (MInst.MulHi size - signed - src1 - src2 - dst_lo - dst_hi)))) - (value_gprs dst_lo dst_hi))) - ;; Helper for creating `MInst.UnaryRmR` instructions. (decl unary_rm_r (UnaryRmROpcode Gpr OperandSize) Gpr) (rule (unary_rm_r op src size) @@ -2559,31 +2556,55 @@ dst) dst))) -;; Helper for creating `mul` instructions. -(decl x64_mul (Type Gpr GprMemImm) Gpr) -(rule (x64_mul ty src1 src2) - (alu_rmi_r ty - (AluRmiROpcode.Mul) - src1 - src2)) +;; Helper for creating `mul` instructions or `imul` instructions (depending +;; on `signed`) +(decl x64_mul (Type bool Gpr GprMem) ValueRegs) +(rule (x64_mul ty signed src1 src2) + (let ((dst_lo WritableGpr (temp_writable_gpr)) + (dst_hi WritableGpr (temp_writable_gpr)) + (size OperandSize (raw_operand_size_of_type ty)) + (_ Unit (emit (MInst.Mul size signed src1 src2 dst_lo dst_hi)))) + (value_gprs dst_lo dst_hi))) + +;; Helper for creating `mul` instructions or `imul` instructions (depending +;; on `signed`) for 8-bit operands. +(decl x64_mul8 (bool Gpr GprMem) Gpr) +(rule (x64_mul8 signed src1 src2) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.Mul8 signed src1 src2 dst)))) + dst)) + +;; Helper for creating `imul` instructions. +(decl x64_imul (Type Gpr GprMem) Gpr) +(rule (x64_imul ty src1 src2) + (let ((dst WritableGpr (temp_writable_gpr)) + (size OperandSize (raw_operand_size_of_type ty)) + (_ Unit (emit (MInst.IMul size src1 src2 dst)))) + dst)) -;; Helper for creating `umullo` instructions. -(decl x64_umullo (Type Gpr GprMem) Gpr) -(rule (x64_umullo ty src1 src2) +;; Helper for creating `imul` instructions with an immediate operand. +(decl x64_imul_imm (Type GprMem i32) Gpr) +(rule (x64_imul_imm ty src1 src2) (let ((dst WritableGpr (temp_writable_gpr)) (size OperandSize (raw_operand_size_of_type ty)) - (_ Unit (emit (MInst.UMulLo size src1 src2 dst)))) + (_ Unit (emit (MInst.IMulImm size src1 src2 dst)))) dst)) -(decl x64_umullo_with_flags_paired (Type Gpr GprMem) ProducesFlags) -(rule (x64_umullo_with_flags_paired ty src1 src2) +(decl x64_mul8_with_flags_paired (bool Gpr GprMem) ProducesFlags) +(rule (x64_mul8_with_flags_paired signed src1 src2) (let ((dst WritableGpr (temp_writable_gpr))) (ProducesFlags.ProducesFlagsReturnsResultWithConsumer - (MInst.UMulLo (raw_operand_size_of_type ty) - src1 - src2 - dst) - dst))) + (MInst.Mul8 signed src1 src2 dst) + dst))) + +(decl x64_mul_lo_with_flags_paired (Type bool Gpr GprMem) ProducesFlags) +(rule (x64_mul_lo_with_flags_paired ty signed src1 src2) + (let ((dst_lo WritableGpr (temp_writable_gpr)) + (dst_hi WritableGpr (temp_writable_gpr)) + (size OperandSize (raw_operand_size_of_type ty))) + (ProducesFlags.ProducesFlagsReturnsResultWithConsumer + (MInst.Mul size signed src1 src2 dst_lo dst_hi) + dst_lo))) ;; Helper for emitting `and` instructions. (decl x64_and (Type Gpr GprMemImm) Gpr) @@ -3891,12 +3912,6 @@ dst)))) dst)) -;; Helper for creating `mul` instructions that return both the lower and -;; (unsigned) higher halves of the result. -(decl mulhi_u (Type Gpr GprMem) ValueRegs) -(rule (mulhi_u ty src1 src2) - (mul_hi ty $false src1 src2)) - ;; Helper for creating `psllw` instructions. (decl x64_psllw (Xmm XmmMemImm) Xmm) (rule 0 (x64_psllw src1 src2) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 51f3bf01ae9e..05bb3aaf9069 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -818,8 +818,6 @@ pub enum AluRmiROpcode { Or, /// Bitwise exclusive OR. Xor, - /// The signless, non-extending (N x N -> N, for N in {32,64}) variant. - Mul, } impl fmt::Debug for AluRmiROpcode { @@ -832,7 +830,6 @@ impl fmt::Debug for AluRmiROpcode { AluRmiROpcode::And => "and", AluRmiROpcode::Or => "or", AluRmiROpcode::Xor => "xor", - AluRmiROpcode::Mul => "imul", }; write!(fmt, "{}", name) } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 1312a1578db6..546f961da636 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -168,134 +168,73 @@ pub(crate) fn emit( }; let mut rex = RexFlags::from(*size); - if *op == AluRmiROpcode::Mul { - // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so - // we have to special-case it. - if *size == OperandSize::Size8 { - match src2 { - RegMemImm::Reg { reg: reg_e } => { - debug_assert!(reg_e.is_real()); - rex.always_emit_if_8bit_needed(reg_e); - let enc_e = int_reg_enc(reg_e); - emit_std_enc_enc(sink, LegacyPrefixes::None, 0xF6, 1, 5, enc_e, rex); - } - - RegMemImm::Mem { addr } => { - let amode = addr.finalize(state, sink); - emit_std_enc_mem( - sink, - LegacyPrefixes::None, - 0xF6, - 1, - 5, - &amode, - rex, - 0, - ); - } + let (opcode_r, opcode_m, subopcode_i) = match op { + AluRmiROpcode::Add => (0x01, 0x03, 0), + AluRmiROpcode::Adc => (0x11, 0x03, 0), + AluRmiROpcode::Sub => (0x29, 0x2B, 5), + AluRmiROpcode::Sbb => (0x19, 0x2B, 5), + AluRmiROpcode::And => (0x21, 0x23, 4), + AluRmiROpcode::Or => (0x09, 0x0B, 1), + AluRmiROpcode::Xor => (0x31, 0x33, 6), + }; - RegMemImm::Imm { .. } => { - panic!("Cannot emit 8bit imul with 8bit immediate"); - } - } - } else { - match src2 { - RegMemImm::Reg { reg: reg_e } => { - emit_std_reg_reg(sink, prefix, 0x0FAF, 2, reg_g, reg_e, rex); - } + let (opcode_r, opcode_m) = if *size == OperandSize::Size8 { + (opcode_r - 1, opcode_m - 1) + } else { + (opcode_r, opcode_m) + }; - RegMemImm::Mem { addr } => { - let amode = addr.finalize(state, sink); - emit_std_reg_mem(sink, prefix, 0x0FAF, 2, reg_g, &amode, rex, 0); - } + if *size == OperandSize::Size8 { + debug_assert!(reg_g.is_real()); + rex.always_emit_if_8bit_needed(reg_g); + } - RegMemImm::Imm { simm32 } => { - let imm_size = if low8_will_sign_extend_to_32(simm32) { - 1 - } else { - if *size == OperandSize::Size16 { - 2 - } else { - 4 - } - }; - let opcode = if imm_size == 1 { 0x6B } else { 0x69 }; - // Yes, really, reg_g twice. - emit_std_reg_reg(sink, prefix, opcode, 1, reg_g, reg_g, rex); - emit_simm(sink, imm_size, simm32); - } + match src2 { + RegMemImm::Reg { reg: reg_e } => { + if *size == OperandSize::Size8 { + debug_assert!(reg_e.is_real()); + rex.always_emit_if_8bit_needed(reg_e); } - } - } else { - let (opcode_r, opcode_m, subopcode_i) = match op { - AluRmiROpcode::Add => (0x01, 0x03, 0), - AluRmiROpcode::Adc => (0x11, 0x03, 0), - AluRmiROpcode::Sub => (0x29, 0x2B, 5), - AluRmiROpcode::Sbb => (0x19, 0x2B, 5), - AluRmiROpcode::And => (0x21, 0x23, 4), - AluRmiROpcode::Or => (0x09, 0x0B, 1), - AluRmiROpcode::Xor => (0x31, 0x33, 6), - AluRmiROpcode::Mul => panic!("unreachable"), - }; - let (opcode_r, opcode_m) = if *size == OperandSize::Size8 { - (opcode_r - 1, opcode_m - 1) - } else { - (opcode_r, opcode_m) - }; - - if *size == OperandSize::Size8 { - debug_assert!(reg_g.is_real()); - rex.always_emit_if_8bit_needed(reg_g); + // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R + // duality). Do this too, so as to be able to compare generated machine + // code easily. + emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex); } - match src2 { - RegMemImm::Reg { reg: reg_e } => { - if *size == OperandSize::Size8 { - debug_assert!(reg_e.is_real()); - rex.always_emit_if_8bit_needed(reg_e); - } - - // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R - // duality). Do this too, so as to be able to compare generated machine - // code easily. - emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex); - } - - RegMemImm::Mem { addr } => { - let amode = addr.finalize(state, sink); - // Here we revert to the "normal" G-E ordering. - emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0); - } + RegMemImm::Mem { addr } => { + let amode = addr.finalize(state, sink); + // Here we revert to the "normal" G-E ordering. + emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0); + } - RegMemImm::Imm { simm32 } => { - let imm_size = if *size == OperandSize::Size8 { + RegMemImm::Imm { simm32 } => { + let imm_size = if *size == OperandSize::Size8 { + 1 + } else { + if low8_will_sign_extend_to_32(simm32) { 1 } else { - if low8_will_sign_extend_to_32(simm32) { - 1 + if *size == OperandSize::Size16 { + 2 } else { - if *size == OperandSize::Size16 { - 2 - } else { - 4 - } + 4 } - }; + } + }; - let opcode = if *size == OperandSize::Size8 { - 0x80 - } else if low8_will_sign_extend_to_32(simm32) { - 0x83 - } else { - 0x81 - }; + let opcode = if *size == OperandSize::Size8 { + 0x80 + } else if low8_will_sign_extend_to_32(simm32) { + 0x83 + } else { + 0x81 + }; - // And also here we use the "normal" G-E ordering. - let enc_g = int_reg_enc(reg_g); - emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex); - emit_simm(sink, imm_size, simm32); - } + // And also here we use the "normal" G-E ordering. + let enc_g = int_reg_enc(reg_g); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex); + emit_simm(sink, imm_size, simm32); } } } @@ -611,9 +550,9 @@ pub(crate) fn emit( } } - Inst::MulHi { - size, + Inst::Mul { signed, + size, src1, src2, dst_lo, @@ -625,6 +564,7 @@ pub(crate) fn emit( debug_assert_eq!(src1, regs::rax()); debug_assert_eq!(dst_lo, regs::rax()); debug_assert_eq!(dst_hi, regs::rdx()); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); let rex_flags = RexFlags::from(*size); let prefix = match size { @@ -635,21 +575,19 @@ pub(crate) fn emit( }; let subopcode = if *signed { 5 } else { 4 }; - match src2.clone().to_reg_mem() { + match src2 { RegMem::Reg { reg } => { - let reg = allocs.next(reg); let src = int_reg_enc(reg); emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags) } RegMem::Mem { addr: src } => { - let amode = src.finalize(state, sink).with_allocs(allocs); + let amode = src.finalize(state, sink); emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0); } } } - - Inst::UMulLo { - size, + Inst::Mul8 { + signed, src1, src2, dst, @@ -658,33 +596,98 @@ pub(crate) fn emit( let dst = allocs.next(dst.to_reg().to_reg()); debug_assert_eq!(src1, regs::rax()); debug_assert_eq!(dst, regs::rax()); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); - let mut rex = RexFlags::from(*size); + let mut rex_flags = RexFlags::from(OperandSize::Size8); + let prefix = LegacyPrefixes::None; + let subopcode = if *signed { 5 } else { 4 }; + match src2 { + RegMem::Reg { reg } => { + // The intel manual states: + // + // > r/m8 can not be encoded to access the following byte + // > registers if a REX prefix is used: AH, BH, CH, DH + // + // And apparently that also means that a REX prefix must be + // used if it's not one of those registers. + if !(reg == regs::rax() + || reg == regs::rbx() + || reg == regs::rcx() + || reg == regs::rdx()) + { + rex_flags.always_emit(); + } + let src = int_reg_enc(reg); + emit_std_enc_enc(sink, prefix, 0xF6, 1, subopcode, src, rex_flags) + } + RegMem::Mem { addr } => { + let amode = addr.finalize(state, sink); + emit_std_enc_mem(sink, prefix, 0xF6, 1, subopcode, &amode, rex_flags, 0); + } + } + } + Inst::IMul { + size, + src1, + src2, + dst, + } => { + let src1 = allocs.next(src1.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + debug_assert_eq!(src1, dst); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); + + let rex = RexFlags::from(*size); + let prefix = LegacyPrefixes::None; + match src2 { + RegMem::Reg { reg } => { + emit_std_reg_reg(sink, prefix, 0x0FAF, 2, dst, reg, rex); + } + + RegMem::Mem { addr } => { + let amode = addr.finalize(state, sink); + emit_std_reg_mem(sink, prefix, 0x0FAF, 2, dst, &amode, rex, 0); + } + } + } + + Inst::IMulImm { + size, + src1, + src2, + dst, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = src1.clone().to_reg_mem().with_allocs(allocs); + + let rex = RexFlags::from(*size); let prefix = match size { + // NB: the intel manual doesn't seem to mention this prefix as + // being required OperandSize::Size16 => LegacyPrefixes::_66, _ => LegacyPrefixes::None, }; - - let opcode = if *size == OperandSize::Size8 { - 0xF6 + let imm_size = if i8::try_from(*src2).is_ok() { + 1 } else { - 0xF7 + if *size == OperandSize::Size16 { + 2 + } else { + 4 + } }; - - match src2.clone().to_reg_mem() { + let opcode = if imm_size == 1 { 0x6B } else { 0x69 }; + match src1 { RegMem::Reg { reg } => { - let reg = allocs.next(reg); - if *size == OperandSize::Size8 { - rex.always_emit_if_8bit_needed(reg); - } - let reg_e = int_reg_enc(reg); - emit_std_enc_enc(sink, prefix, opcode, 1, 4, reg_e, rex); + emit_std_reg_reg(sink, prefix, opcode, 1, dst, reg, rex); } - RegMem::Mem { addr: src } => { - let amode = src.finalize(state, sink).with_allocs(allocs); - emit_std_enc_mem(sink, prefix, opcode, 1, 4, &amode, rex, 0); + + RegMem::Mem { addr } => { + let amode = addr.finalize(state, sink); + emit_std_reg_mem(sink, prefix, opcode, 1, dst, &amode, rex, imm_size); } } + emit_simm(sink, imm_size, *src2 as u32); } Inst::SignExtendData { size, src, dst } => { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index b340f6e2a61e..8b2d51e9432c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -61,32 +61,6 @@ impl Inst { } } - fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst { - debug_assert!(size.is_one_of(&[ - OperandSize::Size16, - OperandSize::Size32, - OperandSize::Size64 - ])); - rhs.assert_regclass_is(RegClass::Int); - Inst::MulHi { - size, - signed, - src1: Gpr::new(regs::rax()).unwrap(), - src2: GprMem::new(rhs).unwrap(), - dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()), - dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()), - } - } - - fn umul_lo(size: OperandSize, operand: RegMem) -> Inst { - Inst::UMulLo { - size, - src1: Gpr::new(regs::rax()).unwrap(), - src2: GprMem::new(operand).unwrap(), - dst: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()), - } - } - fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable) -> Self { src2.assert_regclass_is(RegClass::Float); debug_assert!(src1.class() == RegClass::Float); @@ -1352,157 +1326,6 @@ fn test_x64_emit() { "4C31FA", "xorq %rdx, %r15, %rdx", )); - // Test all mul cases, though - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::reg(r15), - w_rdx, - ), - "490FAFD7", - "imulq %rdx, %r15, %rdx", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::reg(rcx), - w_r8, - ), - "440FAFC1", - "imull %r8d, %ecx, %r8d", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::reg(rcx), - w_rsi, - ), - "0FAFF1", - "imull %esi, %ecx, %esi", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::mem(Amode::imm_reg(99, rdi)), - w_rdx, - ), - "480FAF5763", - "imulq %rdx, 99(%rdi), %rdx", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::mem(Amode::imm_reg(99, rdi)), - w_r8, - ), - "440FAF4763", - "imull %r8d, 99(%rdi), %r8d", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::mem(Amode::imm_reg(99, rdi)), - w_rsi, - ), - "0FAF7763", - "imull %esi, 99(%rdi), %esi", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::imm(-127i32 as u32), - w_rdx, - ), - "486BD281", - "imulq %rdx, $-127, %rdx", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::imm(-129i32 as u32), - w_rdx, - ), - "4869D27FFFFFFF", - "imulq %rdx, $-129, %rdx", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::imm(76543210), - w_rdx, - ), - "4869D2EAF48F04", - "imulq %rdx, $76543210, %rdx", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(-127i32 as u32), - w_r8, - ), - "456BC081", - "imull %r8d, $-127, %r8d", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(-129i32 as u32), - w_r8, - ), - "4569C07FFFFFFF", - "imull %r8d, $-129, %r8d", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(-76543210i32 as u32), - w_r8, - ), - "4569C0160B70FB", - "imull %r8d, $-76543210, %r8d", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(-127i32 as u32), - w_rsi, - ), - "6BF681", - "imull %esi, $-127, %esi", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(-129i32 as u32), - w_rsi, - ), - "69F67FFFFFFF", - "imull %esi, $-129, %esi", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(76543210), - w_rsi, - ), - "69F6EAF48F04", - "imull %esi, $76543210, %esi", - )); insns.push(( Inst::alu_rmi_r( @@ -1585,88 +1408,6 @@ fn test_x64_emit() { "andw %r14w, $-512, %r14w", )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::imm(10), - w_rax, - ), - "666BC00A", - "imulw %ax, $10, %ax", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::imm(-512i32 as u32), - w_rax, - ), - "6669C000FE", - "imulw %ax, $-512, %ax", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::imm(10), - w_r11, - ), - "66456BDB0A", - "imulw %r11w, $10, %r11w", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::imm(-512i32 as u32), - w_r11, - ), - "664569DB00FE", - "imulw %r11w, $-512, %r11w", - )); - - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::reg(rdx), - w_rax, - ), - "660FAFC2", - "imulw %ax, %dx, %ax", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::reg(r12), - w_rax, - ), - "66410FAFC4", - "imulw %ax, %r12w, %ax", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::reg(rdx), - w_r11, - ), - "66440FAFDA", - "imulw %r11w, %dx, %r11w", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size16, - AluRmiROpcode::Mul, - RegMemImm::reg(r12), - w_r11, - ), - "66450FAFDC", - "imulw %r11w, %r12w, %r11w", - )); - insns.push(( Inst::alu_rmi_r( OperandSize::Size8, @@ -1871,48 +1612,6 @@ fn test_x64_emit() { "andb %r15b, %r15b, %r15b", )); - // the 8bit imul has rax as fixed dst - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size8, - AluRmiROpcode::Mul, - RegMemImm::reg(rcx), - w_rax, - ), - "F6E9", - "imulb %al, %cl, %al", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size8, - AluRmiROpcode::Mul, - RegMemImm::reg(rbp), - w_rax, - ), - "40F6ED", - "imulb %al, %bpl, %al", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size8, - AluRmiROpcode::Mul, - RegMemImm::reg(r10), - w_rax, - ), - "41F6EA", - "imulb %al, %r10b, %al", - )); - insns.push(( - Inst::alu_rmi_r( - OperandSize::Size8, - AluRmiROpcode::Mul, - RegMemImm::reg(r15), - w_rax, - ), - "41F6EF", - "imulb %al, %r15b, %al", - )); - // ======================================================== // AluRM @@ -2265,98 +1964,6 @@ fn test_x64_emit() { "div %al, %sil, %al ; trap=int_divz", )); - // ======================================================== - // MulHi - insns.push(( - Inst::mul_hi( - OperandSize::Size32, - true, /*signed*/ - RegMem::reg(regs::rsi()), - ), - "F7EE", - "imul %eax, %esi, %eax, %edx", - )); - insns.push(( - Inst::mul_hi( - OperandSize::Size64, - true, /*signed*/ - RegMem::reg(regs::r15()), - ), - "49F7EF", - "imul %rax, %r15, %rax, %rdx", - )); - insns.push(( - Inst::mul_hi( - OperandSize::Size32, - false, /*signed*/ - RegMem::reg(regs::r14()), - ), - "41F7E6", - "mul %eax, %r14d, %eax, %edx", - )); - insns.push(( - Inst::mul_hi( - OperandSize::Size64, - false, /*signed*/ - RegMem::reg(regs::rdi()), - ), - "48F7E7", - "mul %rax, %rdi, %rax, %rdx", - )); - - // ======================================================== - // UMulLo - insns.push(( - Inst::umul_lo(OperandSize::Size64, RegMem::reg(regs::rdx())), - "48F7E2", - "mulq %rax, %rdx, %rax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size64, RegMem::reg(regs::r12())), - "49F7E4", - "mulq %rax, %r12, %rax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size32, RegMem::reg(regs::rdx())), - "F7E2", - "mull %eax, %edx, %eax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size32, RegMem::reg(regs::r12())), - "41F7E4", - "mull %eax, %r12d, %eax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size16, RegMem::reg(regs::rdx())), - "66F7E2", - "mulw %ax, %dx, %ax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size16, RegMem::reg(regs::r12())), - "6641F7E4", - "mulw %ax, %r12w, %ax", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::rdx())), - "F6E2", - "mulb %al, %dl, %al", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::rdi())), - "40F6E7", - "mulb %al, %dil, %al", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::r9())), - "41F6E1", - "mulb %al, %r9b, %al", - )); - insns.push(( - Inst::umul_lo(OperandSize::Size8, RegMem::reg(regs::r12())), - "41F6E4", - "mulb %al, %r12b, %al", - )); - // ======================================================== // Imm_R // diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 19bda166c19f..ea2ff83b3c83 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -130,8 +130,10 @@ impl Inst { | Inst::MovToPReg { .. } | Inst::MovsxRmR { .. } | Inst::MovzxRmR { .. } - | Inst::MulHi { .. } - | Inst::UMulLo { .. } + | Inst::Mul { .. } + | Inst::Mul8 { .. } + | Inst::IMul { .. } + | Inst::IMulImm { .. } | Inst::Neg { .. } | Inst::Not { .. } | Inst::Nop { .. } @@ -857,7 +859,7 @@ impl PrettyPrint for Inst { format!("{op} {dividend}, {divisor}, {dst} ; trap={trap}") } - Inst::MulHi { + Inst::Mul { size, signed, src1, @@ -869,15 +871,33 @@ impl PrettyPrint for Inst { let dst_lo = pretty_print_reg(dst_lo.to_reg().to_reg(), size.to_bytes(), allocs); let dst_hi = pretty_print_reg(dst_hi.to_reg().to_reg(), size.to_bytes(), allocs); let src2 = src2.pretty_print(size.to_bytes(), allocs); + let suffix = suffix_bwlq(*size); let op = ljustify(if *signed { - "imul".to_string() + format!("imul{suffix}") } else { - "mul".to_string() + format!("mul{suffix}") }); format!("{op} {src1}, {src2}, {dst_lo}, {dst_hi}") } - Inst::UMulLo { + Inst::Mul8 { + signed, + src1, + src2, + dst, + } => { + let src1 = pretty_print_reg(src1.to_reg(), 1, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 1, allocs); + let src2 = src2.pretty_print(1, allocs); + let op = ljustify(if *signed { + "imulb".to_string() + } else { + "mulb".to_string() + }); + format!("{op} {src1}, {src2}, {dst}") + } + + Inst::IMul { size, src1, src2, @@ -886,10 +906,24 @@ impl PrettyPrint for Inst { let src1 = pretty_print_reg(src1.to_reg(), size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); let src2 = src2.pretty_print(size.to_bytes(), allocs); - let op = ljustify2("mul".to_string(), suffix_bwlq(*size)); + let suffix = suffix_bwlq(*size); + let op = ljustify(format!("imul{suffix}")); format!("{op} {src1}, {src2}, {dst}") } + Inst::IMulImm { + size, + src1, + src2, + dst, + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); + let src1 = src1.pretty_print(size.to_bytes(), allocs); + let suffix = suffix_bwlq(*size); + let op = ljustify(format!("imul{suffix}")); + format!("{op} {src1}, {src2:#x}, {dst}") + } + Inst::CheckedSRemSeq { size, divisor, @@ -1902,23 +1936,11 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol // method above. match inst { Inst::AluRmiR { - size, - op, - src1, - src2, - dst, - .. + src1, src2, dst, .. } => { - if *size == OperandSize::Size8 && *op == AluRmiROpcode::Mul { - // 8-bit imul has RAX as a fixed input/output - collector.reg_fixed_use(src1.to_reg(), regs::rax()); - collector.reg_fixed_def(dst.to_writable_reg(), regs::rax()); - src2.get_operands(collector); - } else { - collector.reg_use(src1.to_reg()); - collector.reg_reuse_def(dst.to_writable_reg(), 0); - src2.get_operands(collector); - } + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); } Inst::AluConstOp { dst, .. } => collector.reg_def(dst.to_writable_reg()), Inst::AluRM { src1_dst, src2, .. } => { @@ -1973,7 +1995,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_fixed_use(dividend.to_reg(), regs::rax()); collector.reg_fixed_def(dst.to_writable_reg(), regs::rax()); } - Inst::MulHi { + Inst::Mul { src1, src2, dst_lo, @@ -1985,20 +2007,24 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_fixed_def(dst_hi.to_writable_reg(), regs::rdx()); src2.get_operands(collector); } - Inst::UMulLo { - size, - src1, - src2, - dst, - .. + Inst::Mul8 { + src1, src2, dst, .. } => { collector.reg_fixed_use(src1.to_reg(), regs::rax()); collector.reg_fixed_def(dst.to_writable_reg(), regs::rax()); - if *size != OperandSize::Size8 { - collector.reg_clobbers(PRegSet::empty().with(regs::gpr_preg(regs::ENC_RDX))); - } src2.get_operands(collector); } + Inst::IMul { + src1, src2, dst, .. + } => { + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); + } + Inst::IMulImm { src1, dst, .. } => { + collector.reg_def(dst.to_writable_reg()); + src1.get_operands(collector); + } Inst::SignExtendData { size, src, dst } => { match size { OperandSize::Size8 => { diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 516e3151ba6c..7fc2f50053ee 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -167,17 +167,19 @@ ;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 2 (lower (umul_overflow x y @ (value_type (fits_in_64 ty)))) - (construct_overflow_op (CC.O) (x64_umullo_with_flags_paired ty x y))) +(rule 2 (lower (umul_overflow x y @ (value_type $I8))) + (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired $false x y))) + +(rule 3 (lower (umul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty)))) + (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty $false x y))) ;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 2 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty)))) - (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Mul) x y)) +(rule 2 (lower (smul_overflow x y @ (value_type $I8))) + (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired $true x y))) -;; there is no 8bit imul with an immediate operand so we need to put it in a register or memory -(rule 1 (lower (smul_overflow x y @ (value_type $I8))) - (construct_overflow_op (CC.O) (x64_alurmi_with_flags_paired (AluRmiROpcode.Mul) $I8 x (reg_mem_to_reg_mem_imm (put_in_reg_mem y))))) +(rule 3 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty)))) + (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty $true x y))) ;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -965,19 +967,22 @@ ;; `i64` and smaller. -;; Multiply two registers. -(rule -5 (lower (has_type (fits_in_64 ty) (imul x y))) - (x64_mul ty x y)) +;; 8-bit base case, needs a special instruction encoding and additionally +;; move sinkable loads to the right. +(rule -7 (lower (has_type $I8 (imul x y))) (x64_mul8 $false x y)) +(rule -6 (lower (has_type $I8 (imul (sinkable_load x) y))) (x64_mul8 $false y x)) -;; Handle multiplication where the lhs is an immediate or sinkable load in -;; addition to the automatic rhs handling above. +;; 16-to-64-bit base cases, same as above by moving sinkable loads to the right. +(rule -5 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x y))) + (x64_imul ty x y)) +(rule -4 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (sinkable_load x) y))) + (x64_imul ty y x)) -(rule -4 (lower (has_type (fits_in_64 ty) - (imul (simm32_from_value x) y))) - (x64_mul ty y x)) -(rule -3 (lower (has_type (fits_in_64 ty) - (imul (sinkable_load x) y))) - (x64_mul ty y x)) +;; lift out constants to use 3-operand form +(rule -3 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x (iconst (simm32 y))))) + (x64_imul_imm ty x y)) +(rule -2 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (iconst (simm32 x)) y))) + (x64_imul_imm ty y x)) ;; `i128`. @@ -1004,13 +1009,13 @@ (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1)) ;; lo_hi = mul x_lo, y_hi - (lo_hi Gpr (x64_mul $I64 x_lo y_hi)) + (lo_hi Gpr (x64_imul $I64 x_lo y_hi)) ;; hi_lo = mul x_hi, y_lo - (hi_lo Gpr (x64_mul $I64 x_hi y_lo)) + (hi_lo Gpr (x64_imul $I64 x_hi y_lo)) ;; hilo_hilo = add lo_hi, hi_lo (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo)) - ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo - (mul_regs ValueRegs (mulhi_u $I64 x_lo y_lo)) + ;; dst_lo:hi_lolo = x64_mul x_lo, y_lo + (mul_regs ValueRegs (x64_mul $I64 $false x_lo y_lo)) (dst_lo Gpr (value_regs_get_gpr mul_regs 0)) (hi_lolo Gpr (value_regs_get_gpr mul_regs 1)) ;; dst_hi = add hilo_hilo, hi_lolo @@ -2258,7 +2263,7 @@ ;; top byte: it is the sum of the bytes (masked4 >> 56) * ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01 ;; + ... + (masked4 >> 0). - (mul Gpr (x64_mul $I64 masked4 ones)) + (mul Gpr (x64_imul $I64 masked4 ones)) ;; Now take that top byte and return it as the popcount. (final Gpr (x64_shr $I64 mul (Imm8Reg.Imm8 56)))) final)) @@ -2280,7 +2285,7 @@ (x64_shr $I32 diff3 (Imm8Reg.Imm8 4)) diff3)) (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f))) - (mul Gpr (x64_mul $I32 masked4 (RegMemImm.Imm 0x01010101))) + (mul Gpr (x64_imul_imm $I32 masked4 0x01010101)) (final Gpr (x64_shr $I32 mul (Imm8Reg.Imm8 24)))) final)) @@ -4159,35 +4164,19 @@ ;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; The umulhi instruction is not available for 8-bit types, so we can extend -;; the inputs, use the 16-bit multiply and shift the result down. -(rule 1 (lower (umulhi a @ (value_type $I8) b)) - (let ((a_ext Gpr (extend_to_gpr a $I16 (ExtendKind.Zero))) - (b_ext Gpr (extend_to_gpr b $I16 (ExtendKind.Zero))) - (mul Gpr (x64_mul $I16 a_ext b_ext)) - (hi Gpr (x64_shr $I64 mul (imm8_to_imm8_gpr 8)))) - hi)) +(rule 0 (lower (umulhi a @ (value_type $I8) b)) + (x64_shr $I16 (x64_mul8 $false a b) (imm8_to_imm8_gpr 8))) -(rule 0 (lower (umulhi a @ (value_type ty) b)) - (let ((res ValueRegs (mul_hi ty $false a b)) - (hi Gpr (value_regs_get_gpr res 1))) - hi)) +(rule 1 (lower (umulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b)) + (value_regs_get_gpr (x64_mul ty $false a b) 1)) ;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; The smulhi instruction is not available for 8-bit types, so we can extend -;; the inputs, use the 16-bit multiply and shift the result down. -(rule 1 (lower (smulhi a @ (value_type $I8) b)) - (let ((a_ext Gpr (extend_to_gpr a $I16 (ExtendKind.Sign))) - (b_ext Gpr (extend_to_gpr b $I16 (ExtendKind.Sign))) - (mul Gpr (x64_mul $I16 a_ext b_ext)) - (hi Gpr (x64_sar $I64 mul (imm8_to_imm8_gpr 8)))) - hi)) - -(rule 0 (lower (smulhi a @ (value_type ty) b)) - (let ((res ValueRegs (mul_hi ty $true a b)) - (hi Gpr (value_regs_get_gpr res 1))) - hi)) +(rule 0 (lower (smulhi a @ (value_type $I8) b)) + (x64_sar $I16 (x64_mul8 $true a b) (imm8_to_imm8_gpr 8))) + +(rule 1 (lower (smulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b)) + (value_regs_get_gpr (x64_mul ty $true a b) 1)) ;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/pcc.rs b/cranelift/codegen/src/isa/x64/pcc.rs index 97afc57466bd..6fa8f886e3a8 100644 --- a/cranelift/codegen/src/isa/x64/pcc.rs +++ b/cranelift/codegen/src/isa/x64/pcc.rs @@ -247,7 +247,7 @@ pub(crate) fn check( undefined_result(ctx, vcode, dst, 64, 64)?; Ok(()) } - Inst::MulHi { + Inst::Mul { size, dst_lo, dst_hi, @@ -264,7 +264,17 @@ pub(crate) fn check( undefined_result(ctx, vcode, dst_hi, 64, 64)?; Ok(()) } - Inst::UMulLo { + Inst::Mul8 { dst, ref src2, .. } => { + match <&RegMem>::from(src2) { + RegMem::Mem { ref addr } => { + check_load(ctx, None, addr, vcode, I8, 64)?; + } + RegMem::Reg { .. } => {} + } + undefined_result(ctx, vcode, dst, 64, 64)?; + Ok(()) + } + Inst::IMul { size, dst, ref src2, @@ -279,7 +289,21 @@ pub(crate) fn check( undefined_result(ctx, vcode, dst, 64, 64)?; Ok(()) } - + Inst::IMulImm { + size, + dst, + ref src1, + .. + } => { + match <&RegMem>::from(src1) { + RegMem::Mem { ref addr } => { + check_load(ctx, None, addr, vcode, size.to_type(), 64)?; + } + RegMem::Reg { .. } => {} + } + undefined_result(ctx, vcode, dst, 64, 64)?; + Ok(()) + } Inst::CheckedSRemSeq { dst_quotient, dst_remainder, diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif index eb69119cccb7..5310d3a4b607 100644 --- a/cranelift/filetests/filetests/isa/x64/i128.clif +++ b/cranelift/filetests/filetests/isa/x64/i128.clif @@ -206,7 +206,7 @@ block0(v0: i128, v1: i128): ; imulq %rsi, %rcx, %rsi ; addq %rdx, %rsi, %rdx ; movq %rdx, %r9 -; mul %rax, %rcx, %rax, %rdx +; mulq %rax, %rcx, %rax, %rdx ; movq %rdx, %rcx ; movq %r9, %rdx ; addq %rdx, %rcx, %rdx diff --git a/cranelift/filetests/filetests/isa/x64/mul.clif b/cranelift/filetests/filetests/isa/x64/mul.clif new file mode 100644 index 000000000000..f12a73f05919 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/mul.clif @@ -0,0 +1,489 @@ +test compile precise-output +set unwind_info=false +target x86_64 + +function %imul_i8(i8, i8) -> i8{ +block0(v0: i8, v1: i8): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; mulb %al, %sil, %al +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; mulb %sil +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i16(i16, i16) -> i16{ +block0(v0: i16, v1: i16): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; imulw %ax, %si, %ax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; imull %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32(i32, i32) -> i32{ +block0(v0: i32, v1: i32): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; imull %eax, %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; imull %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i64(i64, i64) -> i64{ +block0(v0: i64, v1: i64): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; imulq %rax, %rsi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; imulq %rsi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i8_three(i8, i8, i8) -> i8{ +block0(v0: i8, v1: i8, v2: i8): + v3 = imul v0, v1 + v4 = imul v3, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; mulb %al, %sil, %al +; mulb %al, %dl, %al +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; mulb %sil +; mulb %dl +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32_three(i32, i32, i32) -> i32{ +block0(v0: i32, v1: i32, v2: i32): + v3 = imul v0, v1 + v4 = imul v3, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imull %edi, %esi, %edi +; movq %rdi, %rax +; imull %eax, %edx, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imull %esi, %edi +; movq %rdi, %rax +; imull %edx, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32_load(i32, i64) -> i32 { +block0(v0: i32, v1: i64): + v2 = load.i32 notrap v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; imull %eax, 0(%rsi), %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; imull (%rsi), %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i64_load(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = load.i64 notrap v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; imulq %rax, 0(%rsi), %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; imulq (%rsi), %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i8_const(i8) -> i8{ +block0(v0: i8): + v3 = imul_imm v0, 97 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rax +; mulb %al, const(0), %al +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rax +; mulb 0xb(%rip) +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) + +function %imul_i16_const(i16) -> i16{ +block0(v0: i16): + v3 = imul_imm v0, 97 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imulw %di, 0x61, %ax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imulw $0x61, %di, %ax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32_const(i32) -> i32{ +block0(v0: i32): + v3 = imul_imm v0, 97 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imull %edi, 0x61, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imull $0x61, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i64_const(i64) -> i64{ +block0(v0: i64): + v3 = imul_imm v0, 97 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imulq %rdi, 0x61, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imulq $0x61, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + + +function %imul_i16_bigger_const(i16) -> i16{ +block0(v0: i16): + v3 = imul_imm v0, 1021 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imulw %di, 0x3fd, %ax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imulw $0x3fd, %di, %ax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32_bigger_const(i32) -> i32{ +block0(v0: i32): + v3 = imul_imm v0, 1021 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imull %edi, 0x3fd, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imull $0x3fd, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i64_bigger_const(i64) -> i64{ +block0(v0: i64): + v3 = imul_imm v0, 1021 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imulq %rdi, 0x3fd, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imulq $0x3fd, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i16_const_and_load(i64) -> i16{ +block0(v0: i64): + v1 = load.i16 v0 + v2 = imul_imm v1, 1021 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwq 0(%rdi), %rcx +; imulw %cx, 0x3fd, %ax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwq (%rdi), %rcx ; trap: heap_oob +; imulw $0x3fd, %cx, %ax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i32_const_and_load(i64) -> i32{ +block0(v0: i64): + v1 = load.i32 v0 + v2 = imul_imm v1, 1021 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imull 0(%rdi), 0x3fd, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imull $0x3fd, (%rdi), %eax ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %imul_i64_const_and_load(i64) -> i64{ +block0(v0: i64): + v1 = load.i64 v0+100 + v2 = imul_imm v1, 1021 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; imulq 100(%rdi), 0x3fd, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; imulq $0x3fd, 0x64(%rdi), %rax ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/popcnt.clif b/cranelift/filetests/filetests/isa/x64/popcnt.clif index 3aecd486fd12..b74ba4b41622 100644 --- a/cranelift/filetests/filetests/isa/x64/popcnt.clif +++ b/cranelift/filetests/filetests/isa/x64/popcnt.clif @@ -147,11 +147,11 @@ block0(v0: i32): ; shrl $1, %eax, %eax ; andl %eax, %edx, %eax ; subl %edi, %eax, %edi -; movq %rdi, %rax -; shrl $4, %eax, %eax -; addl %eax, %edi, %eax -; andl %eax, $252645135, %eax -; imull %eax, $16843009, %eax +; movq %rdi, %r9 +; shrl $4, %r9d, %r9d +; addl %r9d, %edi, %r9d +; andl %r9d, $252645135, %r9d +; imull %r9d, 0x1010101, %eax ; shrl $24, %eax, %eax ; movq %rbp, %rsp ; popq %rbp @@ -173,11 +173,11 @@ block0(v0: i32): ; shrl $1, %eax ; andl %edx, %eax ; subl %eax, %edi -; movq %rdi, %rax -; shrl $4, %eax -; addl %edi, %eax -; andl $0xf0f0f0f, %eax -; imull $0x1010101, %eax, %eax +; movq %rdi, %r9 +; shrl $4, %r9d +; addl %edi, %r9d +; andl $0xf0f0f0f, %r9d +; imull $0x1010101, %r9d, %eax ; shrl $0x18, %eax ; movq %rbp, %rsp ; popq %rbp @@ -194,23 +194,23 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movl 0(%rdi), %edx -; movq %rdx, %rcx +; movl 0(%rdi), %eax +; movq %rax, %rcx ; shrl $1, %ecx, %ecx ; movl $2004318071, %r8d ; andl %ecx, %r8d, %ecx -; subl %edx, %ecx, %edx +; subl %eax, %ecx, %eax ; shrl $1, %ecx, %ecx ; andl %ecx, %r8d, %ecx -; subl %edx, %ecx, %edx +; subl %eax, %ecx, %eax ; shrl $1, %ecx, %ecx ; andl %ecx, %r8d, %ecx -; subl %edx, %ecx, %edx -; movq %rdx, %rax -; shrl $4, %eax, %eax -; addl %eax, %edx, %eax -; andl %eax, $252645135, %eax -; imull %eax, $16843009, %eax +; subl %eax, %ecx, %eax +; movq %rax, %r10 +; shrl $4, %r10d, %r10d +; addl %r10d, %eax, %r10d +; andl %r10d, $252645135, %r10d +; imull %r10d, 0x1010101, %eax ; shrl $24, %eax, %eax ; movq %rbp, %rsp ; popq %rbp @@ -221,23 +221,23 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movl (%rdi), %edx ; trap: heap_oob -; movq %rdx, %rcx +; movl (%rdi), %eax ; trap: heap_oob +; movq %rax, %rcx ; shrl $1, %ecx ; movl $0x77777777, %r8d ; andl %r8d, %ecx -; subl %ecx, %edx +; subl %ecx, %eax ; shrl $1, %ecx ; andl %r8d, %ecx -; subl %ecx, %edx +; subl %ecx, %eax ; shrl $1, %ecx ; andl %r8d, %ecx -; subl %ecx, %edx -; movq %rdx, %rax -; shrl $4, %eax -; addl %edx, %eax -; andl $0xf0f0f0f, %eax -; imull $0x1010101, %eax, %eax +; subl %ecx, %eax +; movq %rax, %r10 +; shrl $4, %r10d +; addl %eax, %r10d +; andl $0xf0f0f0f, %r10d +; imull $0x1010101, %r10d, %eax ; shrl $0x18, %eax ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/smulhi.clif b/cranelift/filetests/filetests/isa/x64/smulhi.clif index fb8356e4b6c2..92589c2f0cfa 100644 --- a/cranelift/filetests/filetests/isa/x64/smulhi.clif +++ b/cranelift/filetests/filetests/isa/x64/smulhi.clif @@ -11,10 +11,9 @@ block0(v0: i8, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsbl %dil, %eax -; movsbl %sil, %r8d -; imull %eax, %r8d, %eax -; sarq $8, %rax, %rax +; movq %rdi, %rax +; imulb %al, %sil, %al +; sarw $8, %ax, %ax ; movq %rbp, %rsp ; popq %rbp ; ret @@ -24,10 +23,9 @@ block0(v0: i8, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsbl %dil, %eax -; movsbl %sil, %r8d -; imull %r8d, %eax -; sarq $8, %rax +; movq %rdi, %rax +; imulb %sil +; sarw $8, %ax ; movq %rbp, %rsp ; popq %rbp ; retq @@ -43,7 +41,7 @@ block0(v0: i16, v1: i16): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; imul %ax, %si, %ax, %dx +; imulw %ax, %si, %ax, %dx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp @@ -72,7 +70,7 @@ block0(v0: i32, v1: i32): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; imul %eax, %esi, %eax, %edx +; imull %eax, %esi, %eax, %edx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp @@ -101,7 +99,7 @@ block0(v0: i64, v1: i64): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; imul %rax, %rsi, %rax, %rdx +; imulq %rax, %rsi, %rax, %rdx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/umulhi.clif b/cranelift/filetests/filetests/isa/x64/umulhi.clif index 487d2a897b51..e68df725c1b8 100644 --- a/cranelift/filetests/filetests/isa/x64/umulhi.clif +++ b/cranelift/filetests/filetests/isa/x64/umulhi.clif @@ -11,10 +11,9 @@ block0(v0: i8, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbl %dil, %eax -; movzbl %sil, %r8d -; imull %eax, %r8d, %eax -; shrq $8, %rax, %rax +; movq %rdi, %rax +; mulb %al, %sil, %al +; shrw $8, %ax, %ax ; movq %rbp, %rsp ; popq %rbp ; ret @@ -24,10 +23,9 @@ block0(v0: i8, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movzbl %dil, %eax -; movzbl %sil, %r8d -; imull %r8d, %eax -; shrq $8, %rax +; movq %rdi, %rax +; mulb %sil +; shrw $8, %ax ; movq %rbp, %rsp ; popq %rbp ; retq @@ -43,7 +41,7 @@ block0(v0: i16, v1: i16): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; mul %ax, %si, %ax, %dx +; mulw %ax, %si, %ax, %dx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp @@ -72,7 +70,7 @@ block0(v0: i32, v1: i32): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; mul %eax, %esi, %eax, %edx +; mull %eax, %esi, %eax, %edx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp @@ -101,7 +99,7 @@ block0(v0: i64, v1: i64): ; movq %rsp, %rbp ; block0: ; movq %rdi, %rax -; mul %rax, %rsi, %rax, %rdx +; mulq %rax, %rsi, %rax, %rdx ; movq %rdx, %rax ; movq %rbp, %rsp ; popq %rbp diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 89f1a5b82e22..8db06519bb9a 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -926,22 +926,18 @@ impl Assembler { /// Multiply immediate and register. pub fn mul_ir(&mut self, imm: i32, dst: Reg, size: OperandSize) { - let imm = RegMemImm::imm(imm as u32); - - self.emit(Inst::AluRmiR { + self.emit(Inst::IMulImm { size: size.into(), - op: AluRmiROpcode::Mul, src1: dst.into(), - src2: GprMemImm::new(imm).expect("valid immediate"), + src2: imm, dst: dst.into(), }); } /// Multiply register and register. pub fn mul_rr(&mut self, src: Reg, dst: Reg, size: OperandSize) { - self.emit(Inst::AluRmiR { + self.emit(Inst::IMul { size: size.into(), - op: AluRmiROpcode::Mul, src1: dst.into(), src2: src.into(), dst: dst.into(),