From bd3dcd313d7bfb5fa602bf445a6f58539653f8aa Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Tue, 21 Feb 2023 14:51:22 -0600 Subject: [PATCH] x64: Add more `fma` instruction lowerings (#5846) The relaxed-simd proposal for WebAssembly adds a fused-multiply-add operation for `v128` types so I was poking around at Cranelift's existing support for its `fma` instruction. I was also poking around at the x86_64 ISA's offerings for the FMA operation and ended up with this PR that improves the lowering of the `fma` instruction on the x64 backend in a number of ways: * A libcall-based fallback is now provided for `f32x4` and `f64x2` types in preparation for eventual support of the relaxed-simd proposal. These encodings are horribly slow, but it's expected that if FMA semantics must be guaranteed then it's the best that can be done without the `fma` feature. Otherwise it'll be up to producers (e.g. Wasmtime embedders) whether wasm-level FMA operations should be FMA or multiply-then-add. * In addition to the existing `vfmadd213*` instructions opcodes were added for `vfmadd132*`. The `132` variant is selected based on which argument can have a sinkable load. * Any argument in the `fma` CLIF instruction can now have a `sinkable_load` and it'll generate a single FMA instruction. * All `vfnmadd*` opcodes were added as well. These are pattern-matched where one of the arguments to the CLIF instruction is an `fneg`. I opted to not add a new CLIF instruction here since it seemed like pattern matching was easy enough but I'm also not intimately familiar with the semantics here so if that's the preferred approach I can do that too. --- cranelift/codegen/src/isa/x64/inst.isle | 71 ++-- cranelift/codegen/src/isa/x64/inst/args.rs | 14 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 42 +- cranelift/codegen/src/isa/x64/inst/mod.rs | 11 - cranelift/codegen/src/isa/x64/lower.isle | 97 ++++- cranelift/codegen/src/isa/x64/lower/isle.rs | 2 +- .../filetests/filetests/isa/x64/fma-call.clif | 180 +++++++++ .../filetests/filetests/isa/x64/fma-inst.clif | 379 +++++++++++++++++- .../filetests/runtests/simd-fma.clif | 1 + 9 files changed, 719 insertions(+), 78 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 324c86900e21..954d0c364ee9 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1199,6 +1199,18 @@ Vfmadd213sd Vfmadd213ps Vfmadd213pd + Vfmadd132ss + Vfmadd132sd + Vfmadd132ps + Vfmadd132pd + Vfnmadd213ss + Vfnmadd213sd + Vfnmadd213ps + Vfnmadd213pd + Vfnmadd132ss + Vfnmadd132sd + Vfnmadd132ps + Vfnmadd132pd Vcmpps Vcmppd Vpsrlw @@ -1623,8 +1635,8 @@ (decl use_popcnt (bool) Type) (extern extractor infallible use_popcnt use_popcnt) -(decl use_fma (bool) Type) -(extern extractor infallible use_fma use_fma) +(decl pure use_fma () bool) +(extern constructor use_fma use_fma) (decl use_sse41 (bool) Type) (extern extractor infallible use_sse41 use_sse41) @@ -3598,34 +3610,33 @@ (_ Unit (emit (MInst.XmmRmRVex3 op src1 src2 src3 dst)))) dst)) -;; Helper for creating `vfmadd213ss` instructions. -; TODO: This should have the (Xmm Xmm XmmMem) signature -; but we don't support VEX memory encodings yet -(decl x64_vfmadd213ss (Xmm Xmm Xmm) Xmm) -(rule (x64_vfmadd213ss x y z) - (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ss) x y z)) - -;; Helper for creating `vfmadd213sd` instructions. -; TODO: This should have the (Xmm Xmm XmmMem) signature -; but we don't support VEX memory encodings yet -(decl x64_vfmadd213sd (Xmm Xmm Xmm) Xmm) -(rule (x64_vfmadd213sd x y z) - (xmm_rmr_vex3 (AvxOpcode.Vfmadd213sd) x y z)) - -;; Helper for creating `vfmadd213ps` instructions. -; TODO: This should have the (Xmm Xmm XmmMem) signature -; but we don't support VEX memory encodings yet -(decl x64_vfmadd213ps (Xmm Xmm Xmm) Xmm) -(rule (x64_vfmadd213ps x y z) - (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ps) x y z)) - -;; Helper for creating `vfmadd213pd` instructions. -; TODO: This should have the (Xmm Xmm XmmMem) signature -; but we don't support VEX memory encodings yet -(decl x64_vfmadd213pd (Xmm Xmm Xmm) Xmm) -(rule (x64_vfmadd213pd x y z) - (xmm_rmr_vex3 (AvxOpcode.Vfmadd213pd) x y z)) - +;; Helper for creating `vfmadd213*` instructions +(decl x64_vfmadd213 (Type Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ss) a b c)) +(rule (x64_vfmadd213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213sd) a b c)) +(rule (x64_vfmadd213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ps) a b c)) +(rule (x64_vfmadd213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd213pd) a b c)) + +;; Helper for creating `vfmadd132*` instructions +(decl x64_vfmadd132 (Type Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132ss) a b c)) +(rule (x64_vfmadd132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132sd) a b c)) +(rule (x64_vfmadd132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132ps) a b c)) +(rule (x64_vfmadd132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmadd132pd) a b c)) + +;; Helper for creating `vfnmadd213*` instructions +(decl x64_vfnmadd213 (Type Xmm Xmm XmmMem) Xmm) +(rule (x64_vfnmadd213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213ss) a b c)) +(rule (x64_vfnmadd213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213sd) a b c)) +(rule (x64_vfnmadd213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213ps) a b c)) +(rule (x64_vfnmadd213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd213pd) a b c)) + +;; Helper for creating `vfnmadd132*` instructions +(decl x64_vfnmadd132 (Type Xmm Xmm XmmMem) Xmm) +(rule (x64_vfnmadd132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132ss) a b c)) +(rule (x64_vfnmadd132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132sd) a b c)) +(rule (x64_vfnmadd132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132ps) a b c)) +(rule (x64_vfnmadd132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132pd) a b c)) ;; Helper for creating `sqrtss` instructions. (decl x64_sqrtss (XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 59b6f6da24a0..b85fdfc20e04 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1515,7 +1515,19 @@ impl AvxOpcode { AvxOpcode::Vfmadd213ss | AvxOpcode::Vfmadd213sd | AvxOpcode::Vfmadd213ps - | AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], + | AvxOpcode::Vfmadd213pd + | AvxOpcode::Vfmadd132ss + | AvxOpcode::Vfmadd132sd + | AvxOpcode::Vfmadd132ps + | AvxOpcode::Vfmadd132pd + | AvxOpcode::Vfnmadd213ss + | AvxOpcode::Vfnmadd213sd + | AvxOpcode::Vfnmadd213ps + | AvxOpcode::Vfnmadd213pd + | AvxOpcode::Vfnmadd132ss + | AvxOpcode::Vfnmadd132sd + | AvxOpcode::Vfnmadd132ps + | AvxOpcode::Vfnmadd132pd => smallvec![InstructionSet::FMA], AvxOpcode::Vminps | AvxOpcode::Vminpd | AvxOpcode::Vmaxps diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index e632833bb13e..44de9450f8f5 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2281,32 +2281,46 @@ pub(crate) fn emit( let dst = allocs.next(dst.to_reg().to_reg()); debug_assert_eq!(src1, dst); let src2 = allocs.next(src2.to_reg()); - let src3 = src3.clone().to_reg_mem().with_allocs(allocs); + let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; let (w, map, opcode) = match op { + AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99), AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9), + AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D), + AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD), + AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99), AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9), + AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D), + AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD), + AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98), AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8), + AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C), + AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC), + AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98), AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8), + AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C), + AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC), AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A), AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B), AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C), _ => unreachable!(), }; - match src3 { - RegMem::Reg { reg: src } => VexInstruction::new() - .length(VexVectorLength::V128) - .prefix(LegacyPrefixes::_66) - .map(map) - .w(w) - .opcode(opcode) - .reg(dst.to_real_reg().unwrap().hw_enc()) - .rm(src.to_real_reg().unwrap().hw_enc()) - .vvvv(src2.to_real_reg().unwrap().hw_enc()) - .encode(sink), - _ => todo!(), - }; + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(map) + .w(w) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .rm(src3) + .vvvv(src2.to_real_reg().unwrap().hw_enc()) + .encode(sink); } Inst::XmmRmRBlendVex { diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 0bfe391e2050..1a5ca835480d 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1944,23 +1944,12 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol src2.get_operands(collector); } Inst::XmmRmRVex3 { - op, src1, src2, src3, dst, .. } => { - // Vfmadd uses and defs the dst reg, that is not the case with all - // AVX's ops, if you're adding a new op, make sure to correctly define - // register uses. - assert!( - *op == AvxOpcode::Vfmadd213ss - || *op == AvxOpcode::Vfmadd213sd - || *op == AvxOpcode::Vfmadd213ps - || *op == AvxOpcode::Vfmadd213pd - ); - collector.reg_use(src1.to_reg()); collector.reg_reuse_def(dst.to_writable_reg(), 0); collector.reg_use(src2.to_reg()); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index db78850e6d2a..9945f0869896 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2167,13 +2167,13 @@ ;; The above rules automatically sink loads for rhs operands, so additionally ;; add rules for sinking loads with lhs operands. (rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y))) - (x64_addss y (sink_load x))) + (x64_addss y x)) (rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y))) - (x64_addsd y (sink_load x))) + (x64_addsd y x)) (rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y))) - (x64_addps y (sink_load x))) + (x64_addps y x)) (rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y))) - (x64_addpd y (sink_load x))) + (x64_addpd y x)) ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2200,13 +2200,13 @@ ;; The above rules automatically sink loads for rhs operands, so additionally ;; add rules for sinking loads with lhs operands. (rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y))) - (x64_mulss y (sink_load x))) + (x64_mulss y x)) (rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y))) - (x64_mulsd y (sink_load x))) + (x64_mulsd y x)) (rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y))) - (x64_mulps y (sink_load x))) + (x64_mulps y x)) (rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y))) - (x64_mulpd y (sink_load x))) + (x64_mulpd y x)) ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2438,18 +2438,83 @@ ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Base case for fma is to call out to one of two libcalls. For vectors they +;; need to be decomposed, handle each element individually, and then recomposed. + (rule (lower (has_type $F32 (fma x y z))) (libcall_3 (LibCall.FmaF32) x y z)) (rule (lower (has_type $F64 (fma x y z))) (libcall_3 (LibCall.FmaF64) x y z)) -(rule 1 (lower (has_type (and (use_fma $true) $F32) (fma x y z))) - (x64_vfmadd213ss x y z)) -(rule 1 (lower (has_type (and (use_fma $true) $F64) (fma x y z))) - (x64_vfmadd213sd x y z)) -(rule (lower (has_type (and (use_fma $true) $F32X4) (fma x y z))) - (x64_vfmadd213ps x y z)) -(rule (lower (has_type (and (use_fma $true) $F64X2) (fma x y z))) - (x64_vfmadd213pd x y z)) + +(rule (lower (has_type $F32X4 (fma x y z))) + (let ( + (x Xmm (put_in_xmm x)) + (y Xmm (put_in_xmm y)) + (z Xmm (put_in_xmm z)) + (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z)) + (x1 Xmm (libcall_3 (LibCall.FmaF32) + (x64_pshufd x 1) + (x64_pshufd y 1) + (x64_pshufd z 1))) + (x2 Xmm (libcall_3 (LibCall.FmaF32) + (x64_pshufd x 2) + (x64_pshufd y 2) + (x64_pshufd z 2))) + (x3 Xmm (libcall_3 (LibCall.FmaF32) + (x64_pshufd x 3) + (x64_pshufd y 3) + (x64_pshufd z 3))) + + (tmp Xmm (vec_insert_lane $F32X4 x0 x1 1)) + (tmp Xmm (vec_insert_lane $F32X4 tmp x2 2)) + (tmp Xmm (vec_insert_lane $F32X4 tmp x3 3)) + ) + tmp)) +(rule (lower (has_type $F64X2 (fma x y z))) + (let ( + (x Xmm (put_in_xmm x)) + (y Xmm (put_in_xmm y)) + (z Xmm (put_in_xmm z)) + (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z)) + (x1 Xmm (libcall_3 (LibCall.FmaF64) + (x64_pshufd x 0xee) + (x64_pshufd y 0xee) + (x64_pshufd z 0xee))) + ) + (vec_insert_lane $F64X2 x0 x1 1))) + + +;; Special case for when the `fma` feature is active and a native instruction +;; can be used. +(rule 1 (lower (has_type ty (fma x y z))) + (if-let $true (use_fma)) + (fmadd ty x y z)) + +(decl fmadd (Type Value Value Value) Xmm) +(decl fnmadd (Type Value Value Value) Xmm) + +;; Base case. Note that this will automatically sink a load with `z`, the value +;; to add. +(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z)) + +;; Allow sinking loads with one of the two values being multiplied in addition +;; to the value being added. Note that both x and y can be sunk here due to +;; multiplication being commutative. +(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x)) +(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y)) + +;; If one of the values being multiplied is negated then use a `vfnmadd*` +;; instruction instead +(rule 3 (fmadd ty (fneg x) y z) (fnmadd ty x y z)) +(rule 4 (fmadd ty x (fneg y) z) (fnmadd ty x y z)) + +(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z)) +(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x)) +(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y)) + +;; Like `fmadd` if one argument is negated switch which one is being codegen'd +(rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z)) +(rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z)) ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 9d684cb879d2..0267c3d32ce9 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -213,7 +213,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { } #[inline] - fn use_fma(&mut self, _: Type) -> bool { + fn use_fma(&mut self) -> bool { self.backend.x64_flags.use_fma() } diff --git a/cranelift/filetests/filetests/isa/x64/fma-call.clif b/cranelift/filetests/filetests/isa/x64/fma-call.clif index 25b62371b626..153fc48c690e 100644 --- a/cranelift/filetests/filetests/isa/x64/fma-call.clif +++ b/cranelift/filetests/filetests/isa/x64/fma-call.clif @@ -55,3 +55,183 @@ block0(v0: f64, v1: f64, v2: f64): ; popq %rbp ; retq +function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $96, %rsp +; block0: +; movdqu %xmm0, rsp(0 + virtual offset) +; movdqu %xmm1, rsp(16 + virtual offset) +; movdqu %xmm2, rsp(32 + virtual offset) +; load_ext_name %FmaF32+0, %r8 +; movdqu rsp(0 + virtual offset), %xmm0 +; movdqu rsp(16 + virtual offset), %xmm1 +; movdqu rsp(32 + virtual offset), %xmm2 +; call *%r8 +; movdqu %xmm0, rsp(48 + virtual offset) +; movdqu rsp(0 + virtual offset), %xmm4 +; pshufd $1, %xmm4, %xmm0 +; movdqu rsp(16 + virtual offset), %xmm2 +; pshufd $1, %xmm2, %xmm1 +; movdqu rsp(32 + virtual offset), %xmm3 +; pshufd $1, %xmm3, %xmm2 +; load_ext_name %FmaF32+0, %r9 +; call *%r9 +; movdqu %xmm0, rsp(64 + virtual offset) +; movdqu rsp(0 + virtual offset), %xmm14 +; pshufd $2, %xmm14, %xmm0 +; movdqu rsp(16 + virtual offset), %xmm13 +; pshufd $2, %xmm13, %xmm1 +; movdqu rsp(32 + virtual offset), %xmm15 +; pshufd $2, %xmm15, %xmm2 +; load_ext_name %FmaF32+0, %r10 +; call *%r10 +; movdqu %xmm0, rsp(80 + virtual offset) +; movdqu rsp(0 + virtual offset), %xmm14 +; pshufd $3, %xmm14, %xmm0 +; movdqu rsp(16 + virtual offset), %xmm1 +; pshufd $3, %xmm1, %xmm1 +; movdqu rsp(32 + virtual offset), %xmm2 +; pshufd $3, %xmm2, %xmm2 +; load_ext_name %FmaF32+0, %r11 +; call *%r11 +; movdqa %xmm0, %xmm13 +; movdqu rsp(64 + virtual offset), %xmm4 +; movdqu rsp(48 + virtual offset), %xmm0 +; insertps $16, %xmm0, %xmm4, %xmm0 +; movdqu rsp(80 + virtual offset), %xmm10 +; insertps $32, %xmm0, %xmm10, %xmm0 +; movdqa %xmm13, %xmm1 +; insertps $48, %xmm0, %xmm1, %xmm0 +; addq %rsp, $96, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x60, %rsp +; block1: ; offset 0x8 +; movdqu %xmm0, (%rsp) +; movdqu %xmm1, 0x10(%rsp) +; movdqu %xmm2, 0x20(%rsp) +; movabsq $0, %r8 ; reloc_external Abs8 %FmaF32 0 +; movdqu (%rsp), %xmm0 +; movdqu 0x10(%rsp), %xmm1 +; movdqu 0x20(%rsp), %xmm2 +; callq *%r8 +; movdqu %xmm0, 0x30(%rsp) +; movdqu (%rsp), %xmm4 +; pshufd $1, %xmm4, %xmm0 +; movdqu 0x10(%rsp), %xmm2 +; pshufd $1, %xmm2, %xmm1 +; movdqu 0x20(%rsp), %xmm3 +; pshufd $1, %xmm3, %xmm2 +; movabsq $0, %r9 ; reloc_external Abs8 %FmaF32 0 +; callq *%r9 +; movdqu %xmm0, 0x40(%rsp) +; movdqu (%rsp), %xmm14 +; pshufd $2, %xmm14, %xmm0 +; movdqu 0x10(%rsp), %xmm13 +; pshufd $2, %xmm13, %xmm1 +; movdqu 0x20(%rsp), %xmm15 +; pshufd $2, %xmm15, %xmm2 +; movabsq $0, %r10 ; reloc_external Abs8 %FmaF32 0 +; callq *%r10 +; movdqu %xmm0, 0x50(%rsp) +; movdqu (%rsp), %xmm14 +; pshufd $3, %xmm14, %xmm0 +; movdqu 0x10(%rsp), %xmm1 +; pshufd $3, %xmm1, %xmm1 +; movdqu 0x20(%rsp), %xmm2 +; pshufd $3, %xmm2, %xmm2 +; movabsq $0, %r11 ; reloc_external Abs8 %FmaF32 0 +; callq *%r11 +; movdqa %xmm0, %xmm13 +; movdqu 0x40(%rsp), %xmm4 +; movdqu 0x30(%rsp), %xmm0 +; insertps $0x10, %xmm4, %xmm0 +; movdqu 0x50(%rsp), %xmm10 +; insertps $0x20, %xmm10, %xmm0 +; movdqa %xmm13, %xmm1 +; insertps $0x30, %xmm1, %xmm0 +; addq $0x60, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $64, %rsp +; block0: +; movdqu %xmm0, rsp(0 + virtual offset) +; movdqu %xmm1, rsp(16 + virtual offset) +; movdqu %xmm2, rsp(32 + virtual offset) +; load_ext_name %FmaF64+0, %r8 +; movdqu rsp(0 + virtual offset), %xmm0 +; movdqu rsp(16 + virtual offset), %xmm1 +; movdqu rsp(32 + virtual offset), %xmm2 +; call *%r8 +; movdqu %xmm0, rsp(48 + virtual offset) +; movdqu rsp(0 + virtual offset), %xmm0 +; pshufd $238, %xmm0, %xmm0 +; movdqu rsp(16 + virtual offset), %xmm1 +; pshufd $238, %xmm1, %xmm1 +; movdqu rsp(32 + virtual offset), %xmm2 +; pshufd $238, %xmm2, %xmm2 +; load_ext_name %FmaF64+0, %r9 +; call *%r9 +; movdqa %xmm0, %xmm14 +; movdqu rsp(48 + virtual offset), %xmm0 +; movlhps %xmm0, %xmm14, %xmm0 +; addq %rsp, $64, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x40, %rsp +; block1: ; offset 0x8 +; movdqu %xmm0, (%rsp) +; movdqu %xmm1, 0x10(%rsp) +; movdqu %xmm2, 0x20(%rsp) +; movabsq $0, %r8 ; reloc_external Abs8 %FmaF64 0 +; movdqu (%rsp), %xmm0 +; movdqu 0x10(%rsp), %xmm1 +; movdqu 0x20(%rsp), %xmm2 +; callq *%r8 +; movdqu %xmm0, 0x30(%rsp) +; movdqu (%rsp), %xmm0 +; pshufd $0xee, %xmm0, %xmm0 +; movdqu 0x10(%rsp), %xmm1 +; pshufd $0xee, %xmm1, %xmm1 +; movdqu 0x20(%rsp), %xmm2 +; pshufd $0xee, %xmm2, %xmm2 +; movabsq $0, %r9 ; reloc_external Abs8 %FmaF64 0 +; callq *%r9 +; movdqa %xmm0, %xmm14 +; movdqu 0x30(%rsp), %xmm0 +; movlhps %xmm14, %xmm0 +; addq $0x40, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/fma-inst.clif b/cranelift/filetests/filetests/isa/x64/fma-inst.clif index 16f6ca226778..e8697e66cca6 100644 --- a/cranelift/filetests/filetests/isa/x64/fma-inst.clif +++ b/cranelift/filetests/filetests/isa/x64/fma-inst.clif @@ -1,7 +1,7 @@ test compile precise-output target x86_64 has_avx=true has_fma=true -function %fma_f32(f32, f32, f32) -> f32 { +function %vfmadd213ss(f32, f32, f32) -> f32 { block0(v0: f32, v1: f32, v2: f32): v3 = fma v0, v1, v2 return v3 @@ -26,8 +26,59 @@ block0(v0: f32, v1: f32, v2: f32): ; popq %rbp ; retq -function %fma_f64(f64, f64, f64) -> f64 { -block0(v0: f64, v1: f64, v2: f64): +function %vfmadd213sd(f64, f64, i64) -> f64 { +block0(v0: f64, v1: f64, v2: i64): + v3 = load.f64 v2 + v4 = fma v0, v1, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd213sd %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd213sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd213ps(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd213ps %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd213ps %xmm2, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd213pd(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): v3 = fma v0, v1, v2 return v3 } @@ -36,7 +87,325 @@ block0(v0: f64, v1: f64, v2: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vfmadd213sd %xmm0, %xmm1, %xmm2, %xmm0 +; vfmadd213pd %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd213pd %xmm2, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd132ss(f32, i64, f32) -> f32 { +block0(v0: f32, v1: i64, v2: f32): + v3 = load.f32 v1 + v4 = fma v0, v3, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd132ss %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd132ss (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd132sd(i64, f64, f64) -> f64 { +block0(v0: i64, v1: f64, v2: f64): + v3 = load.f64 v0 + v4 = fma v3, v1, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd132sd %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd132sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd132ps(f32x4, i64, f32x4) -> f32x4 { +block0(v0: f32x4, v1: i64, v2: f32x4): + v3 = load.f32x4 v1 + v4 = fma v0, v3, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd132ps %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd132ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfmadd132pd(i64, f64x2, f64x2) -> f64x2 { +block0(v0: i64, v1: f64x2, v2: f64x2): + v3 = load.f64x2 v0 + v4 = fma v3, v1, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfmadd132pd %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfmadd132pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd213ss(f32, f32, f32) -> f32 { +block0(v0: f32, v1: f32, v2: f32): + v3 = fneg v0 + v4 = fma v3, v1, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd213ss %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd213ss %xmm2, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd213sd(f64, f64, f64) -> f64 { +block0(v0: f64, v1: f64, v2: f64): + v3 = fneg v1 + v4 = fma v0, v3, v2 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd213sd %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd213sd %xmm2, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd213ps(f32x4, f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: i64): + v3 = fneg v0 + v4 = load.f32x4 v2 + v5 = fma v3, v1, v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd213ps %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd213ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd213pd(f64x2, f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: i64): + v3 = fneg v1 + v4 = load.f64x2 v2 + v5 = fma v0, v3, v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd213pd %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd213pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd132ss(f32, i64, f32) -> f32 { +block0(v0: f32, v1: i64, v2: f32): + v3 = fneg v0 + v4 = load.f32 v1 + v5 = fma v3, v4, v2 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd132ss %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd132ss (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd132sd(i64, f64, f64) -> f64 { +block0(v0: i64, v1: f64, v2: f64): + v3 = fneg v1 + v4 = load.f64 v0 + v5 = fma v4, v3, v2 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd132sd %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd132sd (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd132ps(i64, f32x4, f32x4) -> f32x4 { +block0(v0: i64, v1: f32x4, v2: f32x4): + v3 = load.f32x4 v0 + v4 = fneg v3 + v5 = fma v4, v1, v2 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd132ps %xmm0, %xmm1, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vfnmadd132ps (%rdi), %xmm1, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %vfnmadd132pd(f64x2, i64, f64x2) -> f64x2 { +block0(v0: f64x2, v1: i64, v2: f64x2): + v3 = load.f64x2 v1 + v4 = fneg v3 + v5 = fma v0, v4, v2 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vfnmadd132pd %xmm0, %xmm1, 0(%rdi), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -46,7 +415,7 @@ block0(v0: f64, v1: f64, v2: f64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vfmadd213sd %xmm2, %xmm1, %xmm0 +; vfnmadd132pd (%rdi), %xmm1, %xmm0 ; trap: heap_oob ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif index 4ff5e510411d..37bb30a885c9 100644 --- a/cranelift/filetests/filetests/runtests/simd-fma.clif +++ b/cranelift/filetests/filetests/runtests/simd-fma.clif @@ -1,6 +1,7 @@ test interpret test run target x86_64 has_avx has_fma +target x86_64 has_avx=false has_fma=false target aarch64 function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {