From 90fd859f51d7a77ccb7978804af00c847e8e4d6d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 27 Feb 2020 15:19:37 -0500 Subject: [PATCH] [x86] use instruction-level fast-math-flags to drive MachineCombiner The code changes here are hopefully straightforward: 1. Use MachineInstruction flags to decide if FP ops can be reassociated (use both "reassoc" and "nsz" to be consistent with IR transforms; we probably don't need "nsz", but that's a safer interpretation of the FMF). 2. Check that both nodes allow reassociation to change instructions. This is a stronger requirement than we've usually implemented in IR/DAG, but this is needed to solve the motivating bug (see below), and it seems unlikely to impede optimization at this late stage. 3. Intersect/propagate MachineIR flags to enable further reassociation in MachineCombiner. We managed to make MachineCombiner flexible enough that no changes are needed to that pass itself. So this patch should only affect x86 (assuming no other targets have implemented the hooks using MachineIR flags yet). The motivating example in PR43609 is another case of fast-math transforms interacting badly with special FP ops created during lowering: https://bugs.llvm.org/show_bug.cgi?id=43609 The special fadd ops used for converting int to FP assume that they will not be altered, so those are created without FMF. However, the MachineCombiner pass was being enabled for FP ops using the global/function-level TargetOption for "UnsafeFPMath". We managed to run instruction/node-level FMF all the way down to MachineIR sometime in the last 1-2 years though, so we can do better now. The test diffs require some explanation: 1. llvm/test/CodeGen/X86/fmf-flags.ll - no target option for unsafe math was specified here, so MachineCombiner kicks in where it did not previously; to make it behave consistently, we need to specify a CPU schedule model, so use the default model, and there are no code diffs. 2. llvm/test/CodeGen/X86/machine-combiner.ll - replace the target option for unsafe math with the equivalent IR-level flags, and there are no code diffs; we can't remove the NaN/nsz options because those are still used to drive x86 fmin/fmax codegen (special SDAG opcodes). 3. llvm/test/CodeGen/X86/pow.ll - similar to #1 4. llvm/test/CodeGen/X86/sqrt-fastmath.ll - similar to #1, but MachineCombiner does some reassociation of the estimate sequence ops; presumably these are perf wins based on latency/throughput (and we get some reduction of move instructions too); I'm not sure how it affects numerical accuracy, but the test reflects reality better now because we would expect MachineCombiner to be enabled if the IR was generated via something like "-ffast-math" with clang. 5. llvm/test/CodeGen/X86/vec_int_to_fp.ll - this is the test added to model PR43609; the fadds are not reassociated now, so we should get the expected results. 6. llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll - similar to #1 7. llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll - similar to #1 Differential Revision: https://reviews.llvm.org/D74851 --- llvm/lib/CodeGen/TargetInstrInfo.cpp | 9 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 17 +- llvm/test/CodeGen/X86/fmf-flags.ll | 2 +- llvm/test/CodeGen/X86/machine-combiner.ll | 158 +++++++++--------- llvm/test/CodeGen/X86/pow.ll | 2 +- llvm/test/CodeGen/X86/sqrt-fastmath.ll | 155 ++++++++--------- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 48 +++--- .../CodeGen/X86/vector-reduce-fadd-fast.ll | 14 +- .../CodeGen/X86/vector-reduce-fmul-fast.ll | 12 +- 9 files changed, 210 insertions(+), 207 deletions(-) diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 77ca87b48fafc4..dbf11683e98ccd 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -699,10 +699,13 @@ bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst, std::swap(MI1, MI2); // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its + // 2. The previous instruction must also be associative/commutative (this can + // be different even for instructions with the same opcode if traits like + // fast-math-flags are included). + // 3. The previous instruction must have virtual register definitions for its // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - return MI1->getOpcode() == AssocOpcode && + // 4. The previous instruction's result must only be used by Inst. + return MI1->getOpcode() == AssocOpcode && isAssociativeAndCommutative(*MI1) && hasReassociableOperands(*MI1, MBB) && MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()); } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ef019e05a7212d..a133641dc6a194 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7657,7 +7657,8 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: return false; } @@ -7843,6 +7844,20 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const { + // Propagate FP flags from the original instructions. + // But clear poison-generating flags because those may not be valid now. + // TODO: There should be a helper function for copying only fast-math-flags. + uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags(); + NewMI1.setFlags(IntersectedFlags); + NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::IsExact); + + NewMI2.setFlags(IntersectedFlags); + NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::IsExact); + // Integer instructions may define an implicit EFLAGS dest register operand. MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS); MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS); diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll index b6893e6e60d1b2..6a938bf28b23b6 100644 --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 declare float @llvm.sqrt.f32(float %x); diff --git a/llvm/test/CodeGen/X86/machine-combiner.ll b/llvm/test/CodeGen/X86/machine-combiner.ll index 22da63a083fc9f..cab9c86d7d0b37 100644 --- a/llvm/test/CodeGen/X86/machine-combiner.ll +++ b/llvm/test/CodeGen/X86/machine-combiner.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=AVX,AVX512 ; Incremental updates of the instruction depths should be enough for this test ; case. -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512vl -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=sse -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=avx512vl -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefixes=AVX,AVX512 ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. @@ -26,9 +26,9 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %t1, %x3 ret float %t2 } @@ -46,9 +46,9 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %t1, %x3 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %t1, %x3 ret float %t2 } @@ -66,9 +66,9 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %x3, %t1 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -86,9 +86,9 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %x3, %t1 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -117,13 +117,13 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm7, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 - %t3 = fadd float %t2, %x4 - %t4 = fadd float %t3, %x5 - %t5 = fadd float %t4, %x6 - %t6 = fadd float %t5, %x7 + %t0 = fadd reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %t0, %x2 + %t2 = fadd reassoc nsz float %t1, %x3 + %t3 = fadd reassoc nsz float %t2, %x4 + %t4 = fadd reassoc nsz float %t3, %x5 + %t5 = fadd reassoc nsz float %t4, %x6 + %t6 = fadd reassoc nsz float %t5, %x7 ret float %t6 } @@ -146,9 +146,9 @@ define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %x3, %t1 + %t0 = fdiv reassoc nsz float %x0, %x1 + %t1 = fadd reassoc nsz float %x2, %t0 + %t2 = fadd reassoc nsz float %x3, %t1 ret float %t2 } @@ -168,9 +168,9 @@ define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) { ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 - %t1 = fmul float %x2, %t0 - %t2 = fmul float %x3, %t1 + %t0 = fdiv reassoc nsz float %x0, %x1 + %t1 = fmul reassoc nsz float %x2, %t0 + %t2 = fmul reassoc nsz float %x3, %t1 ret float %t2 } @@ -190,9 +190,9 @@ define double @reassociate_adds_double(double %x0, double %x1, double %x2, doubl ; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 - %t1 = fadd double %x2, %t0 - %t2 = fadd double %x3, %t1 + %t0 = fdiv reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %x2, %t0 + %t2 = fadd reassoc nsz double %x3, %t1 ret double %t2 } @@ -212,9 +212,9 @@ define double @reassociate_muls_double(double %x0, double %x1, double %x2, doubl ; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 - %t1 = fmul double %x2, %t0 - %t2 = fmul double %x3, %t1 + %t0 = fdiv reassoc nsz double %x0, %x1 + %t1 = fmul reassoc nsz double %x2, %t0 + %t2 = fmul reassoc nsz double %x3, %t1 ret double %t2 } @@ -240,9 +240,9 @@ define <4 x float> @reassociate_adds_v4f32(<4 x float> %x0, <4 x float> %x1, <4 ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: retq - %t0 = fmul <4 x float> %x0, %x1 - %t1 = fadd <4 x float> %x2, %t0 - %t2 = fadd <4 x float> %x3, %t1 + %t0 = fmul reassoc nsz <4 x float> %x0, %x1 + %t1 = fadd reassoc nsz <4 x float> %x2, %t0 + %t2 = fadd reassoc nsz <4 x float> %x3, %t1 ret <4 x float> %t2 } @@ -268,9 +268,9 @@ define <2 x double> @reassociate_adds_v2f64(<2 x double> %x0, <2 x double> %x1, ; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: retq - %t0 = fmul <2 x double> %x0, %x1 - %t1 = fadd <2 x double> %x2, %t0 - %t2 = fadd <2 x double> %x3, %t1 + %t0 = fmul reassoc nsz <2 x double> %x0, %x1 + %t1 = fadd reassoc nsz <2 x double> %x2, %t0 + %t2 = fadd reassoc nsz <2 x double> %x3, %t1 ret <2 x double> %t2 } @@ -290,9 +290,9 @@ define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 ; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd <4 x float> %x0, %x1 - %t1 = fmul <4 x float> %x2, %t0 - %t2 = fmul <4 x float> %x3, %t1 + %t0 = fadd reassoc nsz <4 x float> %x0, %x1 + %t1 = fmul reassoc nsz <4 x float> %x2, %t0 + %t2 = fmul reassoc nsz <4 x float> %x3, %t1 ret <4 x float> %t2 } @@ -312,9 +312,9 @@ define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, ; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fadd <2 x double> %x0, %x1 - %t1 = fmul <2 x double> %x2, %t0 - %t2 = fmul <2 x double> %x3, %t1 + %t0 = fadd reassoc nsz <2 x double> %x0, %x1 + %t1 = fmul reassoc nsz <2 x double> %x2, %t0 + %t2 = fmul reassoc nsz <2 x double> %x3, %t1 ret <2 x double> %t2 } @@ -343,9 +343,9 @@ define <8 x float> @reassociate_adds_v8f32(<8 x float> %x0, <8 x float> %x1, <8 ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: retq - %t0 = fmul <8 x float> %x0, %x1 - %t1 = fadd <8 x float> %x2, %t0 - %t2 = fadd <8 x float> %x3, %t1 + %t0 = fmul reassoc nsz <8 x float> %x0, %x1 + %t1 = fadd reassoc nsz <8 x float> %x2, %t0 + %t2 = fadd reassoc nsz <8 x float> %x3, %t1 ret <8 x float> %t2 } @@ -374,9 +374,9 @@ define <4 x double> @reassociate_adds_v4f64(<4 x double> %x0, <4 x double> %x1, ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: retq - %t0 = fmul <4 x double> %x0, %x1 - %t1 = fadd <4 x double> %x2, %t0 - %t2 = fadd <4 x double> %x3, %t1 + %t0 = fmul reassoc nsz <4 x double> %x0, %x1 + %t1 = fadd reassoc nsz <4 x double> %x2, %t0 + %t2 = fadd reassoc nsz <4 x double> %x3, %t1 ret <4 x double> %t2 } @@ -399,9 +399,9 @@ define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8 ; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq - %t0 = fadd <8 x float> %x0, %x1 - %t1 = fmul <8 x float> %x2, %t0 - %t2 = fmul <8 x float> %x3, %t1 + %t0 = fadd reassoc nsz <8 x float> %x0, %x1 + %t1 = fmul reassoc nsz <8 x float> %x2, %t0 + %t2 = fmul reassoc nsz <8 x float> %x3, %t1 ret <8 x float> %t2 } @@ -424,9 +424,9 @@ define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1, ; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq - %t0 = fadd <4 x double> %x0, %x1 - %t1 = fmul <4 x double> %x2, %t0 - %t2 = fmul <4 x double> %x3, %t1 + %t0 = fadd reassoc nsz <4 x double> %x0, %x1 + %t1 = fmul reassoc nsz <4 x double> %x2, %t0 + %t2 = fmul reassoc nsz <4 x double> %x3, %t1 ret <4 x double> %t2 } @@ -464,9 +464,9 @@ define <16 x float> @reassociate_adds_v16f32(<16 x float> %x0, <16 x float> %x1, ; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; AVX512-NEXT: retq - %t0 = fmul <16 x float> %x0, %x1 - %t1 = fadd <16 x float> %x2, %t0 - %t2 = fadd <16 x float> %x3, %t1 + %t0 = fmul reassoc nsz <16 x float> %x0, %x1 + %t1 = fadd reassoc nsz <16 x float> %x2, %t0 + %t2 = fadd reassoc nsz <16 x float> %x3, %t1 ret <16 x float> %t2 } @@ -504,9 +504,9 @@ define <8 x double> @reassociate_adds_v8f64(<8 x double> %x0, <8 x double> %x1, ; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; AVX512-NEXT: retq - %t0 = fmul <8 x double> %x0, %x1 - %t1 = fadd <8 x double> %x2, %t0 - %t2 = fadd <8 x double> %x3, %t1 + %t0 = fmul reassoc nsz <8 x double> %x0, %x1 + %t1 = fadd reassoc nsz <8 x double> %x2, %t0 + %t2 = fadd reassoc nsz <8 x double> %x3, %t1 ret <8 x double> %t2 } @@ -545,9 +545,9 @@ define <16 x float> @reassociate_muls_v16f32(<16 x float> %x0, <16 x float> %x1, ; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %t0 = fadd <16 x float> %x0, %x1 - %t1 = fmul <16 x float> %x2, %t0 - %t2 = fmul <16 x float> %x3, %t1 + %t0 = fadd reassoc nsz <16 x float> %x0, %x1 + %t1 = fmul reassoc nsz <16 x float> %x2, %t0 + %t2 = fmul reassoc nsz <16 x float> %x3, %t1 ret <16 x float> %t2 } @@ -586,9 +586,9 @@ define <8 x double> @reassociate_muls_v8f64(<8 x double> %x0, <8 x double> %x1, ; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %t0 = fadd <8 x double> %x0, %x1 - %t1 = fmul <8 x double> %x2, %t0 - %t2 = fmul <8 x double> %x3, %t1 + %t0 = fadd reassoc nsz <8 x double> %x0, %x1 + %t1 = fmul reassoc nsz <8 x double> %x2, %t0 + %t2 = fmul reassoc nsz <8 x double> %x3, %t1 ret <8 x double> %t2 } @@ -1114,9 +1114,9 @@ define double @reassociate_adds_from_calls() { %x1 = call double @bar() %x2 = call double @bar() %x3 = call double @bar() - %t0 = fadd double %x0, %x1 - %t1 = fadd double %t0, %x2 - %t2 = fadd double %t1, %x3 + %t0 = fadd reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %t0, %x2 + %t2 = fadd reassoc nsz double %t1, %x3 ret double %t2 } @@ -1165,9 +1165,9 @@ define double @already_reassociated() { %x1 = call double @bar() %x2 = call double @bar() %x3 = call double @bar() - %t0 = fadd double %x0, %x1 - %t1 = fadd double %x2, %x3 - %t2 = fadd double %t0, %t1 + %t0 = fadd reassoc nsz double %x0, %x1 + %t1 = fadd reassoc nsz double %x2, %x3 + %t2 = fadd reassoc nsz double %t0, %t1 ret double %t2 } diff --git a/llvm/test/CodeGen/X86/pow.ll b/llvm/test/CodeGen/X86/pow.ll index 45600540289691..52e9ebbe852e26 100644 --- a/llvm/test/CodeGen/X86/pow.ll +++ b/llvm/test/CodeGen/X86/pow.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s declare float @llvm.pow.f32(float, float) declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 6aad4e8f69eb9c..37e6b6954dc264 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 declare double @__sqrt_finite(double) declare float @__sqrtf_finite(float) @@ -135,8 +135,8 @@ define float @sqrtf_check_denorms(float %x) #3 { ; SSE-NEXT: mulss %xmm2, %xmm3 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -148,8 +148,8 @@ define float @sqrtf_check_denorms(float %x) #3 { ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -182,8 +182,8 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; SSE-NEXT: mulps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SSE-NEXT: cmpleps %xmm0, %xmm2 ; SSE-NEXT: andps %xmm2, %xmm1 @@ -197,8 +197,8 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -211,8 +211,8 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] ; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] @@ -246,20 +246,18 @@ define float @f32_estimate(float %x) #1 { ; SSE-LABEL: f32_estimate: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm0, %xmm2 -; SSE-NEXT: addss {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -308,20 +306,18 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_estimate: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: addps {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v4f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 @@ -334,7 +330,7 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) @@ -374,31 +370,27 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm0, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: rsqrtps %xmm0, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: rsqrtps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: addps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: v8f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 -; AVX1-NEXT: vmulps %ymm1, %ymm1, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 @@ -411,7 +403,7 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) @@ -459,58 +451,51 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: rsqrtps %xmm0, %xmm5 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm7 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm7, %xmm0 -; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: rsqrtps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: addps %xmm5, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm6 ; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: rsqrtps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm7, %xmm4 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: rsqrtps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: addps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: rsqrtps %xmm3, %xmm6 ; SSE-NEXT: mulps %xmm6, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: rsqrtps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm7, %xmm5 -; SSE-NEXT: mulps %xmm6, %xmm5 -; SSE-NEXT: mulps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: addps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm4 -; AVX1-NEXT: vmulps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vmulps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vrsqrtps %ymm1, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm2, %ymm5 -; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vrsqrtps %ymm1, %ymm4 +; AVX1-NEXT: vmulps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: v16f32_estimate: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 5cc197c5854531..46db59e3d1eb95 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -5758,15 +5758,15 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm3, %xmm0 -; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: movupd %xmm0, (%rdi) ; SSE2-NEXT: movupd %xmm1, 16(%rdi) @@ -5786,15 +5786,15 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE41-NEXT: por %xmm4, %xmm2 ; SSE41-NEXT: psrlq $32, %xmm1 ; SSE41-NEXT: por %xmm5, %xmm1 ; SSE41-NEXT: subpd %xmm6, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; SSE41-NEXT: addpd %xmm2, %xmm0 ; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: movupd %xmm0, (%rdi) ; SSE41-NEXT: movupd %xmm1, 16(%rdi) @@ -5812,16 +5812,16 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX1-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovupd %xmm0, (%rdi) ; AVX1-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX1-NEXT: retq @@ -5838,16 +5838,16 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX2-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovupd %xmm0, (%rdi) ; AVX2-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX2-NEXT: retq @@ -5864,16 +5864,16 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX512F-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX512F-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vmovupd %xmm0, (%rdi) ; AVX512F-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX512F-NEXT: retq @@ -5890,16 +5890,16 @@ define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 { ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512VL-NEXT: vsubpd %xmm6, %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 ; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX512VL-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vsubpd %xmm6, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} xmm4 = [5.0E-1,5.0E-1] -; AVX512VL-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VL-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovupd %xmm0, (%rdi) ; AVX512VL-NEXT: vmovupd %xmm1, 16(%rdi) ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll index c94f584dfdb0f2..79006690190560 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; ; vXf32 (accum) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll index b426518d403e09..f77ef5d7c2ed39 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL ; ; vXf32 (accum)