diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 5b657fb1712968..7305e3086fcd65 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -814,6 +814,26 @@ enum NodeType { /// TRUNCATE - Completely drop the high bits. TRUNCATE, + /// TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand + /// [SU] located in middle, prefix for `SAT` means indicates whether + /// existing truncate target was a signed operation. For examples, + /// If `truncate(smin(smax(x, C), C))` was saturated then become `S`. + /// If `truncate(umin(x, C))` was saturated then become `U`. + /// [SU] located in last indicates whether range of truncated values is + /// sign-saturated. For example, if `truncate(smin(smax(x, C), C))` is a + /// truncation to `i8`, then if value of C ranges from `-128 to 127`, it will + /// be saturated against signed values, resulting in `S`, which will combine + /// to `TRUNCATE_SSAT_S`. If the value of C ranges from `0 to 255`, it will + /// be saturated against unsigned values, resulting in `U`, which will + /// combine to `TRUNATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if + /// value of C ranges from `0 to 255`, it becomes `U` because it is saturated + /// for unsigned values. As a result, it combines to `TRUNCATE_USAT_U`. + TRUNCATE_SSAT_S, // saturate signed input to signed result - + // truncate(smin(smax(x, C), C)) + TRUNCATE_SSAT_U, // saturate signed input to unsigned result - + // truncate(smin(smax(x, 0), C)) + TRUNCATE_USAT_U, // saturate unsigned input to unsigned result - + // truncate(umin(x, C)) /// [SU]INT_TO_FP - These operators convert integers (whose interpreted sign /// depends on the first letter) to floating point. diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 46044aab79a832..92d10a94bd81e5 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -477,6 +477,9 @@ def sext : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>; def zext : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>; def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>; def trunc : SDNode<"ISD::TRUNCATE" , SDTIntTruncOp>; +def truncssat_s : SDNode<"ISD::TRUNCATE_SSAT_S", SDTIntTruncOp>; +def truncssat_u : SDNode<"ISD::TRUNCATE_SSAT_U", SDTIntTruncOp>; +def truncusat_u : SDNode<"ISD::TRUNCATE_USAT_U", SDTIntTruncOp>; def bitconvert : SDNode<"ISD::BITCAST" , SDTUnaryOp>; def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>; def freeze : SDNode<"ISD::FREEZE" , SDTFreeze>; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 060e66175d965c..640522ed938d9a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -486,6 +486,7 @@ namespace { SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitEXTEND_VECTOR_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); + SDValue visitTRUNCATE_USAT_U(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitFREEZE(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); @@ -1908,6 +1909,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ANY_EXTEND_VECTOR_INREG: return visitEXTEND_VECTOR_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); + case ISD::TRUNCATE_USAT_U: return visitTRUNCATE_USAT_U(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); @@ -13203,7 +13205,9 @@ SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { unsigned CastOpcode = Cast->getOpcode(); assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || - CastOpcode == ISD::FP_ROUND) && + CastOpcode == ISD::TRUNCATE_SSAT_S || + CastOpcode == ISD::TRUNCATE_SSAT_U || + CastOpcode == ISD::TRUNCATE_USAT_U || CastOpcode == ISD::FP_ROUND) && "Unexpected opcode for vector select narrowing/widening"); // We only do this transform before legal ops because the pattern may be @@ -14915,6 +14919,132 @@ SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitTRUNCATE_USAT_U(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + + std::function MatchFPTOINT = [&](SDValue Val) -> SDValue { + if (Val.getOpcode() == ISD::FP_TO_UINT) + return Val; + return SDValue(); + }; + + SDValue FPInstr = MatchFPTOINT(N0); + if (!FPInstr) + return SDValue(); + + EVT FPVT = FPInstr.getOperand(0).getValueType(); + if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT, + FPVT, VT)) + return SDValue(); + return DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(FPInstr), VT, + FPInstr.getOperand(0), + DAG.getValueType(VT.getScalarType())); +} + +/// Detect patterns of truncation with unsigned saturation: +/// +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value x to be truncated or SDValue() if the pattern was +/// not matched. +/// +static SDValue detectUSatUPattern(SDValue In, EVT VT) { + unsigned NumDstBits = VT.getScalarSizeInBits(); + unsigned NumSrcBits = In.getScalarValueSizeInBits(); + // Saturation with truncation. We truncate from InVT to VT. + assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation"); + + SDValue Min; + APInt UnsignedMax = APInt::getMaxValue(NumDstBits).zext(NumSrcBits); + if (sd_match(In, m_UMin(m_Value(Min), m_SpecificInt(UnsignedMax)))) + return Min; + + return SDValue(); +} + +/// Detect patterns of truncation with signed saturation: +/// (truncate (smin (smax (x, signed_min_of_dest_type), +/// signed_max_of_dest_type)) to dest_type) +/// or: +/// (truncate (smax (smin (x, signed_max_of_dest_type), +/// signed_min_of_dest_type)) to dest_type). +/// +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectSSatSPattern(SDValue In, EVT VT) { + unsigned NumDstBits = VT.getScalarSizeInBits(); + unsigned NumSrcBits = In.getScalarValueSizeInBits(); + // Saturation with truncation. We truncate from InVT to VT. + assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation"); + + SDValue Val; + APInt SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits); + APInt SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits); + + if (sd_match(In, m_SMin(m_SMax(m_Value(Val), m_SpecificInt(SignedMin)), + m_SpecificInt(SignedMax)))) + return Val; + + if (sd_match(In, m_SMax(m_SMin(m_Value(Val), m_SpecificInt(SignedMax)), + m_SpecificInt(SignedMin)))) + return Val; + + return SDValue(); +} + +/// Detect patterns of truncation with unsigned saturation: +static SDValue detectSSatUPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const SDLoc &DL) { + unsigned NumDstBits = VT.getScalarSizeInBits(); + unsigned NumSrcBits = In.getScalarValueSizeInBits(); + // Saturation with truncation. We truncate from InVT to VT. + assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation"); + + SDValue Val; + APInt UnsignedMax = APInt::getMaxValue(NumDstBits).zext(NumSrcBits); + // Min == 0, Max is unsigned max of destination type. + if (sd_match(In, m_SMax(m_SMin(m_Value(Val), m_SpecificInt(UnsignedMax)), + m_Zero()))) + return Val; + + if (sd_match(In, m_SMin(m_SMax(m_Value(Val), m_Zero()), + m_SpecificInt(UnsignedMax)))) + return Val; + + if (sd_match(In, m_UMin(m_SMax(m_Value(Val), m_Zero()), + m_SpecificInt(UnsignedMax)))) + return Val; + + return SDValue(); +} + +static SDValue foldToSaturated(SDNode *N, EVT &VT, SDValue &Src, EVT &SrcVT, + SDLoc &DL, const TargetLowering &TLI, + SelectionDAG &DAG) { + auto AllowedTruncateSat = [&](unsigned Opc, EVT SrcVT, EVT VT) -> bool { + return (TLI.isOperationLegalOrCustom(Opc, SrcVT) && + TLI.isTypeDesirableForOp(Opc, VT)); + }; + + if (Src.getOpcode() == ISD::SMIN || Src.getOpcode() == ISD::SMAX) { + if (AllowedTruncateSat(ISD::TRUNCATE_SSAT_S, SrcVT, VT)) + if (SDValue SSatVal = detectSSatSPattern(Src, VT)) + return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, VT, SSatVal); + if (AllowedTruncateSat(ISD::TRUNCATE_SSAT_U, SrcVT, VT)) + if (SDValue SSatVal = detectSSatUPattern(Src, VT, DAG, DL)) + return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, VT, SSatVal); + } else if (Src.getOpcode() == ISD::UMIN) { + if (AllowedTruncateSat(ISD::TRUNCATE_SSAT_U, SrcVT, VT)) + if (SDValue SSatVal = detectSSatUPattern(Src, VT, DAG, DL)) + return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, VT, SSatVal); + if (AllowedTruncateSat(ISD::TRUNCATE_USAT_U, SrcVT, VT)) + if (SDValue USatVal = detectUSatUPattern(Src, VT)) + return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, VT, USatVal); + } + + return SDValue(); +} + SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -14930,6 +15060,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); + // fold saturated truncate + if (SDValue SaturatedTR = foldToSaturated(N, VT, N0, SrcVT, DL, TLI, DAG)) + return SaturatedTR; + // fold (truncate c1) -> c1 if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, VT, {N0})) return C; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 16fc52caebb757..46e8e54ee4ed7d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -380,6 +380,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return "sign_extend_vector_inreg"; case ISD::ZERO_EXTEND_VECTOR_INREG: return "zero_extend_vector_inreg"; case ISD::TRUNCATE: return "truncate"; + case ISD::TRUNCATE_SSAT_S: return "truncate_ssat_s"; + case ISD::TRUNCATE_SSAT_U: return "truncate_ssat_u"; + case ISD::TRUNCATE_USAT_U: return "truncate_usat_u"; case ISD::FP_ROUND: return "fp_round"; case ISD::STRICT_FP_ROUND: return "strict_fp_round"; case ISD::FP_EXTEND: return "fp_extend"; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 6ca9955993d242..149b5dabee0565 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -753,6 +753,11 @@ void TargetLoweringBase::initActions() { // Absolute difference setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand); + // Saturated trunc + setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand); + setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand); + setOperationAction(ISD::TRUNCATE_USAT_U, VT, Expand); + // These default to Expand so they will be expanded to CTLZ/CTTZ by default. setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d86e52d49000ae..2ebe2ff2712d6d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1410,6 +1410,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Legal); + setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Legal); + setOperationAction(ISD::TRUNCATE_USAT_U, VT, Legal); + } + if (Subtarget->hasSME()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); } @@ -28730,6 +28736,18 @@ bool AArch64TargetLowering::hasInlineStackProbe( MF.getInfo()->hasStackProbing(); } +bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { + switch (Opc) { + case ISD::TRUNCATE_SSAT_S: + case ISD::TRUNCATE_SSAT_U: + case ISD::TRUNCATE_USAT_U: + if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32) + return true; + } + + return TargetLowering::isTypeDesirableForOp(Opc, VT); +} + #ifndef NDEBUG void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { switch (N->getOpcode()) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 81e15185f985d5..50e26612ac863e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -743,6 +743,11 @@ class AArch64TargetLowering : public TargetLowering { bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override; + /// Return true if the target has native support for + /// the specified value type and it is 'desirable' to use the type for the + /// given node type. + bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; ArrayRef getRoundingControlRegisters() const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1053ba9242768a..9ed8d3ee3dcee0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5418,64 +5418,75 @@ def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>; def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>; // trunc(umin(X, 255)) -> UQXTRN v8i8 -def : Pat<(v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))), +def : Pat<(v8i8 (truncusat_u (v8i16 V128:$Vn))), (UQXTNv8i8 V128:$Vn)>; // trunc(umin(X, 65535)) -> UQXTRN v4i16 -def : Pat<(v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))), +def : Pat<(v4i16 (truncusat_u (v4i32 V128:$Vn))), (UQXTNv4i16 V128:$Vn)>; +// trunc(umin(X, 4294967295)) -> UQXTRN v2i32 +def : Pat<(v2i32 (truncusat_u (v2i64 V128:$Vn))), + (UQXTNv2i32 V128:$Vn)>; // trunc(smin(smax(X, -128), 128)) -> SQXTRN -// with reversed min/max -def : Pat<(v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), - (v8i16 VImm7F)))), - (SQXTNv8i8 V128:$Vn)>; -def : Pat<(v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), - (v8i16 VImm80)))), +def : Pat<(v8i8 (truncssat_s (v8i16 V128:$Vn))), (SQXTNv8i8 V128:$Vn)>; // trunc(smin(smax(X, -32768), 32767)) -> SQXTRN -// with reversed min/max -def : Pat<(v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), - (v4i32 VImm7FFF)))), - (SQXTNv4i16 V128:$Vn)>; -def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), - (v4i32 VImm8000)))), +def : Pat<(v4i16 (truncssat_s (v4i32 V128:$Vn))), (SQXTNv4i16 V128:$Vn)>; - -// concat_vectors(Vd, trunc(umin(X, 255))) -> UQXTRN(Vd, Vn) +// trunc(smin(smax(X, -2147483648), 2147483647)) -> SQXTRN +def : Pat<(v2i32 (truncssat_s (v2i64 V128:$Vn))), + (SQXTNv2i32 V128:$Vn)>; +// trunc(umin(smax(X, 0), 255)) -> SQXTUN +def : Pat<(v8i8 (truncssat_u (v8i16 V128:$Vn))), + (SQXTUNv8i8 V128:$Vn)>; +// trunc(umin(smax(X, 0), 65535)) -> SQXTUN +def : Pat<(v4i16 (truncssat_u (v4i32 V128:$Vn))), + (SQXTUNv4i16 V128:$Vn)>; +// trunc(umin(smax(X, 0), 4294967295)) -> SQXTUN +def : Pat<(v2i32 (truncssat_u (v2i64 V128:$Vn))), + (SQXTUNv2i32 V128:$Vn)>; + +// truncusat_u +// concat_vectors(Vd, truncusat_u(Vn)) ~> UQXTRN(Vd, Vn) def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Vd), - (v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))))), + (v8i8 (truncusat_u (v8i16 V128:$Vn))))), (UQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -// concat_vectors(Vd, trunc(umin(X, 65535))) -> UQXTRN(Vd, Vn) def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Vd), - (v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))))), + (v4i16 (truncusat_u (v4i32 V128:$Vn))))), (UQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (truncusat_u (v2i64 V128:$Vn))))), + (UQXTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn) -// with reversed min/max +// concat_vectors(Vd, truncssat_s(Vn)) ~> SQXTN2(Vd, Vn) def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Vd), - (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), - (v8i16 VImm7F)))))), + (v8i8 (truncssat_s (v8i16 V128:$Vn))))), (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), - (v8i16 VImm80)))))), - (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - -// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn) -// with reversed min/max def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Vd), - (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), - (v4i32 VImm7FFF)))))), + (v4i16 (truncssat_s (v4i32 V128:$Vn))))), (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (truncssat_s (v2i64 V128:$Vn))))), + (SQXTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + +// concat_vectors(Vd, truncssat_u(Vn)) ~> SQXTUN2(Vd, Vn) +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (truncssat_u (v8i16 V128:$Vn))))), + (SQXTUNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Vd), - (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), - (v4i32 VImm8000)))))), - (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + (v4i16 (truncssat_u (v4i32 V128:$Vn))))), + (SQXTUNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (truncssat_u (v2i64 V128:$Vn))))), + (SQXTUNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; // Select BSWAP vector instructions into REV instructions def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d40d4997d76149..704caeab90bb6e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -853,7 +853,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL" // nodes which truncate by one power of two at a time. - setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction({ISD::TRUNCATE, ISD::TRUNCATE_SSAT_S, + ISD::TRUNCATE_SSAT_U, ISD::TRUNCATE_USAT_U}, + VT, Custom); // Custom-lower insert/extract operations to simplify patterns. setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT, @@ -1168,7 +1170,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction({ISD::TRUNCATE, ISD::TRUNCATE_SSAT_S, + ISD::TRUNCATE_SSAT_U, ISD::TRUNCATE_USAT_U}, + VT, Custom); setOperationAction(ISD::BITCAST, VT, Custom); @@ -6395,6 +6399,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getNode(RISCVISD::BREV8, DL, VT, BSwap); } case ISD::TRUNCATE: + case ISD::TRUNCATE_SSAT_S: + case ISD::TRUNCATE_SSAT_U: + case ISD::TRUNCATE_USAT_U: // Only custom-lower vector truncates if (!Op.getSimpleValueType().isVector()) return Op; @@ -8234,7 +8241,8 @@ SDValue RISCVTargetLowering::lowerVectorMaskTruncLike(SDValue Op, SDValue RISCVTargetLowering::lowerVectorTruncLike(SDValue Op, SelectionDAG &DAG) const { - bool IsVPTrunc = Op.getOpcode() == ISD::VP_TRUNCATE; + unsigned Opc = Op.getOpcode(); + bool IsVPTrunc = Opc == ISD::VP_TRUNCATE; SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -8279,11 +8287,18 @@ SDValue RISCVTargetLowering::lowerVectorTruncLike(SDValue Op, getDefaultVLOps(SrcVT, ContainerVT, DL, DAG, Subtarget); } + unsigned NewOpc; + if (Opc == ISD::TRUNCATE_SSAT_S) + NewOpc = RISCVISD::TRUNCATE_VECTOR_VL_SSAT; + else if (Opc == ISD::TRUNCATE_SSAT_U || Opc == ISD::TRUNCATE_USAT_U) + NewOpc = RISCVISD::TRUNCATE_VECTOR_VL_USAT; + else + NewOpc = RISCVISD::TRUNCATE_VECTOR_VL; + do { SrcEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2); MVT ResultVT = ContainerVT.changeVectorElementType(SrcEltVT); - Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, ResultVT, Result, - Mask, VL); + Result = DAG.getNode(NewOpc, DL, ResultVT, Result, Mask, VL); } while (SrcEltVT != DstEltVT); if (SrcVT.isFixedLengthVector()) diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll index 0138bef9c38454..9157bcba59e9bb 100644 --- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll @@ -7,12 +7,8 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzs w8, d0 -; CHECK-NEXT: fcvtzs w9, d1 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: sqxtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -45,12 +41,8 @@ entry: define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzu w8, d0 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: sqxtun v0.2s, v0.2d ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -242,8 +234,8 @@ entry: define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: sqxtun v0.4h, v0.4s ; CHECK-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -308,10 +300,10 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h -; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-CVT-NEXT: fcvtzu v2.4s, v0.4s -; CHECK-CVT-NEXT: uqxtn v0.4h, v1.4s -; CHECK-CVT-NEXT: uqxtn2 v0.8h, v2.4s +; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-CVT-NEXT: fcvtzs v2.4s, v0.4s +; CHECK-CVT-NEXT: sqxtun v0.4h, v1.4s +; CHECK-CVT-NEXT: sqxtun2 v0.8h, v2.4s ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: ustest_f16i16: @@ -656,12 +648,8 @@ entry: define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32_mm: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzs w8, d0 -; CHECK-NEXT: fcvtzs w9, d1 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: sqxtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -691,12 +679,8 @@ entry: define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32_mm: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzu w8, d0 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: sqxtun v0.2s, v0.2d ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -868,8 +852,8 @@ entry: define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i16_mm: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: sqxtun v0.4h, v0.4s ; CHECK-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -929,10 +913,10 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h -; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-CVT-NEXT: fcvtzu v2.4s, v0.4s -; CHECK-CVT-NEXT: uqxtn v0.4h, v1.4s -; CHECK-CVT-NEXT: uqxtn2 v0.8h, v2.4s +; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-CVT-NEXT: fcvtzs v2.4s, v0.4s +; CHECK-CVT-NEXT: sqxtun v0.4h, v1.4s +; CHECK-CVT-NEXT: sqxtun2 v0.8h, v2.4s ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: ustest_f16i16_mm: diff --git a/llvm/test/CodeGen/AArch64/qmovn.ll b/llvm/test/CodeGen/AArch64/qmovn.ll index 35c172adbad3d8..2685ea9fb5d202 100644 --- a/llvm/test/CodeGen/AArch64/qmovn.ll +++ b/llvm/test/CodeGen/AArch64/qmovn.ll @@ -84,15 +84,7 @@ entry: define <2 x i32> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_smaxmin: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov x8, #-2147483648 // =0xffffffff80000000 -; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sqxtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %c1 = icmp slt <2 x i64> %s0, @@ -106,15 +98,7 @@ entry: define <2 x i32> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_sminmax: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #-2147483648 // =0xffffffff80000000 -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff -; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sqxtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %c1 = icmp sgt <2 x i64> %s0, @@ -125,14 +109,94 @@ entry: ret <2 x i32> %t } +define <2 x i32> @vqmovni64_smaxmin_u(<2 x i64> %s0) { +; CHECK-LABEL: vqmovni64_smaxmin_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %c1 = icmp slt <2 x i64> %s0, + %s1 = select <2 x i1> %c1, <2 x i64> %s0, <2 x i64> + %c2 = icmp sgt <2 x i64> %s1, zeroinitializer + %s2 = select <2 x i1> %c2, <2 x i64> %s1, <2 x i64> zeroinitializer + %t = trunc <2 x i64> %s2 to <2 x i32> + ret <2 x i32> %t +} + +define <2 x i32> @vqmovni64_sminmax_u(<2 x i64> %s0) { +; CHECK-LABEL: vqmovni64_sminmax_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %c1 = icmp sgt <2 x i64> %s0, zeroinitializer + %s1 = select <2 x i1> %c1, <2 x i64> %s0, <2 x i64> zeroinitializer + %c2 = icmp slt <2 x i64> %s1, + %s2 = select <2 x i1> %c2, <2 x i64> %s1, <2 x i64> + %t = trunc <2 x i64> %s2 to <2 x i32> + ret <2 x i32> %t +} + +define <4 x i16> @vqmovni32_smaxmin_u(<4 x i32> %s0) { +; CHECK-LABEL: vqmovni32_smaxmin_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %c1 = icmp slt <4 x i32> %s0, + %s1 = select <4 x i1> %c1, <4 x i32> %s0, <4 x i32> + %c2 = icmp sgt <4 x i32> %s1, zeroinitializer + %s2 = select <4 x i1> %c2, <4 x i32> %s1, <4 x i32> zeroinitializer + %t = trunc <4 x i32> %s2 to <4 x i16> + ret <4 x i16> %t +} + +define <4 x i16> @vqmovni32_sminmax_u(<4 x i32> %s0) { +; CHECK-LABEL: vqmovni32_sminmax_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %c1 = icmp sgt <4 x i32> %s0, zeroinitializer + %s1 = select <4 x i1> %c1, <4 x i32> %s0, <4 x i32> zeroinitializer + %c2 = icmp slt <4 x i32> %s1, + %s2 = select <4 x i1> %c2, <4 x i32> %s1, <4 x i32> + %t = trunc <4 x i32> %s2 to <4 x i16> + ret <4 x i16> %t +} + +define <8 x i8> @vqmovni16_smaxmin_u(<8 x i16> %s0) { +; CHECK-LABEL: vqmovni16_smaxmin_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %c1 = icmp slt <8 x i16> %s0, + %s1 = select <8 x i1> %c1, <8 x i16> %s0, <8 x i16> + %c2 = icmp sgt <8 x i16> %s1, zeroinitializer + %s2 = select <8 x i1> %c2, <8 x i16> %s1, <8 x i16> zeroinitializer + %t = trunc <8 x i16> %s2 to <8 x i8> + ret <8 x i8> %t +} + +define <8 x i8> @vqmovni16_sminmax_u(<8 x i16> %s0) { +; CHECK-LABEL: vqmovni16_sminmax_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %c1 = icmp sgt <8 x i16> %s0, zeroinitializer + %s1 = select <8 x i1> %c1, <8 x i16> %s0, <8 x i16> zeroinitializer + %c2 = icmp slt <8 x i16> %s1, + %s2 = select <8 x i1> %c2, <8 x i16> %s1, <8 x i16> + %t = trunc <8 x i16> %s2 to <8 x i8> + ret <8 x i8> %t +} + define <2 x i32> @vqmovni64_umaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_umaxmin: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: cmhi v1.2d, v1.2d, v0.2d -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: uqxtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: %c1 = icmp ult <2 x i64> %s0, @@ -174,16 +238,8 @@ entry: define <4 x i32> @signed_minmax_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) { ; CHECK-LABEL: signed_minmax_v2i64_to_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: mov x8, #-2147483648 // =0xffffffff80000000 -; CHECK-NEXT: cmgt v3.2d, v2.2d, v1.2d -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: cmgt v3.2d, v1.2d, v2.2d -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b -; CHECK-NEXT: xtn2 v0.4s, v1.2d +; CHECK-NEXT: sqxtn2 v0.4s, v1.2d ; CHECK-NEXT: ret entry: %min = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %y, <2 x i64> ) @@ -226,16 +282,8 @@ entry: define <4 x i32> @signed_maxmin_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) { ; CHECK-LABEL: signed_maxmin_v2i64_to_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #-2147483648 // =0xffffffff80000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff -; CHECK-NEXT: cmgt v3.2d, v1.2d, v2.2d -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: cmgt v3.2d, v2.2d, v1.2d -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b -; CHECK-NEXT: xtn2 v0.4s, v1.2d +; CHECK-NEXT: sqxtn2 v0.4s, v1.2d ; CHECK-NEXT: ret entry: %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> ) @@ -276,12 +324,8 @@ entry: define <4 x i32> @unsigned_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) { ; CHECK-LABEL: unsigned_v2i64_to_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: cmhi v2.2d, v2.2d, v1.2d -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: orn v1.16b, v1.16b, v2.16b -; CHECK-NEXT: xtn2 v0.4s, v1.2d +; CHECK-NEXT: uqxtn2 v0.4s, v1.2d ; CHECK-NEXT: ret entry: %min = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %y, <2 x i64> ) @@ -295,12 +339,8 @@ entry: define <16 x i8> @us_maxmin_v8i16_to_v16i8(<8 x i8> %x, <8 x i16> %y) { ; CHECK-LABEL: us_maxmin_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: movi v3.2d, #0xff00ff00ff00ff ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smax v1.8h, v1.8h, v2.8h -; CHECK-NEXT: smin v1.8h, v1.8h, v3.8h -; CHECK-NEXT: xtn2 v0.16b, v1.8h +; CHECK-NEXT: sqxtun2 v0.16b, v1.8h ; CHECK-NEXT: ret entry: %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %y, <8 x i16> zeroinitializer) @@ -313,12 +353,8 @@ entry: define <8 x i16> @us_maxmin_v4i32_to_v8i16(<4 x i16> %x, <4 x i32> %y) { ; CHECK-LABEL: us_maxmin_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff -; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: xtn2 v0.8h, v1.4s +; CHECK-NEXT: sqxtun2 v0.8h, v1.4s ; CHECK-NEXT: ret entry: %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> zeroinitializer) @@ -331,14 +367,8 @@ entry: define <4 x i32> @us_maxmin_v2i64_to_v4i32(<2 x i32> %x, <2 x i64> %y) { ; CHECK-LABEL: us_maxmin_v2i64_to_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmgt v2.2d, v1.2d, #0 -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: cmgt v2.2d, v3.2d, v1.2d -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: orn v1.16b, v1.16b, v2.16b -; CHECK-NEXT: xtn2 v0.4s, v1.2d +; CHECK-NEXT: sqxtun2 v0.4s, v1.2d ; CHECK-NEXT: ret entry: %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> zeroinitializer) @@ -347,3 +377,158 @@ entry: %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> ret <4 x i32> %shuffle } + +; Test the (concat_vectors (X), (trunc(smin(smax(Y, 0), 2^n))))) pattern. + +define <16 x i8> @sminsmax_range_unsigned_i16_to_i8(<8 x i8> %x, <8 x i16> %y) { +; CHECK-LABEL: sminsmax_range_unsigned_i16_to_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqxtun2 v0.16b, v1.8h +; CHECK-NEXT: ret +entry: + %min = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %y, <8 x i16> zeroinitializer) + %max = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %min, <8 x i16> ) + %trunc = trunc <8 x i16> %max to <8 x i8> + %shuffle = shufflevector <8 x i8> %x, <8 x i8> %trunc, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @sminsmax_range_unsigned_i32_to_i16(<4 x i16> %x, <4 x i32> %y) { +; CHECK-LABEL: sminsmax_range_unsigned_i32_to_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqxtun2 v0.8h, v1.4s +; CHECK-NEXT: ret +entry: + %smax = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> zeroinitializer) + %smin = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %smax, <4 x i32> ) + %trunc = trunc <4 x i32> %smin to <4 x i16> + %shuffle = shufflevector <4 x i16> %x, <4 x i16> %trunc, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @sminsmax_range_unsigned_i64_to_i32(<2 x i32> %x, <2 x i64> %y) { +; CHECK-LABEL: sminsmax_range_unsigned_i64_to_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqxtun2 v0.4s, v1.2d +; CHECK-NEXT: ret +entry: + %smax = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> zeroinitializer) + %smin = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %smax, <2 x i64> ) + %trunc = trunc <2 x i64> %smin to <2 x i32> + %shuffle = shufflevector <2 x i32> %x, <2 x i32> %trunc, <4 x i32> + ret <4 x i32> %shuffle +} + +; Type support varification - not supported with saturated value +; i64 -> i16 +define <4 x i16> @sminsmax_range_unsigned_i64_to_i16(<2 x i16> %x, <2 x i64> %y) { +; CHECK-LABEL: sminsmax_range_unsigned_i64_to_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v2.2d, v1.2d, #0 +; CHECK-NEXT: movi v3.2d, #0x0000000000ffff +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: cmgt v2.2d, v3.2d, v1.2d +; CHECK-NEXT: bif v1.16b, v3.16b, v2.16b +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %smax = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> zeroinitializer) + %smin = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %smax, <2 x i64> ) + %trunc = trunc <2 x i64> %smin to <2 x i16> + %shuffle = shufflevector <2 x i16> %x, <2 x i16> %trunc, <4 x i32> + ret <4 x i16> %shuffle +} + +define <4 x i16> @sminsmax_range_signed_i64_to_i16(<2 x i16> %x, <2 x i64> %y) { +; CHECK-LABEL: sminsmax_range_signed_i64_to_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: mov w8, #32767 // =0x7fff +; CHECK-NEXT: cmgt v3.2d, v1.2d, v2.2d +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: cmgt v3.2d, v2.2d, v1.2d +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %smax = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %y, <2 x i64> ) + %smin = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %smax, <2 x i64> ) + %trunc = trunc <2 x i64> %smin to <2 x i16> + %shuffle = shufflevector <2 x i16> %x, <2 x i16> %trunc, <4 x i32> + ret <4 x i16> %shuffle +} + +define <4 x i16> @umin_range_unsigned_i64_to_i16(<2 x i16> %x, <2 x i64> %y) { +; CHECK-LABEL: umin_range_unsigned_i64_to_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x0000000000ffff +; CHECK-NEXT: cmhi v3.2d, v2.2d, v1.2d +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %umin = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %y, <2 x i64> ) + %trunc = trunc <2 x i64> %umin to <2 x i16> + %shuffle = shufflevector <2 x i16> %x, <2 x i16> %trunc, <4 x i32> + ret <4 x i16> %shuffle +} + +; i32 -> i8 +define <8 x i8> @sminsmax_range_unsigned_i64_to_i8(<4 x i8> %x, <4 x i32> %y) { +; CHECK-LABEL: sminsmax_range_unsigned_i64_to_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %smax = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> zeroinitializer) + %smin = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %smax, <4 x i32> ) + %trunc = trunc <4 x i32> %smin to <4 x i8> + %shuffle = shufflevector <4 x i8> %x, <4 x i8> %trunc, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @sminsmax_range_signed_i32_to_i8(<4 x i8> %x, <4 x i32> %y) { +; CHECK-LABEL: sminsmax_range_signed_i32_to_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mvni v2.4s, #127 +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #127 +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %smax = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %y, <4 x i32> ) + %smin = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %smax, <4 x i32> ) + %trunc = trunc <4 x i32> %smin to <4 x i8> + %shuffle = shufflevector <4 x i8> %x, <4 x i8> %trunc, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @umin_range_unsigned_i32_to_i8(<4 x i8> %x, <4 x i32> %y) { +; CHECK-LABEL: umin_range_unsigned_i32_to_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %umin = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> ) + %trunc = trunc <4 x i32> %umin to <4 x i8> + %shuffle = shufflevector <4 x i8> %x, <4 x i8> %trunc, <8 x i32> + ret <8 x i8> %shuffle +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll index 4e367bb0d70cd1..e2f540e991fd0b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll @@ -101,10 +101,8 @@ define void @trunc_sat_u8u16_notopt(ptr %x, ptr %y) { define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u16_maxmin: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret @@ -119,10 +117,8 @@ define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) { define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u16_minmax: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret @@ -356,10 +352,8 @@ define void @trunc_sat_u32u64_min(ptr %x, ptr %y) { define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u32u64_maxmin: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: ret @@ -374,10 +368,8 @@ define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) { define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u32u64_minmax: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: ret @@ -445,10 +437,8 @@ define void @trunc_sat_u8u32_min(ptr %x, ptr %y) { define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u32_maxmin: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 @@ -465,10 +455,8 @@ define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) { define void @trunc_sat_u8u32_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u32_minmax: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 @@ -544,10 +532,8 @@ define void @trunc_sat_u8u64_min(ptr %x, ptr %y) { define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u64_maxmin: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -566,10 +552,8 @@ define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) { define void @trunc_sat_u8u64_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u64_minmax: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 3e2db3fa4685dd..ffbcebf621fd7c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -113,7 +113,6 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret @@ -304,9 +303,6 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmax.vx v10, v10, zero -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: @@ -801,17 +797,16 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v10, 2 -; CHECK-V-NEXT: vmax.vx v10, v8, zero +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -944,9 +939,8 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: vmax.vx v8, v9, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -1139,7 +1133,6 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret @@ -2114,24 +2107,23 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v10, 4 -; CHECK-V-NEXT: vmax.vx v10, v8, zero +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -3473,7 +3465,6 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret @@ -3659,9 +3650,6 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmax.vx v10, v10, zero -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: @@ -4151,17 +4139,16 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v10, 2 -; CHECK-V-NEXT: vmax.vx v10, v8, zero +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -4289,9 +4276,8 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: vmax.vx v8, v9, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -4479,7 +4465,6 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret @@ -5449,24 +5434,23 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vslideup.vi v8, v10, 4 -; CHECK-V-NEXT: vmax.vx v10, v8, zero +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll index 01a90d8a33b6ec..f43faadc532f26 100644 --- a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll @@ -102,9 +102,7 @@ define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u16_maxmin: ; CHECK: # %bb.0: ; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret @@ -120,9 +118,7 @@ define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u16_minmax: ; CHECK: # %bb.0: ; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret @@ -357,9 +353,7 @@ define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u32u64_maxmin: ; CHECK: # %bb.0: ; CHECK-NEXT: vl4re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vs2r.v v12, (a1) ; CHECK-NEXT: ret @@ -375,9 +369,7 @@ define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u32u64_minmax: ; CHECK: # %bb.0: ; CHECK-NEXT: vl4re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vs2r.v v12, (a1) ; CHECK-NEXT: ret @@ -446,9 +438,7 @@ define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u32_maxmin: ; CHECK: # %bb.0: ; CHECK-NEXT: vl2re32.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -466,9 +456,7 @@ define void @trunc_sat_u8u32_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u32_minmax: ; CHECK: # %bb.0: ; CHECK-NEXT: vl2re32.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -545,9 +533,7 @@ define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u64_maxmin: ; CHECK: # %bb.0: ; CHECK-NEXT: vl4re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v12, 0 @@ -567,9 +553,7 @@ define void @trunc_sat_u8u64_minmax(ptr %x, ptr %y) { ; CHECK-LABEL: trunc_sat_u8u64_minmax: ; CHECK: # %bb.0: ; CHECK-NEXT: vl4re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v12, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll index 28d7588b9347a7..992ea8f8c18a5e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll @@ -4,9 +4,7 @@ define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) { ; CHECK-LABEL: test_v4i16_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i16> %x, zeroinitializer @@ -20,9 +18,7 @@ define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) { define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) { ; CHECK-LABEL: test_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 @@ -38,9 +34,7 @@ define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) { define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) { ; CHECK-LABEL: test_v4i64_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -58,9 +52,7 @@ define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) { define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) { ; CHECK-LABEL: test_v4i32_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i32> %x, zeroinitializer @@ -74,9 +66,7 @@ define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) { define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) { ; CHECK-LABEL: test_v4i64_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -92,10 +82,9 @@ define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) { define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) { ; CHECK-LABEL: test_v4i64_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmax.vx v10, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %a = icmp sgt <4 x i64> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i64> @@ -108,9 +97,7 @@ define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) { define @test_nxv4i16_nxv4i8( %x) { ; CHECK-LABEL: test_nxv4i16_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt %x, zeroinitializer @@ -124,9 +111,7 @@ define @test_nxv4i16_nxv4i8( %x) { define @test_nxv4i32_nxv4i8( %x) { ; CHECK-LABEL: test_nxv4i32_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v10, 0 @@ -142,9 +127,7 @@ define @test_nxv4i32_nxv4i8( %x) { define @test_nxv4i64_nxv4i8( %x) { ; CHECK-LABEL: test_nxv4i64_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v12, 0 @@ -162,10 +145,9 @@ define @test_nxv4i64_nxv4i8( %x) { define @test_nxv4i32_nxv4i16( %x) { ; CHECK-LABEL: test_nxv4i32_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmax.vx v10, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %a = icmp sgt %x, zeroinitializer %b = sext %a to @@ -178,9 +160,7 @@ define @test_nxv4i32_nxv4i16( %x) { define @test_nxv4i64_nxv4i16( %x) { ; CHECK-LABEL: test_nxv4i64_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vnclipu.wi v12, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vnclipu.wi v8, v12, 0 @@ -196,10 +176,9 @@ define @test_nxv4i64_nxv4i16( %x) { define @test_nxv4i64_nxv4i32( %x) { ; CHECK-LABEL: test_nxv4i64_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vmax.vx v12, v8, zero -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v12, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vnclipu.wi v12, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %a = icmp sgt %x, zeroinitializer %b = sext %a to