diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 30a66c7ae4be84..72f9785a2aee65 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5330,7 +5330,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_neon_uaddlv: { EVT OpVT = Op.getOperand(1).getValueType(); EVT ResVT = Op.getValueType(); - if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) { + if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || + OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) { // In order to avoid insert_subvector, used v4i32 than v2i32. SDValue UADDLV = DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); @@ -22286,6 +22287,7 @@ static SDValue performSelectCombine(SDNode *N, static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); + SDLoc DL(N); // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { @@ -22293,14 +22295,32 @@ static SDValue performDUPCombine(SDNode *N, SmallVector Ops(N->ops()); if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), DCI.DAG.getVTList(LVT), Ops)) { - SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } - if (N->getOpcode() == AArch64ISD::DUP) + if (N->getOpcode() == AArch64ISD::DUP) { + if (DCI.isAfterLegalizeDAG()) { + // If scalar dup's operand is extract_vector_elt, try to combine them into + // duplane. For example, + // + // t21: i32 = extract_vector_elt t19, Constant:i64<0> + // t18: v4i32 = AArch64ISD::DUP t21 + // ==> + // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0> + SDValue EXTRACT_VEC_ELT = N->getOperand(0); + if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) { + unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); + return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0), + EXTRACT_VEC_ELT.getOperand(1)); + } + } + } + return performPostLD1Combine(N, DCI, false); + } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 82b79cd7232cc9..f0934c9a3659e4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6472,12 +6472,24 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))) (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)), ssub))>; +def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>; + +def : Pat<(v4i32 (AArch64uaddlv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>; + def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))), (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>; +def : Pat<(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))>; + def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))), (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>; +def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>; + // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. // In effect, opNode is the same as (scalar_to_vector (IntNode)). diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index bf420700eb575f..55750ab34e17a0 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s0, v0 ; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: ucvtf.2s v1, v1 -; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ucvtf.2s v0, v1 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret entry: @@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: uaddlv.8h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.4s v2, v2 -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: @@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: st1.s { v0 }[2], [x8] ; CHECK-NEXT: str d0, [x0, #80] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.4s v2, v2 -; CHECK-NEXT: str q2, [x0] +; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: @@ -256,9 +256,9 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) { ; CHECK-NEXT: uaddlv.4h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.2d v2, v2 -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ucvtf.2d v1, v2 +; CHECK-NEXT: fcvtn v1.2s, v1.2d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll index 3b064b718cd679..20adcdf2956d69 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll @@ -9,16 +9,15 @@ define i32 @widget(i64 %arg, <8 x i16> %arg1) { ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: bfi x10, x0, #1, #3 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: bfi x9, x0, #1, #3 ; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: dup v1.8h, w9 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ld1 { v1.h }[1], [x10] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: str q1, [sp] +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll index 1b037c13aa4b54..0241091fae0254 100644 --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll @@ -195,7 +195,6 @@ entry: } declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) - declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64) define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) { @@ -215,3 +214,36 @@ entry: %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer ret <8 x i8> %vecinit7.i } + +define <4 x i32> @uaddlv_dup_v4i16(<4 x i16> %a) { +; CHECK-LABEL: uaddlv_dup_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.4h +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ushr v0.4s, v0.4s, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a) + %vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0 + %vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer + %vshr_n = lshr <4 x i32> %vecinit7.i, + ret <4 x i32> %vshr_n +} + +define <4 x i32> @uaddlv_dup_v8i16(<8 x i16> %a) { +; CHECK-LABEL: uaddlv_dup_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.8h +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ushr v0.4s, v0.4s, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a) + %vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0 + %vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer + %vshr_n = lshr <4 x i32> %vecinit7.i, + ret <4 x i32> %vshr_n +} + +declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) +declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll index 8b48635b6694c0..f0856c43daf1d9 100644 --- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll +++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll @@ -26,7 +26,21 @@ define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) { ret i16 %4 } +define i16 @uaddlv_uaddlp_v8i8(<8 x i8> %0) { +; CHECK-LABEL: uaddlv_uaddlp_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %2 = tail call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %0) + %3 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>) declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) +declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) +declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>)