From a4516da49f8bda1b99d21dae7e1caba772d7182c Mon Sep 17 00:00:00 2001 From: Jorge Botto Date: Thu, 3 Oct 2024 17:56:01 +0100 Subject: [PATCH] [AArch64] - Fold and and cmp into tst (#110347) Fixes https://github.com/llvm/llvm-project/issues/102703. https://godbolt.org/z/nfj8xsb1Y The following pattern: ``` %2 = and i32 %0, 254 %3 = icmp eq i32 %2, 0 ``` is optimised by instcombine into: ```%3 = icmp ult i32 %0, 2``` However, post instcombine leads to worse aarch64 than the unoptimised version. Pre instcombine: ``` tst w0, #0xfe cset w0, eq ret ``` Post instcombine: ``` and w8, w0, #0xff cmp w8, #2 cset w0, lo ret ``` In the unoptimised version, SelectionDAG converts `SETCC (AND X 254) 0 EQ` into `CSEL 0 1 1 (ANDS X 254)`, which gets emitted as a `tst`. In the optimised version, SelectionDAG converts `SETCC (AND X 255) 2 ULT` into `CSEL 0 1 2 (SUBS (AND X 255) 2)`, which gets emitted as an `and`/`cmp`. This PR adds an optimisation to `AArch64ISelLowering`, converting `SETCC (AND X Y) Z ULT` into `SETCC (AND X (Y & ~(Z - 1))) 0 EQ` when `Z` is a power of two. This makes SelectionDAG/Codegen produce the same optimised code for both examples. --- .../Target/AArch64/AArch64ISelLowering.cpp | 26 +++ llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll | 216 ++++++++++++++++++ .../AArch64/signed-truncation-check.ll | 25 +- 3 files changed, 252 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e55e9989e6565c..48e1b96d841efb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4301,6 +4301,29 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1)); } +// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is +// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS +// (AND X Y) Z which produces a better opt with EmitComparison +static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, + SelectionDAG &DAG, const SDLoc dl) { + if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) { + ConstantSDNode *LHSConstOp = dyn_cast(LHS.getOperand(1)); + ConstantSDNode *RHSConst = dyn_cast(RHS); + if (LHSConstOp && RHSConst) { + uint64_t LHSConstValue = LHSConstOp->getZExtValue(); + uint64_t RHSConstant = RHSConst->getZExtValue(); + if (isPowerOf2_64(RHSConstant)) { + uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1); + LHS = + DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0), + DAG.getConstant(NewMaskValue, dl, LHS.getValueType())); + RHS = DAG.getConstant(0, dl, RHS.getValueType()); + CC = ISD::SETEQ; + } + } + } +} + SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -10596,6 +10619,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } if (LHS.getValueType().isInteger()) { + + simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl); + SDValue CCVal; SDValue Cmp = getAArch64Cmp( LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll new file mode 100644 index 00000000000000..33c5ba7987974a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + + +define i1 @lt8_u8(i8 %0) { +; CHECK-LABEL: lt8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xf8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp ult i8 %0, 8 + ret i1 %2 +} + +define i1 @lt32_u8(i8 %0) { +; CHECK-LABEL: lt32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xe0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp ult i8 %0, 32 + ret i1 %2 +} + +define i1 @lt64_u8(i8 %0) { +; CHECK-LABEL: lt64_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xc0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp ult i8 %0, 64 + ret i1 %2 +} + +define i1 @lt8_u32(i32 %0) { +; CHECK-LABEL: lt8_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #8 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 8 + ret i1 %2 +} + +define i1 @lt32_u32(i32 %0) { +; CHECK-LABEL: lt32_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #32 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 32 + ret i1 %2 +} + +define i1 @lt64_u32(i32 %0) { +; CHECK-LABEL: lt64_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #64 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 64 + ret i1 %2 +} + +define i1 @lt8_u64(i64 %0) { +; CHECK-LABEL: lt8_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, #8 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i64 %0, 8 + ret i1 %2 +} + +define i1 @lt32_u64(i64 %0) { +; CHECK-LABEL: lt32_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, #32 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i64 %0, 32 + ret i1 %2 +} + +define i1 @lt64_u64(i64 %0) { +; CHECK-LABEL: lt64_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, #64 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i64 %0, 64 + ret i1 %2 +} + +define i1 @lt8_u16_and_5(i8 %0) { +; CHECK-LABEL: lt8_u16_and_5: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i8 %0, 5 + %3 = icmp ult i8 %2, 16 + ret i1 %3 +} + +define i1 @lt8_u16_and_19(i8 %0) { +; CHECK-LABEL: lt8_u16_and_19: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0x10 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i8 %0, 19 + %3 = icmp ult i8 %2, 16 + ret i1 %3 +} + +define i1 @lt32_u16_and_7(i32 %0) { +; CHECK-LABEL: lt32_u16_and_7: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i32 %0, 7 + %3 = icmp ult i32 %2, 16 + ret i1 %3 +} + +define i1 @lt32_u16_and_21(i32 %0) { +; CHECK-LABEL: lt32_u16_and_21: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0x10 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i32 %0, 21 + %3 = icmp ult i32 %2, 16 + ret i1 %3 +} + +define i1 @lt64_u16_and_9(i64 %0) { +; CHECK-LABEL: lt64_u16_and_9: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i64 %0, 9 + %3 = icmp ult i64 %2, 16 + ret i1 %3 +} + +define i1 @lt64_u16_and_23(i64 %0) { +; CHECK-LABEL: lt64_u16_and_23: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0x10 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = and i64 %0, 23 + %3 = icmp ult i64 %2, 16 + ret i1 %3 +} + +; negative test +define i1 @lt3_u8(i8 %0) { +; CHECK-LABEL: lt3_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmp w8, #3 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i8 %0, 3 + ret i1 %2 +} + +; negative test +define i1 @lt3_u32(i32 %0) { +; CHECK-LABEL: lt3_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #3 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 3 + ret i1 %2 +} + +; negative test +define i1 @lt3_u64(i64 %0) { +; CHECK-LABEL: lt3_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, #3 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i64 %0, 3 + ret i1 %2 +} + +; negative test +define i32 @lt32_u16_multiple_use(i32 %0) { +; CHECK-LABEL: lt32_u16_multiple_use: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #21 // =0x15 +; CHECK-NEXT: mov w9, #10 // =0xa +; CHECK-NEXT: and w8, w0, w8 +; CHECK-NEXT: cmp w8, #16 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: mul w0, w8, w10 +; CHECK-NEXT: ret + %2 = and i32 %0, 21 + %3 = icmp ult i32 %2, 16 + %4 = add i32 %2, 10 + %5 = zext i1 %3 to i32 + %6 = mul i32 %4, %5 + ret i32 %6 +} diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll index bb4df6d8935b1b..7c80f9320faec1 100644 --- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll +++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll @@ -287,9 +287,8 @@ define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i16_i8_add: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, w1 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, #256 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w8, #0xff00 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i16 %x, %y %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 @@ -328,9 +327,8 @@ define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, #192 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, #256 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w8, #0xff00 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1)) %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 @@ -356,9 +354,8 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, #64 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, #256 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w8, #0xff00 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i16 %x, 64 ; 1U << (8-1-1) %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 @@ -370,9 +367,8 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i16_i4: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, #8 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, #16 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w8, #0xfff0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i16 %x, 8 ; 1U << (4-1) %tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4 @@ -384,9 +380,8 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i24_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, #128 -; CHECK-NEXT: and w8, w8, #0xffffff -; CHECK-NEXT: cmp w8, #256 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w8, #0xffff00 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i24 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8