Skip to content

Commit

Permalink
[SVE] Add patterns for shift intrinsics with FalseLanesZero mode
Browse files Browse the repository at this point in the history
This patch adds patterns to reduce redundant mov and sel instructions
for shift intrinsics with FalseLanesZero mode, when
FeatureExperimentalZeroingPseudosis supported.

For example, before:

mov     z1.b, #0
sel     z0.b, p0, z0.b, z1.b
asr     z0.b, p0/m, z0.b, #7
After:

movprfx z0.b, p0/z, z0.b
asr     z0.b, p0/m, z0.b, #7

Reviewed By: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D145551
  • Loading branch information
lizhijin1024 authored and vfdff committed Mar 19, 2023
1 parent 57aeb30 commit 22c3ba4
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2060,6 +2060,10 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>;

defm ASR_ZPZI : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_asr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
defm LSR_ZPZI : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
defm LSL_ZPZI : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsl, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
} // End HasSVEorSME, UseExperimentalZeroingPseudos

let Predicates = [HasSVEorSME] in {
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,12 @@ class SVE_Shift_DupImm_Any_Predicate_Pat<ValueType vt, SDPatternOperator op,
: Pat<(vt (op (pt (SVEAnyPredicate)), vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
(inst $Rn, i32:$imm)>;

class SVE_2_Op_Imm_Pat_Zero<ValueType vt, SDPatternOperator op, ValueType pt,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Op1, (SVEDup0)),
(vt (splat_vector (it (cpx i32:$imm)))))),
(inst $Pg, $Op1, i32:$imm)>;

class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
FPImmLeaf immL, int imm,
Expand Down Expand Up @@ -5894,6 +5900,20 @@ multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
}

multiclass sve_int_bin_pred_imm_zeroing_bhsd<SDPatternOperator op,
ComplexPattern imm_b, ComplexPattern imm_h,
ComplexPattern imm_s, ComplexPattern imm_d> {
def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, Operand<i32>, FalseLanesZero>;
def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesZero>;
def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesZero>;
def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesZero>;

def : SVE_2_Op_Imm_Pat_Zero<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Pseudo>(NAME # _ZERO_B)>;
def : SVE_2_Op_Imm_Pat_Zero<nxv8i16, op, nxv8i1, i32, imm_h, !cast<Pseudo>(NAME # _ZERO_H)>;
def : SVE_2_Op_Imm_Pat_Zero<nxv4i32, op, nxv4i1, i32, imm_s, !cast<Pseudo>(NAME # _ZERO_S)>;
def : SVE_2_Op_Imm_Pat_Zero<nxv2i64, op, nxv2i1, i64, imm_d, !cast<Pseudo>(NAME # _ZERO_D)>;
}

multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
Expand Down
176 changes: 176 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm-zero.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-experimental-zeroing-pseudos < %s | FileCheck %s

;; ASR
define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
; CHECK-LABEL: asr_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.b, p0/z, z0.b
; CHECK-NEXT: asr z0.b, p0/m, z0.b, #8
; CHECK-NEXT: ret
%vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
%ele = insertelement <vscale x 16 x i8> poison, i8 8, i32 0
%shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
ret <vscale x 16 x i8> %res
}

define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: asr_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
; CHECK-NEXT: asr z0.h, p0/m, z0.h, #16
; CHECK-NEXT: ret
%vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
%ele = insertelement <vscale x 8 x i16> poison, i16 16, i32 0
%shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
; CHECK-LABEL: asr_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.s, p0/z, z0.s
; CHECK-NEXT: asr z0.s, p0/m, z0.s, #32
; CHECK-NEXT: ret
%vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
%ele = insertelement <vscale x 4 x i32> poison, i32 32, i32 0
%shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: asr_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.d, p0/z, z0.d
; CHECK-NEXT: asr z0.d, p0/m, z0.d, #64
; CHECK-NEXT: ret
%vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
%ele = insertelement <vscale x 2 x i64> poison, i64 64, i32 0
%shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
ret <vscale x 2 x i64> %res
}

;; LSL
define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
; CHECK-LABEL: lsl_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.b, p0/z, z0.b
; CHECK-NEXT: lsl z0.b, p0/m, z0.b, #7
; CHECK-NEXT: ret
%vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
%ele = insertelement <vscale x 16 x i8> poison, i8 7, i32 0
%shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
ret <vscale x 16 x i8> %res
}

define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: lsl_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
; CHECK-NEXT: lsl z0.h, p0/m, z0.h, #15
; CHECK-NEXT: ret
%vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
%ele = insertelement <vscale x 8 x i16> poison, i16 15, i32 0
%shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
; CHECK-LABEL: lsl_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.s, p0/z, z0.s
; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #31
; CHECK-NEXT: ret
%vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
%ele = insertelement <vscale x 4 x i32> poison, i32 31, i32 0
%shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: lsl_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.d, p0/z, z0.d
; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #63
; CHECK-NEXT: ret
%vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
%ele = insertelement <vscale x 2 x i64> poison, i64 63, i32 0
%shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
ret <vscale x 2 x i64> %res
}

;; LSR
define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg) {
; CHECK-LABEL: lsr_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.b, p0/z, z0.b
; CHECK-NEXT: lsr z0.b, p0/m, z0.b, #8
; CHECK-NEXT: ret
%vsel = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
%ele = insertelement <vscale x 16 x i8> poison, i8 8, i32 0
%shuffle = shufflevector <vscale x 16 x i8> %ele, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %vsel, <vscale x 16 x i8> %shuffle)
ret <vscale x 16 x i8> %res
}

define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: lsr_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
; CHECK-NEXT: lsr z0.h, p0/m, z0.h, #16
; CHECK-NEXT: ret
%vsel = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
%ele = insertelement <vscale x 8 x i16> poison, i16 16, i32 0
%shuffle = shufflevector <vscale x 8 x i16> %ele, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %vsel, <vscale x 8 x i16> %shuffle)
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) local_unnamed_addr #0 {
; CHECK-LABEL: lsr_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.s, p0/z, z0.s
; CHECK-NEXT: lsr z0.s, p0/m, z0.s, #32
; CHECK-NEXT: ret
%vsel = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
%ele = insertelement <vscale x 4 x i32> poison, i32 32, i32 0
%shuffle = shufflevector <vscale x 4 x i32> %ele, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %vsel, <vscale x 4 x i32> %shuffle)
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: lsr_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: movprfx z0.d, p0/z, z0.d
; CHECK-NEXT: lsr z0.d, p0/m, z0.d, #64
; CHECK-NEXT: ret
%vsel = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
%ele = insertelement <vscale x 2 x i64> poison, i64 64, i32 0
%shuffle = shufflevector <vscale x 2 x i64> %ele, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %vsel, <vscale x 2 x i64> %shuffle)
ret <vscale x 2 x i64> %res
}

declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

0 comments on commit 22c3ba4

Please sign in to comment.