From ee9e1ca54586516c14d0c4a8dae63691a1d4b50c Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 19 Jun 2024 11:17:19 -0500 Subject: [PATCH] x64: Implement some minor optimizations related to SIMD lowerings (#8839) * Add tests for patterns I'm about to optimize * x64: Optimize vector compare-and-branch This commit implements lowering optimizations for the `vall_true` and `vany_true` CLIF instructions when combined with `brif`. This is in the same manner as `icmp` and `fcmp` combined with `brif` where the result of the comparison is never materialized into a general purpose register which helps lower register pressure and remove some instructions. * x64: Optimize `vconst` with an all-ones pattern This has a single-instruction lowering which doesn't load from memory so it's probably cheaper than loading all-ones from memory. --- cranelift/codegen/src/isa/x64/lower.isle | 35 +++++-- tests/disas/x64-simd-test-and-branch.wat | 125 +++++++++++++++++++++++ tests/disas/x64-vector-patterns.wat | 22 ++++ 3 files changed, 171 insertions(+), 11 deletions(-) create mode 100644 tests/disas/x64-simd-test-and-branch.wat create mode 100644 tests/disas/x64-vector-patterns.wat diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 92d8b74780b0..582ed13f9d00 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3308,6 +3308,12 @@ (rule 2 (lower_branch (brif (maybe_uextend (fcmp cc a b)) _ _) (two_targets then else)) (emit_side_effect (jmp_cond_fcmp (emit_fcmp cc a b) then else))) +(rule 2 (lower_branch (brif (maybe_uextend (vany_true a)) _ _) (two_targets then else)) + (emit_side_effect (jmp_cond_icmp (emit_vany_true a) then else))) + +(rule 2 (lower_branch (brif (maybe_uextend (vall_true a)) _ _) (two_targets then else)) + (emit_side_effect (jmp_cond_icmp (emit_vall_true a) then else))) + (rule 1 (lower_branch (brif val @ (value_type $I128) _ _) (two_targets then else)) (emit_side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) then else))) @@ -4263,10 +4269,9 @@ ;; TODO use Inst::gen_constant() instead. (x64_xmm_load_const ty (const_to_vconst const))) -;; Special case for a zero-vector: don't load, xor instead. -(rule 1 (lower (has_type ty (vconst (u128_from_constant 0)))) - (let ((dst Xmm (xmm_uninit_value))) - (x64_pxor dst dst))) +;; Special cases for known constant patterns to skip a 16-byte load. +(rule 1 (lower (has_type ty (vconst (u128_from_constant 0)))) (xmm_zero ty)) +(rule 1 (lower (has_type ty (vconst (u128_from_constant -1)))) (vector_all_ones)) ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -4630,30 +4635,38 @@ ;; 0xffff then every byte was equal to zero, so test if the comparison is ;; not-equal or NZ. (rule (lower (vany_true val)) + (lower_icmp_bool (emit_vany_true val))) + +(decl emit_vany_true (Value) IcmpCondResult) +(rule (emit_vany_true val) (let ( (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16))) (mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero)) ) - (with_flags (x64_cmp_imm (OperandSize.Size32) mask 0xffff) - (x64_setcc (CC.NZ))))) + (icmp_cond_result (x64_cmp_imm (OperandSize.Size32) mask 0xffff) + (CC.NZ)))) ;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (vall_true val @ (value_type ty))) +(rule (lower (vall_true val)) + (lower_icmp_bool (emit_vall_true val))) + +(decl emit_vall_true (Value) IcmpCondResult) +(rule 1 (emit_vall_true val @ (value_type ty)) (if-let $true (use_sse41)) (let ((src Xmm val) (zeros Xmm (xmm_zero ty)) (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) - (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + (icmp_cond_result (x64_ptest cmp cmp) (CC.Z)))) ;; Perform an appropriately-sized lane-wise comparison with zero. If the ;; result is all 0s then all of them are true because nothing was equal to ;; zero. -(rule (lower (vall_true val @ (value_type ty))) +(rule (emit_vall_true val @ (value_type ty)) (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty))) (mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero))) - (with_flags (x64_test (OperandSize.Size32) mask mask) - (x64_setcc (CC.Z))))) + (icmp_cond_result (x64_test (OperandSize.Size32) mask mask) + (CC.Z)))) ;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/tests/disas/x64-simd-test-and-branch.wat b/tests/disas/x64-simd-test-and-branch.wat new file mode 100644 index 000000000000..1bbceb0f8d20 --- /dev/null +++ b/tests/disas/x64-simd-test-and-branch.wat @@ -0,0 +1,125 @@ +;;! target = "x86_64" +;;! test = "compile" +;;! flags = ["-Ccranelift-sse41"] + +(module + (func $i8x16.all_true (param v128) (result i32) + local.get 0 + i8x16.all_true + if (result i32) + i32.const 100 + else + i32.const 200 + end + ) + + (func $i16x8.all_true (param v128) (result i32) + local.get 0 + i16x8.all_true + if (result i32) + i32.const 100 + else + i32.const 200 + end + ) + + (func $i32x4.all_true (param v128) (result i32) + local.get 0 + i32x4.all_true + if (result i32) + i32.const 100 + else + i32.const 200 + end + ) + + (func $i64x2.all_true (param v128) (result i32) + local.get 0 + i64x2.all_true + if (result i32) + i32.const 100 + else + i32.const 200 + end + ) + + (func $v128.any_true (param v128) (result i32) + local.get 0 + v128.any_true + if (result i32) + i32.const 100 + else + i32.const 200 + end + ) +) +;; wasm[0]::function[0]::i8x16.all_true: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm7, %xmm7 +;; pcmpeqb %xmm7, %xmm0 +;; ptest %xmm0, %xmm0 +;; je 0x21 +;; 17: movl $0xc8, %eax +;; jmp 0x26 +;; 21: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[1]::i16x8.all_true: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm7, %xmm7 +;; pcmpeqw %xmm7, %xmm0 +;; ptest %xmm0, %xmm0 +;; je 0x61 +;; 57: movl $0xc8, %eax +;; jmp 0x66 +;; 61: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[2]::i32x4.all_true: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm7, %xmm7 +;; pcmpeqd %xmm7, %xmm0 +;; ptest %xmm0, %xmm0 +;; je 0xa1 +;; 97: movl $0xc8, %eax +;; jmp 0xa6 +;; a1: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[3]::i64x2.all_true: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm7, %xmm7 +;; pcmpeqq %xmm7, %xmm0 +;; ptest %xmm0, %xmm0 +;; je 0xe2 +;; d8: movl $0xc8, %eax +;; jmp 0xe7 +;; e2: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[4]::v128.any_true: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm7, %xmm7 +;; pcmpeqb %xmm7, %xmm0 +;; pmovmskb %xmm0, %ecx +;; cmpl $0xffff, %ecx +;; jne 0x126 +;; 11c: movl $0xc8, %eax +;; jmp 0x12b +;; 126: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq diff --git a/tests/disas/x64-vector-patterns.wat b/tests/disas/x64-vector-patterns.wat new file mode 100644 index 000000000000..e043427c3bb6 --- /dev/null +++ b/tests/disas/x64-vector-patterns.wat @@ -0,0 +1,22 @@ +;;! target = "x86_64" +;;! test = "compile" + +(module + (func $zero (result v128) v128.const i64x2 0 0) + (func $ones (result v128) v128.const i64x2 -1 -1) +) +;; wasm[0]::function[0]::zero: +;; pushq %rbp +;; movq %rsp, %rbp +;; pxor %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[1]::ones: +;; pushq %rbp +;; movq %rsp, %rbp +;; pcmpeqd %xmm0, %xmm0 +;; movq %rbp, %rsp +;; popq %rbp +;; retq