From ed7dfd3925eedb9c25f47617391fd25ccb93f8a5 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Tue, 9 Aug 2022 09:45:53 -0700 Subject: [PATCH] x64: Peephole optimization for `x < 0` (#4625) https://github.com/bytecodealliance/wasmtime/pull/4625 Fixes #4607 --- cranelift/codegen/src/isa/x64/lower.isle | 32 +++++ cranelift/filetests/filetests/isa/x64/b1.clif | 132 ++++++++++++++++++ .../filetests/filetests/isa/x64/branches.clif | 60 ++++++++ 3 files changed, 224 insertions(+) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index a11fa45dd379..5188dd322af4 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1501,6 +1501,38 @@ (rule (lower (icmp cc a @ (value_type $I128) b)) (lower_icmp_bool (emit_cmp cc a b))) +;; Peephole optimization for `x < 0`, when x is a signed 64 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) + (x64_shr $I64 x (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `0 > x`, when x is a signed 64 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) + (x64_shr $I64 x (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) + (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) + (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `x < 0`, when x is a signed 32 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) + (x64_shr $I32 x (Imm8Reg.Imm8 31))) + +;; Peephole optimization for `0 > x`, when x is a signed 32 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) + (x64_shr $I32 x (Imm8Reg.Imm8 31))) + +;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) + (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) + +;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) + (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) + ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than ;; one. To note: what is different here about the output values is that each ;; lane will be filled with all 1s or all 0s according to the comparison, diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif index eb971b36fa5e..a67242437054 100644 --- a/cranelift/filetests/filetests/isa/x64/b1.clif +++ b/cranelift/filetests/filetests/isa/x64/b1.clif @@ -73,3 +73,135 @@ block2: ; popq %rbp ; ret +function %test_x_slt_0_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp slt v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrq $63, %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_x_slt_0_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp slt v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrl $31, %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sgt_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sgt v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrq $63, %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sgt_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sgt v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrl $31, %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sle_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sle v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; notq %rdi, %rdi +; shrq $63, %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sle_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sle v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; notq %rdi, %rdi +; shrl $31, %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_x_sge_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sge v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; notq %rdi, %rdi +; shrq $63, %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_x_sge_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sge v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; notq %rdi, %rdi +; shrl $31, %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif index 4b4a587b6b00..ecb880084274 100644 --- a/cranelift/filetests/filetests/isa/x64/branches.clif +++ b/cranelift/filetests/filetests/isa/x64/branches.clif @@ -223,3 +223,63 @@ block2: ; popq %rbp ; ret +function %f6(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp slt v0, v1 + brnz v2, block1 + jump block2 +block1: + v3 = bconst.b1 true + return v3 +block2: + v4 = bconst.b1 false + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpq $0, %rdi +; jl label1; j label2 +; block1: +; movl $1, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; block2: +; xorl %eax, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f7(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp slt v0, v1 + brnz v2, block1 + jump block2 +block1: + v3 = bconst.b1 true + return v3 +block2: + v4 = bconst.b1 false + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpl $0, %edi +; jl label1; j label2 +; block1: +; movl $1, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; block2: +; xorl %eax, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +