From ed7dfd3925eedb9c25f47617391fd25ccb93f8a5 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Tue, 9 Aug 2022 09:45:53 -0700
Subject: [PATCH] x64: Peephole optimization for `x < 0` (#4625)

https://github.com/bytecodealliance/wasmtime/pull/4625

Fixes #4607
---
 cranelift/codegen/src/isa/x64/lower.isle      |  32 +++++
 cranelift/filetests/filetests/isa/x64/b1.clif | 132 ++++++++++++++++++
 .../filetests/filetests/isa/x64/branches.clif |  60 ++++++++
 3 files changed, 224 insertions(+)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index a11fa45dd379..5188dd322af4 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1501,6 +1501,38 @@
 (rule (lower (icmp cc a @ (value_type $I128) b))
       (lower_icmp_bool (emit_cmp cc a b)))
 
+;; Peephole optimization for `x < 0`, when x is a signed 64 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0))))
+      (x64_shr $I64 x (Imm8Reg.Imm8 63)))
+
+;; Peephole optimization for `0 > x`, when x is a signed 64 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64))))
+      (x64_shr $I64 x (Imm8Reg.Imm8 63)))
+
+;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64))))
+      (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63)))
+
+;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0))))
+      (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63)))
+
+;; Peephole optimization for `x < 0`, when x is a signed 32 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0))))
+      (x64_shr $I32 x (Imm8Reg.Imm8 31)))
+
+;; Peephole optimization for `0 > x`, when x is a signed 32 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32))))
+      (x64_shr $I32 x (Imm8Reg.Imm8 31)))
+
+;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32))))
+      (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31)))
+
+;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value
+(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0))))
+      (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31)))
+
 ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
 ;; one. To note: what is different here about the output values is that each
 ;; lane will be filled with all 1s or all 0s according to the comparison,
diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif
index eb971b36fa5e..a67242437054 100644
--- a/cranelift/filetests/filetests/isa/x64/b1.clif
+++ b/cranelift/filetests/filetests/isa/x64/b1.clif
@@ -73,3 +73,135 @@ block2:
 ;   popq    %rbp
 ;   ret
 
+function %test_x_slt_0_i64(i64) -> b1 {
+block0(v0: i64):
+    v1 = iconst.i64 0
+    v2 = icmp slt v0, v1
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shrq    $63, %rdi, %rdi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_x_slt_0_i32f4(i32) -> b1 {
+block0(v0: i32):
+    v1 = iconst.i32 0
+    v2 = icmp slt v0, v1
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shrl    $31, %edi, %edi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_0_sgt_x_i64(i64) -> b1 {
+block0(v0: i64):
+    v1 = iconst.i64 0
+    v2 = icmp sgt v1, v0
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shrq    $63, %rdi, %rdi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_0_sgt_x_i32f4(i32) -> b1 {
+block0(v0: i32):
+    v1 = iconst.i32 0
+    v2 = icmp sgt v1, v0
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shrl    $31, %edi, %edi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_0_sle_x_i64(i64) -> b1 {
+block0(v0: i64):
+    v1 = iconst.i64 0
+    v2 = icmp sle v1, v0
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   notq    %rdi, %rdi
+;   shrq    $63, %rdi, %rdi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_0_sle_x_i32f4(i32) -> b1 {
+block0(v0: i32):
+    v1 = iconst.i32 0
+    v2 = icmp sle v1, v0
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   notq    %rdi, %rdi
+;   shrl    $31, %edi, %edi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_x_sge_x_i64(i64) -> b1 {
+block0(v0: i64):
+    v1 = iconst.i64 0
+    v2 = icmp sge v0, v1
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   notq    %rdi, %rdi
+;   shrq    $63, %rdi, %rdi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %test_x_sge_x_i32f4(i32) -> b1 {
+block0(v0: i32):
+    v1 = iconst.i32 0
+    v2 = icmp sge v0, v1
+    return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   notq    %rdi, %rdi
+;   shrl    $31, %edi, %edi
+;   movq    %rdi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif
index 4b4a587b6b00..ecb880084274 100644
--- a/cranelift/filetests/filetests/isa/x64/branches.clif
+++ b/cranelift/filetests/filetests/isa/x64/branches.clif
@@ -223,3 +223,63 @@ block2:
 ;   popq    %rbp
 ;   ret
 
+function %f6(i64) -> b1 {
+block0(v0: i64):
+  v1 = iconst.i64 0
+  v2 = icmp slt v0, v1
+  brnz v2, block1
+  jump block2
+block1:
+  v3 = bconst.b1 true
+  return v3
+block2:
+  v4 = bconst.b1 false
+  return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cmpq    $0, %rdi
+;   jl      label1; j label2
+; block1:
+;   movl    $1, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   xorl    %eax, %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f7(i32) -> b1 {
+block0(v0: i32):
+  v1 = iconst.i32 0
+  v2 = icmp slt v0, v1
+  brnz v2, block1
+  jump block2
+block1:
+  v3 = bconst.b1 true
+  return v3
+block2:
+  v4 = bconst.b1 false
+  return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cmpl    $0, %edi
+;   jl      label1; j label2
+; block1:
+;   movl    $1, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   xorl    %eax, %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+