bytecodealliance · afonso360 · May 31, 2024 · May 27, 2024
@@ -310,6 +310,10 @@ impl IntegerCompare {
             ..self
         }
     }
+
+    pub(crate) fn regs(&self) -> [Reg; 2] {
+        [self.rs1, self.rs2]
+    }
 }
 
 #[derive(Debug, Clone, Copy, PartialEq)]

@@ -1520,25 +1520,68 @@ impl Inst {
                 ref x,
                 ref y,
             } => {
-                let label_true = sink.get_label();
-                let label_false = sink.get_label();
+                // The general form for this select is the following:
+                //
+                //     mv rd, x
+                //     b{cond} rcond, label_end
+                //     mv rd, y
+                // label_end:
+                //     ... etc
+                //
+                // This is built on the assumption that moves are cheap, but branches and jumps
+                // are not. So with this format we always avoid one jump instruction at the expense
+                // of an unconditional move.
+                //
+                // We also perform another optimization here. If the destination register is the same
+                // as one of the input registers, we can avoid emitting the first unconditional move
+                // and emit just the branch and the second move.
+                //
+                // To make sure that this happens as often as possible, we also try to invert the
+                // condition, so that if either of the input registers are the same as the destination
+                // we avoid that move.
+
                 let label_end = sink.get_label();
+
+                let xregs = x.regs();
+                let yregs = y.regs();
+                let dstregs: Vec<Reg> = dst.regs().into_iter().map(|r| r.to_reg()).collect();
+                let condregs = condition.regs();
+
+                // We are going to write to the destination register before evaluating
+                // the condition, so we need to make sure that the destination register
+                // is not one of the condition registers.
+                //
+                // This should never happen, since hopefully the regalloc constraints
+                // for this register are set up correctly.
+                debug_assert_ne!(dstregs, condregs);
+
+                // Check if we can invert the condition and avoid moving the y registers into
+                // the destination. This allows us to only emit the branch and one of the moves.
+                let (uncond_move, cond_move, condition) = if yregs == dstregs {
+                    (yregs, xregs, condition.inverse())
+                } else {
+                    (xregs, yregs, condition)
+                };
+
+                // Unconditonally move one of the values to the destination register.
+                //
+                // These moves may not end up being emitted if the source and
+                // destination registers are the same. That logic is built into
+                // the emit function for `Inst::Mov`.
+                for i in gen_moves(dst.regs(), uncond_move) {
+                    i.emit(sink, emit_info, state);
+                }
+
+                // If the condition passes we skip over the conditional move
                 Inst::CondBr {
-                    taken: CondBrTarget::Label(label_true),
-                    not_taken: CondBrTarget::Label(label_false),
+                    taken: CondBrTarget::Label(label_end),
+                    not_taken: CondBrTarget::Fallthrough,
                     kind: condition,
                 }
                 .emit(sink, emit_info, state);
-                sink.bind_label(label_true, &mut state.ctrl_plane);
-                // here is the true
-                // select the first value
-                for i in gen_moves(dst.regs(), x.regs()) {
-                    i.emit(sink, emit_info, state);
-                }
-                Inst::gen_jump(label_end).emit(sink, emit_info, state);
 
-                sink.bind_label(label_false, &mut state.ctrl_plane);
-                for i in gen_moves(dst.regs(), y.regs()) {
+                // Move the conditional value to the destination register.
+                for i in gen_moves(dst.regs(), cond_move) {
                     i.emit(sink, emit_info, state);
                 }
 

@@ -459,14 +459,19 @@ fn riscv64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
             y,
             ..
         } => {
-            collector.reg_use(rs1);
-            collector.reg_use(rs2);
+            // Mark the condition registers as late use so that they don't overlap with the destination
+            // register. We may potentially write to the destination register before evaluating the
+            // condition.
+            collector.reg_late_use(rs1);
+            collector.reg_late_use(rs2);
+
             for reg in x.regs_mut() {
                 collector.reg_use(reg);
             }
             for reg in y.regs_mut() {
                 collector.reg_use(reg);
             }
+
             // If there's more than one destination register then use
             // `reg_early_def` to prevent destination registers from overlapping
             // with any operands. This ensures that the lowering doesn't have to
@@ -476,6 +481,8 @@ fn riscv64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
             // When there's only one destination register though don't use an
             // early def because once the register is written no other inputs
             // are read so it's ok for the destination to overlap the sources.
+            // The condition registers are already marked as late use so they
+            // won't overlap with the destination.
             match dst.regs_mut() {
                 [reg] => collector.reg_def(reg),
                 regs => {

@@ -154,9 +154,8 @@ block0(v0: i128):
 ;   addi a2, a2, -1
 ;   srli a4, a4, 1
 ;   j -0x18
-;   bnez a1, 0xc
 ;   mv a0, a3
-;   j 8
+;   beqz a1, 8
 ;   mv a0, zero
 ;   add a0, a5, a0
 ;   mv a1, zero
@@ -183,9 +182,8 @@ block0(v0: i8):
 ;   slli a2, a0, 0x38
 ;   srai a4, a2, 0x38
 ;   not a0, a4
-;   bgez a4, 0xc
 ;   mv a2, a0
-;   j 8
+;   bltz a4, 8
 ;   mv a2, a4
 ;   mv a0, zero
 ;   addi a5, zero, 0x40
@@ -222,9 +220,8 @@ block0(v0: i16):
 ;   slli a2, a0, 0x30
 ;   srai a4, a2, 0x30
 ;   not a0, a4
-;   bgez a4, 0xc
 ;   mv a2, a0
-;   j 8
+;   bltz a4, 8
 ;   mv a2, a4
 ;   mv a0, zero
 ;   addi a5, zero, 0x40
@@ -259,9 +256,8 @@ block0(v0: i32):
 ; block0: ; offset 0x0
 ;   sext.w a2, a0
 ;   not a4, a2
-;   bgez a2, 0xc
 ;   mv a0, a4
-;   j 8
+;   bltz a2, 8
 ;   mv a0, a2
 ;   mv a4, zero
 ;   addi a3, zero, 0x40
@@ -294,9 +290,8 @@ block0(v0: i64):
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   not a2, a0
-;   bgez a0, 0xc
 ;   mv a4, a2
-;   j 8
+;   bltz a0, 8
 ;   mv a4, a0
 ;   mv a2, zero
 ;   addi a1, zero, 0x40
@@ -335,14 +330,12 @@ block0(v0: i128):
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   not a3, a0
-;   bgez a1, 0xc
 ;   mv a5, a3
-;   j 8
+;   bltz a1, 8
 ;   mv a5, a0
 ;   not a2, a1
-;   bgez a1, 0xc
 ;   mv a3, a2
-;   j 8
+;   bltz a1, 8
 ;   mv a3, a1
 ;   mv a1, zero
 ;   addi a0, zero, 0x40
@@ -366,9 +359,8 @@ block0(v0: i128):
 ;   addi a4, a4, -1
 ;   srli a2, a2, 1
 ;   j -0x18
-;   bnez a3, 0xc
 ;   mv a2, a0
-;   j 8
+;   beqz a3, 8
 ;   mv a2, zero
 ;   add a3, a1, a2
 ;   addi a0, a3, -1
@@ -1444,10 +1436,10 @@ block0(v0: i128, v1: i8):
 ;   srl a0,a0,a3
 ;   select a3,zero,a0##condition=(a5 eq zero)
 ;   sll a5,a1,a5
-;   or a5,a3,a5
+;   or t0,a3,a5
 ;   li a3,64
-;   andi a2,a2,127
-;   select [a0,a1],[zero,a4],[a4,a5]##condition=(a2 uge a3)
+;   andi a5,a2,127
+;   select [a0,a1],[zero,a4],[a4,t0]##condition=(a5 uge a3)
 ;   ret
 ;
 ; Disassembled:
@@ -1457,20 +1449,18 @@ block0(v0: i128, v1: i8):
 ;   sub a3, a3, a5
 ;   sll a4, a0, a5
 ;   srl a0, a0, a3
-;   bnez a5, 0xc
 ;   mv a3, zero
-;   j 8
+;   beqz a5, 8
 ;   mv a3, a0
 ;   sll a5, a1, a5
-;   or a5, a3, a5
+;   or t0, a3, a5
 ;   addi a3, zero, 0x40
-;   andi a2, a2, 0x7f
-;   bltu a2, a3, 0x10
+;   andi a5, a2, 0x7f
 ;   mv a0, zero
 ;   mv a1, a4
-;   j 0xc
+;   bgeu a5, a3, 0xc
 ;   mv a0, a4
-;   mv a1, a5
+;   mv a1, t0
 ;   ret
 
 function %ishl_i128_i128(i128, i128) -> i128 {
@@ -1511,10 +1501,9 @@ block0(v0: i128, v1: i128):
 ;   or a4, a3, a0
 ;   addi a3, zero, 0x40
 ;   andi a2, a2, 0x7f
-;   bltu a2, a3, 0x10
 ;   mv a0, zero
 ;   mv a1, a5
-;   j 0xc
+;   bgeu a2, a3, 0xc
 ;   mv a0, a5
 ;   mv a1, a4
 ;   ret
@@ -1546,19 +1535,17 @@ block0(v0: i128, v1: i8):
 ;   addi a3, zero, 0x40
 ;   sub a3, a3, a4
 ;   sll a5, a1, a3
-;   bnez a4, 0xc
 ;   mv a3, zero
-;   j 8
+;   beqz a4, 8
 ;   mv a3, a5
 ;   srl a5, a0, a4
 ;   or a5, a3, a5
 ;   addi t0, zero, 0x40
 ;   srl a3, a1, a4
 ;   andi a4, a2, 0x7f
-;   bltu a4, t0, 0x10
 ;   mv a0, a3
 ;   mv a1, zero
-;   j 0xc
+;   bgeu a4, t0, 0xc
 ;   mv a0, a5
 ;   mv a1, a3
 ;   ret
@@ -1615,10 +1602,9 @@ block0(v0: i128, v1: i128):
 ;   addi a3, zero, 0x40
 ;   srl a4, a1, a5
 ;   andi a5, a2, 0x7f
-;   bltu a5, a3, 0x10
 ;   mv a0, a4
 ;   mv a1, zero
-;   j 0xc
+;   bgeu a5, a3, 0xc
 ;   mv a0, s11
 ;   mv a1, a4
 ;   ld s11, 8(sp)
@@ -1658,25 +1644,22 @@ block0(v0: i128, v1: i8):
 ;   addi a3, zero, 0x40
 ;   sub a3, a3, a4
 ;   sll a5, a1, a3
-;   bnez a4, 0xc
 ;   mv a3, zero
-;   j 8
+;   beqz a4, 8
 ;   mv a3, a5
 ;   srl a5, a0, a4
 ;   or a5, a3, a5
 ;   addi a0, zero, 0x40
 ;   sra a3, a1, a4
 ;   addi a4, zero, -1
-;   bgez a1, 0xc
 ;   mv t4, a4
-;   j 8
+;   bltz a1, 8
 ;   mv t4, zero
 ;   addi a4, zero, 0x40
 ;   andi a2, a2, 0x7f
-;   bltu a2, a4, 0x10
 ;   mv a0, a3
 ;   mv a1, t4
-;   j 0xc
+;   bgeu a2, a4, 0xc
 ;   mv a0, a5
 ;   mv a1, a3
 ;   ret
@@ -1740,10 +1723,9 @@ block0(v0: i128, v1: i128):
 ;   mv a5, zero
 ;   addi a4, zero, 0x40
 ;   andi a2, a2, 0x7f
-;   bltu a2, a4, 0x10
 ;   mv a0, a3
 ;   mv a1, a5
-;   j 0xc
+;   bgeu a2, a4, 0xc
 ;   mv a0, s11
 ;   mv a1, a3
 ;   ld s11, 8(sp)