Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

riscv64: Improve base select lowering #8703

Merged
merged 1 commit into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ impl IntegerCompare {
..self
}
}

pub(crate) fn regs(&self) -> [Reg; 2] {
[self.rs1, self.rs2]
}
}

#[derive(Debug, Clone, Copy, PartialEq)]
Expand Down
69 changes: 56 additions & 13 deletions cranelift/codegen/src/isa/riscv64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1520,25 +1520,68 @@ impl Inst {
ref x,
ref y,
} => {
let label_true = sink.get_label();
let label_false = sink.get_label();
// The general form for this select is the following:
//
// mv rd, x
// b{cond} rcond, label_end
// mv rd, y
// label_end:
// ... etc
//
// This is built on the assumption that moves are cheap, but branches and jumps
// are not. So with this format we always avoid one jump instruction at the expense
// of an unconditional move.
//
// We also perform another optimization here. If the destination register is the same
// as one of the input registers, we can avoid emitting the first unconditional move
// and emit just the branch and the second move.
//
// To make sure that this happens as often as possible, we also try to invert the
// condition, so that if either of the input registers are the same as the destination
// we avoid that move.

let label_end = sink.get_label();

let xregs = x.regs();
let yregs = y.regs();
let dstregs: Vec<Reg> = dst.regs().into_iter().map(|r| r.to_reg()).collect();
let condregs = condition.regs();

// We are going to write to the destination register before evaluating
// the condition, so we need to make sure that the destination register
// is not one of the condition registers.
//
// This should never happen, since hopefully the regalloc constraints
// for this register are set up correctly.
debug_assert_ne!(dstregs, condregs);

// Check if we can invert the condition and avoid moving the y registers into
// the destination. This allows us to only emit the branch and one of the moves.
let (uncond_move, cond_move, condition) = if yregs == dstregs {
(yregs, xregs, condition.inverse())
} else {
(xregs, yregs, condition)
};

// Unconditonally move one of the values to the destination register.
//
// These moves may not end up being emitted if the source and
// destination registers are the same. That logic is built into
// the emit function for `Inst::Mov`.
for i in gen_moves(dst.regs(), uncond_move) {
i.emit(sink, emit_info, state);
}

// If the condition passes we skip over the conditional move
Inst::CondBr {
taken: CondBrTarget::Label(label_true),
not_taken: CondBrTarget::Label(label_false),
taken: CondBrTarget::Label(label_end),
not_taken: CondBrTarget::Fallthrough,
kind: condition,
}
.emit(sink, emit_info, state);
sink.bind_label(label_true, &mut state.ctrl_plane);
// here is the true
// select the first value
for i in gen_moves(dst.regs(), x.regs()) {
i.emit(sink, emit_info, state);
}
Inst::gen_jump(label_end).emit(sink, emit_info, state);

sink.bind_label(label_false, &mut state.ctrl_plane);
for i in gen_moves(dst.regs(), y.regs()) {
// Move the conditional value to the destination register.
for i in gen_moves(dst.regs(), cond_move) {
i.emit(sink, emit_info, state);
}

Expand Down
11 changes: 9 additions & 2 deletions cranelift/codegen/src/isa/riscv64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,14 +459,19 @@ fn riscv64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
y,
..
} => {
collector.reg_use(rs1);
collector.reg_use(rs2);
// Mark the condition registers as late use so that they don't overlap with the destination
// register. We may potentially write to the destination register before evaluating the
// condition.
collector.reg_late_use(rs1);
collector.reg_late_use(rs2);

for reg in x.regs_mut() {
collector.reg_use(reg);
}
for reg in y.regs_mut() {
collector.reg_use(reg);
}

// If there's more than one destination register then use
// `reg_early_def` to prevent destination registers from overlapping
// with any operands. This ensures that the lowering doesn't have to
Expand All @@ -476,6 +481,8 @@ fn riscv64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
// When there's only one destination register though don't use an
// early def because once the register is written no other inputs
// are read so it's ok for the destination to overlap the sources.
// The condition registers are already marked as late use so they
// won't overlap with the destination.
match dst.regs_mut() {
[reg] => collector.reg_def(reg),
regs => {
Expand Down
66 changes: 24 additions & 42 deletions cranelift/filetests/filetests/isa/riscv64/bitops.clif
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,8 @@ block0(v0: i128):
; addi a2, a2, -1
; srli a4, a4, 1
; j -0x18
; bnez a1, 0xc
; mv a0, a3
; j 8
; beqz a1, 8
; mv a0, zero
; add a0, a5, a0
; mv a1, zero
Expand All @@ -183,9 +182,8 @@ block0(v0: i8):
; slli a2, a0, 0x38
; srai a4, a2, 0x38
; not a0, a4
; bgez a4, 0xc
; mv a2, a0
; j 8
; bltz a4, 8
; mv a2, a4
; mv a0, zero
; addi a5, zero, 0x40
Expand Down Expand Up @@ -222,9 +220,8 @@ block0(v0: i16):
; slli a2, a0, 0x30
; srai a4, a2, 0x30
; not a0, a4
; bgez a4, 0xc
; mv a2, a0
; j 8
; bltz a4, 8
; mv a2, a4
; mv a0, zero
; addi a5, zero, 0x40
Expand Down Expand Up @@ -259,9 +256,8 @@ block0(v0: i32):
; block0: ; offset 0x0
; sext.w a2, a0
; not a4, a2
; bgez a2, 0xc
; mv a0, a4
; j 8
; bltz a2, 8
; mv a0, a2
; mv a4, zero
; addi a3, zero, 0x40
Expand Down Expand Up @@ -294,9 +290,8 @@ block0(v0: i64):
; Disassembled:
; block0: ; offset 0x0
; not a2, a0
; bgez a0, 0xc
; mv a4, a2
; j 8
; bltz a0, 8
; mv a4, a0
; mv a2, zero
; addi a1, zero, 0x40
Expand Down Expand Up @@ -335,14 +330,12 @@ block0(v0: i128):
; Disassembled:
; block0: ; offset 0x0
; not a3, a0
; bgez a1, 0xc
; mv a5, a3
; j 8
; bltz a1, 8
; mv a5, a0
; not a2, a1
; bgez a1, 0xc
; mv a3, a2
; j 8
; bltz a1, 8
; mv a3, a1
; mv a1, zero
; addi a0, zero, 0x40
Expand All @@ -366,9 +359,8 @@ block0(v0: i128):
; addi a4, a4, -1
; srli a2, a2, 1
; j -0x18
; bnez a3, 0xc
; mv a2, a0
; j 8
; beqz a3, 8
; mv a2, zero
; add a3, a1, a2
; addi a0, a3, -1
Expand Down Expand Up @@ -1444,10 +1436,10 @@ block0(v0: i128, v1: i8):
; srl a0,a0,a3
; select a3,zero,a0##condition=(a5 eq zero)
; sll a5,a1,a5
; or a5,a3,a5
; or t0,a3,a5
; li a3,64
; andi a2,a2,127
; select [a0,a1],[zero,a4],[a4,a5]##condition=(a2 uge a3)
; andi a5,a2,127
; select [a0,a1],[zero,a4],[a4,t0]##condition=(a5 uge a3)
; ret
;
; Disassembled:
Expand All @@ -1457,20 +1449,18 @@ block0(v0: i128, v1: i8):
; sub a3, a3, a5
; sll a4, a0, a5
; srl a0, a0, a3
; bnez a5, 0xc
; mv a3, zero
; j 8
; beqz a5, 8
; mv a3, a0
; sll a5, a1, a5
; or a5, a3, a5
; or t0, a3, a5
; addi a3, zero, 0x40
; andi a2, a2, 0x7f
; bltu a2, a3, 0x10
; andi a5, a2, 0x7f
; mv a0, zero
; mv a1, a4
; j 0xc
; bgeu a5, a3, 0xc
; mv a0, a4
; mv a1, a5
; mv a1, t0
; ret

function %ishl_i128_i128(i128, i128) -> i128 {
Expand Down Expand Up @@ -1511,10 +1501,9 @@ block0(v0: i128, v1: i128):
; or a4, a3, a0
; addi a3, zero, 0x40
; andi a2, a2, 0x7f
; bltu a2, a3, 0x10
; mv a0, zero
; mv a1, a5
; j 0xc
; bgeu a2, a3, 0xc
; mv a0, a5
; mv a1, a4
; ret
Expand Down Expand Up @@ -1546,19 +1535,17 @@ block0(v0: i128, v1: i8):
; addi a3, zero, 0x40
; sub a3, a3, a4
; sll a5, a1, a3
; bnez a4, 0xc
; mv a3, zero
; j 8
; beqz a4, 8
; mv a3, a5
; srl a5, a0, a4
; or a5, a3, a5
; addi t0, zero, 0x40
; srl a3, a1, a4
; andi a4, a2, 0x7f
; bltu a4, t0, 0x10
; mv a0, a3
; mv a1, zero
; j 0xc
; bgeu a4, t0, 0xc
; mv a0, a5
; mv a1, a3
; ret
Expand Down Expand Up @@ -1615,10 +1602,9 @@ block0(v0: i128, v1: i128):
; addi a3, zero, 0x40
; srl a4, a1, a5
; andi a5, a2, 0x7f
; bltu a5, a3, 0x10
; mv a0, a4
; mv a1, zero
; j 0xc
; bgeu a5, a3, 0xc
; mv a0, s11
; mv a1, a4
; ld s11, 8(sp)
Expand Down Expand Up @@ -1658,25 +1644,22 @@ block0(v0: i128, v1: i8):
; addi a3, zero, 0x40
; sub a3, a3, a4
; sll a5, a1, a3
; bnez a4, 0xc
; mv a3, zero
; j 8
; beqz a4, 8
; mv a3, a5
; srl a5, a0, a4
; or a5, a3, a5
; addi a0, zero, 0x40
; sra a3, a1, a4
; addi a4, zero, -1
; bgez a1, 0xc
; mv t4, a4
; j 8
; bltz a1, 8
; mv t4, zero
; addi a4, zero, 0x40
; andi a2, a2, 0x7f
; bltu a2, a4, 0x10
; mv a0, a3
; mv a1, t4
; j 0xc
; bgeu a2, a4, 0xc
; mv a0, a5
; mv a1, a3
; ret
Expand Down Expand Up @@ -1740,10 +1723,9 @@ block0(v0: i128, v1: i128):
; mv a5, zero
; addi a4, zero, 0x40
; andi a2, a2, 0x7f
; bltu a2, a4, 0x10
; mv a0, a3
; mv a1, a5
; j 0xc
; bgeu a2, a4, 0xc
; mv a0, s11
; mv a1, a3
; ld s11, 8(sp)
Expand Down
Loading