Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aarch64: Implement I128 Loads and Stores #2985

Merged
merged 2 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 107 additions & 32 deletions cranelift/codegen/src/isa/aarch64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,63 @@ fn collect_address_addends<C: LowerCtx<I = Inst>>(
(result64, result32, offset)
}

/// Lower the address of a pair load or store.
pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
ctx: &mut C,
roots: &[InsnInput],
offset: i32,
) -> PairAMode {
// Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
// extends and addition ops. We update these as we consume address
// components, so they represent the remaining addends not yet handled.
let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
let offset = args_offset + (offset as i64);

trace!(
"lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
addends64,
addends32,
offset
);

// Pairs basically only have reg + imm formats so we only have to worry about those

let base_reg = if let Some(reg64) = addends64.pop() {
reg64
} else if let Some((reg32, extendop)) = addends32.pop() {
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
let signed = match extendop {
ExtendOp::SXTW => true,
ExtendOp::UXTW => false,
_ => unreachable!(),
};
ctx.emit(Inst::Extend {
rd: tmp,
rn: reg32,
signed,
from_bits: 32,
to_bits: 64,
});
tmp.to_reg()
} else {
zero_reg()
};

let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
ctx.emit(Inst::gen_move(addr, base_reg, I64));

// We have the base register, if we have any others, we need to add them
lower_add_addends(ctx, addr, addends64, addends32);

// Figure out what offset we should emit
let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| {
lower_add_immediate(ctx, addr, addr.to_reg(), offset);
SImm7Scaled::maybe_from_i64(0, I64).unwrap()
});

PairAMode::SignedOffset(addr.to_reg(), imm7)
}

/// Lower the address of a load or store.
pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
ctx: &mut C,
Expand Down Expand Up @@ -792,36 +849,23 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
// If there is any offset, load that first into `addr`, and add the `reg`
// that we kicked out of the `AMode`; otherwise, start with that reg.
if offset != 0 {
// If we can fit offset or -offset in an imm12, use an add-imm
// to combine the reg and offset. Otherwise, load value first then add.
if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::Add64,
rd: addr,
rn: reg,
imm12,
});
} else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::Sub64,
rd: addr,
rn: reg,
imm12,
});
} else {
lower_constant_u64(ctx, addr, offset as u64);
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Add64,
rd: addr,
rn: addr.to_reg(),
rm: reg,
});
}
lower_add_immediate(ctx, addr, reg, offset)
} else {
ctx.emit(Inst::gen_move(addr, reg, I64));
}

// Now handle reg64 and reg32-extended components.
lower_add_addends(ctx, addr, addends64, addends32);

memarg
}

fn lower_add_addends<C: LowerCtx<I = Inst>>(
ctx: &mut C,
rd: Writable<Reg>,
addends64: AddressAddend64List,
addends32: AddressAddend32List,
) {
for reg in addends64 {
// If the register is the stack reg, we must move it to another reg
// before adding it.
Expand All @@ -834,23 +878,51 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
};
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Add64,
rd: addr,
rn: addr.to_reg(),
rd,
rn: rd.to_reg(),
rm: reg,
});
}
for (reg, extendop) in addends32 {
assert!(reg != stack_reg());
ctx.emit(Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd: addr,
rn: addr.to_reg(),
rd,
rn: rd.to_reg(),
rm: reg,
extendop,
});
}
}

memarg
/// Adds into `rd` a signed imm pattern matching the best instruction for it.
// TODO: This function is duplicated in ctx.gen_add_imm
fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
// If we can fit offset or -offset in an imm12, use an add-imm
// Otherwise, lower the constant first then add.
if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::Add64,
rd: dst,
rn: src,
imm12,
});
} else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::Sub64,
rd: dst,
rn: src,
imm12,
});
} else {
lower_constant_u64(ctx, dst, imm as u64);
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Add64,
rd: dst,
rn: dst.to_reg(),
rm: src,
});
}
}

pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
Expand Down Expand Up @@ -1248,7 +1320,10 @@ fn load_op_to_ty(op: Opcode) -> Option<Type> {

/// Helper to lower a load instruction; this is used in several places, because
/// a load can sometimes be merged into another operation.
pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>, Type, AMode)>(
pub(crate) fn lower_load<
C: LowerCtx<I = Inst>,
F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
>(
ctx: &mut C,
ir_inst: IRInst,
inputs: &[InsnInput],
Expand All @@ -1261,7 +1336,7 @@ pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>,

let off = ctx.data(ir_inst).load_store_offset().unwrap();
let mem = lower_address(ctx, elem_ty, &inputs[..], off);
let rd = get_output_reg(ctx, output).only_reg().unwrap();
let rd = get_output_reg(ctx, output);

f(ctx, rd, elem_ty, mem);
}
Expand Down
150 changes: 88 additions & 62 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1180,56 +1180,71 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
.memflags(insn)
.expect("Load instruction should have memflags");

lower_load(
ctx,
insn,
&inputs[..],
outputs[0],
|ctx, rd, elem_ty, mem| {
let is_float = ty_has_float_or_vec_representation(elem_ty);
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
(1, _, _) => Inst::ULoad8 { rd, mem, flags },
(8, false, _) => Inst::ULoad8 { rd, mem, flags },
(8, true, _) => Inst::SLoad8 { rd, mem, flags },
(16, false, _) => Inst::ULoad16 { rd, mem, flags },
(16, true, _) => Inst::SLoad16 { rd, mem, flags },
(32, false, false) => Inst::ULoad32 { rd, mem, flags },
(32, true, false) => Inst::SLoad32 { rd, mem, flags },
(32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
(64, _, false) => Inst::ULoad64 { rd, mem, flags },
// Note that we treat some of the vector loads as scalar floating-point loads,
// which is correct in a little endian environment.
(64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
(128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
_ => panic!("Unsupported size in load"),
});

let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
_ => None,
};

if let Some(t) = vec_extend {
ctx.emit(Inst::VecExtend {
t,
rd,
rn: rd.to_reg(),
high_half: false,
let out_ty = ctx.output_ty(insn, 0);
if out_ty == I128 {
let off = ctx.data(insn).load_store_offset().unwrap();
let mem = lower_pair_address(ctx, &inputs[..], off);
let dst = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::LoadP64 {
rt: dst.regs()[0],
rt2: dst.regs()[1],
mem,
flags,
});
} else {
lower_load(
ctx,
insn,
&inputs[..],
outputs[0],
|ctx, dst, elem_ty, mem| {
let rd = dst.only_reg().unwrap();
let is_float = ty_has_float_or_vec_representation(elem_ty);
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
(1, _, _) => Inst::ULoad8 { rd, mem, flags },
(8, false, _) => Inst::ULoad8 { rd, mem, flags },
(8, true, _) => Inst::SLoad8 { rd, mem, flags },
(16, false, _) => Inst::ULoad16 { rd, mem, flags },
(16, true, _) => Inst::SLoad16 { rd, mem, flags },
(32, false, false) => Inst::ULoad32 { rd, mem, flags },
(32, true, false) => Inst::SLoad32 { rd, mem, flags },
(32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
(64, _, false) => Inst::ULoad64 { rd, mem, flags },
// Note that we treat some of the vector loads as scalar floating-point loads,
// which is correct in a little endian environment.
(64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
(128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
_ => panic!("Unsupported size in load"),
});
}
},
);

let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
_ => None,
};

if let Some(t) = vec_extend {
let rd = dst.only_reg().unwrap();
ctx.emit(Inst::VecExtend {
t,
rd,
rn: rd.to_reg(),
high_half: false,
});
}
},
);
}
}

Opcode::Store
Expand All @@ -1253,19 +1268,30 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
.memflags(insn)
.expect("Store instruction should have memflags");

let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);

ctx.emit(match (ty_bits(elem_ty), is_float) {
(1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
(16, _) => Inst::Store16 { rd, mem, flags },
(32, false) => Inst::Store32 { rd, mem, flags },
(32, true) => Inst::FpuStore32 { rd, mem, flags },
(64, false) => Inst::Store64 { rd, mem, flags },
(64, true) => Inst::FpuStore64 { rd, mem, flags },
(128, _) => Inst::FpuStore128 { rd, mem, flags },
_ => panic!("Unsupported size in store"),
});
let dst = put_input_in_regs(ctx, inputs[0]);

if elem_ty == I128 {
let mem = lower_pair_address(ctx, &inputs[1..], off);
ctx.emit(Inst::StoreP64 {
rt: dst.regs()[0],
rt2: dst.regs()[1],
mem,
flags,
});
} else {
let rd = dst.only_reg().unwrap();
let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
ctx.emit(match (ty_bits(elem_ty), is_float) {
(1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
(16, _) => Inst::Store16 { rd, mem, flags },
(32, false) => Inst::Store32 { rd, mem, flags },
(32, true) => Inst::FpuStore32 { rd, mem, flags },
(64, false) => Inst::Store64 { rd, mem, flags },
(64, true) => Inst::FpuStore64 { rd, mem, flags },
(128, _) => Inst::FpuStore128 { rd, mem, flags },
_ => panic!("Unsupported size in store"),
});
}
}

Opcode::StackAddr => {
Expand Down
Loading