Browse Source

Cranelift AArch64: Various small fixes

* Use FMOV to move 64-bit FP registers and SIMD vectors.
* Add support for additional vector load types.
* Fix the printing of Inst::LoadAddr.

Copyright (c) 2020, Arm Limited.
pull/2369/head
Anton Kirilov 4 years ago
parent
commit
edaada3f57
  1. 8
      cranelift/codegen/src/isa/aarch64/inst/emit.rs
  2. 4
      cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
  3. 94
      cranelift/codegen/src/isa/aarch64/inst/mod.rs
  4. 56
      cranelift/codegen/src/isa/aarch64/lower_inst.rs
  5. 45
      cranelift/filetests/filetests/isa/aarch64/amodes.clif

8
cranelift/codegen/src/isa/aarch64/inst/emit.rs

@ -1239,7 +1239,7 @@ impl MachInstEmit for Inst {
sink.put4(enc_dmb_ish()); // dmb ish
}
&Inst::FpuMove64 { rd, rn } => {
sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
sink.put4(enc_fpurr(0b000_11110_01_1_000000_10000, rd, rn));
}
&Inst::FpuMove128 { rd, rn } => {
sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
@ -1984,7 +1984,9 @@ impl MachInstEmit for Inst {
if top22 != 0 {
sink.put4(enc_extend(top22, rd, rn));
} else {
Inst::mov32(rd, rn).emit(sink, emit_info, state);
let mov = Inst::Mov32 { rd, rm: rn };
mov.emit(sink, emit_info, state);
}
}
&Inst::Extend {
@ -2264,7 +2266,7 @@ impl MachInstEmit for Inst {
add.emit(sink, emit_info, state);
} else if offset == 0 {
if reg != rd.to_reg() {
let mov = Inst::mov(rd, reg);
let mov = Inst::Mov64 { rd, rm: reg };
mov.emit(sink, emit_info, state);
}

4
cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

@ -4219,8 +4219,8 @@ fn test_aarch64_binemit() {
rd: writable_vreg(8),
rn: vreg(4),
},
"881CA40E",
"mov v8.8b, v4.8b",
"8840601E",
"fmov d8, d4",
));
insns.push((

94
cranelift/codegen/src/isa/aarch64/inst/mod.rs

@ -5,8 +5,9 @@
use crate::binemit::CodeOffset;
use crate::ir::types::{
B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8,
I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64,
B1, B16, B16X4, B16X8, B32, B32X2, B32X4, B64, B64X2, B8, B8X16, B8X8, F32, F32X2, F32X4, F64,
F64X2, FFLAGS, I16, I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32,
R64,
};
use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
use crate::isa::CallConv;
@ -1192,35 +1193,6 @@ fn inst_size_test() {
}
impl Inst {
/// Create a move instruction.
pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
assert!(to_reg.to_reg().get_class() == from_reg.get_class());
if from_reg.get_class() == RegClass::I64 {
Inst::Mov64 {
rd: to_reg,
rm: from_reg,
}
} else if from_reg.get_class() == RegClass::V128 {
Inst::FpuMove128 {
rd: to_reg,
rn: from_reg,
}
} else {
Inst::FpuMove64 {
rd: to_reg,
rn: from_reg,
}
}
}
/// Create a 32-bit move instruction.
pub fn mov32(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
Inst::Mov32 {
rd: to_reg,
rm: from_reg,
}
}
/// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
/// logical immediate, or constant pool).
pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
@ -2709,8 +2681,31 @@ impl MachInst for Inst {
}
fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
assert!(ty.bits() <= 128);
Inst::mov(to_reg, from_reg)
let bits = ty.bits();
assert!(bits <= 128);
assert!(to_reg.to_reg().get_class() == from_reg.get_class());
if from_reg.get_class() == RegClass::I64 {
Inst::Mov64 {
rd: to_reg,
rm: from_reg,
}
} else if from_reg.get_class() == RegClass::V128 {
if bits > 64 {
Inst::FpuMove128 {
rd: to_reg,
rn: from_reg,
}
} else {
Inst::FpuMove64 {
rd: to_reg,
rn: from_reg,
}
}
} else {
panic!("Unexpected register class: {:?}", from_reg.get_class());
}
}
fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
@ -2761,9 +2756,9 @@ impl MachInst for Inst {
I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64),
F32 | F64 => Ok(RegClass::V128),
IFLAGS | FFLAGS => Ok(RegClass::I64),
B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => {
Ok(RegClass::V128)
}
B8X8 | B8X16 | B16X4 | B16X8 | B32X2 | B32X4 | B64X2 => Ok(RegClass::V128),
F32X2 | I8X8 | I16X4 | I32X2 => Ok(RegClass::V128),
F32X4 | F64X2 | I8X16 | I16X8 | I32X4 | I64X2 => Ok(RegClass::V128),
_ => Err(CodegenError::Unsupported(format!(
"Unexpected SSA-value type: {}",
ty
@ -3149,9 +3144,9 @@ impl Inst {
format!("dmb ish")
}
&Inst::FpuMove64 { rd, rn } => {
let rd = rd.to_reg().show_rru(mb_rru);
let rn = rn.show_rru(mb_rru);
format!("mov {}.8b, {}.8b", rd, rn)
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64);
format!("fmov {}, {}", rd, rn)
}
&Inst::FpuMove128 { rd, rn } => {
let rd = rd.to_reg().show_rru(mb_rru);
@ -3800,9 +3795,10 @@ impl Inst {
for inst in mem_insts.into_iter() {
ret.push_str(&inst.show_rru(mb_rru));
}
let (reg, offset) = match mem {
AMode::Unscaled(r, simm9) => (r, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
let (reg, index_reg, offset) = match mem {
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, None, uimm12scaled.value() as i32),
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
};
let abs_offset = if offset < 0 {
@ -3816,8 +3812,18 @@ impl Inst {
ALUOp::Add64
};
if offset == 0 {
let mov = Inst::mov(rd, reg);
if let Some((idx, extendop)) = index_reg {
let add = Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd,
rn: reg,
rm: idx,
extendop,
};
ret.push_str(&add.show_rru(mb_rru));
} else if offset == 0 {
let mov = Inst::gen_move(rd, reg, I64);
ret.push_str(&mov.show_rru(mb_rru));
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
let add = Inst::AluRRImm12 {

56
cranelift/codegen/src/isa/aarch64/lower_inst.rs

@ -1127,7 +1127,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload16x4
| Opcode::Uload16x4
| Opcode::Sload32x2
| Opcode::Uload32x2 => {
| Opcode::Uload32x2
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
let off = ctx.data(insn).load_store_offset().unwrap();
let elem_ty = match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
@ -1142,9 +1148,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sload32Complex
| Opcode::Uload32Complex => I32,
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8,
Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4,
Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2,
Opcode::Sload8x8
| Opcode::Uload8x8
| Opcode::Sload8x8Complex
| Opcode::Uload8x8Complex => I8X8,
Opcode::Sload16x4
| Opcode::Uload16x4
| Opcode::Sload16x4Complex
| Opcode::Uload16x4Complex => I16X4,
Opcode::Sload32x2
| Opcode::Uload32x2
| Opcode::Sload32x2Complex
| Opcode::Uload32x2Complex => I32X2,
_ => unreachable!(),
};
let sign_extend = match op {
@ -1180,11 +1195,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let vec_extend = match op {
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
_ => None,
};
@ -1641,11 +1662,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rd = get_output_reg(ctx, outputs[0]);
let ity = ctx.input_ty(insn, 0);
let oty = ctx.output_ty(insn, 0);
let ity_bits = ty_bits(ity);
let ity_vec_reg = ty_has_float_or_vec_representation(ity);
let oty_bits = ty_bits(oty);
let oty_vec_reg = ty_has_float_or_vec_representation(oty);
debug_assert_eq!(ity_bits, oty_bits);
match (ity_vec_reg, oty_vec_reg) {
(true, true) => {
let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 {
let narrow_mode = if ity_bits <= 32 {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::ZeroExtend64
@ -1667,11 +1693,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx: 0,
size: VectorSize::Size64x2,
size,
});
}
}
@ -1877,12 +1905,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::GetPinnedReg => {
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
}
Opcode::SetPinnedReg => {
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
}
Opcode::Spill
@ -2314,14 +2342,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
});
}
Opcode::Vsplit
| Opcode::Vconcat
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
| Opcode::Sload16x4Complex
| Opcode::Uload32x2Complex
| Opcode::Sload32x2Complex => {
Opcode::Vsplit | Opcode::Vconcat => {
// TODO
panic!("Vector ops not implemented.");
}
@ -2569,6 +2590,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
//
// This is a scalar Fcopysign.
// This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
// In the latter case it still sets all bits except the lowest 32 to 0.
//
// mov vd, vn
// ushr vtmp, vm, #63 / #31
@ -2583,7 +2605,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let tmp = ctx.alloc_tmp(RegClass::V128, F64);
// Copy LHS to rd.
ctx.emit(Inst::FpuMove64 { rd, rn });
ctx.emit(Inst::gen_move(rd, rn, ty));
// Copy the sign bit to the lowest bit in tmp.
let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();

45
cranelift/filetests/filetests/isa/aarch64/amodes.clif

@ -299,3 +299,48 @@ block0(v0: i64):
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f18(i64, i32) -> i16x8 {
block0(v0: i64, v1: i32):
v2 = uextend.i64 v1
v3 = sload8x8_complex v2+v0
return v3
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldr d0, [x0, w1, UXTW]
; nextln: sxtl v0.8h, v0.8b
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f19(i64, i64) -> i32x4 {
block0(v0: i64, v1: i64):
v2 = uload16x4_complex v0+v1+8
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: add x0, x0, x1
; nextln: ldr d0, [x0, #8]
; nextln: uxtl v0.4s, v0.4h
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f20(i64, i32) -> i64x2 {
block0(v0: i64, v1: i32):
v2 = sextend.i64 v1
v3 = uload32x2_complex v2+v0
return v3
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldr d0, [x0, w1, SXTW]
; nextln: uxtl v0.2d, v0.2s
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret

Loading…
Cancel
Save