Browse Source

Merge pull request #3679 from FreddieLiardet/fp_const_fmov

Improve code generation for floating-point constants
pull/3700/head
Chris Fallin 3 years ago
committed by GitHub
parent
commit
4a331b8981
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 6
      cranelift/codegen/src/isa/aarch64/inst.isle
  2. 13
      cranelift/codegen/src/isa/aarch64/inst/emit.rs
  3. 19
      cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
  4. 52
      cranelift/codegen/src/isa/aarch64/inst/mod.rs
  5. 2
      cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
  6. 383
      cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
  7. 15
      cranelift/codegen/src/isa/aarch64/lower_inst.rs
  8. 121
      cranelift/filetests/filetests/isa/aarch64/constants.clif
  9. 84
      cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
  10. 104
      cranelift/filetests/filetests/isa/aarch64/floating-point.clif

6
cranelift/codegen/src/isa/aarch64/inst.isle

@ -454,6 +454,12 @@
(rn Reg)
(size ScalarSize))
;; Loads a floating-point immediate.
(FpuMoveFPImm
(rd WritableReg)
(imm ASIMDFPModImm)
(size ScalarSize))
;; Move to a vector element from a GPR.
(MovToVec
(rd WritableReg)

13
cranelift/codegen/src/isa/aarch64/inst/emit.rs

@ -1983,6 +1983,19 @@ impl MachInstEmit for Inst {
};
sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
}
&Inst::FpuMoveFPImm { rd, imm, size } => {
let size_code = match size {
ScalarSize::Size32 => 0b00,
ScalarSize::Size64 => 0b01,
_ => unimplemented!(),
};
sink.put4(
0b000_11110_00_1_00_000_000100_00000_00000
| size_code << 22
| ((imm.enc_bits() as u32) << 13)
| machreg_to_vec(rd.to_reg()),
);
}
&Inst::MovToVec { rd, rn, idx, size } => {
let (imm5, shift) = match size.lane_size() {
ScalarSize::Size8 => (0b00001, 1),

19
cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

@ -2051,6 +2051,25 @@ fn test_aarch64_binemit() {
"8103271E",
"fmov s1, w28",
));
insns.push((
Inst::FpuMoveFPImm {
rd: writable_vreg(31),
imm: ASIMDFPModImm::maybe_from_u64(f64::to_bits(1.0), ScalarSize::Size64).unwrap(),
size: ScalarSize::Size64,
},
"1F106E1E",
"fmov d31, #1",
));
insns.push((
Inst::FpuMoveFPImm {
rd: writable_vreg(1),
imm: ASIMDFPModImm::maybe_from_u64(f32::to_bits(31.0).into(), ScalarSize::Size32)
.unwrap(),
size: ScalarSize::Size32,
},
"01F0271E",
"fmov s1, #31",
));
insns.push((
Inst::MovToVec {
rd: writable_vreg(0),

52
cranelift/codegen/src/isa/aarch64/inst/mod.rs

@ -239,29 +239,35 @@ impl Inst {
/// Create instructions that load a 32-bit floating-point constant.
pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
rd: Writable<Reg>,
value: u32,
const_data: u32,
mut alloc_tmp: F,
) -> SmallVec<[Inst; 4]> {
// Note that we must make sure that all bits outside the lowest 32 are set to 0
// because this function is also used to load wider constants (that have zeros
// in their most significant bits).
if value == 0 {
if const_data == 0 {
smallvec![Inst::VecDupImm {
rd,
imm: ASIMDMovModImm::zero(ScalarSize::Size32),
invert: false,
size: VectorSize::Size32x2
size: VectorSize::Size32x2,
}]
} else if let Some(imm) =
ASIMDFPModImm::maybe_from_u64(const_data.into(), ScalarSize::Size32)
{
smallvec![Inst::FpuMoveFPImm {
rd,
imm,
size: ScalarSize::Size32,
}]
} else {
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
// bits.
let tmp = alloc_tmp(I32);
let mut insts = Inst::load_constant(tmp, value as u64);
let mut insts = Inst::load_constant(tmp, const_data as u64);
insts.push(Inst::MovToFpu {
rd,
rn: tmp.to_reg(),
size: ScalarSize::Size64,
size: ScalarSize::Size32,
});
insts
@ -277,11 +283,23 @@ impl Inst {
// Note that we must make sure that all bits outside the lowest 64 are set to 0
// because this function is also used to load wider constants (that have zeros
// in their most significant bits).
if let Ok(const_data) = u32::try_from(const_data) {
// TODO: Treat as half of a 128 bit vector and consider replicated patterns.
// Scalar MOVI might also be an option.
if const_data == 0 {
smallvec![Inst::VecDupImm {
rd,
imm: ASIMDMovModImm::zero(ScalarSize::Size32),
invert: false,
size: VectorSize::Size32x2,
}]
} else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(const_data, ScalarSize::Size64) {
smallvec![Inst::FpuMoveFPImm {
rd,
imm,
size: ScalarSize::Size64,
}]
} else if let Ok(const_data) = u32::try_from(const_data) {
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
// bits. Also, treat it as half of a 128-bit vector and consider replicated
// patterns. Scalar MOVI might also be an option.
} else if const_data & (u32::MAX as u64) == 0 {
let tmp = alloc_tmp(I64);
let mut insts = Inst::load_constant(tmp, const_data);
@ -879,6 +897,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::FpuMoveFPImm { rd, .. } => {
collector.add_def(rd);
}
&Inst::MovToVec { rd, rn, .. } => {
collector.add_mod(rd);
collector.add_use(rn);
@ -1654,6 +1675,9 @@ pub fn aarch64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
mapper.map_def(rd);
mapper.map_use(rn);
}
&mut Inst::FpuMoveFPImm { ref mut rd, .. } => {
mapper.map_def(rd);
}
&mut Inst::MovToVec {
ref mut rd,
ref mut rn,
@ -2693,6 +2717,12 @@ impl Inst {
let rn = show_ireg_sized(rn, mb_rru, operand_size);
format!("fmov {}, {}", rd, rn)
}
&Inst::FpuMoveFPImm { rd, imm, size } => {
let imm = imm.show_rru(mb_rru);
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
format!("fmov {}, {}", rd, imm)
}
&Inst::MovToVec { rd, rn, idx, size } => {
let rd = show_vreg_element(rd.to_reg(), mb_rru, idx, size);
let rn = show_ireg_sized(rn, mb_rru, size.operand_size());

2
cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest

@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365
src/prelude.isle 22dd5ff133398960
src/isa/aarch64/inst.isle 5fa80451697b084f
src/isa/aarch64/inst.isle f946561093de4ff5
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23

383
cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs

File diff suppressed because it is too large

15
cranelift/codegen/src/isa/aarch64/lower_inst.rs

@ -53,16 +53,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
match op {
Opcode::Iconst | Opcode::Bconst | Opcode::Null => implemented_in_isle(ctx),
Opcode::F32const => {
let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
lower_constant_f32(ctx, rd, value);
}
Opcode::F64const => {
let value = f64::from_bits(ctx.get_constant(insn).unwrap());
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
lower_constant_f64(ctx, rd, value);
}
Opcode::F32const | Opcode::F64const => unreachable!(
"Should never see constant ops at top level lowering entry
point, as constants are rematerialized at use-sites"
),
Opcode::Iadd => implemented_in_isle(ctx),
Opcode::Isub => implemented_in_isle(ctx),
Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {

121
cranelift/filetests/filetests/isa/aarch64/constants.clif

@ -292,3 +292,124 @@ block0:
; Inst 1: ret
; }}
function %f() -> f64 {
block0:
v0 = f64const 0x1.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: fmov d0, #1
; Inst 1: ret
; }}
function %f() -> f32 {
block0:
v0 = f32const 0x5.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: fmov s0, #5
; Inst 1: ret
; }}
function %f() -> f64 {
block0:
v0 = f64const 0x32.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 3)
; Inst 0: movz x0, #16457, LSL #48
; Inst 1: fmov d0, x0
; Inst 2: ret
; }}
function %f() -> f32 {
block0:
v0 = f32const 0x32.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 3)
; Inst 0: movz x0, #16968, LSL #16
; Inst 1: fmov s0, w0
; Inst 2: ret
; }}
function %f() -> f64 {
block0:
v0 = f64const 0x0.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: movi v0.2s, #0
; Inst 1: ret
; }}
function %f() -> f32 {
block0:
v0 = f32const 0x0.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: movi v0.2s, #0
; Inst 1: ret
; }}
function %f() -> f64 {
block0:
v0 = f64const -0x10.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: fmov d0, #-16
; Inst 1: ret
; }}
function %f() -> f32 {
block0:
v0 = f32const -0x10.0
return v0
}
; VCode_ShowWithRRU {{
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 2)
; Inst 0: fmov s0, #-16
; Inst 1: ret
; }}

84
cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif

@ -76,19 +76,18 @@ block0(v0: f32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49024, LSL #16
; Inst 3: fmov d1, x0
; Inst 4: fcmp s0, s1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #17280, LSL #16
; Inst 7: fmov d1, x0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, s0
; Inst 11: ret
; Inst 2: fmov s1, #-1
; Inst 3: fcmp s0, s1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #17280, LSL #16
; Inst 6: fmov s1, w0
; Inst 7: fcmp s0, s1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, s0
; Inst 10: ret
; }}
function u0:0(f64) -> i8 {
@ -101,19 +100,18 @@ block0(v0: f64):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp d0, d0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49136, LSL #48
; Inst 3: fmov d1, x0
; Inst 4: fcmp d0, d1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #16496, LSL #48
; Inst 7: fmov d1, x0
; Inst 8: fcmp d0, d1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, d0
; Inst 11: ret
; Inst 2: fmov d1, #-1
; Inst 3: fcmp d0, d1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #16496, LSL #48
; Inst 6: fmov d1, x0
; Inst 7: fcmp d0, d1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, d0
; Inst 10: ret
; }}
function u0:0(f32) -> i16 {
@ -126,19 +124,18 @@ block0(v0: f32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49024, LSL #16
; Inst 3: fmov d1, x0
; Inst 4: fcmp s0, s1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #18304, LSL #16
; Inst 7: fmov d1, x0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, s0
; Inst 11: ret
; Inst 2: fmov s1, #-1
; Inst 3: fcmp s0, s1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #18304, LSL #16
; Inst 6: fmov s1, w0
; Inst 7: fcmp s0, s1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, s0
; Inst 10: ret
; }}
function u0:0(f64) -> i16 {
@ -151,18 +148,17 @@ block0(v0: f64):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp d0, d0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49136, LSL #48
; Inst 3: fmov d1, x0
; Inst 4: fcmp d0, d1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #16624, LSL #48
; Inst 7: fmov d1, x0
; Inst 8: fcmp d0, d1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, d0
; Inst 11: ret
; Inst 2: fmov d1, #-1
; Inst 3: fcmp d0, d1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #16624, LSL #48
; Inst 6: fmov d1, x0
; Inst 7: fcmp d0, d1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, d0
; Inst 10: ret
; }}

104
cranelift/filetests/filetests/isa/aarch64/floating-point.clif

@ -494,19 +494,18 @@ block0(v0: f32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49024, LSL #16
; Inst 3: fmov d1, x0
; Inst 4: fcmp s0, s1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #20352, LSL #16
; Inst 7: fmov d1, x0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, s0
; Inst 11: ret
; Inst 2: fmov s1, #-1
; Inst 3: fcmp s0, s1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #20352, LSL #16
; Inst 6: fmov s1, w0
; Inst 7: fcmp s0, s1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, s0
; Inst 10: ret
; }}
function %f34(f32) -> i32 {
@ -523,11 +522,11 @@ block0(v0: f32):
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #52992, LSL #16
; Inst 3: fmov d1, x0
; Inst 3: fmov s1, w0
; Inst 4: fcmp s0, s1
; Inst 5: b.ge 8 ; udf
; Inst 6: movz x0, #20224, LSL #16
; Inst 7: fmov d1, x0
; Inst 7: fmov s1, w0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzs w0, s0
@ -544,19 +543,18 @@ block0(v0: f32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49024, LSL #16
; Inst 3: fmov d1, x0
; Inst 4: fcmp s0, s1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #24448, LSL #16
; Inst 7: fmov d1, x0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu x0, s0
; Inst 11: ret
; Inst 2: fmov s1, #-1
; Inst 3: fcmp s0, s1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #24448, LSL #16
; Inst 6: fmov s1, w0
; Inst 7: fcmp s0, s1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu x0, s0
; Inst 10: ret
; }}
function %f36(f32) -> i64 {
@ -573,11 +571,11 @@ block0(v0: f32):
; Inst 0: fcmp s0, s0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #57088, LSL #16
; Inst 3: fmov d1, x0
; Inst 3: fmov s1, w0
; Inst 4: fcmp s0, s1
; Inst 5: b.ge 8 ; udf
; Inst 6: movz x0, #24320, LSL #16
; Inst 7: fmov d1, x0
; Inst 7: fmov s1, w0
; Inst 8: fcmp s0, s1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzs x0, s0
@ -594,19 +592,18 @@ block0(v0: f64):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp d0, d0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49136, LSL #48
; Inst 3: fmov d1, x0
; Inst 4: fcmp d0, d1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #16880, LSL #48
; Inst 7: fmov d1, x0
; Inst 8: fcmp d0, d1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu w0, d0
; Inst 11: ret
; Inst 2: fmov d1, #-1
; Inst 3: fcmp d0, d1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #16880, LSL #48
; Inst 6: fmov d1, x0
; Inst 7: fcmp d0, d1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu w0, d0
; Inst 10: ret
; }}
function %f38(f64) -> i32 {
@ -643,19 +640,18 @@ block0(v0: f64):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 11)
; Inst 0: fcmp d0, d0
; Inst 1: b.vc 8 ; udf
; Inst 2: movz x0, #49136, LSL #48
; Inst 3: fmov d1, x0
; Inst 4: fcmp d0, d1
; Inst 5: b.gt 8 ; udf
; Inst 6: movz x0, #17392, LSL #48
; Inst 7: fmov d1, x0
; Inst 8: fcmp d0, d1
; Inst 9: b.mi 8 ; udf
; Inst 10: fcvtzu x0, d0
; Inst 11: ret
; Inst 2: fmov d1, #-1
; Inst 3: fcmp d0, d1
; Inst 4: b.gt 8 ; udf
; Inst 5: movz x0, #17392, LSL #48
; Inst 6: fmov d1, x0
; Inst 7: fcmp d0, d1
; Inst 8: b.mi 8 ; udf
; Inst 9: fcvtzu x0, d0
; Inst 10: ret
; }}
function %f40(f64) -> i64 {
@ -815,7 +811,7 @@ block0(v0: f32):
; (original IR block: block0)
; (instruction range: 0 .. 9)
; Inst 0: movz x0, #20352, LSL #16
; Inst 1: fmov d1, x0
; Inst 1: fmov s1, w0
; Inst 2: fmin s2, s0, s1
; Inst 3: movi v1.2s, #0
; Inst 4: fmax s2, s2, s1
@ -837,10 +833,10 @@ block0(v0: f32):
; (original IR block: block0)
; (instruction range: 0 .. 11)
; Inst 0: movz x0, #20224, LSL #16
; Inst 1: fmov d1, x0
; Inst 1: fmov s1, w0
; Inst 2: fmin s1, s0, s1
; Inst 3: movz x0, #52992, LSL #16
; Inst 4: fmov d2, x0
; Inst 4: fmov s2, w0
; Inst 5: fmax s1, s1, s2
; Inst 6: movi v2.2s, #0
; Inst 7: fcmp s0, s0
@ -861,7 +857,7 @@ block0(v0: f32):
; (original IR block: block0)
; (instruction range: 0 .. 9)
; Inst 0: movz x0, #24448, LSL #16
; Inst 1: fmov d1, x0
; Inst 1: fmov s1, w0
; Inst 2: fmin s2, s0, s1
; Inst 3: movi v1.2s, #0
; Inst 4: fmax s2, s2, s1
@ -883,10 +879,10 @@ block0(v0: f32):
; (original IR block: block0)
; (instruction range: 0 .. 11)
; Inst 0: movz x0, #24320, LSL #16
; Inst 1: fmov d1, x0
; Inst 1: fmov s1, w0
; Inst 2: fmin s1, s0, s1
; Inst 3: movz x0, #57088, LSL #16
; Inst 4: fmov d2, x0
; Inst 4: fmov s2, w0
; Inst 5: fmax s1, s1, s2
; Inst 6: movi v2.2s, #0
; Inst 7: fcmp s0, s0

Loading…
Cancel
Save