Merge pull request #3679 from FreddieLiardet/fp_const_fmov

Improve code generation for floating-point constants
3 years ago · 4a331b8981
10 changed files with 490 additions and 309 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@ -454,6 +454,12 @@
      (rn Reg)
      (size ScalarSize))

+    ;; Loads a floating-point immediate.
+    (FpuMoveFPImm
+      (rd WritableReg)
+      (imm ASIMDFPModImm)
+      (size ScalarSize))
+
    ;; Move to a vector element from a GPR.
    (MovToVec
      (rd WritableReg)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -1983,6 +1983,19 @@ impl MachInstEmit for Inst {
                };
                sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
            }
+            &Inst::FpuMoveFPImm { rd, imm, size } => {
+                let size_code = match size {
+                    ScalarSize::Size32 => 0b00,
+                    ScalarSize::Size64 => 0b01,
+                    _ => unimplemented!(),
+                };
+                sink.put4(
+                    0b000_11110_00_1_00_000_000100_00000_00000
+                        | size_code << 22
+                        | ((imm.enc_bits() as u32) << 13)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
            &Inst::MovToVec { rd, rn, idx, size } => {
                let (imm5, shift) = match size.lane_size() {
                    ScalarSize::Size8 => (0b00001, 1),
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -2051,6 +2051,25 @@ fn test_aarch64_binemit() {
        "8103271E",
        "fmov s1, w28",
    ));
+    insns.push((
+        Inst::FpuMoveFPImm {
+            rd: writable_vreg(31),
+            imm: ASIMDFPModImm::maybe_from_u64(f64::to_bits(1.0), ScalarSize::Size64).unwrap(),
+            size: ScalarSize::Size64,
+        },
+        "1F106E1E",
+        "fmov d31, #1",
+    ));
+    insns.push((
+        Inst::FpuMoveFPImm {
+            rd: writable_vreg(1),
+            imm: ASIMDFPModImm::maybe_from_u64(f32::to_bits(31.0).into(), ScalarSize::Size32)
+                .unwrap(),
+            size: ScalarSize::Size32,
+        },
+        "01F0271E",
+        "fmov s1, #31",
+    ));
    insns.push((
        Inst::MovToVec {
            rd: writable_vreg(0),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -239,29 +239,35 @@ impl Inst {
    /// Create instructions that load a 32-bit floating-point constant.
    pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
        rd: Writable<Reg>,
-        value: u32,
+        const_data: u32,
        mut alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        // Note that we must make sure that all bits outside the lowest 32 are set to 0
        // because this function is also used to load wider constants (that have zeros
        // in their most significant bits).
-        if value == 0 {
+        if const_data == 0 {
            smallvec![Inst::VecDupImm {
                rd,
                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
                invert: false,
-                size: VectorSize::Size32x2
+                size: VectorSize::Size32x2,
+            }]
+        } else if let Some(imm) =
+            ASIMDFPModImm::maybe_from_u64(const_data.into(), ScalarSize::Size32)
+        {
+            smallvec![Inst::FpuMoveFPImm {
+                rd,
+                imm,
+                size: ScalarSize::Size32,
            }]
        } else {
-            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
-            // bits.
            let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, value as u64);
+            let mut insts = Inst::load_constant(tmp, const_data as u64);

            insts.push(Inst::MovToFpu {
                rd,
                rn: tmp.to_reg(),
-                size: ScalarSize::Size64,
+                size: ScalarSize::Size32,
            });

            insts
@ -277,11 +283,23 @@ impl Inst {
        // Note that we must make sure that all bits outside the lowest 64 are set to 0
        // because this function is also used to load wider constants (that have zeros
        // in their most significant bits).
-        if let Ok(const_data) = u32::try_from(const_data) {
+        // TODO: Treat as half of a 128 bit vector and consider replicated patterns.
+        // Scalar MOVI might also be an option.
+        if const_data == 0 {
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
+                invert: false,
+                size: VectorSize::Size32x2,
+            }]
+        } else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(const_data, ScalarSize::Size64) {
+            smallvec![Inst::FpuMoveFPImm {
+                rd,
+                imm,
+                size: ScalarSize::Size64,
+            }]
+        } else if let Ok(const_data) = u32::try_from(const_data) {
            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
-        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
-        // bits.  Also, treat it as half of a 128-bit vector and consider replicated
-        // patterns. Scalar MOVI might also be an option.
        } else if const_data & (u32::MAX as u64) == 0 {
            let tmp = alloc_tmp(I64);
            let mut insts = Inst::load_constant(tmp, const_data);
@ -879,6 +897,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_def(rd);
            collector.add_use(rn);
        }
+        &Inst::FpuMoveFPImm { rd, .. } => {
+            collector.add_def(rd);
+        }
        &Inst::MovToVec { rd, rn, .. } => {
            collector.add_mod(rd);
            collector.add_use(rn);
@ -1654,6 +1675,9 @@ pub fn aarch64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
            mapper.map_def(rd);
            mapper.map_use(rn);
        }
+        &mut Inst::FpuMoveFPImm { ref mut rd, .. } => {
+            mapper.map_def(rd);
+        }
        &mut Inst::MovToVec {
            ref mut rd,
            ref mut rn,
@ -2693,6 +2717,12 @@ impl Inst {
                let rn = show_ireg_sized(rn, mb_rru, operand_size);
                format!("fmov {}, {}", rd, rn)
            }
+            &Inst::FpuMoveFPImm { rd, imm, size } => {
+                let imm = imm.show_rru(mb_rru);
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+
+                format!("fmov {}, {}", rd, imm)
+            }
            &Inst::MovToVec { rd, rn, idx, size } => {
                let rd = show_vreg_element(rd.to_reg(), mb_rru, idx, size);
                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
 src/prelude.isle 22dd5ff133398960
-src/isa/aarch64/inst.isle 5fa80451697b084f
+src/isa/aarch64/inst.isle f946561093de4ff5
 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@ -53,16 +53,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => implemented_in_isle(ctx),

-        Opcode::F32const => {
-            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            lower_constant_f32(ctx, rd, value);
-        }
-        Opcode::F64const => {
-            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            lower_constant_f64(ctx, rd, value);
-        }
+        Opcode::F32const | Opcode::F64const => unreachable!(
+            "Should never see constant ops at top level lowering entry
+            point, as constants are rematerialized at use-sites"
+        ),
+
        Opcode::Iadd => implemented_in_isle(ctx),
        Opcode::Isub => implemented_in_isle(ctx),
        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
--- a/cranelift/filetests/filetests/isa/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif
@ -292,3 +292,124 @@ block0:
 ;   Inst 1:   ret
 ; }}

+function %f() -> f64 {
+block0:
+  v0 = f64const 0x1.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fmov d0, #1
+;   Inst 1:   ret
+; }}
+
+function %f() -> f32 {
+block0:
+  v0 = f32const 0x5.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fmov s0, #5
+;   Inst 1:   ret
+; }}
+
+function %f() -> f64 {
+block0:
+  v0 = f64const 0x32.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   movz x0, #16457, LSL #48
+;   Inst 1:   fmov d0, x0
+;   Inst 2:   ret
+; }}
+
+function %f() -> f32 {
+block0:
+  v0 = f32const 0x32.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   movz x0, #16968, LSL #16
+;   Inst 1:   fmov s0, w0
+;   Inst 2:   ret
+; }}
+
+function %f() -> f64 {
+block0:
+  v0 = f64const 0x0.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   movi v0.2s, #0
+;   Inst 1:   ret
+; }}
+
+function %f() -> f32 {
+block0:
+  v0 = f32const 0x0.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   movi v0.2s, #0
+;   Inst 1:   ret
+; }}
+
+function %f() -> f64 {
+block0:
+  v0 = f64const -0x10.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fmov d0, #-16
+;   Inst 1:   ret
+; }}
+
+function %f() -> f32 {
+block0:
+  v0 = f32const -0x10.0
+  return v0
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fmov s0, #-16
+;   Inst 1:   ret
+; }}
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
@ -76,19 +76,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp s0, s1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #17280, LSL #16
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp s0, s1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, s0
-;   Inst 11:   ret
+;   Inst 2:   fmov s1, #-1
+;   Inst 3:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #17280, LSL #16
+;   Inst 6:   fmov s1, w0
+;   Inst 7:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
+;   Inst 10:   ret
 ; }}

 function u0:0(f64) -> i8 {
@ -101,19 +100,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp d0, d1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #16496, LSL #48
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp d0, d1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, d0
-;   Inst 11:   ret
+;   Inst 2:   fmov d1, #-1
+;   Inst 3:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16496, LSL #48
+;   Inst 6:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
+;   Inst 10:   ret
 ; }}

 function u0:0(f32) -> i16 {
@ -126,19 +124,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp s0, s1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #18304, LSL #16
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp s0, s1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, s0
-;   Inst 11:   ret
+;   Inst 2:   fmov s1, #-1
+;   Inst 3:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #18304, LSL #16
+;   Inst 6:   fmov s1, w0
+;   Inst 7:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
+;   Inst 10:   ret
 ; }}

 function u0:0(f64) -> i16 {
@ -151,18 +148,17 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp d0, d1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #16624, LSL #48
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp d0, d1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, d0
-;   Inst 11:   ret
+;   Inst 2:   fmov d1, #-1
+;   Inst 3:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16624, LSL #48
+;   Inst 6:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
+;   Inst 10:   ret
 ; }}

--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@ -494,19 +494,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp s0, s1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #20352, LSL #16
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp s0, s1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, s0
-;   Inst 11:   ret
+;   Inst 2:   fmov s1, #-1
+;   Inst 3:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #20352, LSL #16
+;   Inst 6:   fmov s1, w0
+;   Inst 7:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
+;   Inst 10:   ret
 ; }}

 function %f34(f32) -> i32 {
@ -523,11 +522,11 @@ block0(v0: f32):
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
 ;   Inst 2:   movz x0, #52992, LSL #16
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fmov s1, w0
 ;   Inst 4:   fcmp s0, s1
 ;   Inst 5:   b.ge 8 ; udf
 ;   Inst 6:   movz x0, #20224, LSL #16
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fmov s1, w0
 ;   Inst 8:   fcmp s0, s1
 ;   Inst 9:   b.mi 8 ; udf
 ;   Inst 10:   fcvtzs w0, s0
@ -544,19 +543,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp s0, s1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #24448, LSL #16
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp s0, s1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu x0, s0
-;   Inst 11:   ret
+;   Inst 2:   fmov s1, #-1
+;   Inst 3:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #24448, LSL #16
+;   Inst 6:   fmov s1, w0
+;   Inst 7:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu x0, s0
+;   Inst 10:   ret
 ; }}

 function %f36(f32) -> i64 {
@ -573,11 +571,11 @@ block0(v0: f32):
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
 ;   Inst 2:   movz x0, #57088, LSL #16
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fmov s1, w0
 ;   Inst 4:   fcmp s0, s1
 ;   Inst 5:   b.ge 8 ; udf
 ;   Inst 6:   movz x0, #24320, LSL #16
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fmov s1, w0
 ;   Inst 8:   fcmp s0, s1
 ;   Inst 9:   b.mi 8 ; udf
 ;   Inst 10:   fcvtzs x0, s0
@ -594,19 +592,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp d0, d1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #16880, LSL #48
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp d0, d1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu w0, d0
-;   Inst 11:   ret
+;   Inst 2:   fmov d1, #-1
+;   Inst 3:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16880, LSL #48
+;   Inst 6:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
+;   Inst 10:   ret
 ; }}

 function %f38(f64) -> i32 {
@ -643,19 +640,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
-;   Inst 3:   fmov d1, x0
-;   Inst 4:   fcmp d0, d1
-;   Inst 5:   b.gt 8 ; udf
-;   Inst 6:   movz x0, #17392, LSL #48
-;   Inst 7:   fmov d1, x0
-;   Inst 8:   fcmp d0, d1
-;   Inst 9:   b.mi 8 ; udf
-;   Inst 10:   fcvtzu x0, d0
-;   Inst 11:   ret
+;   Inst 2:   fmov d1, #-1
+;   Inst 3:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #17392, LSL #48
+;   Inst 6:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu x0, d0
+;   Inst 10:   ret
 ; }}

 function %f40(f64) -> i64 {
@ -815,7 +811,7 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 9)
 ;   Inst 0:   movz x0, #20352, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s2, s0, s1
 ;   Inst 3:   movi v1.2s, #0
 ;   Inst 4:   fmax s2, s2, s1
@ -837,10 +833,10 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 11)
 ;   Inst 0:   movz x0, #20224, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s1, s0, s1
 ;   Inst 3:   movz x0, #52992, LSL #16
-;   Inst 4:   fmov d2, x0
+;   Inst 4:   fmov s2, w0
 ;   Inst 5:   fmax s1, s1, s2
 ;   Inst 6:   movi v2.2s, #0
 ;   Inst 7:   fcmp s0, s0
@ -861,7 +857,7 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 9)
 ;   Inst 0:   movz x0, #24448, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s2, s0, s1
 ;   Inst 3:   movi v1.2s, #0
 ;   Inst 4:   fmax s2, s2, s1
@ -883,10 +879,10 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 11)
 ;   Inst 0:   movz x0, #24320, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s1, s0, s1
 ;   Inst 3:   movz x0, #57088, LSL #16
-;   Inst 4:   fmov d2, x0
+;   Inst 4:   fmov s2, w0
 ;   Inst 5:   fmax s1, s1, s2
 ;   Inst 6:   movi v2.2s, #0
 ;   Inst 7:   fcmp s0, s0