cranelift(aarch64): Add single precision fmov (#8453)

This commit is a follow up to https://github.com/bytecodealliance/wasmtime/pull/8365/files#r1565962730, to enable emission of 32-bit fmov from Winch. I opted to introduce a new instruction over refactoring the existing `FpuMov64` to be more generic to keep things simple, but I'm definitely open to exploring a refactoring if that's preferred. Encoding reference: https://developer.arm.com/documentation/ddi0602/2024-03/SIMD-FP-Instructions/FMOV--register---Floating-point-Move-register-without-conversion-?lang=en
7 months ago · bfb759d7ae
4 changed files with 28 additions and 0 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@ -331,6 +331,11 @@
       ;; Consumption of speculative data barrier.
       (Csdb)

+       ;; FPU 32-bit move.
+       (FpuMove32
+         (rd WritableReg)
+         (rn Reg))
+
       ;; FPU move. Note that this is distinct from a vector-register
       ;; move; moving just 64 bits seems to be significantly faster.
       (FpuMove64
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -1831,6 +1831,11 @@ impl MachInstEmit for Inst {
            &Inst::Csdb {} => {
                sink.put4(0xd503229f);
            }
+            &Inst::FpuMove32 { rd, rn } => {
+                let rd = allocs.next_writable(rd);
+                let rn = allocs.next(rn);
+                sink.put4(enc_fpurr(0b000_11110_00_1_000000_10000, rd, rn));
+            }
            &Inst::FpuMove64 { rd, rn } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -6135,6 +6135,15 @@ fn test_aarch64_binemit() {
        "fmov d8, d4",
    ));

+    insns.push((
+        Inst::FpuMove32 {
+            rd: writable_vreg(8),
+            rn: vreg(4),
+        },
+        "8840201E",
+        "fmov s8, s4",
+    ));
+
    insns.push((
        Inst::FpuMove128 {
            rd: writable_vreg(17),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -595,6 +595,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            collector.reg_use(rt);
        }
        &Inst::Fence {} | &Inst::Csdb {} => {}
+        &Inst::FpuMove32 { rd, rn } => {
+            collector.reg_def(rd);
+            collector.reg_use(rn);
+        }
        &Inst::FpuMove64 { rd, rn } => {
            collector.reg_def(rd);
            collector.reg_use(rn);
@ -1718,6 +1722,11 @@ impl Inst {
            &Inst::Csdb {} => {
                format!("csdb")
            }
+            &Inst::FpuMove32 { rd, rn } => {
+                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size32, allocs);
+                let rn = pretty_print_vreg_scalar(rn, ScalarSize::Size32, allocs);
+                format!("fmov {}, {}", rd, rn)
+            }
            &Inst::FpuMove64 { rd, rn } => {
                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size64, allocs);
                let rn = pretty_print_vreg_scalar(rn, ScalarSize::Size64, allocs);