Merge pull request #2075 from akirilov-arm/simd_fp_arith

AArch64: Implement SIMD floating-point arithmetic
4 years ago · f8f79ba9ca
5 changed files with 248 additions and 38 deletions
--- a/build.rs
+++ b/build.rs
@ -185,7 +185,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            ("simd", "simd_bitwise") => return false,
            ("simd", "simd_bit_shift") => return false,
            ("simd", "simd_boolean") => return false,
+            ("simd", "simd_f32x4") => return false,
+            ("simd", "simd_f32x4_arith") => return false,
            ("simd", "simd_f32x4_cmp") => return false,
+            ("simd", "simd_f64x2") => return false,
+            ("simd", "simd_f64x2_arith") => return false,
            ("simd", "simd_f64x2_cmp") => return false,
            ("simd", "simd_i8x16_arith") => return false,
            ("simd", "simd_i8x16_arith2") => return false,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -1123,6 +1123,18 @@ impl MachInstEmit for Inst {
                    VecMisc2::Not => (0b1, 0b00101, 0b00),
                    VecMisc2::Neg => (0b1, 0b01011, enc_size),
                    VecMisc2::Abs => (0b0, 0b01011, enc_size),
+                    VecMisc2::Fabs => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b01111, enc_size)
+                    }
+                    VecMisc2::Fneg => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b01111, enc_size)
+                    }
+                    VecMisc2::Fsqrt => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b11111, enc_size)
+                    }
                };
                sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn));
            }
@ -1363,9 +1375,22 @@ impl MachInstEmit for Inst {
                    VectorSize::Size64x2 => 0b11,
                    _ => 0,
                };
-                let enc_size_for_fcmp = match size {
-                    VectorSize::Size32x4 => 0b0,
-                    VectorSize::Size64x2 => 0b1,
+                let is_float = match alu_op {
+                    VecALUOp::Fcmeq
+                    | VecALUOp::Fcmgt
+                    | VecALUOp::Fcmge
+                    | VecALUOp::Fadd
+                    | VecALUOp::Fsub
+                    | VecALUOp::Fdiv
+                    | VecALUOp::Fmax
+                    | VecALUOp::Fmin
+                    | VecALUOp::Fmul => true,
+                    _ => false,
+                };
+                let enc_float_size = match (is_float, size) {
+                    (true, VectorSize::Size32x4) => 0b0,
+                    (true, VectorSize::Size64x2) => 0b1,
+                    (true, _) => unimplemented!(),
                    _ => 0,
                };

@ -1379,9 +1404,9 @@ impl MachInstEmit for Inst {
                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Fcmeq => (0b010_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
-                    VecALUOp::Fcmgt => (0b011_01110_10_1 | enc_size_for_fcmp << 1, 0b111001),
-                    VecALUOp::Fcmge => (0b011_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
+                    VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
+                    VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
+                    VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
                    // The following logical instructions operate on bytes, so are not encoded differently
                    // for the different vector types.
                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
@ -1403,6 +1428,17 @@ impl MachInstEmit for Inst {
                    VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
                    VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
                    VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
+                    VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
+                    VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
+                    VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
+                    VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
+                    VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
+                    VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
+                };
+                let top11 = if is_float {
+                    top11 | enc_float_size << 1
+                } else {
+                    top11
                };
                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
            }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -2953,6 +2953,78 @@ fn test_aarch64_binemit() {
        "urhadd v8.4s, v12.4s, v14.4s",
    ));

+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fadd,
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            rm: vreg(16),
+            size: VectorSize::Size32x4,
+        },
+        "1FD4304E",
+        "fadd v31.4s, v0.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fsub,
+            rd: writable_vreg(8),
+            rn: vreg(7),
+            rm: vreg(15),
+            size: VectorSize::Size64x2,
+        },
+        "E8D4EF4E",
+        "fsub v8.2d, v7.2d, v15.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fdiv,
+            rd: writable_vreg(1),
+            rn: vreg(3),
+            rm: vreg(4),
+            size: VectorSize::Size32x4,
+        },
+        "61FC246E",
+        "fdiv v1.4s, v3.4s, v4.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmax,
+            rd: writable_vreg(31),
+            rn: vreg(16),
+            rm: vreg(0),
+            size: VectorSize::Size64x2,
+        },
+        "1FF6604E",
+        "fmax v31.2d, v16.2d, v0.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmin,
+            rd: writable_vreg(5),
+            rn: vreg(19),
+            rm: vreg(26),
+            size: VectorSize::Size32x4,
+        },
+        "65F6BA4E",
+        "fmin v5.4s, v19.4s, v26.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmul,
+            rd: writable_vreg(2),
+            rn: vreg(0),
+            rm: vreg(5),
+            size: VectorSize::Size64x2,
+        },
+        "02DC656E",
+        "fmul v2.2d, v0.2d, v5.2d",
+    ));
+
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Not,
@ -3052,6 +3124,39 @@ fn test_aarch64_binemit() {
        "abs v1.2d, v10.2d",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fabs,
+            rd: writable_vreg(15),
+            rn: vreg(16),
+            size: VectorSize::Size32x4,
+        },
+        "0FFAA04E",
+        "fabs v15.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fneg,
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            size: VectorSize::Size32x4,
+        },
+        "1FF8A06E",
+        "fneg v31.4s, v0.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fsqrt,
+            rd: writable_vreg(7),
+            rn: vreg(18),
+            size: VectorSize::Size64x2,
+        },
+        "47FAE16E",
+        "fsqrt v7.2d, v18.2d",
+    ));
+
    insns.push((
        Inst::VecLanes {
            op: VecLanesOp::Uminv,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -271,6 +271,18 @@ pub enum VecALUOp {
    Smax,
    /// Unsigned rounding halving add
    Urhadd,
+    /// Floating-point add
+    Fadd,
+    /// Floating-point subtract
+    Fsub,
+    /// Floating-point divide
+    Fdiv,
+    /// Floating-point maximum
+    Fmax,
+    /// Floating-point minimum
+    Fmin,
+    /// Floating-point multiply
+    Fmul,
 }

 /// A Vector miscellaneous operation with two registers.
@ -282,6 +294,12 @@ pub enum VecMisc2 {
    Neg,
    /// Absolute value
    Abs,
+    /// Floating-point absolute value
+    Fabs,
+    /// Floating-point negate
+    Fneg,
+    /// Floating-point square root
+    Fsqrt,
 }

 /// An operation across the lanes of vectors.
@ -2810,6 +2828,12 @@ impl Inst {
                    VecALUOp::Umax => ("umax", size),
                    VecALUOp::Smax => ("smax", size),
                    VecALUOp::Urhadd => ("urhadd", size),
+                    VecALUOp::Fadd => ("fadd", size),
+                    VecALUOp::Fsub => ("fsub", size),
+                    VecALUOp::Fdiv => ("fdiv", size),
+                    VecALUOp::Fmax => ("fmax", size),
+                    VecALUOp::Fmin => ("fmin", size),
+                    VecALUOp::Fmul => ("fmul", size),
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
                let rn = show_vreg_vector(rn, mb_rru, size);
@ -2821,6 +2845,9 @@ impl Inst {
                    VecMisc2::Not => ("mvn", VectorSize::Size8x16),
                    VecMisc2::Neg => ("neg", size),
                    VecMisc2::Abs => ("abs", size),
+                    VecMisc2::Fabs => ("fabs", size),
+                    VecMisc2::Fneg => ("fneg", size),
+                    VecMisc2::Fsqrt => ("fsqrt", size),
                };

                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@ -1802,46 +1802,84 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
-            let bits = ty_bits(ctx.output_ty(insn, 0));
-            let fpu_op = match (op, bits) {
-                (Opcode::Fadd, 32) => FPUOp2::Add32,
-                (Opcode::Fadd, 64) => FPUOp2::Add64,
-                (Opcode::Fsub, 32) => FPUOp2::Sub32,
-                (Opcode::Fsub, 64) => FPUOp2::Sub64,
-                (Opcode::Fmul, 32) => FPUOp2::Mul32,
-                (Opcode::Fmul, 64) => FPUOp2::Mul64,
-                (Opcode::Fdiv, 32) => FPUOp2::Div32,
-                (Opcode::Fdiv, 64) => FPUOp2::Div64,
-                (Opcode::Fmin, 32) => FPUOp2::Min32,
-                (Opcode::Fmin, 64) => FPUOp2::Min64,
-                (Opcode::Fmax, 32) => FPUOp2::Max32,
-                (Opcode::Fmax, 64) => FPUOp2::Max64,
-                _ => panic!("Unknown op/bits combination"),
-            };
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
+            if bits < 128 {
+                let fpu_op = match (op, bits) {
+                    (Opcode::Fadd, 32) => FPUOp2::Add32,
+                    (Opcode::Fadd, 64) => FPUOp2::Add64,
+                    (Opcode::Fsub, 32) => FPUOp2::Sub32,
+                    (Opcode::Fsub, 64) => FPUOp2::Sub64,
+                    (Opcode::Fmul, 32) => FPUOp2::Mul32,
+                    (Opcode::Fmul, 64) => FPUOp2::Mul64,
+                    (Opcode::Fdiv, 32) => FPUOp2::Div32,
+                    (Opcode::Fdiv, 64) => FPUOp2::Div64,
+                    (Opcode::Fmin, 32) => FPUOp2::Min32,
+                    (Opcode::Fmin, 64) => FPUOp2::Min64,
+                    (Opcode::Fmax, 32) => FPUOp2::Max32,
+                    (Opcode::Fmax, 64) => FPUOp2::Max64,
+                    _ => panic!("Unknown op/bits combination"),
+                };
+                ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
+            } else {
+                let alu_op = match op {
+                    Opcode::Fadd => VecALUOp::Fadd,
+                    Opcode::Fsub => VecALUOp::Fsub,
+                    Opcode::Fdiv => VecALUOp::Fdiv,
+                    Opcode::Fmax => VecALUOp::Fmax,
+                    Opcode::Fmin => VecALUOp::Fmin,
+                    Opcode::Fmul => VecALUOp::Fmul,
+                    _ => unreachable!(),
+                };
+
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
        }

        Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
-            let bits = ty_bits(ctx.output_ty(insn, 0));
-            let fpu_op = match (op, bits) {
-                (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
-                (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
-                (Opcode::Fneg, 32) => FPUOp1::Neg32,
-                (Opcode::Fneg, 64) => FPUOp1::Neg64,
-                (Opcode::Fabs, 32) => FPUOp1::Abs32,
-                (Opcode::Fabs, 64) => FPUOp1::Abs64,
-                (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
-                (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
-                (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
-                (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
-                _ => panic!("Unknown op/bits combination"),
-            };
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
+            if bits < 128 {
+                let fpu_op = match (op, bits) {
+                    (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
+                    (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
+                    (Opcode::Fneg, 32) => FPUOp1::Neg32,
+                    (Opcode::Fneg, 64) => FPUOp1::Neg64,
+                    (Opcode::Fabs, 32) => FPUOp1::Abs32,
+                    (Opcode::Fabs, 64) => FPUOp1::Abs64,
+                    (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
+                    (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
+                    (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
+                    (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
+                    _ => panic!("Unknown op/bits combination"),
+                };
+                ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
+            } else {
+                let op = match op {
+                    Opcode::Fabs => VecMisc2::Fabs,
+                    Opcode::Fneg => VecMisc2::Fneg,
+                    Opcode::Sqrt => VecMisc2::Fsqrt,
+                    _ => unimplemented!(),
+                };
+
+                ctx.emit(Inst::VecMisc {
+                    op,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
        }

        Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {