From 90bafae1dc2dbc42da62719961f81570ab622bdd Mon Sep 17 00:00:00 2001
From: Anton Kirilov <anton.kirilov@arm.com>
Date: Fri, 12 Jun 2020 23:19:53 +0100
Subject: [PATCH] AArch64: Implement SIMD floating-point comparisons

Copyright (c) 2020, Arm Limited.
---
 build.rs                                      |  2 +
 .../codegen/src/isa/aarch64/inst/emit.rs      |  8 ++
 .../src/isa/aarch64/inst/emit_tests.rs        | 36 ++++++++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs | 13 ++-
 cranelift/codegen/src/isa/aarch64/lower.rs    | 74 ++++++++++++++++-
 .../codegen/src/isa/aarch64/lower_inst.rs     | 82 ++++---------------
 6 files changed, 149 insertions(+), 66 deletions(-)

diff --git a/build.rs b/build.rs
index 3118925b77..b33f022ad0 100644
--- a/build.rs
+++ b/build.rs
@@ -184,6 +184,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             ("simd", "simd_align") => return false,
             ("simd", "simd_bitwise") => return false,
             ("simd", "simd_boolean") => return false,
+            ("simd", "simd_f32x4_cmp") => return false,
+            ("simd", "simd_f64x2_cmp") => return false,
             ("simd", "simd_i8x16_cmp") => return false,
             ("simd", "simd_i16x8_cmp") => return false,
             ("simd", "simd_i32x4_cmp") => return false,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 7668465d62..abb9aa0045 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1279,6 +1279,11 @@ impl MachInstEmit for Inst {
                     I32X4 => 0b10,
                     _ => 0,
                 };
+                let enc_size_for_fcmp = match ty {
+                    F32X4 => 0b0,
+                    F64X2 => 0b1,
+                    _ => 0,
+                };
 
                 let (top11, bit15_10) = match alu_op {
                     VecALUOp::SQAddScalar => {
@@ -1302,6 +1307,9 @@ impl MachInstEmit for Inst {
                     VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
                     VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
                     VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Fcmeq => (0b010_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
+                    VecALUOp::Fcmgt => (0b011_01110_10_1 | enc_size_for_fcmp << 1, 0b111001),
+                    VecALUOp::Fcmge => (0b011_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
                     // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
                     VecALUOp::And => {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 05dce50151..aaf4cfbae3 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2209,6 +2209,42 @@ fn test_aarch64_binemit() {
         "cmhs v8.4s, v2.4s, v15.4s",
     ));
 
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmeq,
+            rd: writable_vreg(28),
+            rn: vreg(12),
+            rm: vreg(4),
+            ty: F32X4,
+        },
+        "9CE5244E",
+        "fcmeq v28.4s, v12.4s, v4.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmgt,
+            rd: writable_vreg(3),
+            rn: vreg(16),
+            rm: vreg(31),
+            ty: F64X2,
+        },
+        "03E6FF6E",
+        "fcmgt v3.2d, v16.2d, v31.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmge,
+            rd: writable_vreg(18),
+            rn: vreg(23),
+            rm: vreg(0),
+            ty: F64X2,
+        },
+        "F2E6606E",
+        "fcmge v18.2d, v23.2d, v0.2d",
+    ));
+
     insns.push((
         Inst::VecRRR {
             alu_op: VecALUOp::And,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 6d14d53448..bd14cf0ba7 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -225,6 +225,12 @@ pub enum VecALUOp {
     Cmhs,
     /// Compare unsigned higher or same
     Cmhi,
+    /// Floating-point compare equal
+    Fcmeq,
+    /// Floating-point compare greater than
+    Fcmgt,
+    /// Floating-point compare greater than or equal
+    Fcmge,
     /// Bitwise and
     And,
     /// Bitwise bit clear
@@ -2085,7 +2091,9 @@ impl MachInst for Inst {
             I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64),
             F32 | F64 => Ok(RegClass::V128),
             IFLAGS | FFLAGS => Ok(RegClass::I64),
-            B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 => Ok(RegClass::V128),
+            B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => {
+                Ok(RegClass::V128)
+            }
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
@@ -2720,6 +2728,9 @@ impl ShowWithRRU for Inst {
                     VecALUOp::Cmgt => ("cmgt", true, ty),
                     VecALUOp::Cmhs => ("cmhs", true, ty),
                     VecALUOp::Cmhi => ("cmhi", true, ty),
+                    VecALUOp::Fcmeq => ("fcmeq", true, ty),
+                    VecALUOp::Fcmgt => ("fcmgt", true, ty),
+                    VecALUOp::Fcmge => ("fcmge", true, ty),
                     VecALUOp::And => ("and", true, I8X16),
                     VecALUOp::Bic => ("bic", true, I8X16),
                     VecALUOp::Orr => ("orr", true, I8X16),
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index d1526c2ae9..831eeec9bb 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,7 +14,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::CodegenResult;
+use crate::{CodegenError, CodegenResult};
 
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;
@@ -726,6 +726,77 @@ pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond {
     }
 }
 
+pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    mut rn: Reg,
+    mut rm: Reg,
+    ty: Type,
+    cond: Cond,
+) -> CodegenResult<()> {
+    match ty {
+        F32X4 | F64X2 | I8X16 | I16X8 | I32X4 => {}
+        _ => {
+            return Err(CodegenError::Unsupported(format!(
+                "unsupported SIMD type: {:?}",
+                ty
+            )));
+        }
+    };
+
+    let is_float = match ty {
+        F32X4 | F64X2 => true,
+        _ => false,
+    };
+    // 'Less than' operations are implemented by swapping
+    // the order of operands and using the 'greater than'
+    // instructions.
+    // 'Not equal' is implemented with 'equal' and inverting
+    // the result.
+    let (alu_op, swap) = match (is_float, cond) {
+        (false, Cond::Eq) => (VecALUOp::Cmeq, false),
+        (false, Cond::Ne) => (VecALUOp::Cmeq, false),
+        (false, Cond::Ge) => (VecALUOp::Cmge, false),
+        (false, Cond::Gt) => (VecALUOp::Cmgt, false),
+        (false, Cond::Le) => (VecALUOp::Cmge, true),
+        (false, Cond::Lt) => (VecALUOp::Cmgt, true),
+        (false, Cond::Hs) => (VecALUOp::Cmhs, false),
+        (false, Cond::Hi) => (VecALUOp::Cmhi, false),
+        (false, Cond::Ls) => (VecALUOp::Cmhs, true),
+        (false, Cond::Lo) => (VecALUOp::Cmhi, true),
+        (true, Cond::Eq) => (VecALUOp::Fcmeq, false),
+        (true, Cond::Ne) => (VecALUOp::Fcmeq, false),
+        (true, Cond::Mi) => (VecALUOp::Fcmgt, true),
+        (true, Cond::Ls) => (VecALUOp::Fcmge, true),
+        (true, Cond::Ge) => (VecALUOp::Fcmge, false),
+        (true, Cond::Gt) => (VecALUOp::Fcmgt, false),
+        _ => unreachable!(),
+    };
+
+    if swap {
+        std::mem::swap(&mut rn, &mut rm);
+    }
+
+    ctx.emit(Inst::VecRRR {
+        alu_op,
+        rd,
+        rn,
+        rm,
+        ty,
+    });
+
+    if cond == Cond::Ne {
+        ctx.emit(Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd,
+            rn: rd.to_reg(),
+            ty: I8X16,
+        });
+    }
+
+    Ok(())
+}
+
 /// Determines whether this condcode interprets inputs as signed or
 /// unsigned.  See the documentation for the `icmp` instruction in
 /// cranelift-codegen/meta/src/shared/instructions.rs for further insights
@@ -762,6 +833,7 @@ pub fn ty_bits(ty: Type) -> usize {
         IFLAGS | FFLAGS => 32,
         B8X8 | I8X8 | B16X4 | I16X4 | B32X2 | I32X2 => 64,
         B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 => 128,
+        F32X4 | F64X2 => 128,
         _ => panic!("ty_bits() on unknown type: {:?}", ty),
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index e77c641630..651c8c02e9 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -7,7 +7,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::{CodegenError, CodegenResult};
+use crate::CodegenResult;
 
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -1234,6 +1234,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let condcode = inst_condcode(ctx.data(insn)).unwrap();
             let cond = lower_condcode(condcode);
             let is_signed = condcode_is_signed(condcode);
+            let rd = output_to_reg(ctx, outputs[0]);
             let ty = ctx.input_ty(insn, 0);
             let bits = ty_bits(ty);
             let narrow_mode = match (bits <= 32, is_signed) {
@@ -1242,68 +1243,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 (false, true) => NarrowValueMode::SignExtend64,
                 (false, false) => NarrowValueMode::ZeroExtend64,
             };
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
 
             if ty_bits(ty) < 128 {
                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
-                let rn = input_to_reg(ctx, inputs[0], narrow_mode);
                 let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
-                let rd = output_to_reg(ctx, outputs[0]);
                 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
                 ctx.emit(Inst::CondSet { cond, rd });
             } else {
-                match ty {
-                    I8X16 | I16X8 | I32X4 => {}
-                    _ => {
-                        return Err(CodegenError::Unsupported(format!(
-                            "unsupported simd type: {:?}",
-                            ty
-                        )));
-                    }
-                };
-
-                let mut rn = input_to_reg(ctx, inputs[0], narrow_mode);
-                let mut rm = input_to_reg(ctx, inputs[1], narrow_mode);
-                let rd = output_to_reg(ctx, outputs[0]);
-
-                // 'Less than' operations are implemented by swapping
-                // the order of operands and using the 'greater than'
-                // instructions.
-                // 'Not equal' is implemented with 'equal' and inverting
-                // the result.
-                let (alu_op, swap) = match cond {
-                    Cond::Eq => (VecALUOp::Cmeq, false),
-                    Cond::Ne => (VecALUOp::Cmeq, false),
-                    Cond::Ge => (VecALUOp::Cmge, false),
-                    Cond::Gt => (VecALUOp::Cmgt, false),
-                    Cond::Le => (VecALUOp::Cmge, true),
-                    Cond::Lt => (VecALUOp::Cmgt, true),
-                    Cond::Hs => (VecALUOp::Cmhs, false),
-                    Cond::Hi => (VecALUOp::Cmhi, false),
-                    Cond::Ls => (VecALUOp::Cmhs, true),
-                    Cond::Lo => (VecALUOp::Cmhi, true),
-                    _ => unreachable!(),
-                };
-
-                if swap {
-                    std::mem::swap(&mut rn, &mut rm);
-                }
-
-                ctx.emit(Inst::VecRRR {
-                    alu_op,
-                    rd,
-                    rn,
-                    rm,
-                    ty,
-                });
-
-                if cond == Cond::Ne {
-                    ctx.emit(Inst::VecMisc {
-                        op: VecMisc2::Not,
-                        rd,
-                        rn: rd.to_reg(),
-                        ty: I8X16,
-                    });
-                }
+                let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
         }
 
@@ -1314,16 +1263,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = output_to_reg(ctx, outputs[0]);
-            match ty_bits(ty) {
-                32 => {
-                    ctx.emit(Inst::FpuCmp32 { rn, rm });
-                }
-                64 => {
-                    ctx.emit(Inst::FpuCmp64 { rn, rm });
+
+            if ty_bits(ty) < 128 {
+                match ty_bits(ty) {
+                    32 => {
+                        ctx.emit(Inst::FpuCmp32 { rn, rm });
+                    }
+                    64 => {
+                        ctx.emit(Inst::FpuCmp64 { rn, rm });
+                    }
+                    _ => panic!("Bad float size"),
                 }
-                _ => panic!("Bad float size"),
+                ctx.emit(Inst::CondSet { cond, rd });
+            } else {
+                lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
-            ctx.emit(Inst::CondSet { cond, rd });
         }
 
         Opcode::JumpTableEntry | Opcode::JumpTableBase => {