From 90bafae1dc2dbc42da62719961f81570ab622bdd Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Fri, 12 Jun 2020 23:19:53 +0100 Subject: [PATCH] AArch64: Implement SIMD floating-point comparisons Copyright (c) 2020, Arm Limited. --- build.rs | 2 + .../codegen/src/isa/aarch64/inst/emit.rs | 8 ++ .../src/isa/aarch64/inst/emit_tests.rs | 36 ++++++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 13 ++- cranelift/codegen/src/isa/aarch64/lower.rs | 74 ++++++++++++++++- .../codegen/src/isa/aarch64/lower_inst.rs | 82 ++++--------------- 6 files changed, 149 insertions(+), 66 deletions(-) diff --git a/build.rs b/build.rs index 3118925b77..b33f022ad0 100644 --- a/build.rs +++ b/build.rs @@ -184,6 +184,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", "simd_align") => return false, ("simd", "simd_bitwise") => return false, ("simd", "simd_boolean") => return false, + ("simd", "simd_f32x4_cmp") => return false, + ("simd", "simd_f64x2_cmp") => return false, ("simd", "simd_i8x16_cmp") => return false, ("simd", "simd_i16x8_cmp") => return false, ("simd", "simd_i32x4_cmp") => return false, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 7668465d62..abb9aa0045 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1279,6 +1279,11 @@ impl MachInstEmit for Inst { I32X4 => 0b10, _ => 0, }; + let enc_size_for_fcmp = match ty { + F32X4 => 0b0, + F64X2 => 0b1, + _ => 0, + }; let (top11, bit15_10) = match alu_op { VecALUOp::SQAddScalar => { @@ -1302,6 +1307,9 @@ impl MachInstEmit for Inst { VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101), VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101), VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Fcmeq => (0b010_01110_00_1 | enc_size_for_fcmp << 1, 0b111001), + VecALUOp::Fcmgt => (0b011_01110_10_1 | enc_size_for_fcmp << 1, 0b111001), + VecALUOp::Fcmge => (0b011_01110_00_1 | enc_size_for_fcmp << 1, 0b111001), // The following logical instructions operate on bytes, so are not encoded differently // for the different vector types. VecALUOp::And => { diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 05dce50151..aaf4cfbae3 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2209,6 +2209,42 @@ fn test_aarch64_binemit() { "cmhs v8.4s, v2.4s, v15.4s", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmeq, + rd: writable_vreg(28), + rn: vreg(12), + rm: vreg(4), + ty: F32X4, + }, + "9CE5244E", + "fcmeq v28.4s, v12.4s, v4.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmgt, + rd: writable_vreg(3), + rn: vreg(16), + rm: vreg(31), + ty: F64X2, + }, + "03E6FF6E", + "fcmgt v3.2d, v16.2d, v31.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmge, + rd: writable_vreg(18), + rn: vreg(23), + rm: vreg(0), + ty: F64X2, + }, + "F2E6606E", + "fcmge v18.2d, v23.2d, v0.2d", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::And, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 6d14d53448..bd14cf0ba7 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -225,6 +225,12 @@ pub enum VecALUOp { Cmhs, /// Compare unsigned higher or same Cmhi, + /// Floating-point compare equal + Fcmeq, + /// Floating-point compare greater than + Fcmgt, + /// Floating-point compare greater than or equal + Fcmge, /// Bitwise and And, /// Bitwise bit clear @@ -2085,7 +2091,9 @@ impl MachInst for Inst { I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64), F32 | F64 => Ok(RegClass::V128), IFLAGS | FFLAGS => Ok(RegClass::I64), - B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 => Ok(RegClass::V128), + B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => { + Ok(RegClass::V128) + } _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", ty @@ -2720,6 +2728,9 @@ impl ShowWithRRU for Inst { VecALUOp::Cmgt => ("cmgt", true, ty), VecALUOp::Cmhs => ("cmhs", true, ty), VecALUOp::Cmhi => ("cmhi", true, ty), + VecALUOp::Fcmeq => ("fcmeq", true, ty), + VecALUOp::Fcmgt => ("fcmgt", true, ty), + VecALUOp::Fcmge => ("fcmge", true, ty), VecALUOp::And => ("and", true, I8X16), VecALUOp::Bic => ("bic", true, I8X16), VecALUOp::Orr => ("orr", true, I8X16), diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index d1526c2ae9..831eeec9bb 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -14,7 +14,7 @@ use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode, Type}; use crate::machinst::lower::*; use crate::machinst::*; -use crate::CodegenResult; +use crate::{CodegenError, CodegenResult}; use crate::isa::aarch64::inst::*; use crate::isa::aarch64::AArch64Backend; @@ -726,6 +726,77 @@ pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond { } } +pub(crate) fn lower_vector_compare>( + ctx: &mut C, + rd: Writable, + mut rn: Reg, + mut rm: Reg, + ty: Type, + cond: Cond, +) -> CodegenResult<()> { + match ty { + F32X4 | F64X2 | I8X16 | I16X8 | I32X4 => {} + _ => { + return Err(CodegenError::Unsupported(format!( + "unsupported SIMD type: {:?}", + ty + ))); + } + }; + + let is_float = match ty { + F32X4 | F64X2 => true, + _ => false, + }; + // 'Less than' operations are implemented by swapping + // the order of operands and using the 'greater than' + // instructions. + // 'Not equal' is implemented with 'equal' and inverting + // the result. + let (alu_op, swap) = match (is_float, cond) { + (false, Cond::Eq) => (VecALUOp::Cmeq, false), + (false, Cond::Ne) => (VecALUOp::Cmeq, false), + (false, Cond::Ge) => (VecALUOp::Cmge, false), + (false, Cond::Gt) => (VecALUOp::Cmgt, false), + (false, Cond::Le) => (VecALUOp::Cmge, true), + (false, Cond::Lt) => (VecALUOp::Cmgt, true), + (false, Cond::Hs) => (VecALUOp::Cmhs, false), + (false, Cond::Hi) => (VecALUOp::Cmhi, false), + (false, Cond::Ls) => (VecALUOp::Cmhs, true), + (false, Cond::Lo) => (VecALUOp::Cmhi, true), + (true, Cond::Eq) => (VecALUOp::Fcmeq, false), + (true, Cond::Ne) => (VecALUOp::Fcmeq, false), + (true, Cond::Mi) => (VecALUOp::Fcmgt, true), + (true, Cond::Ls) => (VecALUOp::Fcmge, true), + (true, Cond::Ge) => (VecALUOp::Fcmge, false), + (true, Cond::Gt) => (VecALUOp::Fcmgt, false), + _ => unreachable!(), + }; + + if swap { + std::mem::swap(&mut rn, &mut rm); + } + + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm, + ty, + }); + + if cond == Cond::Ne { + ctx.emit(Inst::VecMisc { + op: VecMisc2::Not, + rd, + rn: rd.to_reg(), + ty: I8X16, + }); + } + + Ok(()) +} + /// Determines whether this condcode interprets inputs as signed or /// unsigned. See the documentation for the `icmp` instruction in /// cranelift-codegen/meta/src/shared/instructions.rs for further insights @@ -762,6 +833,7 @@ pub fn ty_bits(ty: Type) -> usize { IFLAGS | FFLAGS => 32, B8X8 | I8X8 | B16X4 | I16X4 | B32X2 | I32X2 => 64, B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 => 128, + F32X4 | F64X2 => 128, _ => panic!("ty_bits() on unknown type: {:?}", ty), } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index e77c641630..651c8c02e9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -7,7 +7,7 @@ use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode}; use crate::machinst::lower::*; use crate::machinst::*; -use crate::{CodegenError, CodegenResult}; +use crate::CodegenResult; use crate::isa::aarch64::abi::*; use crate::isa::aarch64::inst::*; @@ -1234,6 +1234,7 @@ pub(crate) fn lower_insn_to_regs>( let condcode = inst_condcode(ctx.data(insn)).unwrap(); let cond = lower_condcode(condcode); let is_signed = condcode_is_signed(condcode); + let rd = output_to_reg(ctx, outputs[0]); let ty = ctx.input_ty(insn, 0); let bits = ty_bits(ty); let narrow_mode = match (bits <= 32, is_signed) { @@ -1242,68 +1243,16 @@ pub(crate) fn lower_insn_to_regs>( (false, true) => NarrowValueMode::SignExtend64, (false, false) => NarrowValueMode::ZeroExtend64, }; + let rn = input_to_reg(ctx, inputs[0], narrow_mode); if ty_bits(ty) < 128 { let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); - let rn = input_to_reg(ctx, inputs[0], narrow_mode); let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); - let rd = output_to_reg(ctx, outputs[0]); ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); ctx.emit(Inst::CondSet { cond, rd }); } else { - match ty { - I8X16 | I16X8 | I32X4 => {} - _ => { - return Err(CodegenError::Unsupported(format!( - "unsupported simd type: {:?}", - ty - ))); - } - }; - - let mut rn = input_to_reg(ctx, inputs[0], narrow_mode); - let mut rm = input_to_reg(ctx, inputs[1], narrow_mode); - let rd = output_to_reg(ctx, outputs[0]); - - // 'Less than' operations are implemented by swapping - // the order of operands and using the 'greater than' - // instructions. - // 'Not equal' is implemented with 'equal' and inverting - // the result. - let (alu_op, swap) = match cond { - Cond::Eq => (VecALUOp::Cmeq, false), - Cond::Ne => (VecALUOp::Cmeq, false), - Cond::Ge => (VecALUOp::Cmge, false), - Cond::Gt => (VecALUOp::Cmgt, false), - Cond::Le => (VecALUOp::Cmge, true), - Cond::Lt => (VecALUOp::Cmgt, true), - Cond::Hs => (VecALUOp::Cmhs, false), - Cond::Hi => (VecALUOp::Cmhi, false), - Cond::Ls => (VecALUOp::Cmhs, true), - Cond::Lo => (VecALUOp::Cmhi, true), - _ => unreachable!(), - }; - - if swap { - std::mem::swap(&mut rn, &mut rm); - } - - ctx.emit(Inst::VecRRR { - alu_op, - rd, - rn, - rm, - ty, - }); - - if cond == Cond::Ne { - ctx.emit(Inst::VecMisc { - op: VecMisc2::Not, - rd, - rn: rd.to_reg(), - ty: I8X16, - }); - } + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; } } @@ -1314,16 +1263,21 @@ pub(crate) fn lower_insn_to_regs>( let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); let rd = output_to_reg(ctx, outputs[0]); - match ty_bits(ty) { - 32 => { - ctx.emit(Inst::FpuCmp32 { rn, rm }); - } - 64 => { - ctx.emit(Inst::FpuCmp64 { rn, rm }); + + if ty_bits(ty) < 128 { + match ty_bits(ty) { + 32 => { + ctx.emit(Inst::FpuCmp32 { rn, rm }); + } + 64 => { + ctx.emit(Inst::FpuCmp64 { rn, rm }); + } + _ => panic!("Bad float size"), } - _ => panic!("Bad float size"), + ctx.emit(Inst::CondSet { cond, rd }); + } else { + lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; } - ctx.emit(Inst::CondSet { cond, rd }); } Opcode::JumpTableEntry | Opcode::JumpTableBase => {