[x64] Add the new i64x2 comparisons

4 years ago · d730f18a78
3 changed files with 53 additions and 20 deletions
--- a/build.rs
+++ b/build.rs
@ -182,7 +182,6 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
    }

    match (testsuite, testname) {
-        ("simd", "simd_i64x2_cmp") => return true,
        ("simd", "simd_i8x16_arith2") => return true, // Unsupported feature: proposed simd operator I8x16Popcnt
        ("simd", "simd_i64x2_arith2") => return true, // Unsupported feature: proposed simd operator I64x2Abs
        ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6
@ -231,8 +230,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            ("simd", "simd_boolean") | ("simd", "simd_lane") => return true,

            // These are new instructions that are not really implemented in any backend.
-            ("simd", "simd_i64x2_cmp")
-            | ("simd", "simd_i8x16_arith2")
+            ("simd", "simd_i8x16_arith2")
            | ("simd", "simd_i64x2_arith2")
            | ("simd", "simd_conversions")
            | ("simd", "simd_i16x8_extadd_pairwise_i8x16")
@ -252,6 +250,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            | ("simd", "simd_store64_lane")
            | ("simd", "simd_store8_lane") => return true,

+            // These are only implemented on x64.
+            ("simd", "simd_i64x2_cmp") => return !cfg!(feature = "experimental_x64"),
+
            // These are only implemented on aarch64 and x64.
            ("simd", "simd_f32x4_pmin_pmax")
            | ("simd", "simd_f64x2_pmin_pmax")
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@ -3185,11 +3185,27 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    ),
                };

-                // Here we decide which operand to use as the read/write `dst` (ModRM reg field)
-                // and which to use as the read `input` (ModRM r/m field). In the normal case we
-                // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for
-                // the less-than cases so that we can reuse the greater-than implementation.
+                // Here we decide which operand to use as the read/write `dst` (ModRM reg field) and
+                // which to use as the read `input` (ModRM r/m field). In the normal case we use
+                // Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for the
+                // less-than cases so that we can reuse the greater-than implementation.
+                //
+                // In a surprising twist, the operands for i64x2 `gte`/`sle` must also be flipped
+                // from the normal order because of the special-case lowering for these instructions
+                // (i.e. we use PCMPGTQ with flipped operands and negate the result).
                let input = match condcode {
+                    IntCC::SignedLessThanOrEqual if ty == types::I64X2 => {
+                        let lhs = put_input_in_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, lhs, ty));
+                        rhs
+                    }
+                    IntCC::SignedGreaterThanOrEqual if ty == types::I64X2 => {
+                        let lhs = input_to_reg_mem(ctx, inputs[0]);
+                        let rhs = put_input_in_reg(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, rhs, ty));
+                        lhs
+                    }
                    IntCC::SignedLessThan
                    | IntCC::SignedLessThanOrEqual
                    | IntCC::UnsignedLessThan
@ -3220,10 +3236,25 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
                    }
-                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
+                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
+                        if ty != types::I64X2 =>
+                    {
                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
                    }
+                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
+                        if ty == types::I64X2 =>
+                    {
+                        // The PMINS* instruction is only available in AVX512VL/F so we must instead
+                        // compare with flipped operands and negate the result (emitting one more
+                        // instruction).
+                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst));
+                        // Emit all 1s into the `tmp` register.
+                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        // Invert the result of the `PCMPGT*`.
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                    }
                    IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@ -1642,16 +1642,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            let a = pop1_with_bitcast(state, type_of(op), builder);
            state.push1(builder.ins().vhigh_bits(I32, a));
        }
-        Operator::I8x16Eq | Operator::I16x8Eq | Operator::I32x4Eq => {
+        Operator::I8x16Eq | Operator::I16x8Eq | Operator::I32x4Eq | Operator::I64x2Eq => {
            translate_vector_icmp(IntCC::Equal, type_of(op), builder, state)
        }
-        Operator::I8x16Ne | Operator::I16x8Ne | Operator::I32x4Ne => {
+        Operator::I8x16Ne | Operator::I16x8Ne | Operator::I32x4Ne | Operator::I64x2Ne => {
            translate_vector_icmp(IntCC::NotEqual, type_of(op), builder, state)
        }
-        Operator::I8x16GtS | Operator::I16x8GtS | Operator::I32x4GtS => {
+        Operator::I8x16GtS | Operator::I16x8GtS | Operator::I32x4GtS | Operator::I64x2GtS => {
            translate_vector_icmp(IntCC::SignedGreaterThan, type_of(op), builder, state)
        }
-        Operator::I8x16LtS | Operator::I16x8LtS | Operator::I32x4LtS => {
+        Operator::I8x16LtS | Operator::I16x8LtS | Operator::I32x4LtS | Operator::I64x2LtS => {
            translate_vector_icmp(IntCC::SignedLessThan, type_of(op), builder, state)
        }
        Operator::I8x16GtU | Operator::I16x8GtU | Operator::I32x4GtU => {
@ -1660,10 +1660,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::I8x16LtU | Operator::I16x8LtU | Operator::I32x4LtU => {
            translate_vector_icmp(IntCC::UnsignedLessThan, type_of(op), builder, state)
        }
-        Operator::I8x16GeS | Operator::I16x8GeS | Operator::I32x4GeS => {
+        Operator::I8x16GeS | Operator::I16x8GeS | Operator::I32x4GeS | Operator::I64x2GeS => {
            translate_vector_icmp(IntCC::SignedGreaterThanOrEqual, type_of(op), builder, state)
        }
-        Operator::I8x16LeS | Operator::I16x8LeS | Operator::I32x4LeS => {
+        Operator::I8x16LeS | Operator::I16x8LeS | Operator::I32x4LeS | Operator::I64x2LeS => {
            translate_vector_icmp(IntCC::SignedLessThanOrEqual, type_of(op), builder, state)
        }
        Operator::I8x16GeU | Operator::I16x8GeU | Operator::I32x4GeU => translate_vector_icmp(
@ -1852,12 +1852,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        | Operator::I64x2ExtMulHighI32x4S
        | Operator::I64x2ExtMulLowI32x4U
        | Operator::I64x2ExtMulHighI32x4U
-        | Operator::I64x2Eq
-        | Operator::I64x2Ne
-        | Operator::I64x2LtS
-        | Operator::I64x2GtS
-        | Operator::I64x2LeS
-        | Operator::I64x2GeS
        | Operator::I64x2Abs
        | Operator::I64x2AllTrue
        | Operator::I16x8ExtAddPairwiseI8x16S
@ -2646,7 +2640,14 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::V128Load64Splat { .. }
        | Operator::I64x2ExtractLane { .. }
        | Operator::I64x2ReplaceLane { .. }
+        | Operator::I64x2Eq
+        | Operator::I64x2Ne
+        | Operator::I64x2LtS
+        | Operator::I64x2GtS
+        | Operator::I64x2LeS
+        | Operator::I64x2GeS
        | Operator::I64x2Neg
+        | Operator::I64x2Abs
        | Operator::I64x2Shl
        | Operator::I64x2ShrS
        | Operator::I64x2ShrU