Add x86 pack instructions

5 years ago · fb6e8f784d
6 changed files with 63 additions and 2 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@ -1619,6 +1619,7 @@ fn define_simd(
    let x86_insertps = x86.by_name("x86_insertps");
    let x86_movlhps = x86.by_name("x86_movlhps");
    let x86_movsd = x86.by_name("x86_movsd");
+    let x86_packss = x86.by_name("x86_packss");
    let x86_pextr = x86.by_name("x86_pextr");
    let x86_pinsr = x86.by_name("x86_pinsr");
    let x86_pmaxs = x86.by_name("x86_pmaxs");
@ -1804,6 +1805,10 @@ fn define_simd(
            rec_fa.opcodes(low),
        );
    }
+    for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
+        let x86_packss = x86_packss.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(x86_packss, rec_fa.opcodes(*opcodes));
+    }

    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@ -6,7 +6,6 @@ use crate::cdsl::instructions::{
 use crate::cdsl::operands::Operand;
 use crate::cdsl::types::ValueType;
 use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar};
-
 use crate::shared::entities::EntityRefs;
 use crate::shared::formats::Formats;
 use crate::shared::immediates::Immediates;
@ -275,7 +274,7 @@ pub(crate) fn define(
    );
    let a = &Operand::new("a", TxN).with_doc("A vector value (i.e. held in an XMM register)");
    let b = &Operand::new("b", TxN).with_doc("A vector value (i.e. held in an XMM register)");
-    let i = &Operand::new("i", uimm8,).with_doc( "An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details");
+    let i = &Operand::new("i", uimm8).with_doc("An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details");

    ig.push(
        Inst::new(
@ -410,6 +409,35 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let I16xN = &TypeVar::new(
+        "I16xN",
+        "A SIMD vector type containing integers 16-bits wide and up",
+        TypeSetBuilder::new()
+            .ints(16..32)
+            .simd_lanes(4..8)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I16xN);
+    let y = &Operand::new("y", I16xN);
+    let a = &Operand::new("a", &I16xN.split_lanes());
+
+    ig.push(
+        Inst::new(
+            "x86_packss",
+            r#"
+        Convert packed signed integers the lanes of ``x`` and ``y`` into half-width integers, using
+        signed saturation to handle overflows. For example, with notional i16x2 vectors, where 
+        ``x = [x1, x0]`` and ``y = [y1, y0]``, this operation would result in 
+        ``a = [y1', y0', x1', x0']`` (using the Intel manual's right-to-left lane ordering).
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", FxN);
    let y = &Operand::new("y", FxN);
    let a = &Operand::new("a", FxN);
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@ -291,6 +291,14 @@ pub static OR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
 /// Return the bitwise logical OR of packed single-precision values in xmm and x/m (SSE).
 pub static ORPS: [u8; 2] = [0x0f, 0x56];

+/// Converts 8 packed signed word integers from xmm1 and from xxm2/m128 into 16 packed signed byte
+/// integers in xmm1 using signed saturation (SSE2).
+pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63];
+
+/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 packed signed
+/// word integers in xmm1 using signed saturation (SSE2).
+pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b];
+
 /// Add packed byte integers from xmm2/m128 and xmm1 (SSE2).
 pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc];

--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@ -2375,6 +2375,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
        | Opcode::X86Pmaxu
        | Opcode::X86Pmins
        | Opcode::X86Pminu
+        | Opcode::X86Packss
        | Opcode::X86Punpckh
        | Opcode::X86Punpckl
        | Opcode::X86ElfTlsGetAddr
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
@ -109,3 +109,9 @@ block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]):
 [-, %xmm7]  v2 = x86_punpckl v0, v1         ; bin: 66 0f 62 fe
            return
 }
+
+function %packss_i16x8(i16x8, i16x8) {
+block0(v0: i16x8 [%xmm7], v1: i16x8 [%xmm8]):
+[-, %xmm7]  v2 = x86_packss v0, v1          ; bin: 66 41 0f 63 f8
+            return
+}
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
@ -205,3 +205,16 @@ block0:
    return v5
 }
 ; run
+
+function %pack() -> b1 {
+block0:
+    v0 = vconst.i32x4 [0 1 -1 0x0001ffff]
+    v1 = vconst.i32x4 [4 5 -6 0xffffffff]
+    v2 = x86_packss v0, v1
+
+    v3 = vconst.i16x8 [0 1 -1 0x7fff 4 5 -6 0xffff]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run