From 65e6de234437b54c1f13697aa2c817acc2c6ef7a Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 1 Jul 2020 10:30:43 -0700
Subject: [PATCH] Replace `x86_packss` with `snarrow`

Since the Wasm specification contains narrowing instructions (see https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md#integer-to-integer-narrowing) that lower to PACKSS*, the x86-specific instruction is not necessary in the CLIF IR.
---
 .../codegen/meta/src/isa/x86/encodings.rs     |  6 ++--
 .../codegen/meta/src/isa/x86/instructions.rs  | 29 -----------------
 .../codegen/meta/src/isa/x86/legalize.rs      |  4 +--
 .../codegen/meta/src/shared/instructions.rs   | 31 +++++++++++++++++++
 .../codegen/src/isa/aarch64/lower_inst.rs     |  4 +--
 .../isa/x86/simd-bitwise-legalize.clif        |  2 +-
 .../isa/x86/simd-lane-access-binemit.clif     |  4 +--
 .../isa/x86/simd-lane-access-run.clif         | 16 +++-------
 8 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 303b1bfaeb..a1d4de8ca5 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1676,6 +1676,7 @@ fn define_simd(
     let uload16x4_complex = shared.by_name("uload16x4_complex");
     let uload32x2 = shared.by_name("uload32x2");
     let uload32x2_complex = shared.by_name("uload32x2_complex");
+    let snarrow = shared.by_name("snarrow");
     let ushr_imm = shared.by_name("ushr_imm");
     let usub_sat = shared.by_name("usub_sat");
     let vconst = shared.by_name("vconst");
@@ -1686,7 +1687,6 @@ fn define_simd(
     let x86_fmin = x86.by_name("x86_fmin");
     let x86_movlhps = x86.by_name("x86_movlhps");
     let x86_movsd = x86.by_name("x86_movsd");
-    let x86_packss = x86.by_name("x86_packss");
     let x86_pblendw = x86.by_name("x86_pblendw");
     let x86_pextr = x86.by_name("x86_pextr");
     let x86_pinsr = x86.by_name("x86_pinsr");
@@ -1901,8 +1901,8 @@ fn define_simd(
         );
     }
     for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
-        let x86_packss = x86_packss.bind(vector(*ty, sse_vector_size));
-        e.enc_both_inferred(x86_packss, rec_fa.opcodes(*opcodes));
+        let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
     }
 
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
index 4afbc88747..0e48784f23 100644
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -454,35 +454,6 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
-    let I16xN = &TypeVar::new(
-        "I16xN",
-        "A SIMD vector type containing integers 16-bits wide and up",
-        TypeSetBuilder::new()
-            .ints(16..32)
-            .simd_lanes(4..8)
-            .includes_scalars(false)
-            .build(),
-    );
-
-    let x = &Operand::new("x", I16xN);
-    let y = &Operand::new("y", I16xN);
-    let a = &Operand::new("a", &I16xN.split_lanes());
-
-    ig.push(
-        Inst::new(
-            "x86_packss",
-            r#"
-        Convert packed signed integers the lanes of ``x`` and ``y`` into half-width integers, using
-        signed saturation to handle overflows. For example, with notional i16x2 vectors, where 
-        ``x = [x1, x0]`` and ``y = [y1, y0]``, this operation would result in 
-        ``a = [y1', y0', x1', x0']`` (using the Intel manual's right-to-left lane ordering).
-        "#,
-            &formats.binary,
-        )
-        .operands_in(vec![x, y])
-        .operands_out(vec![a]),
-    );
-
     let x = &Operand::new("x", FxN);
     let y = &Operand::new("y", FxN);
     let a = &Operand::new("a", FxN);
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index 51453322e9..30c6789dff 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -405,6 +405,7 @@ fn define_simd(
     let uadd_sat = insts.by_name("uadd_sat");
     let umax = insts.by_name("umax");
     let umin = insts.by_name("umin");
+    let snarrow = insts.by_name("snarrow");
     let ushr_imm = insts.by_name("ushr_imm");
     let ushr = insts.by_name("ushr");
     let vconst = insts.by_name("vconst");
@@ -412,7 +413,6 @@ fn define_simd(
     let vany_true = insts.by_name("vany_true");
     let vselect = insts.by_name("vselect");
 
-    let x86_packss = x86_instructions.by_name("x86_packss");
     let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
     let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
     let x86_pmins = x86_instructions.by_name("x86_pmins");
@@ -575,7 +575,7 @@ fn define_simd(
                 def!(g = raw_bitcast_i16x8_again(f)),
                 def!(h = x86_psra(g, b)),
                 // Re-pack the vector.
-                def!(z = x86_packss(e, h)),
+                def!(z = snarrow(e, h)),
             ],
         );
     }
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index fb91ae0ae9..bad56b5f27 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3883,6 +3883,37 @@ pub(crate) fn define(
         .constraints(vec![WiderOrEq(Int.clone(), IntTo.clone())]),
     );
 
+    let I16xN = &TypeVar::new(
+        "I16xN",
+        "A SIMD vector type containing integers 16-bits wide and up",
+        TypeSetBuilder::new()
+            .ints(16..32)
+            .simd_lanes(4..8)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I16xN);
+    let y = &Operand::new("y", I16xN);
+    let a = &Operand::new("a", &I16xN.split_lanes());
+
+    ig.push(
+        Inst::new(
+            "snarrow",
+            r#"
+        Combine `x` and `y` into a vector with twice the lanes but half the integer width while 
+        saturating overflowing values to the signed maximum and minimum.
+        
+        The lanes will be concatenated after narrowing. For example, when `x` and `y` are `i32x4`
+        and `x = [x3, x2, x1, x0]` and `y = [y3, y2, y1, y0]`, then after narrowing the value
+        returned is an `i16x8`: `a = [y3', y2', y1', y0', x3', x2', x1', x0']`.
+            "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     let IntTo = &TypeVar::new(
         "IntTo",
         "A larger integer type with the same number of lanes",
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 82eb35f13f..2c67c1cd46 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2060,7 +2060,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Pminu
         | Opcode::X86Pmullq
         | Opcode::X86Pmuludq
-        | Opcode::X86Packss
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
         | Opcode::X86Vcvtudq2ps
@@ -2069,8 +2068,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             panic!("x86-specific opcode in supposedly arch-neutral IR!");
         }
 
-        Opcode::Iabs => unimplemented!(),
         Opcode::AvgRound => unimplemented!(),
+        Opcode::Iabs => unimplemented!(),
+        Opcode::Snarrow => unimplemented!(),
         Opcode::TlsValue => unimplemented!(),
     }
 
diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
index 7193aa2b54..ad459563ef 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
@@ -34,7 +34,7 @@ block0:
     ; nextln: v9 = raw_bitcast.i16x8 v8
     ; nextln: v10 = x86_psra v9, v4
 
-    ; nextln: v2 = x86_packss v7, v10
+    ; nextln: v2 = snarrow v7, v10
     return v2
 }
 
diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
index 24bc8cfa24..e15d059eef 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
@@ -118,8 +118,8 @@ block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]):
             return
 }
 
-function %packss_i16x8(i16x8, i16x8) {
+function %snarrow_i16x8(i16x8, i16x8) {
 block0(v0: i16x8 [%xmm7], v1: i16x8 [%xmm8]):
-[-, %xmm7]  v2 = x86_packss v0, v1          ; bin: 66 41 0f 63 f8
+[-, %xmm7]  v2 = snarrow v0, v1             ; bin: 66 41 0f 63 f8
             return
 }
diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
index 00ebae26f6..013ea78679 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
@@ -206,15 +206,9 @@ block0:
 }
 ; run
 
-function %pack() -> b1 {
-block0:
-    v0 = vconst.i32x4 [0 1 -1 0x0001ffff]
-    v1 = vconst.i32x4 [4 5 -6 0xffffffff]
-    v2 = x86_packss v0, v1
-
-    v3 = vconst.i16x8 [0 1 -1 0x7fff 4 5 -6 0xffff]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
+function %snarrow(i32x4, i32x4) -> i16x8 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = snarrow v0, v1
+    return v2
 }
-; run
+; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff]