x64: Add non-SSSE3 lowerings of `pshufb` (#6606)

* x64: Add non-SSSE3 lowerings of `pshufb` Or, more accurately, add lowerings which don't use `pshufb`'s functionality at all where possible or otherwise fall back to a new libcall. This particular instruction seemed uniquely difficult to implement in the backend so I decided to "cop out" and use libcall instead. The libcall will be used for `popcnt`, `shuffle`, and `swizzle` instructions when SSSE3 isn't available. * Implemente SSE2 popcnt with Hacker's Delight * x64: Implement passing vector arguments in the fastcall convention Windows says that vector arguments are passed indirectly so handle that here through the `ABIArg::ImplicitPtrArg` variant. Some additional handling is added to the general machinst backend. * Update `gen_load_base_offset` for x64 * Fill out remaining bits of fastcall and vector parameters * Remove now-unnecessary `Clone` bound
1 year ago · 3dfbfb61a9
25 changed files with 568 additions and 29 deletions
--- a/cranelift/codegen/src/ir/libcall.rs
+++ b/cranelift/codegen/src/ir/libcall.rs
@ -56,6 +56,9 @@ pub enum LibCall {
    ElfTlsGetAddr,
    /// Elf __tls_get_offset
    ElfTlsGetOffset,
+
+    /// The `pshufb` on x86 when SSSE3 isn't available.
+    X86Pshufb,
    // When adding a new variant make sure to add it to `all_libcalls` too.
 }

@ -88,6 +91,8 @@ impl FromStr for LibCall {

            "ElfTlsGetAddr" => Ok(Self::ElfTlsGetAddr),
            "ElfTlsGetOffset" => Ok(Self::ElfTlsGetOffset),
+
+            "X86Pshufb" => Ok(Self::X86Pshufb),
            _ => Err(()),
        }
    }
@ -115,6 +120,7 @@ impl LibCall {
            Memcmp,
            ElfTlsGetAddr,
            ElfTlsGetOffset,
+            X86Pshufb,
        ]
    }

@ -166,6 +172,11 @@ impl LibCall {
            LibCall::Probestack | LibCall::ElfTlsGetAddr | LibCall::ElfTlsGetOffset => {
                unimplemented!()
            }
+            LibCall::X86Pshufb => {
+                sig.params.push(AbiParam::new(I8X16));
+                sig.params.push(AbiParam::new(I8X16));
+                sig.returns.push(AbiParam::new(I8X16));
+            }
        }

        sig
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@ -211,6 +211,10 @@ impl TargetIsa for AArch64Backend {
    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
        false
    }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }

 impl fmt::Display for AArch64Backend {
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@ -344,6 +344,10 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    /// Returns whether the CLIF `x86_blendv` instruction is implemented for
    /// this ISA for the specified type.
    fn has_x86_blendv_lowering(&self, ty: Type) -> bool;
+
+    /// Returns whether the CLIF `x86_pshufb` instruction is implemented for
+    /// this ISA.
+    fn has_x86_pshufb_lowering(&self) -> bool;
 }

 /// Function alignment specifications as required by an ISA, returned by
--- a/cranelift/codegen/src/isa/riscv64/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/mod.rs
@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend {
    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
        false
    }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }

 impl fmt::Display for Riscv64Backend {
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend {
    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
        false
    }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }

 impl fmt::Display for S390xBackend {
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@ -154,6 +154,44 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                );
            }

+            // Windows fastcall dictates that `__m128i` paramters to a function
+            // are passed indirectly as pointers, so handle that as a special
+            // case before the loop below.
+            if param.value_type.is_vector()
+                && param.value_type.bits() >= 128
+                && args_or_rets == ArgsOrRets::Args
+                && is_fastcall
+            {
+                let pointer = match get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
+                    Some(reg) => {
+                        next_gpr += 1;
+                        ABIArgSlot::Reg {
+                            reg: reg.to_real_reg().unwrap(),
+                            ty: ir::types::I64,
+                            extension: ir::ArgumentExtension::None,
+                        }
+                    }
+
+                    None => {
+                        next_stack = align_to(next_stack, 8) + 8;
+                        ABIArgSlot::Stack {
+                            offset: (next_stack - 8) as i64,
+                            ty: ir::types::I64,
+                            extension: param.extension,
+                        }
+                    }
+                };
+                next_param_idx += 1;
+                args.push(ABIArg::ImplicitPtrArg {
+                    // NB: this is filled in after this loop
+                    offset: 0,
+                    pointer,
+                    ty: param.value_type,
+                    purpose: param.purpose,
+                });
+                continue;
+            }
+
            let mut slots = ABIArgSlotVec::new();
            for (rc, reg_ty) in rcs.iter().zip(reg_tys.iter()) {
                let intreg = *rc == RegClass::Int;
@ -221,6 +259,20 @@ impl ABIMachineSpec for X64ABIMachineSpec {
            });
        }

+        // Fastcall's indirect 128+ bit vector arguments are all located on the
+        // stack, and stack space is reserved after all paramters are passed,
+        // so allocate from the space now.
+        if args_or_rets == ArgsOrRets::Args && is_fastcall {
+            for arg in args.args_mut() {
+                if let ABIArg::ImplicitPtrArg { offset, .. } = arg {
+                    assert_eq!(*offset, 0);
+                    next_stack = align_to(next_stack, 16);
+                    *offset = next_stack as i64;
+                    next_stack += 16;
+                }
+            }
+        }
+
        let extra_arg = if add_ret_area_ptr {
            debug_assert!(args_or_rets == ArgsOrRets::Args);
            if let Some(reg) = get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
@ -348,8 +400,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    }

    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
-        // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed.
-        assert_eq!(ty, I64);
+        // Only ever used for I64s and vectors; if that changes, see if the
+        // ExtKind below needs to be changed.
+        assert!(ty == I64 || ty.is_vector());
        let simm32 = offset as u32;
        let mem = Amode::imm_reg(simm32, base);
        Inst::load(ty, mem, into_reg, ExtKind::None)
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -4957,6 +4957,7 @@
 (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
 (convert VCodeConstant SyntheticAmode const_to_synthetic_amode)
 (convert VCodeConstant XmmMem const_to_xmm_mem)
+(convert VCodeConstant RegMem const_to_reg_mem)

 (convert IntCC CC intcc_to_cc)
 (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@ -5010,6 +5011,8 @@
 (extern constructor const_to_synthetic_amode const_to_synthetic_amode)
 (decl const_to_xmm_mem (VCodeConstant) XmmMem)
 (rule (const_to_xmm_mem c) (const_to_synthetic_amode c))
+(decl const_to_reg_mem (VCodeConstant) RegMem)
+(rule (const_to_reg_mem c) (RegMem.Mem (const_to_synthetic_amode c)))

 (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
 (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))
@ -5054,10 +5057,14 @@
        NearestF32
        NearestF64
        TruncF32
-        TruncF64))
+        TruncF64
+        X86Pshufb))

 (decl libcall_1 (LibCall Reg) Reg)
 (extern constructor libcall_1 libcall_1)

+(decl libcall_2 (LibCall Reg Reg) Reg)
+(extern constructor libcall_2 libcall_2)
+
 (decl libcall_3 (LibCall Reg Reg Reg) Reg)
 (extern constructor libcall_3 libcall_3)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@ -2095,11 +2095,11 @@

 ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule 3 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
+(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
      (if-let $true (use_popcnt))
      (x64_popcnt ty src))

-(rule 2 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
+(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
      (if-let $true (use_popcnt))
      (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))

@ -2192,7 +2192,7 @@
        final))


-(rule 1 (lower (has_type $I8X16 (popcnt src)))
+(rule 2 (lower (has_type $I8X16 (popcnt src)))
      (if-let $true (use_avx512vl_simd))
      (if-let $true (use_avx512bitalg_simd))
      (x64_vpopcntb src))
@ -2218,8 +2218,8 @@
 ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);


-(rule (lower (has_type $I8X16
-                       (popcnt src)))
+(rule 1 (lower (has_type $I8X16 (popcnt src)))
+      (if-let $true (use_ssse3))
      (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
            (low_nibbles Xmm (sse_and $I8X16 src low_mask))
            ;; Note that this is a 16x8 shift, but that's OK; we mask
@ -2233,6 +2233,19 @@
            (bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
        (x64_paddb bit_counts_low bit_counts_high)))

+;; A modified version of the popcnt method from Hacker's Delight.
+(rule (lower (has_type $I8X16 (popcnt src)))
+      (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
+            (src Xmm src)
+            (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
+        (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))
+
 ;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type $I8 (bitrev src)))
@ -4181,7 +4194,8 @@
 ;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
 ;; register.
 (rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
-      (x64_pshufb a (xmm_zero $I8X16)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb a (xmm_zero $I8X16)))

 ;; Special case for the `shufps` instruction which will select two 32-bit values
 ;; from the first operand and two 32-bit values from the second operand. Note
@ -4209,7 +4223,8 @@
 ;; indices (may not be completely necessary: verification could fail incorrect
 ;; mask values) and fix the indexes to all point to the `dst` vector.
 (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
-      (x64_pshufb a (shuffle_0_31_mask mask)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb a (shuffle_0_31_mask mask)))

 ;; For the case where the shuffle mask contains out-of-bounds values (values
 ;; greater than 31) we must mask off those resulting values in the result of
@ -4231,8 +4246,8 @@
 ;; above, we build the `constructed_mask` for each case statically.
 (rule (lower (shuffle a b (vec_mask_from_immediate mask)))
      (x64_por
-        (x64_pshufb a (shuffle_0_15_mask mask))
-        (x64_pshufb b (shuffle_16_31_mask mask))))
+        (lower_pshufb a (shuffle_0_15_mask mask))
+        (lower_pshufb b (shuffle_16_31_mask mask))))

 ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -4244,13 +4259,28 @@
 ;; variables like: %dst = swizzle %src, %mask
 (rule (lower (swizzle src mask))
      (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
-        (x64_pshufb src mask)))
+        (lower_pshufb src mask)))

 ;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (x86_pshufb src mask))
+      (if-let $true (use_ssse3))
      (x64_pshufb src mask))

+;; A helper function to generate either the `pshufb` instruction or a libcall to
+;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
+;; performant thing in the world so this is primarily here for completeness
+;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
+;; of SSSE3 to use the `pshufb` instruction itself.
+(decl lower_pshufb (Xmm RegMem) Xmm)
+(rule 1 (lower_pshufb src mask)
+        (if-let $true (use_ssse3))
+        (x64_pshufb src mask))
+(rule (lower_pshufb src (RegMem.Reg mask))
+      (libcall_2 (LibCall.X86Pshufb) src mask))
+(rule (lower_pshufb src (RegMem.Mem addr))
+      (lower_pshufb src (x64_movdqu_load addr)))
+
 ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Remove the extractlane instruction, leaving the float where it is. The upper
@ -4343,14 +4373,18 @@
 ;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
 ;; with a mask of zero which is calculated with an xor-against-itself register.
 (rule 0 (lower (has_type $I8X16 (splat src)))
-        (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
+        (let ((src Xmm (x64_movd_to_xmm src)))
+          (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
 (rule 1 (lower (has_type $I8X16 (splat src)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
+(rule 2 (lower (has_type $I8X16 (splat src)))
        (if-let $true (use_avx2_simd))
        (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
-(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
+(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
        (if-let $true (use_sse41))
        (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
-(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
+(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
        (if-let $true (use_avx2_simd))
        (x64_vpbroadcastb addr))

@ -4399,10 +4433,10 @@
 ;; the register-based encoding is only available with AVX2. With the
 ;; `sinkable_load` extractor this should be guaranteed to use the memory-based
 ;; encoding hence the `use_avx_simd` test.
-(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
+(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
        (let ((tmp Xmm (x64_movss_load addr)))
          (x64_shufps tmp tmp 0)))
-(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
+(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
        (if-let $true (use_avx_simd))
        (x64_vbroadcastss addr))

@ -4413,7 +4447,7 @@
        (x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00))
 (rule 0 (lower (has_type $F64X2 (splat src)))
        (x64_pshufd src 0b01_00_01_00))
-(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
+(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
        (if-let $true (use_ssse3))
        (x64_movddup addr))

--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@ -645,6 +645,24 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
        output_reg.to_reg()
    }

+    fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg {
+        let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
+        let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;
+        let output_reg = self.lower_ctx.alloc_tmp(ret_ty).only_reg().unwrap();
+
+        emit_vm_call(
+            self.lower_ctx,
+            &self.backend.flags,
+            &self.backend.triple,
+            libcall.clone(),
+            &[a, b],
+            &[output_reg],
+        )
+        .expect("Failed to emit LibCall");
+
+        output_reg.to_reg()
+    }
+
    fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg {
        let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
        let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@ -186,6 +186,10 @@ impl TargetIsa for X64Backend {
        // operation, so that always returns `false`
        self.x64_flags.use_sse41() && ty != types::I16X8
    }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        self.x64_flags.use_ssse3()
+    }
 }

 impl fmt::Display for X64Backend {
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@ -2193,7 +2193,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
        from_regs: ValueRegs<Reg>,
    ) {
        match &ctx.sigs().args(self.sig)[idx] {
-            &ABIArg::Slots { .. } => {}
+            &ABIArg::Slots { .. } | &ABIArg::ImplicitPtrArg { .. } => {}
            &ABIArg::StructArg { offset, size, .. } => {
                let src_ptr = from_regs.only_reg().unwrap();
                let dst_ptr = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
@ -2220,7 +2220,6 @@ impl<M: ABIMachineSpec> CallSite<M> {
                    ctx.emit(insn);
                }
            }
-            &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
        }
    }

@ -2260,6 +2259,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
                    &ABIArgSlot::Stack { .. } => 0,
                })
                .sum(),
+            ABIArg::ImplicitPtrArg { .. } => 1,
            _ => 0,
        };
        let mut temps: SmallVec<[Writable<Reg>; 16]> = (0..needed_tmps)
@ -2355,7 +2355,36 @@ impl<M: ABIMachineSpec> CallSite<M> {
            &ABIArg::StructArg { pointer, .. } => {
                assert!(pointer.is_none()); // Only supported via ISLE.
            }
-            &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
+            &ABIArg::ImplicitPtrArg {
+                offset,
+                pointer,
+                ty,
+                purpose: _,
+            } => {
+                assert_eq!(from_regs.len(), 1);
+                let vreg = from_regs.regs()[0];
+                let amode = StackAMode::SPOffset(offset, ty);
+                let tmp = temps[0];
+                insts.push(M::gen_get_stack_addr(amode, tmp, ty));
+                let tmp = tmp.to_reg();
+                insts.push(M::gen_store_base_offset(tmp, 0, vreg, ty));
+                match pointer {
+                    ABIArgSlot::Reg { reg, .. } => {
+                        self.uses.push(CallArgPair {
+                            vreg: tmp,
+                            preg: reg.into(),
+                        });
+                    }
+                    ABIArgSlot::Stack { offset, .. } => {
+                        let ty = M::word_type();
+                        insts.push(M::gen_store_stack(
+                            StackAMode::SPOffset(offset, ty),
+                            tmp,
+                            ty,
+                        ));
+                    }
+                };
+            }
        }
        insts
    }
--- a/cranelift/filetests/filetests/isa/x64/call-conv.clif
+++ b/cranelift/filetests/filetests/isa/x64/call-conv.clif
@ -660,3 +660,238 @@ block0(v0: f32, v1: i64, v2: i32, v3: f32):
 ;   popq %rbp
 ;   retq

+function %fastcall_m128i_param(i32, i8x16) system_v {
+    sig0 = (i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   subq    %rsp, $48, %rsp
+;   virtual_sp_offset_adjust 48
+;   lea     32(%rsp), %rcx
+;   movdqu  %xmm0, 0(%rcx)
+;   call    *%rdi
+;   addq    %rsp, $48, %rsp
+;   virtual_sp_offset_adjust -48
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   subq $0x30, %rsp
+;   leaq 0x20(%rsp), %rcx
+;   movdqu %xmm0, (%rcx)
+;   callq *%rdi
+;   addq $0x30, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_params_and_results(i32, i32, i8x16, i64, i8x16) -> i8x16 system_v {
+    sig0 = (i32, i8x16, i64, i8x16) -> i8x16 windows_fastcall
+block0(v0: i32, v1: i32, v2: i8x16, v3: i64, v4: i8x16):
+    v5 = call_indirect sig0, v0(v1, v2, v3, v4)
+    v6 = iadd v5, v5
+    return v6
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdx, %r8
+;   movq    %rsi, %rcx
+;   subq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust 64
+;   lea     32(%rsp), %rdx
+;   movdqu  %xmm0, 0(%rdx)
+;   lea     48(%rsp), %r9
+;   movdqu  %xmm1, 0(%r9)
+;   call    *%rdi
+;   addq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust -64
+;   paddb   %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdx, %r8
+;   movq %rsi, %rcx
+;   subq $0x40, %rsp
+;   leaq 0x20(%rsp), %rdx
+;   movdqu %xmm0, (%rdx)
+;   leaq 0x30(%rsp), %r9
+;   movdqu %xmm1, (%r9)
+;   callq *%rdi
+;   addq $0x40, %rsp
+;   paddb %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_one_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i32, i32, i32, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v0, v0, v0, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust 64
+;   lea     48(%rsp), %r8
+;   movdqu  %xmm0, 0(%r8)
+;   movq    %r8, 32(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %rdx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust -64
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x40, %rsp
+;   leaq 0x30(%rsp), %r8
+;   movdqu %xmm0, (%r8)
+;   movq %r8, 0x20(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %rdx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x40, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_two_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i32, i32, i32, i8x16, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v0, v0, v0, v1, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $80, %rsp
+;   virtual_sp_offset_adjust 80
+;   lea     48(%rsp), %r8
+;   movdqu  %xmm0, 0(%r8)
+;   movq    %r8, 32(%rsp)
+;   lea     64(%rsp), %rsi
+;   movdqu  %xmm0, 0(%rsi)
+;   movq    %rsi, 40(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %rdx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $80, %rsp
+;   virtual_sp_offset_adjust -80
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x50, %rsp
+;   leaq 0x30(%rsp), %r8
+;   movdqu %xmm0, (%r8)
+;   movq %r8, 0x20(%rsp)
+;   leaq 0x40(%rsp), %rsi
+;   movdqu %xmm0, (%rsi)
+;   movq %rsi, 0x28(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %rdx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x50, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_reg_and_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i8x16, i32, i32, i8x16, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v1, v0, v0, v1, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $96, %rsp
+;   virtual_sp_offset_adjust 96
+;   lea     48(%rsp), %rdx
+;   movdqu  %xmm0, 0(%rdx)
+;   lea     64(%rsp), %r11
+;   movdqu  %xmm0, 0(%r11)
+;   movq    %r11, 32(%rsp)
+;   lea     80(%rsp), %rcx
+;   movdqu  %xmm0, 0(%rcx)
+;   movq    %rcx, 40(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $96, %rsp
+;   virtual_sp_offset_adjust -96
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x60, %rsp
+;   leaq 0x30(%rsp), %rdx
+;   movdqu %xmm0, (%rdx)
+;   leaq 0x40(%rsp), %r11
+;   movdqu %xmm0, (%r11)
+;   movq %r11, 0x20(%rsp)
+;   leaq 0x50(%rsp), %rcx
+;   movdqu %xmm0, (%rcx)
+;   movq %rcx, 0x28(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x60, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
--- a/cranelift/filetests/filetests/runtests/simd-popcnt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-popcnt.clif
@ -2,7 +2,9 @@ test interpret
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
+target x86_64
 target x86_64 sse42
 target x86_64 sse42 has_avx has_avx512vl has_avx512bitalg
 target riscv64 has_v
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@ -2,10 +2,13 @@
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
-target x86_64 has_sse3 has_ssse3 has_sse41
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+target x86_64 sse42 has_avx has_avx512vl has_avx512vbmi
 target riscv64gc has_v

 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
--- a/cranelift/filetests/filetests/runtests/simd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-splat.clif
@ -2,6 +2,7 @@
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
 target x86_64
 target x86_64 sse41
--- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif
@ -2,9 +2,11 @@ test interpret
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
-target x86_64 has_sse3 has_ssse3 has_sse41
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target x86_64
+target x86_64 sse41
+target x86_64 sse41 has_avx
 target riscv64gc has_v

 function %swizzle_i8x16(i8x16, i8x16) -> i8x16 {
--- a/cranelift/filetests/src/function_runner.rs
+++ b/cranelift/filetests/src/function_runner.rs
@ -89,7 +89,18 @@ impl TestFileCompiler {
    /// host machine, this [TargetIsa] must match the host machine's ISA (see
    /// [TestFileCompiler::with_host_isa]).
    pub fn new(isa: OwnedTargetIsa) -> Self {
-        let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        let mut builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        drop(&mut builder); // require mutability on all architectures
+        #[cfg(target_arch = "x86_64")]
+        {
+            builder.symbol_lookup_fn(Box::new(|name| {
+                if name == "__cranelift_x86_pshufb" {
+                    Some(__cranelift_x86_pshufb as *const u8)
+                } else {
+                    None
+                }
+            }));
+        }
        let module = JITModule::new(builder);
        let ctx = module.make_context();

@ -500,6 +511,52 @@ fn make_trampoline(name: UserFuncName, signature: &ir::Signature, isa: &dyn Targ
    func
 }

+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::__m128i;
+#[cfg(target_arch = "x86_64")]
+#[allow(improper_ctypes_definitions)]
+extern "C" fn __cranelift_x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
+    union U {
+        reg: __m128i,
+        mem: [u8; 16],
+    }
+
+    unsafe {
+        let a = U { reg: a }.mem;
+        let b = U { reg: b }.mem;
+
+        let select = |arr: &[u8; 16], byte: u8| {
+            if byte & 0x80 != 0 {
+                0x00
+            } else {
+                arr[(byte & 0xf) as usize]
+            }
+        };
+
+        U {
+            mem: [
+                select(&a, b[0]),
+                select(&a, b[1]),
+                select(&a, b[2]),
+                select(&a, b[3]),
+                select(&a, b[4]),
+                select(&a, b[5]),
+                select(&a, b[6]),
+                select(&a, b[7]),
+                select(&a, b[8]),
+                select(&a, b[9]),
+                select(&a, b[10]),
+                select(&a, b[11]),
+                select(&a, b[12]),
+                select(&a, b[13]),
+                select(&a, b[14]),
+                select(&a, b[15]),
+            ],
+        }
+        .reg
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
--- a/cranelift/module/src/lib.rs
+++ b/cranelift/module/src/lib.rs
@ -72,5 +72,6 @@ pub fn default_libcall_names() -> Box<dyn Fn(ir::LibCall) -> String + Send + Syn

        ir::LibCall::ElfTlsGetAddr => "__tls_get_addr".to_owned(),
        ir::LibCall::ElfTlsGetOffset => "__tls_get_offset".to_owned(),
+        ir::LibCall::X86Pshufb => "__cranelift_x86_pshufb".to_owned(),
    })
 }
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@ -2197,7 +2197,9 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::I8x16RelaxedSwizzle => {
            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
            state.push1(
-                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                if environ.relaxed_simd_deterministic()
+                    || !environ.use_x86_pshufb_for_relaxed_swizzle()
+                {
                    // Deterministic semantics match the `i8x16.swizzle`
                    // instruction which is the CLIF `swizzle`.
                    builder.ins().swizzle(a, b)
--- a/cranelift/wasm/src/environ/spec.rs
+++ b/cranelift/wasm/src/environ/spec.rs
@ -569,6 +569,12 @@ pub trait FuncEnvironment: TargetEnvironment {
        let _ = ty;
        false
    }
+
+    /// Returns whether the CLIF `x86_pshufb` instruction should be used for the
+    /// `i8x16.relaxed_swizzle` instruction.
+    fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
+        false
+    }
 }

 /// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the
--- a/crates/cranelift-shared/src/obj.rs
+++ b/crates/cranelift-shared/src/obj.rs
@ -558,6 +558,7 @@ fn libcall_name(call: LibCall) -> &'static str {
        LibCall::TruncF64 => LC::TruncF64,
        LibCall::FmaF32 => LC::FmaF32,
        LibCall::FmaF64 => LC::FmaF64,
+        LibCall::X86Pshufb => LC::X86Pshufb,
        _ => panic!("unknown libcall to give a name to: {call:?}"),
    };
    other.symbol()
--- a/crates/cranelift/src/func_environ.rs
+++ b/crates/cranelift/src/func_environ.rs
@ -2203,4 +2203,8 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m
    fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool {
        self.isa.has_x86_blendv_lowering(ty)
    }
+
+    fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
+        self.isa.has_x86_pshufb_lowering()
+    }
 }
--- a/crates/environ/src/obj.rs
+++ b/crates/environ/src/obj.rs
@ -168,4 +168,5 @@ libcalls! {
    TruncF64 = "libcall_truncf64"
    FmaF32 = "libcall_fmaf32"
    FmaF64 = "libcall_fmaf64"
+    X86Pshufb = "libcall_x86_pshufb"
 }
--- a/crates/jit/src/code_memory.rs
+++ b/crates/jit/src/code_memory.rs
@ -284,6 +284,10 @@ impl CodeMemory {
                obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize,
                obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize,
                obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize,
+                #[cfg(target_arch = "x86_64")]
+                obj::LibCall::X86Pshufb => libcalls::relocs::x86_pshufb as usize,
+                #[cfg(not(target_arch = "x86_64"))]
+                obj::LibCall::X86Pshufb => unreachable!(),
            };
            self.mmap
                .as_mut_ptr()
--- a/crates/runtime/src/libcalls.rs
+++ b/crates/runtime/src/libcalls.rs
@ -575,4 +575,52 @@ pub mod relocs {
    pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 {
        a.mul_add(b, c)
    }
+
+    // This intrinsic is only used on x86_64 platforms as an implementation of
+    // the `pshufb` instruction when SSSE3 is not available.
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::__m128i;
+    #[cfg(target_arch = "x86_64")]
+    #[allow(improper_ctypes_definitions)]
+    pub extern "C" fn x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
+        union U {
+            reg: __m128i,
+            mem: [u8; 16],
+        }
+
+        unsafe {
+            let a = U { reg: a }.mem;
+            let b = U { reg: b }.mem;
+
+            let select = |arr: &[u8; 16], byte: u8| {
+                if byte & 0x80 != 0 {
+                    0x00
+                } else {
+                    arr[(byte & 0xf) as usize]
+                }
+            };
+
+            U {
+                mem: [
+                    select(&a, b[0]),
+                    select(&a, b[1]),
+                    select(&a, b[2]),
+                    select(&a, b[3]),
+                    select(&a, b[4]),
+                    select(&a, b[5]),
+                    select(&a, b[6]),
+                    select(&a, b[7]),
+                    select(&a, b[8]),
+                    select(&a, b[9]),
+                    select(&a, b[10]),
+                    select(&a, b[11]),
+                    select(&a, b[12]),
+                    select(&a, b[13]),
+                    select(&a, b[14]),
+                    select(&a, b[15]),
+                ],
+            }
+            .reg
+        }
+    }
 }