diff --git a/cranelift/codegen/src/ir/libcall.rs b/cranelift/codegen/src/ir/libcall.rs
index e6b2d539d0..77a680c93b 100644
--- a/cranelift/codegen/src/ir/libcall.rs
+++ b/cranelift/codegen/src/ir/libcall.rs
@@ -56,6 +56,9 @@ pub enum LibCall {
     ElfTlsGetAddr,
     /// Elf __tls_get_offset
     ElfTlsGetOffset,
+
+    /// The `pshufb` on x86 when SSSE3 isn't available.
+    X86Pshufb,
     // When adding a new variant make sure to add it to `all_libcalls` too.
 }
 
@@ -88,6 +91,8 @@ impl FromStr for LibCall {
 
             "ElfTlsGetAddr" => Ok(Self::ElfTlsGetAddr),
             "ElfTlsGetOffset" => Ok(Self::ElfTlsGetOffset),
+
+            "X86Pshufb" => Ok(Self::X86Pshufb),
             _ => Err(()),
         }
     }
@@ -115,6 +120,7 @@ impl LibCall {
             Memcmp,
             ElfTlsGetAddr,
             ElfTlsGetOffset,
+            X86Pshufb,
         ]
     }
 
@@ -166,6 +172,11 @@ impl LibCall {
             LibCall::Probestack | LibCall::ElfTlsGetAddr | LibCall::ElfTlsGetOffset => {
                 unimplemented!()
             }
+            LibCall::X86Pshufb => {
+                sig.params.push(AbiParam::new(I8X16));
+                sig.params.push(AbiParam::new(I8X16));
+                sig.returns.push(AbiParam::new(I8X16));
+            }
         }
 
         sig
diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs
index 87633accfb..0d34e07b4a 100644
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -211,6 +211,10 @@ impl TargetIsa for AArch64Backend {
     fn has_x86_blendv_lowering(&self, _: Type) -> bool {
         false
     }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }
 
 impl fmt::Display for AArch64Backend {
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index 404d7982b4..1fe2b5e47b 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -344,6 +344,10 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
     /// Returns whether the CLIF `x86_blendv` instruction is implemented for
     /// this ISA for the specified type.
     fn has_x86_blendv_lowering(&self, ty: Type) -> bool;
+
+    /// Returns whether the CLIF `x86_pshufb` instruction is implemented for
+    /// this ISA.
+    fn has_x86_pshufb_lowering(&self) -> bool;
 }
 
 /// Function alignment specifications as required by an ISA, returned by
diff --git a/cranelift/codegen/src/isa/riscv64/mod.rs b/cranelift/codegen/src/isa/riscv64/mod.rs
index 6844e1708f..af66580a0f 100644
--- a/cranelift/codegen/src/isa/riscv64/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/mod.rs
@@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend {
     fn has_x86_blendv_lowering(&self, _: Type) -> bool {
         false
     }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }
 
 impl fmt::Display for Riscv64Backend {
diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs
index 8a08ba9b83..238f7d5a1f 100644
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend {
     fn has_x86_blendv_lowering(&self, _: Type) -> bool {
         false
     }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        false
+    }
 }
 
 impl fmt::Display for S390xBackend {
diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index 28b2f7c4df..1e24ef10a3 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -154,6 +154,44 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                 );
             }
 
+            // Windows fastcall dictates that `__m128i` paramters to a function
+            // are passed indirectly as pointers, so handle that as a special
+            // case before the loop below.
+            if param.value_type.is_vector()
+                && param.value_type.bits() >= 128
+                && args_or_rets == ArgsOrRets::Args
+                && is_fastcall
+            {
+                let pointer = match get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
+                    Some(reg) => {
+                        next_gpr += 1;
+                        ABIArgSlot::Reg {
+                            reg: reg.to_real_reg().unwrap(),
+                            ty: ir::types::I64,
+                            extension: ir::ArgumentExtension::None,
+                        }
+                    }
+
+                    None => {
+                        next_stack = align_to(next_stack, 8) + 8;
+                        ABIArgSlot::Stack {
+                            offset: (next_stack - 8) as i64,
+                            ty: ir::types::I64,
+                            extension: param.extension,
+                        }
+                    }
+                };
+                next_param_idx += 1;
+                args.push(ABIArg::ImplicitPtrArg {
+                    // NB: this is filled in after this loop
+                    offset: 0,
+                    pointer,
+                    ty: param.value_type,
+                    purpose: param.purpose,
+                });
+                continue;
+            }
+
             let mut slots = ABIArgSlotVec::new();
             for (rc, reg_ty) in rcs.iter().zip(reg_tys.iter()) {
                 let intreg = *rc == RegClass::Int;
@@ -221,6 +259,20 @@ impl ABIMachineSpec for X64ABIMachineSpec {
             });
         }
 
+        // Fastcall's indirect 128+ bit vector arguments are all located on the
+        // stack, and stack space is reserved after all paramters are passed,
+        // so allocate from the space now.
+        if args_or_rets == ArgsOrRets::Args && is_fastcall {
+            for arg in args.args_mut() {
+                if let ABIArg::ImplicitPtrArg { offset, .. } = arg {
+                    assert_eq!(*offset, 0);
+                    next_stack = align_to(next_stack, 16);
+                    *offset = next_stack as i64;
+                    next_stack += 16;
+                }
+            }
+        }
+
         let extra_arg = if add_ret_area_ptr {
             debug_assert!(args_or_rets == ArgsOrRets::Args);
             if let Some(reg) = get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
@@ -348,8 +400,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
     }
 
     fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
-        // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed.
-        assert_eq!(ty, I64);
+        // Only ever used for I64s and vectors; if that changes, see if the
+        // ExtKind below needs to be changed.
+        assert!(ty == I64 || ty.is_vector());
         let simm32 = offset as u32;
         let mem = Amode::imm_reg(simm32, base);
         Inst::load(ty, mem, into_reg, ExtKind::None)
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 06fcbac98f..b1cb08c707 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -4957,6 +4957,7 @@
 (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
 (convert VCodeConstant SyntheticAmode const_to_synthetic_amode)
 (convert VCodeConstant XmmMem const_to_xmm_mem)
+(convert VCodeConstant RegMem const_to_reg_mem)
 
 (convert IntCC CC intcc_to_cc)
 (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@@ -5010,6 +5011,8 @@
 (extern constructor const_to_synthetic_amode const_to_synthetic_amode)
 (decl const_to_xmm_mem (VCodeConstant) XmmMem)
 (rule (const_to_xmm_mem c) (const_to_synthetic_amode c))
+(decl const_to_reg_mem (VCodeConstant) RegMem)
+(rule (const_to_reg_mem c) (RegMem.Mem (const_to_synthetic_amode c)))
 
 (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
 (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))
@@ -5054,10 +5057,14 @@
         NearestF32
         NearestF64
         TruncF32
-        TruncF64))
+        TruncF64
+        X86Pshufb))
 
 (decl libcall_1 (LibCall Reg) Reg)
 (extern constructor libcall_1 libcall_1)
 
+(decl libcall_2 (LibCall Reg Reg) Reg)
+(extern constructor libcall_2 libcall_2)
+
 (decl libcall_3 (LibCall Reg Reg Reg) Reg)
 (extern constructor libcall_3 libcall_3)
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 3273bf903c..16d66326bd 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2095,11 +2095,11 @@
 
 ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 3 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
+(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
       (if-let $true (use_popcnt))
       (x64_popcnt ty src))
 
-(rule 2 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
+(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
       (if-let $true (use_popcnt))
       (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
 
@@ -2192,7 +2192,7 @@
         final))
 
 
-(rule 1 (lower (has_type $I8X16 (popcnt src)))
+(rule 2 (lower (has_type $I8X16 (popcnt src)))
       (if-let $true (use_avx512vl_simd))
       (if-let $true (use_avx512bitalg_simd))
       (x64_vpopcntb src))
@@ -2218,8 +2218,8 @@
 ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
 
 
-(rule (lower (has_type $I8X16
-                       (popcnt src)))
+(rule 1 (lower (has_type $I8X16 (popcnt src)))
+      (if-let $true (use_ssse3))
       (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
             (low_nibbles Xmm (sse_and $I8X16 src low_mask))
             ;; Note that this is a 16x8 shift, but that's OK; we mask
@@ -2233,6 +2233,19 @@
             (bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
         (x64_paddb bit_counts_low bit_counts_high)))
 
+;; A modified version of the popcnt method from Hacker's Delight.
+(rule (lower (has_type $I8X16 (popcnt src)))
+      (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
+            (src Xmm src)
+            (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
+            (src Xmm (x64_psubb src shifted))
+            (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
+        (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))
+
 ;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (bitrev src)))
@@ -4181,7 +4194,8 @@
 ;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
 ;; register.
 (rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
-      (x64_pshufb a (xmm_zero $I8X16)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb a (xmm_zero $I8X16)))
 
 ;; Special case for the `shufps` instruction which will select two 32-bit values
 ;; from the first operand and two 32-bit values from the second operand. Note
@@ -4209,7 +4223,8 @@
 ;; indices (may not be completely necessary: verification could fail incorrect
 ;; mask values) and fix the indexes to all point to the `dst` vector.
 (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
-      (x64_pshufb a (shuffle_0_31_mask mask)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb a (shuffle_0_31_mask mask)))
 
 ;; For the case where the shuffle mask contains out-of-bounds values (values
 ;; greater than 31) we must mask off those resulting values in the result of
@@ -4231,8 +4246,8 @@
 ;; above, we build the `constructed_mask` for each case statically.
 (rule (lower (shuffle a b (vec_mask_from_immediate mask)))
       (x64_por
-        (x64_pshufb a (shuffle_0_15_mask mask))
-        (x64_pshufb b (shuffle_16_31_mask mask))))
+        (lower_pshufb a (shuffle_0_15_mask mask))
+        (lower_pshufb b (shuffle_16_31_mask mask))))
 
 ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -4244,13 +4259,28 @@
 ;; variables like: %dst = swizzle %src, %mask
 (rule (lower (swizzle src mask))
       (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
-        (x64_pshufb src mask)))
+        (lower_pshufb src mask)))
 
 ;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (x86_pshufb src mask))
+      (if-let $true (use_ssse3))
       (x64_pshufb src mask))
 
+;; A helper function to generate either the `pshufb` instruction or a libcall to
+;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
+;; performant thing in the world so this is primarily here for completeness
+;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
+;; of SSSE3 to use the `pshufb` instruction itself.
+(decl lower_pshufb (Xmm RegMem) Xmm)
+(rule 1 (lower_pshufb src mask)
+        (if-let $true (use_ssse3))
+        (x64_pshufb src mask))
+(rule (lower_pshufb src (RegMem.Reg mask))
+      (libcall_2 (LibCall.X86Pshufb) src mask))
+(rule (lower_pshufb src (RegMem.Mem addr))
+      (lower_pshufb src (x64_movdqu_load addr)))
+
 ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Remove the extractlane instruction, leaving the float where it is. The upper
@@ -4343,14 +4373,18 @@
 ;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
 ;; with a mask of zero which is calculated with an xor-against-itself register.
 (rule 0 (lower (has_type $I8X16 (splat src)))
-        (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
+        (let ((src Xmm (x64_movd_to_xmm src)))
+          (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
 (rule 1 (lower (has_type $I8X16 (splat src)))
+        (if-let $true (use_ssse3))
+        (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
+(rule 2 (lower (has_type $I8X16 (splat src)))
         (if-let $true (use_avx2_simd))
         (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
-(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
+(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
         (if-let $true (use_sse41))
         (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
-(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
+(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
         (if-let $true (use_avx2_simd))
         (x64_vpbroadcastb addr))
 
@@ -4399,10 +4433,10 @@
 ;; the register-based encoding is only available with AVX2. With the
 ;; `sinkable_load` extractor this should be guaranteed to use the memory-based
 ;; encoding hence the `use_avx_simd` test.
-(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
+(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
         (let ((tmp Xmm (x64_movss_load addr)))
           (x64_shufps tmp tmp 0)))
-(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
+(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
         (if-let $true (use_avx_simd))
         (x64_vbroadcastss addr))
 
@@ -4413,7 +4447,7 @@
         (x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00))
 (rule 0 (lower (has_type $F64X2 (splat src)))
         (x64_pshufd src 0b01_00_01_00))
-(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
+(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
         (if-let $true (use_ssse3))
         (x64_movddup addr))
 
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index f7c5456d5c..6b1d9a14e3 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -645,6 +645,24 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
         output_reg.to_reg()
     }
 
+    fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg {
+        let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
+        let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;
+        let output_reg = self.lower_ctx.alloc_tmp(ret_ty).only_reg().unwrap();
+
+        emit_vm_call(
+            self.lower_ctx,
+            &self.backend.flags,
+            &self.backend.triple,
+            libcall.clone(),
+            &[a, b],
+            &[output_reg],
+        )
+        .expect("Failed to emit LibCall");
+
+        output_reg.to_reg()
+    }
+
     fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg {
         let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
         let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 0c4fd48554..bcdc397c2b 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -186,6 +186,10 @@ impl TargetIsa for X64Backend {
         // operation, so that always returns `false`
         self.x64_flags.use_sse41() && ty != types::I16X8
     }
+
+    fn has_x86_pshufb_lowering(&self) -> bool {
+        self.x64_flags.use_ssse3()
+    }
 }
 
 impl fmt::Display for X64Backend {
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index 1889f144a9..df0a714e26 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -2193,7 +2193,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
         from_regs: ValueRegs<Reg>,
     ) {
         match &ctx.sigs().args(self.sig)[idx] {
-            &ABIArg::Slots { .. } => {}
+            &ABIArg::Slots { .. } | &ABIArg::ImplicitPtrArg { .. } => {}
             &ABIArg::StructArg { offset, size, .. } => {
                 let src_ptr = from_regs.only_reg().unwrap();
                 let dst_ptr = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
@@ -2220,7 +2220,6 @@ impl<M: ABIMachineSpec> CallSite<M> {
                     ctx.emit(insn);
                 }
             }
-            &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
         }
     }
 
@@ -2260,6 +2259,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
                     &ABIArgSlot::Stack { .. } => 0,
                 })
                 .sum(),
+            ABIArg::ImplicitPtrArg { .. } => 1,
             _ => 0,
         };
         let mut temps: SmallVec<[Writable<Reg>; 16]> = (0..needed_tmps)
@@ -2355,7 +2355,36 @@ impl<M: ABIMachineSpec> CallSite<M> {
             &ABIArg::StructArg { pointer, .. } => {
                 assert!(pointer.is_none()); // Only supported via ISLE.
             }
-            &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
+            &ABIArg::ImplicitPtrArg {
+                offset,
+                pointer,
+                ty,
+                purpose: _,
+            } => {
+                assert_eq!(from_regs.len(), 1);
+                let vreg = from_regs.regs()[0];
+                let amode = StackAMode::SPOffset(offset, ty);
+                let tmp = temps[0];
+                insts.push(M::gen_get_stack_addr(amode, tmp, ty));
+                let tmp = tmp.to_reg();
+                insts.push(M::gen_store_base_offset(tmp, 0, vreg, ty));
+                match pointer {
+                    ABIArgSlot::Reg { reg, .. } => {
+                        self.uses.push(CallArgPair {
+                            vreg: tmp,
+                            preg: reg.into(),
+                        });
+                    }
+                    ABIArgSlot::Stack { offset, .. } => {
+                        let ty = M::word_type();
+                        insts.push(M::gen_store_stack(
+                            StackAMode::SPOffset(offset, ty),
+                            tmp,
+                            ty,
+                        ));
+                    }
+                };
+            }
         }
         insts
     }
diff --git a/cranelift/filetests/filetests/isa/x64/call-conv.clif b/cranelift/filetests/filetests/isa/x64/call-conv.clif
index 775637edfe..6040fa4675 100644
--- a/cranelift/filetests/filetests/isa/x64/call-conv.clif
+++ b/cranelift/filetests/filetests/isa/x64/call-conv.clif
@@ -660,3 +660,238 @@ block0(v0: f32, v1: i64, v2: i32, v3: f32):
 ;   popq %rbp
 ;   retq
 
+function %fastcall_m128i_param(i32, i8x16) system_v {
+    sig0 = (i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   subq    %rsp, $48, %rsp
+;   virtual_sp_offset_adjust 48
+;   lea     32(%rsp), %rcx
+;   movdqu  %xmm0, 0(%rcx)
+;   call    *%rdi
+;   addq    %rsp, $48, %rsp
+;   virtual_sp_offset_adjust -48
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   subq $0x30, %rsp
+;   leaq 0x20(%rsp), %rcx
+;   movdqu %xmm0, (%rcx)
+;   callq *%rdi
+;   addq $0x30, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_params_and_results(i32, i32, i8x16, i64, i8x16) -> i8x16 system_v {
+    sig0 = (i32, i8x16, i64, i8x16) -> i8x16 windows_fastcall
+block0(v0: i32, v1: i32, v2: i8x16, v3: i64, v4: i8x16):
+    v5 = call_indirect sig0, v0(v1, v2, v3, v4)
+    v6 = iadd v5, v5
+    return v6
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdx, %r8
+;   movq    %rsi, %rcx
+;   subq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust 64
+;   lea     32(%rsp), %rdx
+;   movdqu  %xmm0, 0(%rdx)
+;   lea     48(%rsp), %r9
+;   movdqu  %xmm1, 0(%r9)
+;   call    *%rdi
+;   addq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust -64
+;   paddb   %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdx, %r8
+;   movq %rsi, %rcx
+;   subq $0x40, %rsp
+;   leaq 0x20(%rsp), %rdx
+;   movdqu %xmm0, (%rdx)
+;   leaq 0x30(%rsp), %r9
+;   movdqu %xmm1, (%r9)
+;   callq *%rdi
+;   addq $0x40, %rsp
+;   paddb %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_one_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i32, i32, i32, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v0, v0, v0, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust 64
+;   lea     48(%rsp), %r8
+;   movdqu  %xmm0, 0(%r8)
+;   movq    %r8, 32(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %rdx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $64, %rsp
+;   virtual_sp_offset_adjust -64
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x40, %rsp
+;   leaq 0x30(%rsp), %r8
+;   movdqu %xmm0, (%r8)
+;   movq %r8, 0x20(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %rdx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x40, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_two_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i32, i32, i32, i8x16, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v0, v0, v0, v1, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $80, %rsp
+;   virtual_sp_offset_adjust 80
+;   lea     48(%rsp), %r8
+;   movdqu  %xmm0, 0(%r8)
+;   movq    %r8, 32(%rsp)
+;   lea     64(%rsp), %rsi
+;   movdqu  %xmm0, 0(%rsi)
+;   movq    %rsi, 40(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %rdx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $80, %rsp
+;   virtual_sp_offset_adjust -80
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x50, %rsp
+;   leaq 0x30(%rsp), %r8
+;   movdqu %xmm0, (%r8)
+;   movq %r8, 0x20(%rsp)
+;   leaq 0x40(%rsp), %rsi
+;   movdqu %xmm0, (%rsi)
+;   movq %rsi, 0x28(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %rdx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x50, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fastcall_m128i_reg_and_stack_param(i32, i8x16) system_v {
+    sig0 = (i32, i8x16, i32, i32, i8x16, i8x16) windows_fastcall
+block0(v0: i32, v1: i8x16):
+    call_indirect sig0, v0(v0, v1, v0, v0, v1, v1)
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %r9
+;   subq    %rsp, $96, %rsp
+;   virtual_sp_offset_adjust 96
+;   lea     48(%rsp), %rdx
+;   movdqu  %xmm0, 0(%rdx)
+;   lea     64(%rsp), %r11
+;   movdqu  %xmm0, 0(%r11)
+;   movq    %r11, 32(%rsp)
+;   lea     80(%rsp), %rcx
+;   movdqu  %xmm0, 0(%rcx)
+;   movq    %rcx, 40(%rsp)
+;   movq    %r9, %rcx
+;   movq    %r9, %r8
+;   call    *%r9
+;   addq    %rsp, $96, %rsp
+;   virtual_sp_offset_adjust -96
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   subq $0x60, %rsp
+;   leaq 0x30(%rsp), %rdx
+;   movdqu %xmm0, (%rdx)
+;   leaq 0x40(%rsp), %r11
+;   movdqu %xmm0, (%r11)
+;   movq %r11, 0x20(%rsp)
+;   leaq 0x50(%rsp), %rcx
+;   movdqu %xmm0, (%rcx)
+;   movq %rcx, 0x28(%rsp)
+;   movq %r9, %rcx
+;   movq %r9, %r8
+;   callq *%r9
+;   addq $0x60, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-popcnt.clif b/cranelift/filetests/filetests/runtests/simd-popcnt.clif
index 327da79054..00a24d92f9 100644
--- a/cranelift/filetests/filetests/runtests/simd-popcnt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-popcnt.clif
@@ -2,7 +2,9 @@ test interpret
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
+target x86_64
 target x86_64 sse42
 target x86_64 sse42 has_avx has_avx512vl has_avx512bitalg
 target riscv64 has_v
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index e6aa4b6af8..0a72254528 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -2,10 +2,13 @@
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
-target x86_64 has_sse3 has_ssse3 has_sse41
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+target x86_64 sse42 has_avx has_avx512vl has_avx512vbmi
 target riscv64gc has_v
 
 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif
index e1caad0419..6dd866f8df 100644
--- a/cranelift/filetests/filetests/runtests/simd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-splat.clif
@@ -2,6 +2,7 @@
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
 target x86_64
 target x86_64 sse41
diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif
index 2592e65fb3..f188eaf18a 100644
--- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif
@@ -2,9 +2,11 @@ test interpret
 test run
 target aarch64
 target s390x
+target x86_64 has_ssse3=false
 set enable_simd
-target x86_64 has_sse3 has_ssse3 has_sse41
-target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target x86_64
+target x86_64 sse41
+target x86_64 sse41 has_avx
 target riscv64gc has_v
 
 function %swizzle_i8x16(i8x16, i8x16) -> i8x16 {
diff --git a/cranelift/filetests/src/function_runner.rs b/cranelift/filetests/src/function_runner.rs
index a786506439..dff1fda6cf 100644
--- a/cranelift/filetests/src/function_runner.rs
+++ b/cranelift/filetests/src/function_runner.rs
@@ -89,7 +89,18 @@ impl TestFileCompiler {
     /// host machine, this [TargetIsa] must match the host machine's ISA (see
     /// [TestFileCompiler::with_host_isa]).
     pub fn new(isa: OwnedTargetIsa) -> Self {
-        let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        let mut builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
+        drop(&mut builder); // require mutability on all architectures
+        #[cfg(target_arch = "x86_64")]
+        {
+            builder.symbol_lookup_fn(Box::new(|name| {
+                if name == "__cranelift_x86_pshufb" {
+                    Some(__cranelift_x86_pshufb as *const u8)
+                } else {
+                    None
+                }
+            }));
+        }
         let module = JITModule::new(builder);
         let ctx = module.make_context();
 
@@ -500,6 +511,52 @@ fn make_trampoline(name: UserFuncName, signature: &ir::Signature, isa: &dyn Targ
     func
 }
 
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::__m128i;
+#[cfg(target_arch = "x86_64")]
+#[allow(improper_ctypes_definitions)]
+extern "C" fn __cranelift_x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
+    union U {
+        reg: __m128i,
+        mem: [u8; 16],
+    }
+
+    unsafe {
+        let a = U { reg: a }.mem;
+        let b = U { reg: b }.mem;
+
+        let select = |arr: &[u8; 16], byte: u8| {
+            if byte & 0x80 != 0 {
+                0x00
+            } else {
+                arr[(byte & 0xf) as usize]
+            }
+        };
+
+        U {
+            mem: [
+                select(&a, b[0]),
+                select(&a, b[1]),
+                select(&a, b[2]),
+                select(&a, b[3]),
+                select(&a, b[4]),
+                select(&a, b[5]),
+                select(&a, b[6]),
+                select(&a, b[7]),
+                select(&a, b[8]),
+                select(&a, b[9]),
+                select(&a, b[10]),
+                select(&a, b[11]),
+                select(&a, b[12]),
+                select(&a, b[13]),
+                select(&a, b[14]),
+                select(&a, b[15]),
+            ],
+        }
+        .reg
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/cranelift/module/src/lib.rs b/cranelift/module/src/lib.rs
index 046204ecbf..f9fb9ad024 100644
--- a/cranelift/module/src/lib.rs
+++ b/cranelift/module/src/lib.rs
@@ -72,5 +72,6 @@ pub fn default_libcall_names() -> Box<dyn Fn(ir::LibCall) -> String + Send + Syn
 
         ir::LibCall::ElfTlsGetAddr => "__tls_get_addr".to_owned(),
         ir::LibCall::ElfTlsGetOffset => "__tls_get_offset".to_owned(),
+        ir::LibCall::X86Pshufb => "__cranelift_x86_pshufb".to_owned(),
     })
 }
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 103afeadd8..c12a24a607 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -2197,7 +2197,9 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::I8x16RelaxedSwizzle => {
             let (a, b) = pop2_with_bitcast(state, I8X16, builder);
             state.push1(
-                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                if environ.relaxed_simd_deterministic()
+                    || !environ.use_x86_pshufb_for_relaxed_swizzle()
+                {
                     // Deterministic semantics match the `i8x16.swizzle`
                     // instruction which is the CLIF `swizzle`.
                     builder.ins().swizzle(a, b)
diff --git a/cranelift/wasm/src/environ/spec.rs b/cranelift/wasm/src/environ/spec.rs
index a632973baf..9623ce9284 100644
--- a/cranelift/wasm/src/environ/spec.rs
+++ b/cranelift/wasm/src/environ/spec.rs
@@ -569,6 +569,12 @@ pub trait FuncEnvironment: TargetEnvironment {
         let _ = ty;
         false
     }
+
+    /// Returns whether the CLIF `x86_pshufb` instruction should be used for the
+    /// `i8x16.relaxed_swizzle` instruction.
+    fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
+        false
+    }
 }
 
 /// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the
diff --git a/crates/cranelift-shared/src/obj.rs b/crates/cranelift-shared/src/obj.rs
index 858e1cf9ac..7713b9f635 100644
--- a/crates/cranelift-shared/src/obj.rs
+++ b/crates/cranelift-shared/src/obj.rs
@@ -558,6 +558,7 @@ fn libcall_name(call: LibCall) -> &'static str {
         LibCall::TruncF64 => LC::TruncF64,
         LibCall::FmaF32 => LC::FmaF32,
         LibCall::FmaF64 => LC::FmaF64,
+        LibCall::X86Pshufb => LC::X86Pshufb,
         _ => panic!("unknown libcall to give a name to: {call:?}"),
     };
     other.symbol()
diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
index f7556f8b9f..3cdc5da087 100644
--- a/crates/cranelift/src/func_environ.rs
+++ b/crates/cranelift/src/func_environ.rs
@@ -2203,4 +2203,8 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m
     fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool {
         self.isa.has_x86_blendv_lowering(ty)
     }
+
+    fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
+        self.isa.has_x86_pshufb_lowering()
+    }
 }
diff --git a/crates/environ/src/obj.rs b/crates/environ/src/obj.rs
index 6e39cc319f..6ba410c507 100644
--- a/crates/environ/src/obj.rs
+++ b/crates/environ/src/obj.rs
@@ -168,4 +168,5 @@ libcalls! {
     TruncF64 = "libcall_truncf64"
     FmaF32 = "libcall_fmaf32"
     FmaF64 = "libcall_fmaf64"
+    X86Pshufb = "libcall_x86_pshufb"
 }
diff --git a/crates/jit/src/code_memory.rs b/crates/jit/src/code_memory.rs
index cdf45d005e..d4022bc01f 100644
--- a/crates/jit/src/code_memory.rs
+++ b/crates/jit/src/code_memory.rs
@@ -284,6 +284,10 @@ impl CodeMemory {
                 obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize,
                 obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize,
                 obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize,
+                #[cfg(target_arch = "x86_64")]
+                obj::LibCall::X86Pshufb => libcalls::relocs::x86_pshufb as usize,
+                #[cfg(not(target_arch = "x86_64"))]
+                obj::LibCall::X86Pshufb => unreachable!(),
             };
             self.mmap
                 .as_mut_ptr()
diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs
index bceb890144..9b133d3cc8 100644
--- a/crates/runtime/src/libcalls.rs
+++ b/crates/runtime/src/libcalls.rs
@@ -575,4 +575,52 @@ pub mod relocs {
     pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 {
         a.mul_add(b, c)
     }
+
+    // This intrinsic is only used on x86_64 platforms as an implementation of
+    // the `pshufb` instruction when SSSE3 is not available.
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::__m128i;
+    #[cfg(target_arch = "x86_64")]
+    #[allow(improper_ctypes_definitions)]
+    pub extern "C" fn x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
+        union U {
+            reg: __m128i,
+            mem: [u8; 16],
+        }
+
+        unsafe {
+            let a = U { reg: a }.mem;
+            let b = U { reg: b }.mem;
+
+            let select = |arr: &[u8; 16], byte: u8| {
+                if byte & 0x80 != 0 {
+                    0x00
+                } else {
+                    arr[(byte & 0xf) as usize]
+                }
+            };
+
+            U {
+                mem: [
+                    select(&a, b[0]),
+                    select(&a, b[1]),
+                    select(&a, b[2]),
+                    select(&a, b[3]),
+                    select(&a, b[4]),
+                    select(&a, b[5]),
+                    select(&a, b[6]),
+                    select(&a, b[7]),
+                    select(&a, b[8]),
+                    select(&a, b[9]),
+                    select(&a, b[10]),
+                    select(&a, b[11]),
+                    select(&a, b[12]),
+                    select(&a, b[13]),
+                    select(&a, b[14]),
+                    select(&a, b[15]),
+                ],
+            }
+            .reg
+        }
+    }
 }