diff --git a/cranelift/codegen/src/ir/libcall.rs b/cranelift/codegen/src/ir/libcall.rs index e6b2d539d0..77a680c93b 100644 --- a/cranelift/codegen/src/ir/libcall.rs +++ b/cranelift/codegen/src/ir/libcall.rs @@ -56,6 +56,9 @@ pub enum LibCall { ElfTlsGetAddr, /// Elf __tls_get_offset ElfTlsGetOffset, + + /// The `pshufb` on x86 when SSSE3 isn't available. + X86Pshufb, // When adding a new variant make sure to add it to `all_libcalls` too. } @@ -88,6 +91,8 @@ impl FromStr for LibCall { "ElfTlsGetAddr" => Ok(Self::ElfTlsGetAddr), "ElfTlsGetOffset" => Ok(Self::ElfTlsGetOffset), + + "X86Pshufb" => Ok(Self::X86Pshufb), _ => Err(()), } } @@ -115,6 +120,7 @@ impl LibCall { Memcmp, ElfTlsGetAddr, ElfTlsGetOffset, + X86Pshufb, ] } @@ -166,6 +172,11 @@ impl LibCall { LibCall::Probestack | LibCall::ElfTlsGetAddr | LibCall::ElfTlsGetOffset => { unimplemented!() } + LibCall::X86Pshufb => { + sig.params.push(AbiParam::new(I8X16)); + sig.params.push(AbiParam::new(I8X16)); + sig.returns.push(AbiParam::new(I8X16)); + } } sig diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs index 87633accfb..0d34e07b4a 100644 --- a/cranelift/codegen/src/isa/aarch64/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -211,6 +211,10 @@ impl TargetIsa for AArch64Backend { fn has_x86_blendv_lowering(&self, _: Type) -> bool { false } + + fn has_x86_pshufb_lowering(&self) -> bool { + false + } } impl fmt::Display for AArch64Backend { diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 404d7982b4..1fe2b5e47b 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -344,6 +344,10 @@ pub trait TargetIsa: fmt::Display + Send + Sync { /// Returns whether the CLIF `x86_blendv` instruction is implemented for /// this ISA for the specified type. fn has_x86_blendv_lowering(&self, ty: Type) -> bool; + + /// Returns whether the CLIF `x86_pshufb` instruction is implemented for + /// this ISA. + fn has_x86_pshufb_lowering(&self) -> bool; } /// Function alignment specifications as required by an ISA, returned by diff --git a/cranelift/codegen/src/isa/riscv64/mod.rs b/cranelift/codegen/src/isa/riscv64/mod.rs index 6844e1708f..af66580a0f 100644 --- a/cranelift/codegen/src/isa/riscv64/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/mod.rs @@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend { fn has_x86_blendv_lowering(&self, _: Type) -> bool { false } + + fn has_x86_pshufb_lowering(&self) -> bool { + false + } } impl fmt::Display for Riscv64Backend { diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs index 8a08ba9b83..238f7d5a1f 100644 --- a/cranelift/codegen/src/isa/s390x/mod.rs +++ b/cranelift/codegen/src/isa/s390x/mod.rs @@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend { fn has_x86_blendv_lowering(&self, _: Type) -> bool { false } + + fn has_x86_pshufb_lowering(&self) -> bool { + false + } } impl fmt::Display for S390xBackend { diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 28b2f7c4df..1e24ef10a3 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -154,6 +154,44 @@ impl ABIMachineSpec for X64ABIMachineSpec { ); } + // Windows fastcall dictates that `__m128i` paramters to a function + // are passed indirectly as pointers, so handle that as a special + // case before the loop below. + if param.value_type.is_vector() + && param.value_type.bits() >= 128 + && args_or_rets == ArgsOrRets::Args + && is_fastcall + { + let pointer = match get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) { + Some(reg) => { + next_gpr += 1; + ABIArgSlot::Reg { + reg: reg.to_real_reg().unwrap(), + ty: ir::types::I64, + extension: ir::ArgumentExtension::None, + } + } + + None => { + next_stack = align_to(next_stack, 8) + 8; + ABIArgSlot::Stack { + offset: (next_stack - 8) as i64, + ty: ir::types::I64, + extension: param.extension, + } + } + }; + next_param_idx += 1; + args.push(ABIArg::ImplicitPtrArg { + // NB: this is filled in after this loop + offset: 0, + pointer, + ty: param.value_type, + purpose: param.purpose, + }); + continue; + } + let mut slots = ABIArgSlotVec::new(); for (rc, reg_ty) in rcs.iter().zip(reg_tys.iter()) { let intreg = *rc == RegClass::Int; @@ -221,6 +259,20 @@ impl ABIMachineSpec for X64ABIMachineSpec { }); } + // Fastcall's indirect 128+ bit vector arguments are all located on the + // stack, and stack space is reserved after all paramters are passed, + // so allocate from the space now. + if args_or_rets == ArgsOrRets::Args && is_fastcall { + for arg in args.args_mut() { + if let ABIArg::ImplicitPtrArg { offset, .. } = arg { + assert_eq!(*offset, 0); + next_stack = align_to(next_stack, 16); + *offset = next_stack as i64; + next_stack += 16; + } + } + } + let extra_arg = if add_ret_area_ptr { debug_assert!(args_or_rets == ArgsOrRets::Args); if let Some(reg) = get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) { @@ -348,8 +400,9 @@ impl ABIMachineSpec for X64ABIMachineSpec { } fn gen_load_base_offset(into_reg: Writable, base: Reg, offset: i32, ty: Type) -> Self::I { - // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed. - assert_eq!(ty, I64); + // Only ever used for I64s and vectors; if that changes, see if the + // ExtKind below needs to be changed. + assert!(ty == I64 || ty.is_vector()); let simm32 = offset as u32; let mem = Amode::imm_reg(simm32, base); Inst::load(ty, mem, into_reg, ExtKind::None) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 06fcbac98f..b1cb08c707 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -4957,6 +4957,7 @@ (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned) (convert VCodeConstant SyntheticAmode const_to_synthetic_amode) (convert VCodeConstant XmmMem const_to_xmm_mem) +(convert VCodeConstant RegMem const_to_reg_mem) (convert IntCC CC intcc_to_cc) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) @@ -5010,6 +5011,8 @@ (extern constructor const_to_synthetic_amode const_to_synthetic_amode) (decl const_to_xmm_mem (VCodeConstant) XmmMem) (rule (const_to_xmm_mem c) (const_to_synthetic_amode c)) +(decl const_to_reg_mem (VCodeConstant) RegMem) +(rule (const_to_reg_mem c) (RegMem.Mem (const_to_synthetic_amode c))) (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned) (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg)) @@ -5054,10 +5057,14 @@ NearestF32 NearestF64 TruncF32 - TruncF64)) + TruncF64 + X86Pshufb)) (decl libcall_1 (LibCall Reg) Reg) (extern constructor libcall_1 libcall_1) +(decl libcall_2 (LibCall Reg Reg) Reg) +(extern constructor libcall_2 libcall_2) + (decl libcall_3 (LibCall Reg Reg Reg) Reg) (extern constructor libcall_3 libcall_3) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 3273bf903c..16d66326bd 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2095,11 +2095,11 @@ ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 3 (lower (has_type (ty_32_or_64 ty) (popcnt src))) +(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src))) (if-let $true (use_popcnt)) (x64_popcnt ty src)) -(rule 2 (lower (has_type (ty_8_or_16 ty) (popcnt src))) +(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src))) (if-let $true (use_popcnt)) (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) @@ -2192,7 +2192,7 @@ final)) -(rule 1 (lower (has_type $I8X16 (popcnt src))) +(rule 2 (lower (has_type $I8X16 (popcnt src))) (if-let $true (use_avx512vl_simd)) (if-let $true (use_avx512bitalg_simd)) (x64_vpopcntb src)) @@ -2218,8 +2218,8 @@ ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); -(rule (lower (has_type $I8X16 - (popcnt src))) +(rule 1 (lower (has_type $I8X16 (popcnt src))) + (if-let $true (use_ssse3)) (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)) (low_nibbles Xmm (sse_and $I8X16 src low_mask)) ;; Note that this is a 16x8 shift, but that's OK; we mask @@ -2233,6 +2233,19 @@ (bit_counts_high Xmm (x64_pshufb lookup high_nibbles))) (x64_paddb bit_counts_low bit_counts_high))) +;; A modified version of the popcnt method from Hacker's Delight. +(rule (lower (has_type $I8X16 (popcnt src))) + (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777)) + (src Xmm src) + (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1)) + (src Xmm (x64_psubb src shifted)) + (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) + (src Xmm (x64_psubb src shifted)) + (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) + (src Xmm (x64_psubb src shifted)) + (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4))))) + (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)))) + ;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (bitrev src))) @@ -4181,7 +4194,8 @@ ;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero ;; register. (rule 6 (lower (shuffle a _ (u128_from_immediate 0))) - (x64_pshufb a (xmm_zero $I8X16))) + (if-let $true (use_ssse3)) + (x64_pshufb a (xmm_zero $I8X16))) ;; Special case for the `shufps` instruction which will select two 32-bit values ;; from the first operand and two 32-bit values from the second operand. Note @@ -4209,7 +4223,8 @@ ;; indices (may not be completely necessary: verification could fail incorrect ;; mask values) and fix the indexes to all point to the `dst` vector. (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask))) - (x64_pshufb a (shuffle_0_31_mask mask))) + (if-let $true (use_ssse3)) + (x64_pshufb a (shuffle_0_31_mask mask))) ;; For the case where the shuffle mask contains out-of-bounds values (values ;; greater than 31) we must mask off those resulting values in the result of @@ -4231,8 +4246,8 @@ ;; above, we build the `constructed_mask` for each case statically. (rule (lower (shuffle a b (vec_mask_from_immediate mask))) (x64_por - (x64_pshufb a (shuffle_0_15_mask mask)) - (x64_pshufb b (shuffle_16_31_mask mask)))) + (lower_pshufb a (shuffle_0_15_mask mask)) + (lower_pshufb b (shuffle_16_31_mask mask)))) ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -4244,13 +4259,28 @@ ;; variables like: %dst = swizzle %src, %mask (rule (lower (swizzle src mask)) (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070)))) - (x64_pshufb src mask))) + (lower_pshufb src mask))) ;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (x86_pshufb src mask)) + (if-let $true (use_ssse3)) (x64_pshufb src mask)) +;; A helper function to generate either the `pshufb` instruction or a libcall to +;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most +;; performant thing in the world so this is primarily here for completeness +;; of lowerings on all x86 cpus but if rules are ideally gated on the presence +;; of SSSE3 to use the `pshufb` instruction itself. +(decl lower_pshufb (Xmm RegMem) Xmm) +(rule 1 (lower_pshufb src mask) + (if-let $true (use_ssse3)) + (x64_pshufb src mask)) +(rule (lower_pshufb src (RegMem.Reg mask)) + (libcall_2 (LibCall.X86Pshufb) src mask)) +(rule (lower_pshufb src (RegMem.Mem addr)) + (lower_pshufb src (x64_movdqu_load addr))) + ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Remove the extractlane instruction, leaving the float where it is. The upper @@ -4343,14 +4373,18 @@ ;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts ;; with a mask of zero which is calculated with an xor-against-itself register. (rule 0 (lower (has_type $I8X16 (splat src))) - (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16))) + (let ((src Xmm (x64_movd_to_xmm src))) + (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0))) (rule 1 (lower (has_type $I8X16 (splat src))) + (if-let $true (use_ssse3)) + (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16))) +(rule 2 (lower (has_type $I8X16 (splat src))) (if-let $true (use_avx2_simd)) (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src))) -(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) +(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) (if-let $true (use_sse41)) (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16))) -(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) +(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) (if-let $true (use_avx2_simd)) (x64_vpbroadcastb addr)) @@ -4399,10 +4433,10 @@ ;; the register-based encoding is only available with AVX2. With the ;; `sinkable_load` extractor this should be guaranteed to use the memory-based ;; encoding hence the `use_avx_simd` test. -(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) +(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) (let ((tmp Xmm (x64_movss_load addr))) (x64_shufps tmp tmp 0))) -(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) +(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) (if-let $true (use_avx_simd)) (x64_vbroadcastss addr)) @@ -4413,7 +4447,7 @@ (x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00)) (rule 0 (lower (has_type $F64X2 (splat src))) (x64_pshufd src 0b01_00_01_00)) -(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr)))) +(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr)))) (if-let $true (use_ssse3)) (x64_movddup addr)) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index f7c5456d5c..6b1d9a14e3 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -645,6 +645,24 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { output_reg.to_reg() } + fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg { + let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()); + let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type; + let output_reg = self.lower_ctx.alloc_tmp(ret_ty).only_reg().unwrap(); + + emit_vm_call( + self.lower_ctx, + &self.backend.flags, + &self.backend.triple, + libcall.clone(), + &[a, b], + &[output_reg], + ) + .expect("Failed to emit LibCall"); + + output_reg.to_reg() + } + fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg { let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()); let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type; diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 0c4fd48554..bcdc397c2b 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -186,6 +186,10 @@ impl TargetIsa for X64Backend { // operation, so that always returns `false` self.x64_flags.use_sse41() && ty != types::I16X8 } + + fn has_x86_pshufb_lowering(&self) -> bool { + self.x64_flags.use_ssse3() + } } impl fmt::Display for X64Backend { diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 1889f144a9..df0a714e26 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -2193,7 +2193,7 @@ impl CallSite { from_regs: ValueRegs, ) { match &ctx.sigs().args(self.sig)[idx] { - &ABIArg::Slots { .. } => {} + &ABIArg::Slots { .. } | &ABIArg::ImplicitPtrArg { .. } => {} &ABIArg::StructArg { offset, size, .. } => { let src_ptr = from_regs.only_reg().unwrap(); let dst_ptr = ctx.alloc_tmp(M::word_type()).only_reg().unwrap(); @@ -2220,7 +2220,6 @@ impl CallSite { ctx.emit(insn); } } - &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE. } } @@ -2260,6 +2259,7 @@ impl CallSite { &ABIArgSlot::Stack { .. } => 0, }) .sum(), + ABIArg::ImplicitPtrArg { .. } => 1, _ => 0, }; let mut temps: SmallVec<[Writable; 16]> = (0..needed_tmps) @@ -2355,7 +2355,36 @@ impl CallSite { &ABIArg::StructArg { pointer, .. } => { assert!(pointer.is_none()); // Only supported via ISLE. } - &ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE. + &ABIArg::ImplicitPtrArg { + offset, + pointer, + ty, + purpose: _, + } => { + assert_eq!(from_regs.len(), 1); + let vreg = from_regs.regs()[0]; + let amode = StackAMode::SPOffset(offset, ty); + let tmp = temps[0]; + insts.push(M::gen_get_stack_addr(amode, tmp, ty)); + let tmp = tmp.to_reg(); + insts.push(M::gen_store_base_offset(tmp, 0, vreg, ty)); + match pointer { + ABIArgSlot::Reg { reg, .. } => { + self.uses.push(CallArgPair { + vreg: tmp, + preg: reg.into(), + }); + } + ABIArgSlot::Stack { offset, .. } => { + let ty = M::word_type(); + insts.push(M::gen_store_stack( + StackAMode::SPOffset(offset, ty), + tmp, + ty, + )); + } + }; + } } insts } diff --git a/cranelift/filetests/filetests/isa/x64/call-conv.clif b/cranelift/filetests/filetests/isa/x64/call-conv.clif index 775637edfe..6040fa4675 100644 --- a/cranelift/filetests/filetests/isa/x64/call-conv.clif +++ b/cranelift/filetests/filetests/isa/x64/call-conv.clif @@ -660,3 +660,238 @@ block0(v0: f32, v1: i64, v2: i32, v3: f32): ; popq %rbp ; retq +function %fastcall_m128i_param(i32, i8x16) system_v { + sig0 = (i8x16) windows_fastcall +block0(v0: i32, v1: i8x16): + call_indirect sig0, v0(v1) + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; subq %rsp, $48, %rsp +; virtual_sp_offset_adjust 48 +; lea 32(%rsp), %rcx +; movdqu %xmm0, 0(%rcx) +; call *%rdi +; addq %rsp, $48, %rsp +; virtual_sp_offset_adjust -48 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; subq $0x30, %rsp +; leaq 0x20(%rsp), %rcx +; movdqu %xmm0, (%rcx) +; callq *%rdi +; addq $0x30, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fastcall_m128i_params_and_results(i32, i32, i8x16, i64, i8x16) -> i8x16 system_v { + sig0 = (i32, i8x16, i64, i8x16) -> i8x16 windows_fastcall +block0(v0: i32, v1: i32, v2: i8x16, v3: i64, v4: i8x16): + v5 = call_indirect sig0, v0(v1, v2, v3, v4) + v6 = iadd v5, v5 + return v6 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdx, %r8 +; movq %rsi, %rcx +; subq %rsp, $64, %rsp +; virtual_sp_offset_adjust 64 +; lea 32(%rsp), %rdx +; movdqu %xmm0, 0(%rdx) +; lea 48(%rsp), %r9 +; movdqu %xmm1, 0(%r9) +; call *%rdi +; addq %rsp, $64, %rsp +; virtual_sp_offset_adjust -64 +; paddb %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdx, %r8 +; movq %rsi, %rcx +; subq $0x40, %rsp +; leaq 0x20(%rsp), %rdx +; movdqu %xmm0, (%rdx) +; leaq 0x30(%rsp), %r9 +; movdqu %xmm1, (%r9) +; callq *%rdi +; addq $0x40, %rsp +; paddb %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fastcall_m128i_one_stack_param(i32, i8x16) system_v { + sig0 = (i32, i32, i32, i32, i8x16) windows_fastcall +block0(v0: i32, v1: i8x16): + call_indirect sig0, v0(v0, v0, v0, v0, v1) + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r9 +; subq %rsp, $64, %rsp +; virtual_sp_offset_adjust 64 +; lea 48(%rsp), %r8 +; movdqu %xmm0, 0(%r8) +; movq %r8, 32(%rsp) +; movq %r9, %rcx +; movq %r9, %rdx +; movq %r9, %r8 +; call *%r9 +; addq %rsp, $64, %rsp +; virtual_sp_offset_adjust -64 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r9 +; subq $0x40, %rsp +; leaq 0x30(%rsp), %r8 +; movdqu %xmm0, (%r8) +; movq %r8, 0x20(%rsp) +; movq %r9, %rcx +; movq %r9, %rdx +; movq %r9, %r8 +; callq *%r9 +; addq $0x40, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fastcall_m128i_two_stack_param(i32, i8x16) system_v { + sig0 = (i32, i32, i32, i32, i8x16, i8x16) windows_fastcall +block0(v0: i32, v1: i8x16): + call_indirect sig0, v0(v0, v0, v0, v0, v1, v1) + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r9 +; subq %rsp, $80, %rsp +; virtual_sp_offset_adjust 80 +; lea 48(%rsp), %r8 +; movdqu %xmm0, 0(%r8) +; movq %r8, 32(%rsp) +; lea 64(%rsp), %rsi +; movdqu %xmm0, 0(%rsi) +; movq %rsi, 40(%rsp) +; movq %r9, %rcx +; movq %r9, %rdx +; movq %r9, %r8 +; call *%r9 +; addq %rsp, $80, %rsp +; virtual_sp_offset_adjust -80 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r9 +; subq $0x50, %rsp +; leaq 0x30(%rsp), %r8 +; movdqu %xmm0, (%r8) +; movq %r8, 0x20(%rsp) +; leaq 0x40(%rsp), %rsi +; movdqu %xmm0, (%rsi) +; movq %rsi, 0x28(%rsp) +; movq %r9, %rcx +; movq %r9, %rdx +; movq %r9, %r8 +; callq *%r9 +; addq $0x50, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fastcall_m128i_reg_and_stack_param(i32, i8x16) system_v { + sig0 = (i32, i8x16, i32, i32, i8x16, i8x16) windows_fastcall +block0(v0: i32, v1: i8x16): + call_indirect sig0, v0(v0, v1, v0, v0, v1, v1) + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r9 +; subq %rsp, $96, %rsp +; virtual_sp_offset_adjust 96 +; lea 48(%rsp), %rdx +; movdqu %xmm0, 0(%rdx) +; lea 64(%rsp), %r11 +; movdqu %xmm0, 0(%r11) +; movq %r11, 32(%rsp) +; lea 80(%rsp), %rcx +; movdqu %xmm0, 0(%rcx) +; movq %rcx, 40(%rsp) +; movq %r9, %rcx +; movq %r9, %r8 +; call *%r9 +; addq %rsp, $96, %rsp +; virtual_sp_offset_adjust -96 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r9 +; subq $0x60, %rsp +; leaq 0x30(%rsp), %rdx +; movdqu %xmm0, (%rdx) +; leaq 0x40(%rsp), %r11 +; movdqu %xmm0, (%r11) +; movq %r11, 0x20(%rsp) +; leaq 0x50(%rsp), %rcx +; movdqu %xmm0, (%rcx) +; movq %rcx, 0x28(%rsp) +; movq %r9, %rcx +; movq %r9, %r8 +; callq *%r9 +; addq $0x60, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-popcnt.clif b/cranelift/filetests/filetests/runtests/simd-popcnt.clif index 327da79054..00a24d92f9 100644 --- a/cranelift/filetests/filetests/runtests/simd-popcnt.clif +++ b/cranelift/filetests/filetests/runtests/simd-popcnt.clif @@ -2,7 +2,9 @@ test interpret test run target aarch64 target s390x +target x86_64 has_ssse3=false set enable_simd +target x86_64 target x86_64 sse42 target x86_64 sse42 has_avx has_avx512vl has_avx512bitalg target riscv64 has_v diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index e6aa4b6af8..0a72254528 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -2,10 +2,13 @@ test run target aarch64 target s390x +target x86_64 has_ssse3=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target x86_64 sse42 has_avx has_avx512vl has_avx512vbmi target riscv64gc has_v function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index e1caad0419..6dd866f8df 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -2,6 +2,7 @@ test run target aarch64 target s390x +target x86_64 has_ssse3=false set enable_simd target x86_64 target x86_64 sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif index 2592e65fb3..f188eaf18a 100644 --- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif +++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif @@ -2,9 +2,11 @@ test interpret test run target aarch64 target s390x +target x86_64 has_ssse3=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 +target x86_64 sse41 +target x86_64 sse41 has_avx target riscv64gc has_v function %swizzle_i8x16(i8x16, i8x16) -> i8x16 { diff --git a/cranelift/filetests/src/function_runner.rs b/cranelift/filetests/src/function_runner.rs index a786506439..dff1fda6cf 100644 --- a/cranelift/filetests/src/function_runner.rs +++ b/cranelift/filetests/src/function_runner.rs @@ -89,7 +89,18 @@ impl TestFileCompiler { /// host machine, this [TargetIsa] must match the host machine's ISA (see /// [TestFileCompiler::with_host_isa]). pub fn new(isa: OwnedTargetIsa) -> Self { - let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names()); + let mut builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names()); + drop(&mut builder); // require mutability on all architectures + #[cfg(target_arch = "x86_64")] + { + builder.symbol_lookup_fn(Box::new(|name| { + if name == "__cranelift_x86_pshufb" { + Some(__cranelift_x86_pshufb as *const u8) + } else { + None + } + })); + } let module = JITModule::new(builder); let ctx = module.make_context(); @@ -500,6 +511,52 @@ fn make_trampoline(name: UserFuncName, signature: &ir::Signature, isa: &dyn Targ func } +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::__m128i; +#[cfg(target_arch = "x86_64")] +#[allow(improper_ctypes_definitions)] +extern "C" fn __cranelift_x86_pshufb(a: __m128i, b: __m128i) -> __m128i { + union U { + reg: __m128i, + mem: [u8; 16], + } + + unsafe { + let a = U { reg: a }.mem; + let b = U { reg: b }.mem; + + let select = |arr: &[u8; 16], byte: u8| { + if byte & 0x80 != 0 { + 0x00 + } else { + arr[(byte & 0xf) as usize] + } + }; + + U { + mem: [ + select(&a, b[0]), + select(&a, b[1]), + select(&a, b[2]), + select(&a, b[3]), + select(&a, b[4]), + select(&a, b[5]), + select(&a, b[6]), + select(&a, b[7]), + select(&a, b[8]), + select(&a, b[9]), + select(&a, b[10]), + select(&a, b[11]), + select(&a, b[12]), + select(&a, b[13]), + select(&a, b[14]), + select(&a, b[15]), + ], + } + .reg + } +} + #[cfg(test)] mod test { use super::*; diff --git a/cranelift/module/src/lib.rs b/cranelift/module/src/lib.rs index 046204ecbf..f9fb9ad024 100644 --- a/cranelift/module/src/lib.rs +++ b/cranelift/module/src/lib.rs @@ -72,5 +72,6 @@ pub fn default_libcall_names() -> Box String + Send + Syn ir::LibCall::ElfTlsGetAddr => "__tls_get_addr".to_owned(), ir::LibCall::ElfTlsGetOffset => "__tls_get_offset".to_owned(), + ir::LibCall::X86Pshufb => "__cranelift_x86_pshufb".to_owned(), }) } diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 103afeadd8..c12a24a607 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -2197,7 +2197,9 @@ pub fn translate_operator( Operator::I8x16RelaxedSwizzle => { let (a, b) = pop2_with_bitcast(state, I8X16, builder); state.push1( - if environ.relaxed_simd_deterministic() || !environ.is_x86() { + if environ.relaxed_simd_deterministic() + || !environ.use_x86_pshufb_for_relaxed_swizzle() + { // Deterministic semantics match the `i8x16.swizzle` // instruction which is the CLIF `swizzle`. builder.ins().swizzle(a, b) diff --git a/cranelift/wasm/src/environ/spec.rs b/cranelift/wasm/src/environ/spec.rs index a632973baf..9623ce9284 100644 --- a/cranelift/wasm/src/environ/spec.rs +++ b/cranelift/wasm/src/environ/spec.rs @@ -569,6 +569,12 @@ pub trait FuncEnvironment: TargetEnvironment { let _ = ty; false } + + /// Returns whether the CLIF `x86_pshufb` instruction should be used for the + /// `i8x16.relaxed_swizzle` instruction. + fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool { + false + } } /// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the diff --git a/crates/cranelift-shared/src/obj.rs b/crates/cranelift-shared/src/obj.rs index 858e1cf9ac..7713b9f635 100644 --- a/crates/cranelift-shared/src/obj.rs +++ b/crates/cranelift-shared/src/obj.rs @@ -558,6 +558,7 @@ fn libcall_name(call: LibCall) -> &'static str { LibCall::TruncF64 => LC::TruncF64, LibCall::FmaF32 => LC::FmaF32, LibCall::FmaF64 => LC::FmaF64, + LibCall::X86Pshufb => LC::X86Pshufb, _ => panic!("unknown libcall to give a name to: {call:?}"), }; other.symbol() diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index f7556f8b9f..3cdc5da087 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2203,4 +2203,8 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool { self.isa.has_x86_blendv_lowering(ty) } + + fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool { + self.isa.has_x86_pshufb_lowering() + } } diff --git a/crates/environ/src/obj.rs b/crates/environ/src/obj.rs index 6e39cc319f..6ba410c507 100644 --- a/crates/environ/src/obj.rs +++ b/crates/environ/src/obj.rs @@ -168,4 +168,5 @@ libcalls! { TruncF64 = "libcall_truncf64" FmaF32 = "libcall_fmaf32" FmaF64 = "libcall_fmaf64" + X86Pshufb = "libcall_x86_pshufb" } diff --git a/crates/jit/src/code_memory.rs b/crates/jit/src/code_memory.rs index cdf45d005e..d4022bc01f 100644 --- a/crates/jit/src/code_memory.rs +++ b/crates/jit/src/code_memory.rs @@ -284,6 +284,10 @@ impl CodeMemory { obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize, obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize, obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize, + #[cfg(target_arch = "x86_64")] + obj::LibCall::X86Pshufb => libcalls::relocs::x86_pshufb as usize, + #[cfg(not(target_arch = "x86_64"))] + obj::LibCall::X86Pshufb => unreachable!(), }; self.mmap .as_mut_ptr() diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs index bceb890144..9b133d3cc8 100644 --- a/crates/runtime/src/libcalls.rs +++ b/crates/runtime/src/libcalls.rs @@ -575,4 +575,52 @@ pub mod relocs { pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 { a.mul_add(b, c) } + + // This intrinsic is only used on x86_64 platforms as an implementation of + // the `pshufb` instruction when SSSE3 is not available. + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::__m128i; + #[cfg(target_arch = "x86_64")] + #[allow(improper_ctypes_definitions)] + pub extern "C" fn x86_pshufb(a: __m128i, b: __m128i) -> __m128i { + union U { + reg: __m128i, + mem: [u8; 16], + } + + unsafe { + let a = U { reg: a }.mem; + let b = U { reg: b }.mem; + + let select = |arr: &[u8; 16], byte: u8| { + if byte & 0x80 != 0 { + 0x00 + } else { + arr[(byte & 0xf) as usize] + } + }; + + U { + mem: [ + select(&a, b[0]), + select(&a, b[1]), + select(&a, b[2]), + select(&a, b[3]), + select(&a, b[4]), + select(&a, b[5]), + select(&a, b[6]), + select(&a, b[7]), + select(&a, b[8]), + select(&a, b[9]), + select(&a, b[10]), + select(&a, b[11]), + select(&a, b[12]), + select(&a, b[13]), + select(&a, b[14]), + select(&a, b[15]), + ], + } + .reg + } + } }