Browse Source

x64: Add non-SSSE3 lowerings of `pshufb` (#6606)

* x64: Add non-SSSE3 lowerings of `pshufb`

Or, more accurately, add lowerings which don't use `pshufb`'s
functionality at all where possible or otherwise fall back to a new
libcall. This particular instruction seemed uniquely difficult to
implement in the backend so I decided to "cop out" and use libcall
instead. The libcall will be used for `popcnt`, `shuffle`, and
`swizzle` instructions when SSSE3 isn't available.

* Implemente SSE2 popcnt with Hacker's Delight

* x64: Implement passing vector arguments in the fastcall convention

Windows says that vector arguments are passed indirectly so handle that
here through the `ABIArg::ImplicitPtrArg` variant. Some additional
handling is added to the general machinst backend.

* Update `gen_load_base_offset` for x64

* Fill out remaining bits of fastcall and vector parameters

* Remove now-unnecessary `Clone` bound
pull/6607/head
Alex Crichton 1 year ago
committed by GitHub
parent
commit
3dfbfb61a9
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 11
      cranelift/codegen/src/ir/libcall.rs
  2. 4
      cranelift/codegen/src/isa/aarch64/mod.rs
  3. 4
      cranelift/codegen/src/isa/mod.rs
  4. 4
      cranelift/codegen/src/isa/riscv64/mod.rs
  5. 4
      cranelift/codegen/src/isa/s390x/mod.rs
  6. 57
      cranelift/codegen/src/isa/x64/abi.rs
  7. 9
      cranelift/codegen/src/isa/x64/inst.isle
  8. 66
      cranelift/codegen/src/isa/x64/lower.isle
  9. 18
      cranelift/codegen/src/isa/x64/lower/isle.rs
  10. 4
      cranelift/codegen/src/isa/x64/mod.rs
  11. 35
      cranelift/codegen/src/machinst/abi.rs
  12. 235
      cranelift/filetests/filetests/isa/x64/call-conv.clif
  13. 2
      cranelift/filetests/filetests/runtests/simd-popcnt.clif
  14. 9
      cranelift/filetests/filetests/runtests/simd-shuffle.clif
  15. 1
      cranelift/filetests/filetests/runtests/simd-splat.clif
  16. 6
      cranelift/filetests/filetests/runtests/simd-swizzle.clif
  17. 59
      cranelift/filetests/src/function_runner.rs
  18. 1
      cranelift/module/src/lib.rs
  19. 4
      cranelift/wasm/src/code_translator.rs
  20. 6
      cranelift/wasm/src/environ/spec.rs
  21. 1
      crates/cranelift-shared/src/obj.rs
  22. 4
      crates/cranelift/src/func_environ.rs
  23. 1
      crates/environ/src/obj.rs
  24. 4
      crates/jit/src/code_memory.rs
  25. 48
      crates/runtime/src/libcalls.rs

11
cranelift/codegen/src/ir/libcall.rs

@ -56,6 +56,9 @@ pub enum LibCall {
ElfTlsGetAddr,
/// Elf __tls_get_offset
ElfTlsGetOffset,
/// The `pshufb` on x86 when SSSE3 isn't available.
X86Pshufb,
// When adding a new variant make sure to add it to `all_libcalls` too.
}
@ -88,6 +91,8 @@ impl FromStr for LibCall {
"ElfTlsGetAddr" => Ok(Self::ElfTlsGetAddr),
"ElfTlsGetOffset" => Ok(Self::ElfTlsGetOffset),
"X86Pshufb" => Ok(Self::X86Pshufb),
_ => Err(()),
}
}
@ -115,6 +120,7 @@ impl LibCall {
Memcmp,
ElfTlsGetAddr,
ElfTlsGetOffset,
X86Pshufb,
]
}
@ -166,6 +172,11 @@ impl LibCall {
LibCall::Probestack | LibCall::ElfTlsGetAddr | LibCall::ElfTlsGetOffset => {
unimplemented!()
}
LibCall::X86Pshufb => {
sig.params.push(AbiParam::new(I8X16));
sig.params.push(AbiParam::new(I8X16));
sig.returns.push(AbiParam::new(I8X16));
}
}
sig

4
cranelift/codegen/src/isa/aarch64/mod.rs

@ -211,6 +211,10 @@ impl TargetIsa for AArch64Backend {
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
false
}
fn has_x86_pshufb_lowering(&self) -> bool {
false
}
}
impl fmt::Display for AArch64Backend {

4
cranelift/codegen/src/isa/mod.rs

@ -344,6 +344,10 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
/// Returns whether the CLIF `x86_blendv` instruction is implemented for
/// this ISA for the specified type.
fn has_x86_blendv_lowering(&self, ty: Type) -> bool;
/// Returns whether the CLIF `x86_pshufb` instruction is implemented for
/// this ISA.
fn has_x86_pshufb_lowering(&self) -> bool;
}
/// Function alignment specifications as required by an ISA, returned by

4
cranelift/codegen/src/isa/riscv64/mod.rs

@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend {
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
false
}
fn has_x86_pshufb_lowering(&self) -> bool {
false
}
}
impl fmt::Display for Riscv64Backend {

4
cranelift/codegen/src/isa/s390x/mod.rs

@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend {
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
false
}
fn has_x86_pshufb_lowering(&self) -> bool {
false
}
}
impl fmt::Display for S390xBackend {

57
cranelift/codegen/src/isa/x64/abi.rs

@ -154,6 +154,44 @@ impl ABIMachineSpec for X64ABIMachineSpec {
);
}
// Windows fastcall dictates that `__m128i` paramters to a function
// are passed indirectly as pointers, so handle that as a special
// case before the loop below.
if param.value_type.is_vector()
&& param.value_type.bits() >= 128
&& args_or_rets == ArgsOrRets::Args
&& is_fastcall
{
let pointer = match get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
Some(reg) => {
next_gpr += 1;
ABIArgSlot::Reg {
reg: reg.to_real_reg().unwrap(),
ty: ir::types::I64,
extension: ir::ArgumentExtension::None,
}
}
None => {
next_stack = align_to(next_stack, 8) + 8;
ABIArgSlot::Stack {
offset: (next_stack - 8) as i64,
ty: ir::types::I64,
extension: param.extension,
}
}
};
next_param_idx += 1;
args.push(ABIArg::ImplicitPtrArg {
// NB: this is filled in after this loop
offset: 0,
pointer,
ty: param.value_type,
purpose: param.purpose,
});
continue;
}
let mut slots = ABIArgSlotVec::new();
for (rc, reg_ty) in rcs.iter().zip(reg_tys.iter()) {
let intreg = *rc == RegClass::Int;
@ -221,6 +259,20 @@ impl ABIMachineSpec for X64ABIMachineSpec {
});
}
// Fastcall's indirect 128+ bit vector arguments are all located on the
// stack, and stack space is reserved after all paramters are passed,
// so allocate from the space now.
if args_or_rets == ArgsOrRets::Args && is_fastcall {
for arg in args.args_mut() {
if let ABIArg::ImplicitPtrArg { offset, .. } = arg {
assert_eq!(*offset, 0);
next_stack = align_to(next_stack, 16);
*offset = next_stack as i64;
next_stack += 16;
}
}
}
let extra_arg = if add_ret_area_ptr {
debug_assert!(args_or_rets == ArgsOrRets::Args);
if let Some(reg) = get_intreg_for_arg(&call_conv, next_gpr, next_param_idx) {
@ -348,8 +400,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
}
fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
// Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed.
assert_eq!(ty, I64);
// Only ever used for I64s and vectors; if that changes, see if the
// ExtKind below needs to be changed.
assert!(ty == I64 || ty.is_vector());
let simm32 = offset as u32;
let mem = Amode::imm_reg(simm32, base);
Inst::load(ty, mem, into_reg, ExtKind::None)

9
cranelift/codegen/src/isa/x64/inst.isle

@ -4957,6 +4957,7 @@
(convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
(convert VCodeConstant SyntheticAmode const_to_synthetic_amode)
(convert VCodeConstant XmmMem const_to_xmm_mem)
(convert VCodeConstant RegMem const_to_reg_mem)
(convert IntCC CC intcc_to_cc)
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@ -5010,6 +5011,8 @@
(extern constructor const_to_synthetic_amode const_to_synthetic_amode)
(decl const_to_xmm_mem (VCodeConstant) XmmMem)
(rule (const_to_xmm_mem c) (const_to_synthetic_amode c))
(decl const_to_reg_mem (VCodeConstant) RegMem)
(rule (const_to_reg_mem c) (RegMem.Mem (const_to_synthetic_amode c)))
(decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
(rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))
@ -5054,10 +5057,14 @@
NearestF32
NearestF64
TruncF32
TruncF64))
TruncF64
X86Pshufb))
(decl libcall_1 (LibCall Reg) Reg)
(extern constructor libcall_1 libcall_1)
(decl libcall_2 (LibCall Reg Reg) Reg)
(extern constructor libcall_2 libcall_2)
(decl libcall_3 (LibCall Reg Reg Reg) Reg)
(extern constructor libcall_3 libcall_3)

66
cranelift/codegen/src/isa/x64/lower.isle

@ -2095,11 +2095,11 @@
;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 3 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
(if-let $true (use_popcnt))
(x64_popcnt ty src))
(rule 2 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
(if-let $true (use_popcnt))
(x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
@ -2192,7 +2192,7 @@
final))
(rule 1 (lower (has_type $I8X16 (popcnt src)))
(rule 2 (lower (has_type $I8X16 (popcnt src)))
(if-let $true (use_avx512vl_simd))
(if-let $true (use_avx512bitalg_simd))
(x64_vpopcntb src))
@ -2218,8 +2218,8 @@
;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
(rule (lower (has_type $I8X16
(popcnt src)))
(rule 1 (lower (has_type $I8X16 (popcnt src)))
(if-let $true (use_ssse3))
(let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
(low_nibbles Xmm (sse_and $I8X16 src low_mask))
;; Note that this is a 16x8 shift, but that's OK; we mask
@ -2233,6 +2233,19 @@
(bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
(x64_paddb bit_counts_low bit_counts_high)))
;; A modified version of the popcnt method from Hacker's Delight.
(rule (lower (has_type $I8X16 (popcnt src)))
(let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
(src Xmm src)
(shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
(x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))
;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I8 (bitrev src)))
@ -4181,7 +4194,8 @@
;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
;; register.
(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
(x64_pshufb a (xmm_zero $I8X16)))
(if-let $true (use_ssse3))
(x64_pshufb a (xmm_zero $I8X16)))
;; Special case for the `shufps` instruction which will select two 32-bit values
;; from the first operand and two 32-bit values from the second operand. Note
@ -4209,7 +4223,8 @@
;; indices (may not be completely necessary: verification could fail incorrect
;; mask values) and fix the indexes to all point to the `dst` vector.
(rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
(x64_pshufb a (shuffle_0_31_mask mask)))
(if-let $true (use_ssse3))
(x64_pshufb a (shuffle_0_31_mask mask)))
;; For the case where the shuffle mask contains out-of-bounds values (values
;; greater than 31) we must mask off those resulting values in the result of
@ -4231,8 +4246,8 @@
;; above, we build the `constructed_mask` for each case statically.
(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
(x64_por
(x64_pshufb a (shuffle_0_15_mask mask))
(x64_pshufb b (shuffle_16_31_mask mask))))
(lower_pshufb a (shuffle_0_15_mask mask))
(lower_pshufb b (shuffle_16_31_mask mask))))
;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -4244,13 +4259,28 @@
;; variables like: %dst = swizzle %src, %mask
(rule (lower (swizzle src mask))
(let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
(x64_pshufb src mask)))
(lower_pshufb src mask)))
;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (x86_pshufb src mask))
(if-let $true (use_ssse3))
(x64_pshufb src mask))
;; A helper function to generate either the `pshufb` instruction or a libcall to
;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
;; performant thing in the world so this is primarily here for completeness
;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
;; of SSSE3 to use the `pshufb` instruction itself.
(decl lower_pshufb (Xmm RegMem) Xmm)
(rule 1 (lower_pshufb src mask)
(if-let $true (use_ssse3))
(x64_pshufb src mask))
(rule (lower_pshufb src (RegMem.Reg mask))
(libcall_2 (LibCall.X86Pshufb) src mask))
(rule (lower_pshufb src (RegMem.Mem addr))
(lower_pshufb src (x64_movdqu_load addr)))
;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Remove the extractlane instruction, leaving the float where it is. The upper
@ -4343,14 +4373,18 @@
;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
;; with a mask of zero which is calculated with an xor-against-itself register.
(rule 0 (lower (has_type $I8X16 (splat src)))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(let ((src Xmm (x64_movd_to_xmm src)))
(x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
(rule 1 (lower (has_type $I8X16 (splat src)))
(if-let $true (use_ssse3))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(rule 2 (lower (has_type $I8X16 (splat src)))
(if-let $true (use_avx2_simd))
(x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (use_sse41))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (use_avx2_simd))
(x64_vpbroadcastb addr))
@ -4399,10 +4433,10 @@
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `use_avx_simd` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (use_avx_simd))
(x64_vbroadcastss addr))
@ -4413,7 +4447,7 @@
(x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00))
(rule 0 (lower (has_type $F64X2 (splat src)))
(x64_pshufd src 0b01_00_01_00))
(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(if-let $true (use_ssse3))
(x64_movddup addr))

18
cranelift/codegen/src/isa/x64/lower/isle.rs

@ -645,6 +645,24 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
output_reg.to_reg()
}
fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg {
let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;
let output_reg = self.lower_ctx.alloc_tmp(ret_ty).only_reg().unwrap();
emit_vm_call(
self.lower_ctx,
&self.backend.flags,
&self.backend.triple,
libcall.clone(),
&[a, b],
&[output_reg],
)
.expect("Failed to emit LibCall");
output_reg.to_reg()
}
fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg {
let call_conv = self.lower_ctx.abi().call_conv(self.lower_ctx.sigs());
let ret_ty = libcall.signature(call_conv, I64).returns[0].value_type;

4
cranelift/codegen/src/isa/x64/mod.rs

@ -186,6 +186,10 @@ impl TargetIsa for X64Backend {
// operation, so that always returns `false`
self.x64_flags.use_sse41() && ty != types::I16X8
}
fn has_x86_pshufb_lowering(&self) -> bool {
self.x64_flags.use_ssse3()
}
}
impl fmt::Display for X64Backend {

35
cranelift/codegen/src/machinst/abi.rs

@ -2193,7 +2193,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
from_regs: ValueRegs<Reg>,
) {
match &ctx.sigs().args(self.sig)[idx] {
&ABIArg::Slots { .. } => {}
&ABIArg::Slots { .. } | &ABIArg::ImplicitPtrArg { .. } => {}
&ABIArg::StructArg { offset, size, .. } => {
let src_ptr = from_regs.only_reg().unwrap();
let dst_ptr = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
@ -2220,7 +2220,6 @@ impl<M: ABIMachineSpec> CallSite<M> {
ctx.emit(insn);
}
}
&ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
}
}
@ -2260,6 +2259,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
&ABIArgSlot::Stack { .. } => 0,
})
.sum(),
ABIArg::ImplicitPtrArg { .. } => 1,
_ => 0,
};
let mut temps: SmallVec<[Writable<Reg>; 16]> = (0..needed_tmps)
@ -2355,7 +2355,36 @@ impl<M: ABIMachineSpec> CallSite<M> {
&ABIArg::StructArg { pointer, .. } => {
assert!(pointer.is_none()); // Only supported via ISLE.
}
&ABIArg::ImplicitPtrArg { .. } => unimplemented!(), // Only supported via ISLE.
&ABIArg::ImplicitPtrArg {
offset,
pointer,
ty,
purpose: _,
} => {
assert_eq!(from_regs.len(), 1);
let vreg = from_regs.regs()[0];
let amode = StackAMode::SPOffset(offset, ty);
let tmp = temps[0];
insts.push(M::gen_get_stack_addr(amode, tmp, ty));
let tmp = tmp.to_reg();
insts.push(M::gen_store_base_offset(tmp, 0, vreg, ty));
match pointer {
ABIArgSlot::Reg { reg, .. } => {
self.uses.push(CallArgPair {
vreg: tmp,
preg: reg.into(),
});
}
ABIArgSlot::Stack { offset, .. } => {
let ty = M::word_type();
insts.push(M::gen_store_stack(
StackAMode::SPOffset(offset, ty),
tmp,
ty,
));
}
};
}
}
insts
}

235
cranelift/filetests/filetests/isa/x64/call-conv.clif

@ -660,3 +660,238 @@ block0(v0: f32, v1: i64, v2: i32, v3: f32):
; popq %rbp
; retq
function %fastcall_m128i_param(i32, i8x16) system_v {
sig0 = (i8x16) windows_fastcall
block0(v0: i32, v1: i8x16):
call_indirect sig0, v0(v1)
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; subq %rsp, $48, %rsp
; virtual_sp_offset_adjust 48
; lea 32(%rsp), %rcx
; movdqu %xmm0, 0(%rcx)
; call *%rdi
; addq %rsp, $48, %rsp
; virtual_sp_offset_adjust -48
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; subq $0x30, %rsp
; leaq 0x20(%rsp), %rcx
; movdqu %xmm0, (%rcx)
; callq *%rdi
; addq $0x30, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq
function %fastcall_m128i_params_and_results(i32, i32, i8x16, i64, i8x16) -> i8x16 system_v {
sig0 = (i32, i8x16, i64, i8x16) -> i8x16 windows_fastcall
block0(v0: i32, v1: i32, v2: i8x16, v3: i64, v4: i8x16):
v5 = call_indirect sig0, v0(v1, v2, v3, v4)
v6 = iadd v5, v5
return v6
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdx, %r8
; movq %rsi, %rcx
; subq %rsp, $64, %rsp
; virtual_sp_offset_adjust 64
; lea 32(%rsp), %rdx
; movdqu %xmm0, 0(%rdx)
; lea 48(%rsp), %r9
; movdqu %xmm1, 0(%r9)
; call *%rdi
; addq %rsp, $64, %rsp
; virtual_sp_offset_adjust -64
; paddb %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdx, %r8
; movq %rsi, %rcx
; subq $0x40, %rsp
; leaq 0x20(%rsp), %rdx
; movdqu %xmm0, (%rdx)
; leaq 0x30(%rsp), %r9
; movdqu %xmm1, (%r9)
; callq *%rdi
; addq $0x40, %rsp
; paddb %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %fastcall_m128i_one_stack_param(i32, i8x16) system_v {
sig0 = (i32, i32, i32, i32, i8x16) windows_fastcall
block0(v0: i32, v1: i8x16):
call_indirect sig0, v0(v0, v0, v0, v0, v1)
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r9
; subq %rsp, $64, %rsp
; virtual_sp_offset_adjust 64
; lea 48(%rsp), %r8
; movdqu %xmm0, 0(%r8)
; movq %r8, 32(%rsp)
; movq %r9, %rcx
; movq %r9, %rdx
; movq %r9, %r8
; call *%r9
; addq %rsp, $64, %rsp
; virtual_sp_offset_adjust -64
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r9
; subq $0x40, %rsp
; leaq 0x30(%rsp), %r8
; movdqu %xmm0, (%r8)
; movq %r8, 0x20(%rsp)
; movq %r9, %rcx
; movq %r9, %rdx
; movq %r9, %r8
; callq *%r9
; addq $0x40, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq
function %fastcall_m128i_two_stack_param(i32, i8x16) system_v {
sig0 = (i32, i32, i32, i32, i8x16, i8x16) windows_fastcall
block0(v0: i32, v1: i8x16):
call_indirect sig0, v0(v0, v0, v0, v0, v1, v1)
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r9
; subq %rsp, $80, %rsp
; virtual_sp_offset_adjust 80
; lea 48(%rsp), %r8
; movdqu %xmm0, 0(%r8)
; movq %r8, 32(%rsp)
; lea 64(%rsp), %rsi
; movdqu %xmm0, 0(%rsi)
; movq %rsi, 40(%rsp)
; movq %r9, %rcx
; movq %r9, %rdx
; movq %r9, %r8
; call *%r9
; addq %rsp, $80, %rsp
; virtual_sp_offset_adjust -80
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r9
; subq $0x50, %rsp
; leaq 0x30(%rsp), %r8
; movdqu %xmm0, (%r8)
; movq %r8, 0x20(%rsp)
; leaq 0x40(%rsp), %rsi
; movdqu %xmm0, (%rsi)
; movq %rsi, 0x28(%rsp)
; movq %r9, %rcx
; movq %r9, %rdx
; movq %r9, %r8
; callq *%r9
; addq $0x50, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq
function %fastcall_m128i_reg_and_stack_param(i32, i8x16) system_v {
sig0 = (i32, i8x16, i32, i32, i8x16, i8x16) windows_fastcall
block0(v0: i32, v1: i8x16):
call_indirect sig0, v0(v0, v1, v0, v0, v1, v1)
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r9
; subq %rsp, $96, %rsp
; virtual_sp_offset_adjust 96
; lea 48(%rsp), %rdx
; movdqu %xmm0, 0(%rdx)
; lea 64(%rsp), %r11
; movdqu %xmm0, 0(%r11)
; movq %r11, 32(%rsp)
; lea 80(%rsp), %rcx
; movdqu %xmm0, 0(%rcx)
; movq %rcx, 40(%rsp)
; movq %r9, %rcx
; movq %r9, %r8
; call *%r9
; addq %rsp, $96, %rsp
; virtual_sp_offset_adjust -96
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r9
; subq $0x60, %rsp
; leaq 0x30(%rsp), %rdx
; movdqu %xmm0, (%rdx)
; leaq 0x40(%rsp), %r11
; movdqu %xmm0, (%r11)
; movq %r11, 0x20(%rsp)
; leaq 0x50(%rsp), %rcx
; movdqu %xmm0, (%rcx)
; movq %rcx, 0x28(%rsp)
; movq %r9, %rcx
; movq %r9, %r8
; callq *%r9
; addq $0x60, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq

2
cranelift/filetests/filetests/runtests/simd-popcnt.clif

@ -2,7 +2,9 @@ test interpret
test run
target aarch64
target s390x
target x86_64 has_ssse3=false
set enable_simd
target x86_64
target x86_64 sse42
target x86_64 sse42 has_avx has_avx512vl has_avx512bitalg
target riscv64 has_v

9
cranelift/filetests/filetests/runtests/simd-shuffle.clif

@ -2,10 +2,13 @@
test run
target aarch64
target s390x
target x86_64 has_ssse3=false
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
target x86_64
target x86_64 sse41
target x86_64 sse42
target x86_64 sse42 has_avx
target x86_64 sse42 has_avx has_avx512vl has_avx512vbmi
target riscv64gc has_v
function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {

1
cranelift/filetests/filetests/runtests/simd-splat.clif

@ -2,6 +2,7 @@
test run
target aarch64
target s390x
target x86_64 has_ssse3=false
set enable_simd
target x86_64
target x86_64 sse41

6
cranelift/filetests/filetests/runtests/simd-swizzle.clif

@ -2,9 +2,11 @@ test interpret
test run
target aarch64
target s390x
target x86_64 has_ssse3=false
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
target x86_64
target x86_64 sse41
target x86_64 sse41 has_avx
target riscv64gc has_v
function %swizzle_i8x16(i8x16, i8x16) -> i8x16 {

59
cranelift/filetests/src/function_runner.rs

@ -89,7 +89,18 @@ impl TestFileCompiler {
/// host machine, this [TargetIsa] must match the host machine's ISA (see
/// [TestFileCompiler::with_host_isa]).
pub fn new(isa: OwnedTargetIsa) -> Self {
let builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
let mut builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names());
drop(&mut builder); // require mutability on all architectures
#[cfg(target_arch = "x86_64")]
{
builder.symbol_lookup_fn(Box::new(|name| {
if name == "__cranelift_x86_pshufb" {
Some(__cranelift_x86_pshufb as *const u8)
} else {
None
}
}));
}
let module = JITModule::new(builder);
let ctx = module.make_context();
@ -500,6 +511,52 @@ fn make_trampoline(name: UserFuncName, signature: &ir::Signature, isa: &dyn Targ
func
}
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::__m128i;
#[cfg(target_arch = "x86_64")]
#[allow(improper_ctypes_definitions)]
extern "C" fn __cranelift_x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
union U {
reg: __m128i,
mem: [u8; 16],
}
unsafe {
let a = U { reg: a }.mem;
let b = U { reg: b }.mem;
let select = |arr: &[u8; 16], byte: u8| {
if byte & 0x80 != 0 {
0x00
} else {
arr[(byte & 0xf) as usize]
}
};
U {
mem: [
select(&a, b[0]),
select(&a, b[1]),
select(&a, b[2]),
select(&a, b[3]),
select(&a, b[4]),
select(&a, b[5]),
select(&a, b[6]),
select(&a, b[7]),
select(&a, b[8]),
select(&a, b[9]),
select(&a, b[10]),
select(&a, b[11]),
select(&a, b[12]),
select(&a, b[13]),
select(&a, b[14]),
select(&a, b[15]),
],
}
.reg
}
}
#[cfg(test)]
mod test {
use super::*;

1
cranelift/module/src/lib.rs

@ -72,5 +72,6 @@ pub fn default_libcall_names() -> Box<dyn Fn(ir::LibCall) -> String + Send + Syn
ir::LibCall::ElfTlsGetAddr => "__tls_get_addr".to_owned(),
ir::LibCall::ElfTlsGetOffset => "__tls_get_offset".to_owned(),
ir::LibCall::X86Pshufb => "__cranelift_x86_pshufb".to_owned(),
})
}

4
cranelift/wasm/src/code_translator.rs

@ -2197,7 +2197,9 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
Operator::I8x16RelaxedSwizzle => {
let (a, b) = pop2_with_bitcast(state, I8X16, builder);
state.push1(
if environ.relaxed_simd_deterministic() || !environ.is_x86() {
if environ.relaxed_simd_deterministic()
|| !environ.use_x86_pshufb_for_relaxed_swizzle()
{
// Deterministic semantics match the `i8x16.swizzle`
// instruction which is the CLIF `swizzle`.
builder.ins().swizzle(a, b)

6
cranelift/wasm/src/environ/spec.rs

@ -569,6 +569,12 @@ pub trait FuncEnvironment: TargetEnvironment {
let _ = ty;
false
}
/// Returns whether the CLIF `x86_pshufb` instruction should be used for the
/// `i8x16.relaxed_swizzle` instruction.
fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
false
}
}
/// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the

1
crates/cranelift-shared/src/obj.rs

@ -558,6 +558,7 @@ fn libcall_name(call: LibCall) -> &'static str {
LibCall::TruncF64 => LC::TruncF64,
LibCall::FmaF32 => LC::FmaF32,
LibCall::FmaF64 => LC::FmaF64,
LibCall::X86Pshufb => LC::X86Pshufb,
_ => panic!("unknown libcall to give a name to: {call:?}"),
};
other.symbol()

4
crates/cranelift/src/func_environ.rs

@ -2203,4 +2203,8 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m
fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool {
self.isa.has_x86_blendv_lowering(ty)
}
fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
self.isa.has_x86_pshufb_lowering()
}
}

1
crates/environ/src/obj.rs

@ -168,4 +168,5 @@ libcalls! {
TruncF64 = "libcall_truncf64"
FmaF32 = "libcall_fmaf32"
FmaF64 = "libcall_fmaf64"
X86Pshufb = "libcall_x86_pshufb"
}

4
crates/jit/src/code_memory.rs

@ -284,6 +284,10 @@ impl CodeMemory {
obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize,
obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize,
obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize,
#[cfg(target_arch = "x86_64")]
obj::LibCall::X86Pshufb => libcalls::relocs::x86_pshufb as usize,
#[cfg(not(target_arch = "x86_64"))]
obj::LibCall::X86Pshufb => unreachable!(),
};
self.mmap
.as_mut_ptr()

48
crates/runtime/src/libcalls.rs

@ -575,4 +575,52 @@ pub mod relocs {
pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 {
a.mul_add(b, c)
}
// This intrinsic is only used on x86_64 platforms as an implementation of
// the `pshufb` instruction when SSSE3 is not available.
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::__m128i;
#[cfg(target_arch = "x86_64")]
#[allow(improper_ctypes_definitions)]
pub extern "C" fn x86_pshufb(a: __m128i, b: __m128i) -> __m128i {
union U {
reg: __m128i,
mem: [u8; 16],
}
unsafe {
let a = U { reg: a }.mem;
let b = U { reg: b }.mem;
let select = |arr: &[u8; 16], byte: u8| {
if byte & 0x80 != 0 {
0x00
} else {
arr[(byte & 0xf) as usize]
}
};
U {
mem: [
select(&a, b[0]),
select(&a, b[1]),
select(&a, b[2]),
select(&a, b[3]),
select(&a, b[4]),
select(&a, b[5]),
select(&a, b[6]),
select(&a, b[7]),
select(&a, b[8]),
select(&a, b[9]),
select(&a, b[10]),
select(&a, b[11]),
select(&a, b[12]),
select(&a, b[13]),
select(&a, b[14]),
select(&a, b[15]),
],
}
.reg
}
}
}

Loading…
Cancel
Save