@ -2095,11 +2095,11 @@
;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 3 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
(if-let $true (use_popcnt))
(x64_popcnt ty src))
(rule 2 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
(if-let $true (use_popcnt))
(x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
@ -2192,7 +2192,7 @@
final))
(rule 1 (lower (has_type $I8X16 (popcnt src)))
(rule 2 (lower (has_type $I8X16 (popcnt src)))
(if-let $true (use_avx512vl_simd))
(if-let $true (use_avx512bitalg_simd))
(x64_vpopcntb src))
@ -2218,8 +2218,8 @@
;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
(rule (lower (has_type $I8X16
(popcnt src) ))
(rule 1 (lower (has_type $I8X16 (popcnt src)))
(if-let $true (use_ssse3 ))
(let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
(low_nibbles Xmm (sse_and $I8X16 src low_mask))
;; Note that this is a 16x8 shift, but that's OK; we mask
@ -2233,6 +2233,19 @@
(bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
(x64_paddb bit_counts_low bit_counts_high)))
;; A modified version of the popcnt method from Hacker's Delight.
(rule (lower (has_type $I8X16 (popcnt src)))
(let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
(src Xmm src)
(shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
(src Xmm (x64_psubb src shifted))
(src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
(x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))
;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I8 (bitrev src)))
@ -4181,7 +4194,8 @@
;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
;; register.
(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
(x64_pshufb a (xmm_zero $I8X16)))
(if-let $true (use_ssse3))
(x64_pshufb a (xmm_zero $I8X16)))
;; Special case for the `shufps` instruction which will select two 32-bit values
;; from the first operand and two 32-bit values from the second operand. Note
@ -4209,7 +4223,8 @@
;; indices (may not be completely necessary: verification could fail incorrect
;; mask values) and fix the indexes to all point to the `dst` vector.
(rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
(x64_pshufb a (shuffle_0_31_mask mask)))
(if-let $true (use_ssse3))
(x64_pshufb a (shuffle_0_31_mask mask)))
;; For the case where the shuffle mask contains out-of-bounds values (values
;; greater than 31) we must mask off those resulting values in the result of
@ -4231,8 +4246,8 @@
;; above, we build the `constructed_mask` for each case statically.
(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
(x64_por
(x64 _pshufb a (shuffle_0_15_mask mask))
(x64 _pshufb b (shuffle_16_31_mask mask))))
(lower _pshufb a (shuffle_0_15_mask mask))
(lower _pshufb b (shuffle_16_31_mask mask))))
;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -4244,13 +4259,28 @@
;; variables like: %dst = swizzle %src, %mask
(rule (lower (swizzle src mask))
(let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
(x64 _pshufb src mask)))
(lower _pshufb src mask)))
;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (x86_pshufb src mask))
(if-let $true (use_ssse3))
(x64_pshufb src mask))
;; A helper function to generate either the `pshufb` instruction or a libcall to
;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
;; performant thing in the world so this is primarily here for completeness
;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
;; of SSSE3 to use the `pshufb` instruction itself.
(decl lower_pshufb (Xmm RegMem) Xmm)
(rule 1 (lower_pshufb src mask)
(if-let $true (use_ssse3))
(x64_pshufb src mask))
(rule (lower_pshufb src (RegMem.Reg mask))
(libcall_2 (LibCall.X86Pshufb) src mask))
(rule (lower_pshufb src (RegMem.Mem addr))
(lower_pshufb src (x64_movdqu_load addr)))
;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Remove the extractlane instruction, leaving the float where it is. The upper
@ -4343,14 +4373,18 @@
;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
;; with a mask of zero which is calculated with an xor-against-itself register.
(rule 0 (lower (has_type $I8X16 (splat src)))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(let ((src Xmm (x64_movd_to_xmm src)))
(x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
(rule 1 (lower (has_type $I8X16 (splat src)))
(if-let $true (use_ssse3))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(rule 2 (lower (has_type $I8X16 (splat src)))
(if-let $true (use_avx2_simd))
(x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (use_sse41))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (use_avx2_simd))
(x64_vpbroadcastb addr))
@ -4399,10 +4433,10 @@
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `use_avx_simd` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (use_avx_simd))
(x64_vbroadcastss addr))
@ -4413,7 +4447,7 @@
(x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00))
(rule 0 (lower (has_type $F64X2 (splat src)))
(x64_pshufd src 0b01_00_01_00))
(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(if-let $true (use_ssse3))
(x64_movddup addr))