Browse Source

x64: Implement some minor optimizations related to SIMD lowerings (#8839)

* Add tests for patterns I'm about to optimize

* x64: Optimize vector compare-and-branch

This commit implements lowering optimizations for the `vall_true` and
`vany_true` CLIF instructions when combined with `brif`. This is in the
same manner as `icmp` and `fcmp` combined with `brif` where the result
of the comparison is never materialized into a general purpose register
which helps lower register pressure and remove some instructions.

* x64: Optimize `vconst` with an all-ones pattern

This has a single-instruction lowering which doesn't load from memory so
it's probably cheaper than loading all-ones from memory.
pull/8843/head
Alex Crichton 5 months ago
committed by GitHub
parent
commit
ee9e1ca545
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 35
      cranelift/codegen/src/isa/x64/lower.isle
  2. 125
      tests/disas/x64-simd-test-and-branch.wat
  3. 22
      tests/disas/x64-vector-patterns.wat

35
cranelift/codegen/src/isa/x64/lower.isle

@ -3308,6 +3308,12 @@
(rule 2 (lower_branch (brif (maybe_uextend (fcmp cc a b)) _ _) (two_targets then else))
(emit_side_effect (jmp_cond_fcmp (emit_fcmp cc a b) then else)))
(rule 2 (lower_branch (brif (maybe_uextend (vany_true a)) _ _) (two_targets then else))
(emit_side_effect (jmp_cond_icmp (emit_vany_true a) then else)))
(rule 2 (lower_branch (brif (maybe_uextend (vall_true a)) _ _) (two_targets then else))
(emit_side_effect (jmp_cond_icmp (emit_vall_true a) then else)))
(rule 1 (lower_branch (brif val @ (value_type $I128) _ _)
(two_targets then else))
(emit_side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) then else)))
@ -4263,10 +4269,9 @@
;; TODO use Inst::gen_constant() instead.
(x64_xmm_load_const ty (const_to_vconst const)))
;; Special case for a zero-vector: don't load, xor instead.
(rule 1 (lower (has_type ty (vconst (u128_from_constant 0))))
(let ((dst Xmm (xmm_uninit_value)))
(x64_pxor dst dst)))
;; Special cases for known constant patterns to skip a 16-byte load.
(rule 1 (lower (has_type ty (vconst (u128_from_constant 0)))) (xmm_zero ty))
(rule 1 (lower (has_type ty (vconst (u128_from_constant -1)))) (vector_all_ones))
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -4630,30 +4635,38 @@
;; 0xffff then every byte was equal to zero, so test if the comparison is
;; not-equal or NZ.
(rule (lower (vany_true val))
(lower_icmp_bool (emit_vany_true val)))
(decl emit_vany_true (Value) IcmpCondResult)
(rule (emit_vany_true val)
(let (
(any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16)))
(mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero))
)
(with_flags (x64_cmp_imm (OperandSize.Size32) mask 0xffff)
(x64_setcc (CC.NZ)))))
(icmp_cond_result (x64_cmp_imm (OperandSize.Size32) mask 0xffff)
(CC.NZ))))
;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (vall_true val @ (value_type ty)))
(rule (lower (vall_true val))
(lower_icmp_bool (emit_vall_true val)))
(decl emit_vall_true (Value) IcmpCondResult)
(rule 1 (emit_vall_true val @ (value_type ty))
(if-let $true (use_sse41))
(let ((src Xmm val)
(zeros Xmm (xmm_zero ty))
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
(icmp_cond_result (x64_ptest cmp cmp) (CC.Z))))
;; Perform an appropriately-sized lane-wise comparison with zero. If the
;; result is all 0s then all of them are true because nothing was equal to
;; zero.
(rule (lower (vall_true val @ (value_type ty)))
(rule (emit_vall_true val @ (value_type ty))
(let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty)))
(mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero)))
(with_flags (x64_test (OperandSize.Size32) mask mask)
(x64_setcc (CC.Z)))))
(icmp_cond_result (x64_test (OperandSize.Size32) mask mask)
(CC.Z))))
;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

125
tests/disas/x64-simd-test-and-branch.wat

@ -0,0 +1,125 @@
;;! target = "x86_64"
;;! test = "compile"
;;! flags = ["-Ccranelift-sse41"]
(module
(func $i8x16.all_true (param v128) (result i32)
local.get 0
i8x16.all_true
if (result i32)
i32.const 100
else
i32.const 200
end
)
(func $i16x8.all_true (param v128) (result i32)
local.get 0
i16x8.all_true
if (result i32)
i32.const 100
else
i32.const 200
end
)
(func $i32x4.all_true (param v128) (result i32)
local.get 0
i32x4.all_true
if (result i32)
i32.const 100
else
i32.const 200
end
)
(func $i64x2.all_true (param v128) (result i32)
local.get 0
i64x2.all_true
if (result i32)
i32.const 100
else
i32.const 200
end
)
(func $v128.any_true (param v128) (result i32)
local.get 0
v128.any_true
if (result i32)
i32.const 100
else
i32.const 200
end
)
)
;; wasm[0]::function[0]::i8x16.all_true:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm7, %xmm7
;; pcmpeqb %xmm7, %xmm0
;; ptest %xmm0, %xmm0
;; je 0x21
;; 17: movl $0xc8, %eax
;; jmp 0x26
;; 21: movl $0x64, %eax
;; movq %rbp, %rsp
;; popq %rbp
;; retq
;;
;; wasm[0]::function[1]::i16x8.all_true:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm7, %xmm7
;; pcmpeqw %xmm7, %xmm0
;; ptest %xmm0, %xmm0
;; je 0x61
;; 57: movl $0xc8, %eax
;; jmp 0x66
;; 61: movl $0x64, %eax
;; movq %rbp, %rsp
;; popq %rbp
;; retq
;;
;; wasm[0]::function[2]::i32x4.all_true:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm7, %xmm7
;; pcmpeqd %xmm7, %xmm0
;; ptest %xmm0, %xmm0
;; je 0xa1
;; 97: movl $0xc8, %eax
;; jmp 0xa6
;; a1: movl $0x64, %eax
;; movq %rbp, %rsp
;; popq %rbp
;; retq
;;
;; wasm[0]::function[3]::i64x2.all_true:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm7, %xmm7
;; pcmpeqq %xmm7, %xmm0
;; ptest %xmm0, %xmm0
;; je 0xe2
;; d8: movl $0xc8, %eax
;; jmp 0xe7
;; e2: movl $0x64, %eax
;; movq %rbp, %rsp
;; popq %rbp
;; retq
;;
;; wasm[0]::function[4]::v128.any_true:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm7, %xmm7
;; pcmpeqb %xmm7, %xmm0
;; pmovmskb %xmm0, %ecx
;; cmpl $0xffff, %ecx
;; jne 0x126
;; 11c: movl $0xc8, %eax
;; jmp 0x12b
;; 126: movl $0x64, %eax
;; movq %rbp, %rsp
;; popq %rbp
;; retq

22
tests/disas/x64-vector-patterns.wat

@ -0,0 +1,22 @@
;;! target = "x86_64"
;;! test = "compile"
(module
(func $zero (result v128) v128.const i64x2 0 0)
(func $ones (result v128) v128.const i64x2 -1 -1)
)
;; wasm[0]::function[0]::zero:
;; pushq %rbp
;; movq %rsp, %rbp
;; pxor %xmm0, %xmm0
;; movq %rbp, %rsp
;; popq %rbp
;; retq
;;
;; wasm[0]::function[1]::ones:
;; pushq %rbp
;; movq %rsp, %rbp
;; pcmpeqd %xmm0, %xmm0
;; movq %rbp, %rsp
;; popq %rbp
;; retq
Loading…
Cancel
Save