From e6a77eecf0601be5466d8a9e6efbaa021685e2d3 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 1 May 2023 19:32:11 -0500 Subject: [PATCH] x64: Add non-SSE 4.1 lowerings of min/max instructions (#6291) * x64: Add non-SSE 4.1 lowerings of min/max instructions This commit updates the x64 backend to avoid using various `p{min,max}*` instructions if SSE 4.1 isn't enabled. These instructions are used for comparisons as well as the `{u,s}{min,max}` instructions. Alternative lowerings are primarily drawn from LLVM. Through this refactoring the x64 backend now has also grown (not the most efficient) lowerings for vector comparisons with `i64x2` types, which it previously largely didn't have. This enabled copying some non-x86_64 tests into the main test files for various operations. * Review comments --- cranelift/codegen/src/isa/x64/lower.isle | 281 +++++++++++++----- .../filetests/runtests/simd-icmp-ne.clif | 5 +- .../filetests/runtests/simd-icmp-sge.clif | 5 +- .../filetests/runtests/simd-icmp-sgt.clif | 2 + .../filetests/runtests/simd-icmp-sle.clif | 5 +- .../filetests/runtests/simd-icmp-slt.clif | 5 +- .../runtests/simd-icmp-uge-i64x2.clif | 17 -- .../filetests/runtests/simd-icmp-uge.clif | 17 +- .../runtests/simd-icmp-ugt-i64x2.clif | 17 -- .../filetests/runtests/simd-icmp-ugt.clif | 17 +- .../runtests/simd-icmp-ule-i64x2.clif | 17 -- .../filetests/runtests/simd-icmp-ule.clif | 17 +- .../runtests/simd-icmp-ult-i64x2.clif | 17 -- .../filetests/runtests/simd-icmp-ult.clif | 17 +- .../runtests/simd-min-max-aarch64.clif | 39 --- .../filetests/runtests/simd-min-max.clif | 41 ++- 16 files changed, 327 insertions(+), 192 deletions(-) delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif delete mode 100644 cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index ff6585ef87..8fd18c2828 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1383,49 +1383,121 @@ (rule -1 (lower (has_type (fits_in_64 ty) (smax x y))) (cmp_and_choose ty (CC.NL) x y)) -;; SSE `smax`. +;; SSE helpers for determining if single-instruction lowerings are available. -(rule (lower (has_type $I8X16 (smax x y))) - (x64_pmaxsb x y)) +(decl pure has_pmins (Type) bool) +(rule 1 (has_pmins $I16X8) $true) +(rule 1 (has_pmins $I64X2) $false) +(rule (has_pmins _) (use_sse41)) -(rule (lower (has_type $I16X8 (smax x y))) - (x64_pmaxsw x y)) +(decl pure has_pmaxs (Type) bool) +(rule 1 (has_pmaxs $I16X8) $true) +(rule 1 (has_pmaxs $I64X2) $false) +(rule (has_pmaxs _) (use_sse41)) -(rule (lower (has_type $I32X4 (smax x y))) - (x64_pmaxsd x y)) +(decl pure has_pmaxu (Type) bool) +(rule 1 (has_pmaxu $I8X16) $true) +(rule 1 (has_pmaxu $I64X2) $false) +(rule (has_pmaxu _) (use_sse41)) -;; SSE `smin`. +(decl pure has_pminu (Type) bool) +(rule 1 (has_pminu $I8X16) $true) +(rule 1 (has_pminu $I64X2) $false) +(rule (has_pminu _) (use_sse41)) + +;; SSE `smax`. + +(rule 1 (lower (has_type (ty_vec128 ty) (smax x y))) + (if-let $true (has_pmaxs ty)) + (x64_pmaxs ty x y)) + +(rule (lower (has_type (ty_vec128 ty) (smax x y))) + (let ( + (x Xmm x) + (y Xmm y) + (cmp Xmm (x64_pcmpgt ty x y)) + (x_is_max Xmm (x64_pand cmp x)) + (y_is_max Xmm (x64_pandn cmp y)) + ) + (x64_por x_is_max y_is_max))) -(rule (lower (has_type $I8X16 (smin x y))) - (x64_pminsb x y)) +;; SSE `smin`. -(rule (lower (has_type $I16X8 (smin x y))) - (x64_pminsw x y)) +(rule 1 (lower (has_type (ty_vec128 ty) (smin x y))) + (if-let $true (has_pmins ty)) + (x64_pmins ty x y)) -(rule (lower (has_type $I32X4 (smin x y))) - (x64_pminsd x y)) +(rule (lower (has_type (ty_vec128 ty) (smin x y))) + (let ( + (x Xmm x) + (y Xmm y) + (cmp Xmm (x64_pcmpgt ty y x)) + (x_is_min Xmm (x64_pand cmp x)) + (y_is_min Xmm (x64_pandn cmp y)) + ) + (x64_por x_is_min y_is_min))) ;; SSE `umax`. -(rule (lower (has_type $I8X16 (umax x y))) - (x64_pmaxub x y)) +(rule 2 (lower (has_type (ty_vec128 ty) (umax x y))) + (if-let $true (has_pmaxu ty)) + (x64_pmaxu ty x y)) + +;; If y < x then the saturating subtraction will be zero, otherwise when added +;; back to x it'll return y. +(rule 1 (lower (has_type $I16X8 (umax x y))) + (let ((x Xmm x)) + (x64_paddw x (x64_psubusw y x)))) -(rule (lower (has_type $I16X8 (umax x y))) - (x64_pmaxuw x y)) +;; Flip the upper bits of each lane so the signed comparison has the same +;; result as a signed comparison, and then select the results with the output +;; mask. See `pcmpgt` lowering for info on flipping the upper bit. +(rule (lower (has_type (ty_vec128 ty) (umax x y))) + (let ( + (x Xmm x) + (y Xmm y) + (mask Xmm (flip_high_bit_mask ty)) + (x_masked Xmm (x64_pxor x mask)) + (y_masked Xmm (x64_pxor y mask)) + (cmp Xmm (x64_pcmpgt ty x_masked y_masked)) + (x_is_max Xmm (x64_pand cmp x)) + (y_is_max Xmm (x64_pandn cmp y)) + ) + (x64_por x_is_max y_is_max))) -(rule (lower (has_type $I32X4 (umax x y))) - (x64_pmaxud x y)) +(decl flip_high_bit_mask (Type) Xmm) +(rule (flip_high_bit_mask $I16X8) + (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))) +(rule (flip_high_bit_mask $I32X4) + (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000))) +(rule (flip_high_bit_mask $I64X2) + (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000))) ;; SSE `umin`. -(rule (lower (has_type $I8X16 (umin x y))) - (x64_pminub x y)) +(rule 2 (lower (has_type (ty_vec128 ty) (umin x y))) + (if-let $true (has_pminu ty)) + (x64_pminu ty x y)) -(rule (lower (has_type $I16X8 (umin x y))) - (x64_pminuw x y)) +;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then +;; the saturated result, when subtracted again, will go back to `y`. +(rule 1 (lower (has_type $I16X8 (umin x y))) + (let ((x Xmm x)) + (x64_psubw x (x64_psubusw x y)))) -(rule (lower (has_type $I32X4 (umin x y))) - (x64_pminud x y)) +;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit. +(rule (lower (has_type (ty_vec128 ty) (umin x y))) + (let ( + (x Xmm x) + (y Xmm y) + (mask Xmm (flip_high_bit_mask ty)) + (x_masked Xmm (x64_pxor x mask)) + (y_masked Xmm (x64_pxor y mask)) + (cmp Xmm (x64_pcmpgt ty y_masked x_masked)) + (x_is_max Xmm (x64_pand cmp x)) + (y_is_max Xmm (x64_pandn cmp y)) + ) + (x64_por x_is_max y_is_max))) ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1519,62 +1591,127 @@ (let ((checked Xmm (x64_pcmpeq ty a b)) (all_ones Xmm (vector_all_ones))) (x64_pxor checked all_ones))) -;; Signed comparisons have a single-instruction lowering, unlike their unsigned -;; counterparts. These latter instructions use the unsigned min/max -;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s). + +;; SSE `sgt` + (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) (x64_pcmpgt ty a b)) + +;; SSE `slt` + (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b)) (x64_pcmpgt ty b a)) + +;; SSE `ugt` + +;; N.B.: we must manually prevent load coalescing operands; the +;; register allocator gets confused otherwise. +(rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pmaxu ty)) + (let ((a Xmm a) + (b Xmm b) + (max Xmm (x64_pmaxu ty a b)) + (eq Xmm (x64_pcmpeq ty max b))) + (x64_pxor eq (vector_all_ones)))) + +;; Flip the upper bit of each lane so the result of a signed comparison is the +;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more) (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) - ;; N.B.: we must manually prevent load coalescing of these operands; the - ;; register allocator gets confused otherwise. TODO: - ;; https://github.com/bytecodealliance/wasmtime/issues/3953. - (let ((xmm_a Xmm (put_in_xmm a)) - (xmm_b Xmm (put_in_xmm b)) - (max Xmm (x64_pmaxu ty xmm_a xmm_b)) - (eq Xmm (x64_pcmpeq ty max xmm_b)) - (all_ones Xmm (vector_all_ones))) - (x64_pxor eq all_ones))) + (let ((mask Xmm (flip_high_bit_mask ty)) + (a_masked Xmm (x64_pxor a mask)) + (b_masked Xmm (x64_pxor b mask))) + (x64_pcmpgt ty a_masked b_masked))) + +;; SSE `ult` + +(rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pminu ty)) + ;; N.B.: see note above. + (let ((a Xmm a) + (b Xmm b) + (min Xmm (x64_pminu ty a b)) + (eq Xmm (x64_pcmpeq ty min b))) + (x64_pxor eq (vector_all_ones)))) + +;; Flip the upper bit of `a` and `b` so the signed comparison result will +;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more). (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) - ;; N.B.: see note above. - (let ((xmm_a Xmm (put_in_xmm a)) - (xmm_b Xmm (put_in_xmm b)) - (min Xmm (x64_pminu ty xmm_a xmm_b)) - (eq Xmm (x64_pcmpeq ty min xmm_b)) - (all_ones Xmm (vector_all_ones))) - (x64_pxor eq all_ones))) -;; To lower signed and unsigned *-or-equals comparisons, we find the minimum -;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that -;; there is no 64x2 version of this lowering (see below). + (let ((mask Xmm (flip_high_bit_mask ty)) + (a_masked Xmm (x64_pxor a mask)) + (b_masked Xmm (x64_pxor b mask))) + (x64_pcmpgt ty b_masked a_masked))) + +;; SSE `sge` + +;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`. +(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pmaxs ty)) + (x64_pcmpeq ty a (x64_pmaxs ty a b))) + +;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the +;; result. (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) - (let ((max Xmm (x64_pmaxs ty a b))) - (x64_pcmpeq ty a max))) + (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones))) + +;; SSE `sle` + +;; With `pmins*` use that and compare the result to `a`. +(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pmins ty)) + (x64_pcmpeq ty a (x64_pmins ty a b))) + +;; Without `pmins*` perform a greater-than test and invert the result. (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) - (let ((min Xmm (x64_pmins ty a b))) - (x64_pcmpeq ty a min))) + (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones))) + +;; SSE `uge` + +(rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pmaxu ty)) + (x64_pcmpeq ty a (x64_pmaxu ty a b))) + +;; Perform a saturating subtract of `a` from `b` and if the result is zero then +;; `a` is greater or equal. +(rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b)) + (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8))) + +;; Flip the upper bit of each lane so the signed comparison is the same as +;; an unsigned one and then invert the result. See docs on `pcmpgt` for why +;; flipping the upper bit works. (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) - (let ((max Xmm (x64_pmaxu ty a b))) - (x64_pcmpeq ty a max))) -(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) - (let ((min Xmm (x64_pminu ty a b))) - (x64_pcmpeq ty a min))) -;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead -;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all -;; 1s), emitting one more instruction than the smaller-lane versions. -(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) - (let ((checked Xmm (x64_pcmpgt $I64X2 b a)) - (all_ones Xmm (vector_all_ones))) - (x64_pxor checked all_ones))) -(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b)) - (let ((checked Xmm (x64_pcmpgt $I64X2 a b)) - (all_ones Xmm (vector_all_ones))) - (x64_pxor checked all_ones))) -;; TODO: not used by WebAssembly translation -;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) -;; TODO: not used by WebAssembly translation -;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b)) + (let ( + (mask Xmm (flip_high_bit_mask ty)) + (a_masked Xmm (x64_pxor a mask)) + (b_masked Xmm (x64_pxor b mask)) + (cmp Xmm (x64_pcmpgt ty b_masked a_masked)) + ) + (x64_pxor cmp (vector_all_ones)))) +;; SSE `ule` + +(rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) + (if-let $true (has_pminu ty)) + (x64_pcmpeq ty a (x64_pminu ty a b))) + +;; A saturating subtraction will produce zeros if `a` is less than `b`, so +;; compare that result to an all-zeros result to figure out lanes of `a` that +;; are <= to the lanes in `b` +(rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b)) + (let ((zeros_if_a_is_min Xmm (x64_psubusw a b))) + (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16)))) + +;; Flip the upper bit of each lane in `a` and `b` so a signed comparison +;; produces the same result as an unsigned comparison. Then test test for `gt` +;; and invert the result to get the `le` that is desired here. See docs on +;; `pcmpgt` for why flipping the upper bit works. +(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) + (let ( + (mask Xmm (flip_high_bit_mask ty)) + (a_masked Xmm (x64_pxor a mask)) + (b_masked Xmm (x64_pxor b mask)) + (cmp Xmm (x64_pcmpgt ty a_masked b_masked)) + ) + (x64_pxor cmp (vector_all_ones)))) ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif index 2bd39fdce0..5dbb5369d5 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif @@ -1,8 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif index 989987c3c4..035c0afcee 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif @@ -1,8 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif index 6d9cbc8e5e..e2542ad711 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif @@ -1,7 +1,9 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target x86_64 +target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx target aarch64 diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif index c2a73f1891..fa1e93f145 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif @@ -1,8 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif index e2cbdc0039..1eca10ccbf 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif @@ -1,8 +1,11 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif deleted file mode 100644 index 74fe4c2814..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif +++ /dev/null @@ -1,17 +0,0 @@ -test interpret -test run -target aarch64 -target s390x - -; TODO: Move this to the main file once x86_64 supports this operation -; See: #5529 - -function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = icmp uge v0, v1 - return v2 -} -; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1] -; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0] -; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0] -; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif index 06df15db4c..8280fed7bf 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif @@ -1,10 +1,13 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x -target x86_64 -target x86_64 has_avx function %simd_icmp_uge_i8(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -39,3 +42,13 @@ block0: return v8 } ; run: %icmp_uge_const_i32x4() == 1 + +function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp uge v0, v1 + return v2 +} +; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1] +; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0] +; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0] +; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif deleted file mode 100644 index d3ab7f3b5f..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif +++ /dev/null @@ -1,17 +0,0 @@ -test interpret -test run -target aarch64 -target s390x - -; TODO: Move this to the main file once x86_64 supports this operation -; See: #5529 - -function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = icmp ugt v0, v1 - return v2 -} -; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1] -; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0] -; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0] -; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif index aa9d5f374e..fa5b12794d 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif @@ -1,10 +1,13 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x -target x86_64 -target x86_64 has_avx function %simd_icmp_ugt_i8(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -38,3 +41,13 @@ block0: return v8 } ; run: %icmp_ugt_const_i8x16() == 1 + +function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ugt v0, v1 + return v2 +} +; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1] +; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0] +; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0] +; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif deleted file mode 100644 index c06136bcaa..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif +++ /dev/null @@ -1,17 +0,0 @@ -test interpret -test run -target aarch64 -target s390x - -; TODO: Move this to the main file once x86_64 supports this operation -; See: #5529 - -function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = icmp ule v0, v1 - return v2 -} -; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0] -; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1] -; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1] -; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif index 6fe03d611a..680f02a228 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif @@ -1,10 +1,13 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x -target x86_64 -target x86_64 has_avx function %simd_icmp_ule_i8(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -40,3 +43,13 @@ block0: return v8 } ; run: %icmp_ule_const_i16x8() == 1 + +function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ule v0, v1 + return v2 +} +; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0] +; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1] +; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1] +; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif deleted file mode 100644 index 788de0b539..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif +++ /dev/null @@ -1,17 +0,0 @@ -test interpret -test run -target aarch64 -target s390x - -; TODO: Move this to the main file once x86_64 supports this operation -; See: #5529 - -function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = icmp ult v0, v1 - return v2 -} -; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0] -; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1] -; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1] -; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif index ea3c3cd434..f99f8570e4 100644 --- a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif +++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif @@ -1,10 +1,13 @@ test interpret test run +target x86_64 has_sse41=false set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target aarch64 target s390x -target x86_64 -target x86_64 has_avx function %simd_icmp_ult_i8(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -53,3 +56,13 @@ block0: return v8 } ; run: %icmp_ult_const_i16x8() == 1 + +function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ult v0, v1 + return v2 +} +; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0] +; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1] +; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1] +; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif deleted file mode 100644 index d5cef288b0..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif +++ /dev/null @@ -1,39 +0,0 @@ -test run -test interpret -target aarch64 - -function %smin_i64x2(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = smin v0, v1 - return v2 -} - -; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ] -; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ] - -function %smax_i64x2(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = smax v0, v1 - return v2 -} - -; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ] -; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ] - -function %umin_i64x2(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = umin v0, v1 - return v2 -} - -; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ] -; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ] - -function %umax_i64x2(i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2): - v2 = umax v0, v1 - return v2 -} - -; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ] -; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ] diff --git a/cranelift/filetests/filetests/runtests/simd-min-max.clif b/cranelift/filetests/filetests/runtests/simd-min-max.clif index d4934615a6..d63baec4ed 100644 --- a/cranelift/filetests/filetests/runtests/simd-min-max.clif +++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif @@ -1,9 +1,12 @@ test run test interpret +target x86_64 has_sse41=false set enable_simd target aarch64 target x86_64 -target x86_64 has_avx +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx target s390x function %smin_i8x16(i8x16, i8x16) -> i8x16 { @@ -109,3 +112,39 @@ block0(v0: i32x4, v1: i32x4): } ; run: %umax_i32x4([0xBAADF00D 0xDEADBEEF 0xC00FFFEE 0xBADAB00F], [0xCA11ACAB 0x12349876 0x98763210 0x43216789]) == [ 0xCA11ACAB 0xDEADBEEF 0xC00FFFEE 0xBADAB00F ] + +function %smin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = smin v0, v1 + return v2 +} + +; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ] +; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ] + +function %smax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = smax v0, v1 + return v2 +} + +; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ] +; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ] + +function %umin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umin v0, v1 + return v2 +} + +; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ] +; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ] + +function %umax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umax v0, v1 + return v2 +} + +; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ] +; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]