x64: Add non-SSE 4.1 lowerings of min/max instructions (#6291)

* x64: Add non-SSE 4.1 lowerings of min/max instructions This commit updates the x64 backend to avoid using various `p{min,max}*` instructions if SSE 4.1 isn't enabled. These instructions are used for comparisons as well as the `{u,s}{min,max}` instructions. Alternative lowerings are primarily drawn from LLVM. Through this refactoring the x64 backend now has also grown (not the most efficient) lowerings for vector comparisons with `i64x2` types, which it previously largely didn't have. This enabled copying some non-x86_64 tests into the main test files for various operations. * Review comments
2 years ago · e6a77eecf0
16 changed files with 327 additions and 192 deletions
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@ -1383,49 +1383,121 @@
 (rule -1 (lower (has_type (fits_in_64 ty) (smax x y)))
      (cmp_and_choose ty (CC.NL) x y))

-;; SSE `smax`.
+;; SSE helpers for determining if single-instruction lowerings are available.

-(rule (lower (has_type $I8X16 (smax x y)))
-      (x64_pmaxsb x y))
+(decl pure has_pmins (Type) bool)
+(rule 1 (has_pmins $I16X8) $true)
+(rule 1 (has_pmins $I64X2) $false)
+(rule (has_pmins _) (use_sse41))

-(rule (lower (has_type $I16X8 (smax x y)))
-      (x64_pmaxsw x y))
+(decl pure has_pmaxs (Type) bool)
+(rule 1 (has_pmaxs $I16X8) $true)
+(rule 1 (has_pmaxs $I64X2) $false)
+(rule (has_pmaxs _) (use_sse41))

-(rule (lower (has_type $I32X4 (smax x y)))
-      (x64_pmaxsd x y))
+(decl pure has_pmaxu (Type) bool)
+(rule 1 (has_pmaxu $I8X16) $true)
+(rule 1 (has_pmaxu $I64X2) $false)
+(rule (has_pmaxu _) (use_sse41))

-;; SSE `smin`.
+(decl pure has_pminu (Type) bool)
+(rule 1 (has_pminu $I8X16) $true)
+(rule 1 (has_pminu $I64X2) $false)
+(rule (has_pminu _) (use_sse41))
+
+;; SSE `smax`.
+
+(rule 1 (lower (has_type (ty_vec128 ty) (smax x y)))
+        (if-let $true (has_pmaxs ty))
+        (x64_pmaxs ty x y))
+
+(rule (lower (has_type (ty_vec128 ty) (smax x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (cmp Xmm (x64_pcmpgt ty x y))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))

-(rule (lower (has_type $I8X16 (smin x y)))
-      (x64_pminsb x y))
+;; SSE `smin`.

-(rule (lower (has_type $I16X8 (smin x y)))
-      (x64_pminsw x y))
+(rule 1 (lower (has_type (ty_vec128 ty) (smin x y)))
+        (if-let $true (has_pmins ty))
+        (x64_pmins ty x y))

-(rule (lower (has_type $I32X4 (smin x y)))
-      (x64_pminsd x y))
+(rule (lower (has_type (ty_vec128 ty) (smin x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (cmp Xmm (x64_pcmpgt ty y x))
+          (x_is_min Xmm (x64_pand cmp x))
+          (y_is_min Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_min y_is_min)))

 ;; SSE `umax`.

-(rule (lower (has_type $I8X16 (umax x y)))
-      (x64_pmaxub x y))
+(rule 2 (lower (has_type (ty_vec128 ty) (umax x y)))
+        (if-let $true (has_pmaxu ty))
+        (x64_pmaxu ty x y))
+
+;; If y < x then the saturating subtraction will be zero, otherwise when added
+;; back to x it'll return y.
+(rule 1 (lower (has_type $I16X8 (umax x y)))
+        (let ((x Xmm x))
+          (x64_paddw x (x64_psubusw y x))))

-(rule (lower (has_type $I16X8 (umax x y)))
-      (x64_pmaxuw x y))
+;; Flip the upper bits of each lane so the signed comparison has the same
+;; result as a signed comparison, and then select the results with the output
+;; mask. See `pcmpgt` lowering for info on flipping the upper bit.
+(rule (lower (has_type (ty_vec128 ty) (umax x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (mask Xmm (flip_high_bit_mask ty))
+          (x_masked Xmm (x64_pxor x mask))
+          (y_masked Xmm (x64_pxor y mask))
+          (cmp Xmm (x64_pcmpgt ty x_masked y_masked))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))

-(rule (lower (has_type $I32X4 (umax x y)))
-      (x64_pmaxud x y))
+(decl flip_high_bit_mask (Type) Xmm)
+(rule (flip_high_bit_mask $I16X8)
+      (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)))
+(rule (flip_high_bit_mask $I32X4)
+      (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000)))
+(rule (flip_high_bit_mask $I64X2)
+      (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000)))

 ;; SSE `umin`.

-(rule (lower (has_type $I8X16 (umin x y)))
-      (x64_pminub x y))
+(rule 2 (lower (has_type (ty_vec128 ty) (umin x y)))
+        (if-let $true (has_pminu ty))
+        (x64_pminu ty x y))

-(rule (lower (has_type $I16X8 (umin x y)))
-      (x64_pminuw x y))
+;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then
+;; the saturated result, when subtracted again, will go back to `y`.
+(rule 1 (lower (has_type $I16X8 (umin x y)))
+        (let ((x Xmm x))
+          (x64_psubw x (x64_psubusw x y))))

-(rule (lower (has_type $I32X4 (umin x y)))
-      (x64_pminud x y))
+;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit.
+(rule (lower (has_type (ty_vec128 ty) (umin x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (mask Xmm (flip_high_bit_mask ty))
+          (x_masked Xmm (x64_pxor x mask))
+          (y_masked Xmm (x64_pxor y mask))
+          (cmp Xmm (x64_pcmpgt ty y_masked x_masked))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))

 ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -1519,62 +1591,127 @@
      (let ((checked Xmm (x64_pcmpeq ty a b))
            (all_ones Xmm (vector_all_ones)))
           (x64_pxor checked all_ones)))
-;; Signed comparisons have a single-instruction lowering, unlike their unsigned
-;; counterparts. These latter instructions use the unsigned min/max
-;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
+
+;; SSE `sgt`
+
 (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty a b))
+
+;; SSE `slt`
+
 (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty b a))
+
+;; SSE `ugt`
+
+;; N.B.: we must manually prevent load coalescing operands; the
+;; register allocator gets confused otherwise.
+(rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxu ty))
+        (let ((a Xmm a)
+              (b Xmm b)
+              (max Xmm (x64_pmaxu ty a b))
+              (eq Xmm (x64_pcmpeq ty max b)))
+             (x64_pxor eq (vector_all_ones))))
+
+;; Flip the upper bit of each lane so the result of a signed comparison is the
+;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more)
 (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: we must manually prevent load coalescing of these operands; the
-      ;; register allocator gets confused otherwise. TODO:
-      ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
-      (let ((xmm_a Xmm (put_in_xmm a))
-            (xmm_b Xmm (put_in_xmm b))
-            (max Xmm (x64_pmaxu ty xmm_a xmm_b))
-            (eq Xmm (x64_pcmpeq ty max xmm_b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor eq all_ones)))
+      (let ((mask Xmm (flip_high_bit_mask ty))
+            (a_masked Xmm (x64_pxor a mask))
+            (b_masked Xmm (x64_pxor b mask)))
+           (x64_pcmpgt ty a_masked b_masked)))
+
+;; SSE `ult`
+
+(rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pminu ty))
+        ;; N.B.: see note above.
+        (let ((a Xmm a)
+              (b Xmm b)
+              (min Xmm (x64_pminu ty a b))
+              (eq Xmm (x64_pcmpeq ty min b)))
+             (x64_pxor eq (vector_all_ones))))
+
+;; Flip the upper bit of `a` and `b` so the signed comparison result will
+;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more).
 (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: see note above.
-      (let ((xmm_a Xmm (put_in_xmm a))
-            (xmm_b Xmm (put_in_xmm b))
-            (min Xmm (x64_pminu ty xmm_a xmm_b))
-            (eq Xmm (x64_pcmpeq ty min xmm_b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor eq all_ones)))
-;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
-;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
-;; there is no 64x2 version of this lowering (see below).
+      (let ((mask Xmm (flip_high_bit_mask ty))
+            (a_masked Xmm (x64_pxor a mask))
+            (b_masked Xmm (x64_pxor b mask)))
+           (x64_pcmpgt ty b_masked a_masked)))
+
+;; SSE `sge`
+
+;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`.
+(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxs ty))
+        (x64_pcmpeq ty a (x64_pmaxs ty a b)))
+
+;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the
+;; result.
 (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxs ty a b)))
-           (x64_pcmpeq ty a max)))
+      (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones)))
+
+;; SSE `sle`
+
+;; With `pmins*` use that and compare the result to `a`.
+(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmins ty))
+        (x64_pcmpeq ty a (x64_pmins ty a b)))
+
+;; Without `pmins*` perform a greater-than test and invert the result.
 (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((min Xmm (x64_pmins ty a b)))
-           (x64_pcmpeq ty a min)))
+      (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones)))
+
+;; SSE `uge`
+
+(rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxu ty))
+        (x64_pcmpeq ty a (x64_pmaxu ty a b)))
+
+;; Perform a saturating subtract of `a` from `b` and if the result is zero then
+;; `a` is greater or equal.
+(rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b))
+         (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8)))
+
+;; Flip the upper bit of each lane so the signed comparison is the same as
+;; an unsigned one and then invert the result. See docs on `pcmpgt` for why
+;; flipping the upper bit works.
 (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxu ty a b)))
-           (x64_pcmpeq ty a max)))
-(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((min Xmm (x64_pminu ty a b)))
-           (x64_pcmpeq ty a min)))
-;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
-;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
-;; 1s), emitting one more instruction than the smaller-lane versions.
-(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
-      (let ((checked Xmm (x64_pcmpgt $I64X2 b a))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor checked all_ones)))
-(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
-      (let ((checked Xmm (x64_pcmpgt $I64X2 a b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor checked all_ones)))
-;; TODO: not used by WebAssembly translation
-;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
-;; TODO: not used by WebAssembly translation
-;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
+      (let (
+          (mask Xmm (flip_high_bit_mask ty))
+          (a_masked Xmm (x64_pxor a mask))
+          (b_masked Xmm (x64_pxor b mask))
+          (cmp Xmm (x64_pcmpgt ty b_masked a_masked))
+        )
+        (x64_pxor cmp (vector_all_ones))))

+;; SSE `ule`
+
+(rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pminu ty))
+        (x64_pcmpeq ty a (x64_pminu ty a b)))
+
+;; A saturating subtraction will produce zeros if `a` is less than `b`, so
+;; compare that result to an all-zeros result to figure out lanes of `a` that
+;; are <= to the lanes in `b`
+(rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b))
+        (let ((zeros_if_a_is_min Xmm (x64_psubusw a b)))
+            (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16))))
+
+;; Flip the upper bit of each lane in `a` and `b` so a signed comparison
+;; produces the same result as an unsigned comparison. Then test test for `gt`
+;; and invert the result to get the `le` that is desired here. See docs on
+;; `pcmpgt` for why flipping the upper bit works.
+(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (let (
+          (mask Xmm (flip_high_bit_mask ty))
+          (a_masked Xmm (x64_pxor a mask))
+          (b_masked Xmm (x64_pxor b mask))
+          (cmp Xmm (x64_pcmpgt ty a_masked b_masked))
+        )
+        (x64_pxor cmp (vector_all_ones))))

 ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

--- a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x

--- a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x

--- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
@ -1,7 +1,9 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x

--- a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x

--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp uge v0, v1
-    return v2
-}
-; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
-; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
-; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
-; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx

 function %simd_icmp_uge_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -39,3 +42,13 @@ block0:
    return v8
 }
 ; run: %icmp_uge_const_i32x4() == 1
+
+function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp uge v0, v1
+    return v2
+}
+; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
+; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
+; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
+; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ugt v0, v1
-    return v2
-}
-; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
-; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
-; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
-; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx

 function %simd_icmp_ugt_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -38,3 +41,13 @@ block0:
    return v8
 }
 ; run: %icmp_ugt_const_i8x16() == 1
+
+function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ugt v0, v1
+    return v2
+}
+; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
+; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
+; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
+; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ule v0, v1
-    return v2
-}
-; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
-; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
-; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
-; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx

 function %simd_icmp_ule_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -40,3 +43,13 @@ block0:
    return v8
 }
 ; run: %icmp_ule_const_i16x8() == 1
+
+function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ule v0, v1
+    return v2
+}
+; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
+; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
+; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
+; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ult v0, v1
-    return v2
-}
-; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
-; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
-; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
-; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx

 function %simd_icmp_ult_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -53,3 +56,13 @@ block0:
    return v8
 }
 ; run: %icmp_ult_const_i16x8() == 1
+
+function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ult v0, v1
+    return v2
+}
+; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
+; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
+; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
+; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
+++ b/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
@ -1,39 +0,0 @@
-test run
-test interpret
-target aarch64
-
-function %smin_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = smin v0, v1
-  return v2
-}
-
-; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
-; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
-
-function %smax_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = smax v0, v1
-  return v2
-}
-
-; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
-; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
-
-function %umin_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = umin v0, v1
-  return v2
-}
-
-; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
-; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
-
-function %umax_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = umax v0, v1
-  return v2
-}
-
-; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
-; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]
--- a/cranelift/filetests/filetests/runtests/simd-min-max.clif
+++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif
@ -1,9 +1,12 @@
 test run
 test interpret
+target x86_64 has_sse41=false
 set enable_simd
 target aarch64
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target s390x

 function %smin_i8x16(i8x16, i8x16) -> i8x16 {
@ -109,3 +112,39 @@ block0(v0: i32x4, v1: i32x4):
 }

 ; run: %umax_i32x4([0xBAADF00D 0xDEADBEEF 0xC00FFFEE 0xBADAB00F], [0xCA11ACAB 0x12349876 0x98763210 0x43216789]) == [ 0xCA11ACAB 0xDEADBEEF 0xC00FFFEE 0xBADAB00F ]
+
+function %smin_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = smin v0, v1
+  return v2
+}
+
+; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
+; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
+
+function %smax_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = smax v0, v1
+  return v2
+}
+
+; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
+; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
+
+function %umin_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = umin v0, v1
+  return v2
+}
+
+; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
+; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
+
+function %umax_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = umax v0, v1
+  return v2
+}
+
+; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
+; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]