From e6a77eecf0601be5466d8a9e6efbaa021685e2d3 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 1 May 2023 19:32:11 -0500
Subject: [PATCH] x64: Add non-SSE 4.1 lowerings of min/max instructions
 (#6291)

* x64: Add non-SSE 4.1 lowerings of min/max instructions

This commit updates the x64 backend to avoid using various `p{min,max}*`
instructions if SSE 4.1 isn't enabled. These instructions are used for
comparisons as well as the `{u,s}{min,max}` instructions. Alternative
lowerings are primarily drawn from LLVM.

Through this refactoring the x64 backend now has also grown (not the
most efficient) lowerings for vector comparisons with `i64x2` types,
which it previously largely didn't have. This enabled copying some
non-x86_64 tests into the main test files for various operations.

* Review comments
---
 cranelift/codegen/src/isa/x64/lower.isle      | 281 +++++++++++++-----
 .../filetests/runtests/simd-icmp-ne.clif      |   5 +-
 .../filetests/runtests/simd-icmp-sge.clif     |   5 +-
 .../filetests/runtests/simd-icmp-sgt.clif     |   2 +
 .../filetests/runtests/simd-icmp-sle.clif     |   5 +-
 .../filetests/runtests/simd-icmp-slt.clif     |   5 +-
 .../runtests/simd-icmp-uge-i64x2.clif         |  17 --
 .../filetests/runtests/simd-icmp-uge.clif     |  17 +-
 .../runtests/simd-icmp-ugt-i64x2.clif         |  17 --
 .../filetests/runtests/simd-icmp-ugt.clif     |  17 +-
 .../runtests/simd-icmp-ule-i64x2.clif         |  17 --
 .../filetests/runtests/simd-icmp-ule.clif     |  17 +-
 .../runtests/simd-icmp-ult-i64x2.clif         |  17 --
 .../filetests/runtests/simd-icmp-ult.clif     |  17 +-
 .../runtests/simd-min-max-aarch64.clif        |  39 ---
 .../filetests/runtests/simd-min-max.clif      |  41 ++-
 16 files changed, 327 insertions(+), 192 deletions(-)
 delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
 delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
 delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
 delete mode 100644 cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
 delete mode 100644 cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ff6585ef87..8fd18c2828 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1383,49 +1383,121 @@
 (rule -1 (lower (has_type (fits_in_64 ty) (smax x y)))
       (cmp_and_choose ty (CC.NL) x y))
 
-;; SSE `smax`.
+;; SSE helpers for determining if single-instruction lowerings are available.
 
-(rule (lower (has_type $I8X16 (smax x y)))
-      (x64_pmaxsb x y))
+(decl pure has_pmins (Type) bool)
+(rule 1 (has_pmins $I16X8) $true)
+(rule 1 (has_pmins $I64X2) $false)
+(rule (has_pmins _) (use_sse41))
 
-(rule (lower (has_type $I16X8 (smax x y)))
-      (x64_pmaxsw x y))
+(decl pure has_pmaxs (Type) bool)
+(rule 1 (has_pmaxs $I16X8) $true)
+(rule 1 (has_pmaxs $I64X2) $false)
+(rule (has_pmaxs _) (use_sse41))
 
-(rule (lower (has_type $I32X4 (smax x y)))
-      (x64_pmaxsd x y))
+(decl pure has_pmaxu (Type) bool)
+(rule 1 (has_pmaxu $I8X16) $true)
+(rule 1 (has_pmaxu $I64X2) $false)
+(rule (has_pmaxu _) (use_sse41))
 
-;; SSE `smin`.
+(decl pure has_pminu (Type) bool)
+(rule 1 (has_pminu $I8X16) $true)
+(rule 1 (has_pminu $I64X2) $false)
+(rule (has_pminu _) (use_sse41))
+
+;; SSE `smax`.
+
+(rule 1 (lower (has_type (ty_vec128 ty) (smax x y)))
+        (if-let $true (has_pmaxs ty))
+        (x64_pmaxs ty x y))
+
+(rule (lower (has_type (ty_vec128 ty) (smax x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (cmp Xmm (x64_pcmpgt ty x y))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))
 
-(rule (lower (has_type $I8X16 (smin x y)))
-      (x64_pminsb x y))
+;; SSE `smin`.
 
-(rule (lower (has_type $I16X8 (smin x y)))
-      (x64_pminsw x y))
+(rule 1 (lower (has_type (ty_vec128 ty) (smin x y)))
+        (if-let $true (has_pmins ty))
+        (x64_pmins ty x y))
 
-(rule (lower (has_type $I32X4 (smin x y)))
-      (x64_pminsd x y))
+(rule (lower (has_type (ty_vec128 ty) (smin x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (cmp Xmm (x64_pcmpgt ty y x))
+          (x_is_min Xmm (x64_pand cmp x))
+          (y_is_min Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_min y_is_min)))
 
 ;; SSE `umax`.
 
-(rule (lower (has_type $I8X16 (umax x y)))
-      (x64_pmaxub x y))
+(rule 2 (lower (has_type (ty_vec128 ty) (umax x y)))
+        (if-let $true (has_pmaxu ty))
+        (x64_pmaxu ty x y))
+
+;; If y < x then the saturating subtraction will be zero, otherwise when added
+;; back to x it'll return y.
+(rule 1 (lower (has_type $I16X8 (umax x y)))
+        (let ((x Xmm x))
+          (x64_paddw x (x64_psubusw y x))))
 
-(rule (lower (has_type $I16X8 (umax x y)))
-      (x64_pmaxuw x y))
+;; Flip the upper bits of each lane so the signed comparison has the same
+;; result as a signed comparison, and then select the results with the output
+;; mask. See `pcmpgt` lowering for info on flipping the upper bit.
+(rule (lower (has_type (ty_vec128 ty) (umax x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (mask Xmm (flip_high_bit_mask ty))
+          (x_masked Xmm (x64_pxor x mask))
+          (y_masked Xmm (x64_pxor y mask))
+          (cmp Xmm (x64_pcmpgt ty x_masked y_masked))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))
 
-(rule (lower (has_type $I32X4 (umax x y)))
-      (x64_pmaxud x y))
+(decl flip_high_bit_mask (Type) Xmm)
+(rule (flip_high_bit_mask $I16X8)
+      (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)))
+(rule (flip_high_bit_mask $I32X4)
+      (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000)))
+(rule (flip_high_bit_mask $I64X2)
+      (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000)))
 
 ;; SSE `umin`.
 
-(rule (lower (has_type $I8X16 (umin x y)))
-      (x64_pminub x y))
+(rule 2 (lower (has_type (ty_vec128 ty) (umin x y)))
+        (if-let $true (has_pminu ty))
+        (x64_pminu ty x y))
 
-(rule (lower (has_type $I16X8 (umin x y)))
-      (x64_pminuw x y))
+;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then
+;; the saturated result, when subtracted again, will go back to `y`.
+(rule 1 (lower (has_type $I16X8 (umin x y)))
+        (let ((x Xmm x))
+          (x64_psubw x (x64_psubusw x y))))
 
-(rule (lower (has_type $I32X4 (umin x y)))
-      (x64_pminud x y))
+;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit.
+(rule (lower (has_type (ty_vec128 ty) (umin x y)))
+      (let (
+          (x Xmm x)
+          (y Xmm y)
+          (mask Xmm (flip_high_bit_mask ty))
+          (x_masked Xmm (x64_pxor x mask))
+          (y_masked Xmm (x64_pxor y mask))
+          (cmp Xmm (x64_pcmpgt ty y_masked x_masked))
+          (x_is_max Xmm (x64_pand cmp x))
+          (y_is_max Xmm (x64_pandn cmp y))
+        )
+        (x64_por x_is_max y_is_max)))
 
 ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1519,62 +1591,127 @@
       (let ((checked Xmm (x64_pcmpeq ty a b))
             (all_ones Xmm (vector_all_ones)))
            (x64_pxor checked all_ones)))
-;; Signed comparisons have a single-instruction lowering, unlike their unsigned
-;; counterparts. These latter instructions use the unsigned min/max
-;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
+
+;; SSE `sgt`
+
 (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
       (x64_pcmpgt ty a b))
+
+;; SSE `slt`
+
 (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
       (x64_pcmpgt ty b a))
+
+;; SSE `ugt`
+
+;; N.B.: we must manually prevent load coalescing operands; the
+;; register allocator gets confused otherwise.
+(rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxu ty))
+        (let ((a Xmm a)
+              (b Xmm b)
+              (max Xmm (x64_pmaxu ty a b))
+              (eq Xmm (x64_pcmpeq ty max b)))
+             (x64_pxor eq (vector_all_ones))))
+
+;; Flip the upper bit of each lane so the result of a signed comparison is the
+;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more)
 (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: we must manually prevent load coalescing of these operands; the
-      ;; register allocator gets confused otherwise. TODO:
-      ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
-      (let ((xmm_a Xmm (put_in_xmm a))
-            (xmm_b Xmm (put_in_xmm b))
-            (max Xmm (x64_pmaxu ty xmm_a xmm_b))
-            (eq Xmm (x64_pcmpeq ty max xmm_b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor eq all_ones)))
+      (let ((mask Xmm (flip_high_bit_mask ty))
+            (a_masked Xmm (x64_pxor a mask))
+            (b_masked Xmm (x64_pxor b mask)))
+           (x64_pcmpgt ty a_masked b_masked)))
+
+;; SSE `ult`
+
+(rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pminu ty))
+        ;; N.B.: see note above.
+        (let ((a Xmm a)
+              (b Xmm b)
+              (min Xmm (x64_pminu ty a b))
+              (eq Xmm (x64_pcmpeq ty min b)))
+             (x64_pxor eq (vector_all_ones))))
+
+;; Flip the upper bit of `a` and `b` so the signed comparison result will
+;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more).
 (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: see note above.
-      (let ((xmm_a Xmm (put_in_xmm a))
-            (xmm_b Xmm (put_in_xmm b))
-            (min Xmm (x64_pminu ty xmm_a xmm_b))
-            (eq Xmm (x64_pcmpeq ty min xmm_b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor eq all_ones)))
-;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
-;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
-;; there is no 64x2 version of this lowering (see below).
+      (let ((mask Xmm (flip_high_bit_mask ty))
+            (a_masked Xmm (x64_pxor a mask))
+            (b_masked Xmm (x64_pxor b mask)))
+           (x64_pcmpgt ty b_masked a_masked)))
+
+;; SSE `sge`
+
+;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`.
+(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxs ty))
+        (x64_pcmpeq ty a (x64_pmaxs ty a b)))
+
+;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the
+;; result.
 (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxs ty a b)))
-           (x64_pcmpeq ty a max)))
+      (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones)))
+
+;; SSE `sle`
+
+;; With `pmins*` use that and compare the result to `a`.
+(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmins ty))
+        (x64_pcmpeq ty a (x64_pmins ty a b)))
+
+;; Without `pmins*` perform a greater-than test and invert the result.
 (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((min Xmm (x64_pmins ty a b)))
-           (x64_pcmpeq ty a min)))
+      (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones)))
+
+;; SSE `uge`
+
+(rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pmaxu ty))
+        (x64_pcmpeq ty a (x64_pmaxu ty a b)))
+
+;; Perform a saturating subtract of `a` from `b` and if the result is zero then
+;; `a` is greater or equal.
+(rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b))
+         (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8)))
+
+;; Flip the upper bit of each lane so the signed comparison is the same as
+;; an unsigned one and then invert the result. See docs on `pcmpgt` for why
+;; flipping the upper bit works.
 (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxu ty a b)))
-           (x64_pcmpeq ty a max)))
-(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((min Xmm (x64_pminu ty a b)))
-           (x64_pcmpeq ty a min)))
-;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
-;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
-;; 1s), emitting one more instruction than the smaller-lane versions.
-(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
-      (let ((checked Xmm (x64_pcmpgt $I64X2 b a))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor checked all_ones)))
-(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
-      (let ((checked Xmm (x64_pcmpgt $I64X2 a b))
-            (all_ones Xmm (vector_all_ones)))
-           (x64_pxor checked all_ones)))
-;; TODO: not used by WebAssembly translation
-;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
-;; TODO: not used by WebAssembly translation
-;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
+      (let (
+          (mask Xmm (flip_high_bit_mask ty))
+          (a_masked Xmm (x64_pxor a mask))
+          (b_masked Xmm (x64_pxor b mask))
+          (cmp Xmm (x64_pcmpgt ty b_masked a_masked))
+        )
+        (x64_pxor cmp (vector_all_ones))))
 
+;; SSE `ule`
+
+(rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+        (if-let $true (has_pminu ty))
+        (x64_pcmpeq ty a (x64_pminu ty a b)))
+
+;; A saturating subtraction will produce zeros if `a` is less than `b`, so
+;; compare that result to an all-zeros result to figure out lanes of `a` that
+;; are <= to the lanes in `b`
+(rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b))
+        (let ((zeros_if_a_is_min Xmm (x64_psubusw a b)))
+            (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16))))
+
+;; Flip the upper bit of each lane in `a` and `b` so a signed comparison
+;; produces the same result as an unsigned comparison. Then test test for `gt`
+;; and invert the result to get the `le` that is desired here. See docs on
+;; `pcmpgt` for why flipping the upper bit works.
+(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+      (let (
+          (mask Xmm (flip_high_bit_mask ty))
+          (a_masked Xmm (x64_pxor a mask))
+          (b_masked Xmm (x64_pxor b mask))
+          (cmp Xmm (x64_pcmpgt ty a_masked b_masked))
+        )
+        (x64_pxor cmp (vector_all_ones))))
 
 ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
index 2bd39fdce0..5dbb5369d5 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
@@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
 
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
index 989987c3c4..035c0afcee 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
@@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
 
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
index 6d9cbc8e5e..e2542ad711 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
@@ -1,7 +1,9 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
index c2a73f1891..fa1e93f145 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
@@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
 
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
index e2cbdc0039..1eca10ccbf 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
@@ -1,8 +1,11 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
 
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
deleted file mode 100644
index 74fe4c2814..0000000000
--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
+++ /dev/null
@@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp uge v0, v1
-    return v2
-}
-; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
-; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
-; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
-; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
index 06df15db4c..8280fed7bf 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
@@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx
 
 function %simd_icmp_uge_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -39,3 +42,13 @@ block0:
     return v8
 }
 ; run: %icmp_uge_const_i32x4() == 1
+
+function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp uge v0, v1
+    return v2
+}
+; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
+; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
+; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
+; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
deleted file mode 100644
index d3ab7f3b5f..0000000000
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
+++ /dev/null
@@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ugt v0, v1
-    return v2
-}
-; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
-; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
-; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
-; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
index aa9d5f374e..fa5b12794d 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
@@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx
 
 function %simd_icmp_ugt_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -38,3 +41,13 @@ block0:
     return v8
 }
 ; run: %icmp_ugt_const_i8x16() == 1
+
+function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ugt v0, v1
+    return v2
+}
+; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
+; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
+; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
+; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
deleted file mode 100644
index c06136bcaa..0000000000
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
+++ /dev/null
@@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ule v0, v1
-    return v2
-}
-; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
-; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
-; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
-; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
index 6fe03d611a..680f02a228 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
@@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx
 
 function %simd_icmp_ule_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -40,3 +43,13 @@ block0:
     return v8
 }
 ; run: %icmp_ule_const_i16x8() == 1
+
+function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ule v0, v1
+    return v2
+}
+; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
+; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
+; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
+; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
deleted file mode 100644
index 788de0b539..0000000000
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
+++ /dev/null
@@ -1,17 +0,0 @@
-test interpret
-test run
-target aarch64
-target s390x
-
-; TODO: Move this to the main file once x86_64 supports this operation
-; See: #5529
-
-function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-    v2 = icmp ult v0, v1
-    return v2
-}
-; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
-; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
-; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
-; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
diff --git a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
index ea3c3cd434..f99f8570e4 100644
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
@@ -1,10 +1,13 @@
 test interpret
 test run
+target x86_64 has_sse41=false
 set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target x86_64
-target x86_64 has_avx
 
 function %simd_icmp_ult_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -53,3 +56,13 @@ block0:
     return v8
 }
 ; run: %icmp_ult_const_i16x8() == 1
+
+function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = icmp ult v0, v1
+    return v2
+}
+; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
+; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
+; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
+; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
diff --git a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
deleted file mode 100644
index d5cef288b0..0000000000
--- a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
+++ /dev/null
@@ -1,39 +0,0 @@
-test run
-test interpret
-target aarch64
-
-function %smin_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = smin v0, v1
-  return v2
-}
-
-; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
-; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
-
-function %smax_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = smax v0, v1
-  return v2
-}
-
-; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
-; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
-
-function %umin_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = umin v0, v1
-  return v2
-}
-
-; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
-; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
-
-function %umax_i64x2(i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2):
-  v2 = umax v0, v1
-  return v2
-}
-
-; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
-; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]
diff --git a/cranelift/filetests/filetests/runtests/simd-min-max.clif b/cranelift/filetests/filetests/runtests/simd-min-max.clif
index d4934615a6..d63baec4ed 100644
--- a/cranelift/filetests/filetests/runtests/simd-min-max.clif
+++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif
@@ -1,9 +1,12 @@
 test run
 test interpret
+target x86_64 has_sse41=false
 set enable_simd
 target aarch64
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
 target s390x
 
 function %smin_i8x16(i8x16, i8x16) -> i8x16 {
@@ -109,3 +112,39 @@ block0(v0: i32x4, v1: i32x4):
 }
 
 ; run: %umax_i32x4([0xBAADF00D 0xDEADBEEF 0xC00FFFEE 0xBADAB00F], [0xCA11ACAB 0x12349876 0x98763210 0x43216789]) == [ 0xCA11ACAB 0xDEADBEEF 0xC00FFFEE 0xBADAB00F ]
+
+function %smin_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = smin v0, v1
+  return v2
+}
+
+; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
+; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
+
+function %smax_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = smax v0, v1
+  return v2
+}
+
+; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
+; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
+
+function %umin_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = umin v0, v1
+  return v2
+}
+
+; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
+; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
+
+function %umax_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = umax v0, v1
+  return v2
+}
+
+; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
+; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]