x64: Add non-SSE 4.1 lowerings of min/max instructions (#6291)

* x64: Add non-SSE 4.1 lowerings of min/max instructions This commit updates the x64 backend to avoid using various `p{min,max}*` instructions if SSE 4.1 isn't enabled. These instructions are used for comparisons as well as the `{u,s}{min,max}` instructions. Alternative lowerings are primarily drawn from LLVM. Through this refactoring the x64 backend now has also grown (not the most efficient) lowerings for vector comparisons with `i64x2` types, which it previously largely didn't have. This enabled copying some non-x86_64 tests into the main test files for various operations. * Review comments
2 years ago · e6a77eecf0
16 changed files with 327 additions and 192 deletions
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@ -1383,49 +1383,121 @@
 (rule -1 (lower (has_type (fits_in_64 ty) (smax x y)))
      (cmp_and_choose ty (CC.NL) x y))
-;; SSE `smax`.
+;; SSE helpers for determining if single-instruction lowerings are available.
-(rule (lower (has_type $I8X16 (smax x y)))
+(decl pure has_pmins (Type) bool)
-      (x64_pmaxsb x y))
+(rule 1 (has_pmins $I16X8) $true)
 (rule 1 (has_pmins $I64X2) $false)
 (rule (has_pmins _) (use_sse41))
-(rule (lower (has_type $I16X8 (smax x y)))
+(decl pure has_pmaxs (Type) bool)
-      (x64_pmaxsw x y))
+(rule 1 (has_pmaxs $I16X8) $true)
 (rule 1 (has_pmaxs $I64X2) $false)
 (rule (has_pmaxs _) (use_sse41))
-(rule (lower (has_type $I32X4 (smax x y)))
+(decl pure has_pmaxu (Type) bool)
-      (x64_pmaxsd x y))
+(rule 1 (has_pmaxu $I8X16) $true)
 (rule 1 (has_pmaxu $I64X2) $false)
 (rule (has_pmaxu _) (use_sse41))
-;; SSE `smin`.
+(decl pure has_pminu (Type) bool)
 (rule 1 (has_pminu $I8X16) $true)
 (rule 1 (has_pminu $I64X2) $false)
 (rule (has_pminu _) (use_sse41))
 ;; SSE `smax`.
 (rule 1 (lower (has_type (ty_vec128 ty) (smax x y)))
        (if-let $true (has_pmaxs ty))
        (x64_pmaxs ty x y))
 (rule (lower (has_type (ty_vec128 ty) (smax x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (cmp Xmm (x64_pcmpgt ty x y))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))
-(rule (lower (has_type $I8X16 (smin x y)))
+;; SSE `smin`.
      (x64_pminsb x y))
-(rule (lower (has_type $I16X8 (smin x y)))
+(rule 1 (lower (has_type (ty_vec128 ty) (smin x y)))
-      (x64_pminsw x y))
+        (if-let $true (has_pmins ty))
        (x64_pmins ty x y))
-(rule (lower (has_type $I32X4 (smin x y)))
+(rule (lower (has_type (ty_vec128 ty) (smin x y)))
-      (x64_pminsd x y))
+      (let (
          (x Xmm x)
          (y Xmm y)
          (cmp Xmm (x64_pcmpgt ty y x))
          (x_is_min Xmm (x64_pand cmp x))
          (y_is_min Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_min y_is_min)))
 ;; SSE `umax`.
-(rule (lower (has_type $I8X16 (umax x y)))
+(rule 2 (lower (has_type (ty_vec128 ty) (umax x y)))
-      (x64_pmaxub x y))
+        (if-let $true (has_pmaxu ty))
        (x64_pmaxu ty x y))
 ;; If y < x then the saturating subtraction will be zero, otherwise when added
 ;; back to x it'll return y.
 (rule 1 (lower (has_type $I16X8 (umax x y)))
        (let ((x Xmm x))
          (x64_paddw x (x64_psubusw y x))))
-(rule (lower (has_type $I16X8 (umax x y)))
+;; Flip the upper bits of each lane so the signed comparison has the same
-      (x64_pmaxuw x y))
+;; result as a signed comparison, and then select the results with the output
 ;; mask. See `pcmpgt` lowering for info on flipping the upper bit.
 (rule (lower (has_type (ty_vec128 ty) (umax x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (mask Xmm (flip_high_bit_mask ty))
          (x_masked Xmm (x64_pxor x mask))
          (y_masked Xmm (x64_pxor y mask))
          (cmp Xmm (x64_pcmpgt ty x_masked y_masked))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))
-(rule (lower (has_type $I32X4 (umax x y)))
+(decl flip_high_bit_mask (Type) Xmm)
-      (x64_pmaxud x y))
+(rule (flip_high_bit_mask $I16X8)
      (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)))
 (rule (flip_high_bit_mask $I32X4)
      (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000)))
 (rule (flip_high_bit_mask $I64X2)
      (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000)))
 ;; SSE `umin`.
-(rule (lower (has_type $I8X16 (umin x y)))
+(rule 2 (lower (has_type (ty_vec128 ty) (umin x y)))
-      (x64_pminub x y))
+        (if-let $true (has_pminu ty))
        (x64_pminu ty x y))
-(rule (lower (has_type $I16X8 (umin x y)))
+;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then
-      (x64_pminuw x y))
+;; the saturated result, when subtracted again, will go back to `y`.
 (rule 1 (lower (has_type $I16X8 (umin x y)))
        (let ((x Xmm x))
          (x64_psubw x (x64_psubusw x y))))
-(rule (lower (has_type $I32X4 (umin x y)))
+;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit.
-      (x64_pminud x y))
+(rule (lower (has_type (ty_vec128 ty) (umin x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (mask Xmm (flip_high_bit_mask ty))
          (x_masked Xmm (x64_pxor x mask))
          (y_masked Xmm (x64_pxor y mask))
          (cmp Xmm (x64_pcmpgt ty y_masked x_masked))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))
 ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -1519,62 +1591,127 @@
      (let ((checked Xmm (x64_pcmpeq ty a b))
            (all_ones Xmm (vector_all_ones)))
           (x64_pxor checked all_ones)))
-;; Signed comparisons have a single-instruction lowering, unlike their unsigned
+
-;; counterparts. These latter instructions use the unsigned min/max
+;; SSE `sgt`
-;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
+
 (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty a b))
 ;; SSE `slt`
 (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty b a))
 ;; SSE `ugt`
 ;; N.B.: we must manually prevent load coalescing operands; the
 ;; register allocator gets confused otherwise.
 (rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
        (if-let $true (has_pmaxu ty))
        (let ((a Xmm a)
              (b Xmm b)
              (max Xmm (x64_pmaxu ty a b))
              (eq Xmm (x64_pcmpeq ty max b)))
             (x64_pxor eq (vector_all_ones))))
 ;; Flip the upper bit of each lane so the result of a signed comparison is the
 ;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more)
 (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: we must manually prevent load coalescing of these operands; the
+      (let ((mask Xmm (flip_high_bit_mask ty))
-      ;; register allocator gets confused otherwise. TODO:
+            (a_masked Xmm (x64_pxor a mask))
-      ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
+            (b_masked Xmm (x64_pxor b mask)))
-      (let ((xmm_a Xmm (put_in_xmm a))
+           (x64_pcmpgt ty a_masked b_masked)))
-            (xmm_b Xmm (put_in_xmm b))
+
-            (max Xmm (x64_pmaxu ty xmm_a xmm_b))
+;; SSE `ult`
-            (eq Xmm (x64_pcmpeq ty max xmm_b))
+
-            (all_ones Xmm (vector_all_ones)))
+(rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
-           (x64_pxor eq all_ones)))
+        (if-let $true (has_pminu ty))
        ;; N.B.: see note above.
        (let ((a Xmm a)
              (b Xmm b)
              (min Xmm (x64_pminu ty a b))
              (eq Xmm (x64_pcmpeq ty min b)))
             (x64_pxor eq (vector_all_ones))))
 ;; Flip the upper bit of `a` and `b` so the signed comparison result will
 ;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more).
 (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
-      ;; N.B.: see note above.
+      (let ((mask Xmm (flip_high_bit_mask ty))
-      (let ((xmm_a Xmm (put_in_xmm a))
+            (a_masked Xmm (x64_pxor a mask))
-            (xmm_b Xmm (put_in_xmm b))
+            (b_masked Xmm (x64_pxor b mask)))
-            (min Xmm (x64_pminu ty xmm_a xmm_b))
+           (x64_pcmpgt ty b_masked a_masked)))
-            (eq Xmm (x64_pcmpeq ty min xmm_b))
+
-            (all_ones Xmm (vector_all_ones)))
+;; SSE `sge`
-           (x64_pxor eq all_ones)))
+
-;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
+;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`.
-;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
+(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-;; there is no 64x2 version of this lowering (see below).
+        (if-let $true (has_pmaxs ty))
        (x64_pcmpeq ty a (x64_pmaxs ty a b)))
 ;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the
 ;; result.
 (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxs ty a b)))
+      (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones)))
-           (x64_pcmpeq ty a max)))
+
 ;; SSE `sle`
 ;; With `pmins*` use that and compare the result to `a`.
 (rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let $true (has_pmins ty))
        (x64_pcmpeq ty a (x64_pmins ty a b)))
 ;; Without `pmins*` perform a greater-than test and invert the result.
 (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((min Xmm (x64_pmins ty a b)))
+      (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones)))
-           (x64_pcmpeq ty a min)))
+
 ;; SSE `uge`
 (rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let $true (has_pmaxu ty))
        (x64_pcmpeq ty a (x64_pmaxu ty a b)))
 ;; Perform a saturating subtract of `a` from `b` and if the result is zero then
 ;; `a` is greater or equal.
 (rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b))
         (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8)))
 ;; Flip the upper bit of each lane so the signed comparison is the same as
 ;; an unsigned one and then invert the result. See docs on `pcmpgt` for why
 ;; flipping the upper bit works.
 (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
-      (let ((max Xmm (x64_pmaxu ty a b)))
+      (let (
-           (x64_pcmpeq ty a max)))
+          (mask Xmm (flip_high_bit_mask ty))
-(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
+          (a_masked Xmm (x64_pxor a mask))
-      (let ((min Xmm (x64_pminu ty a b)))
+          (b_masked Xmm (x64_pxor b mask))
-           (x64_pcmpeq ty a min)))
+          (cmp Xmm (x64_pcmpgt ty b_masked a_masked))
-;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
+        )
-;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
+        (x64_pxor cmp (vector_all_ones))))
 ;; 1s), emitting one more instruction than the smaller-lane versions.
 (rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
      (let ((checked Xmm (x64_pcmpgt $I64X2 b a))
            (all_ones Xmm (vector_all_ones)))
           (x64_pxor checked all_ones)))
 (rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
      (let ((checked Xmm (x64_pcmpgt $I64X2 a b))
            (all_ones Xmm (vector_all_ones)))
           (x64_pxor checked all_ones)))
 ;; TODO: not used by WebAssembly translation
 ;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
 ;; TODO: not used by WebAssembly translation
 ;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
 ;; SSE `ule`
 (rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let $true (has_pminu ty))
        (x64_pcmpeq ty a (x64_pminu ty a b)))
 ;; A saturating subtraction will produce zeros if `a` is less than `b`, so
 ;; compare that result to an all-zeros result to figure out lanes of `a` that
 ;; are <= to the lanes in `b`
 (rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b))
        (let ((zeros_if_a_is_min Xmm (x64_psubusw a b)))
            (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16))))
 ;; Flip the upper bit of each lane in `a` and `b` so a signed comparison
 ;; produces the same result as an unsigned comparison. Then test test for `gt`
 ;; and invert the result to get the `le` that is desired here. See docs on
 ;; `pcmpgt` for why flipping the upper bit works.
 (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let (
          (mask Xmm (flip_high_bit_mask ty))
          (a_masked Xmm (x64_pxor a mask))
          (b_masked Xmm (x64_pxor b mask))
          (cmp Xmm (x64_pcmpgt ty a_masked b_masked))
        )
        (x64_pxor cmp (vector_all_ones))))
 ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ne.clif
@ -1,8 +1,11 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sge.clif
@ -1,8 +1,11 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sgt.clif
@ -1,7 +1,9 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
--- a/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-sle.clif
@ -1,8 +1,11 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
--- a/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-slt.clif
@ -1,8 +1,11 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge-i64x2.clif
@ -1,17 +0,0 @@
 test interpret
 test run
 target aarch64
 target s390x
 ; TODO: Move this to the main file once x86_64 supports this operation
 ; See: #5529
 function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp uge v0, v1
    return v2
 }
 ; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
 ; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
 ; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
 ; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-uge.clif
@ -1,10 +1,13 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
 target x86_64
 target x86_64 has_avx
 function %simd_icmp_uge_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -39,3 +42,13 @@ block0:
    return v8
 }
 ; run: %icmp_uge_const_i32x4() == 1
 function %simd_icmp_uge_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp uge v0, v1
    return v2
 }
 ; run: %simd_icmp_uge_i64([0 1], [0 0]) == [-1 -1]
 ; run: %simd_icmp_uge_i64([-1 0], [-1 1]) == [-1 0]
 ; run: %simd_icmp_uge_i64([-5 1], [-1 -1]) == [0 0]
 ; run: %simd_icmp_uge_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt-i64x2.clif
@ -1,17 +0,0 @@
 test interpret
 test run
 target aarch64
 target s390x
 ; TODO: Move this to the main file once x86_64 supports this operation
 ; See: #5529
 function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ugt v0, v1
    return v2
 }
 ; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
 ; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
 ; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
 ; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ugt.clif
@ -1,10 +1,13 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
 target x86_64
 target x86_64 has_avx
 function %simd_icmp_ugt_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -38,3 +41,13 @@ block0:
    return v8
 }
 ; run: %icmp_ugt_const_i8x16() == 1
 function %simd_icmp_ugt_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ugt v0, v1
    return v2
 }
 ; run: %simd_icmp_ugt_i64([0 1], [0 0]) == [0 -1]
 ; run: %simd_icmp_ugt_i64([-1 0], [-1 1]) == [0 0]
 ; run: %simd_icmp_ugt_i64([-5 1], [-1 -1]) == [0 0]
 ; run: %simd_icmp_ugt_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule-i64x2.clif
@ -1,17 +0,0 @@
 test interpret
 test run
 target aarch64
 target s390x
 ; TODO: Move this to the main file once x86_64 supports this operation
 ; See: #5529
 function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ule v0, v1
    return v2
 }
 ; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
 ; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
 ; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
 ; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ule.clif
@ -1,10 +1,13 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
 target x86_64
 target x86_64 has_avx
 function %simd_icmp_ule_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -40,3 +43,13 @@ block0:
    return v8
 }
 ; run: %icmp_ule_const_i16x8() == 1
 function %simd_icmp_ule_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ule v0, v1
    return v2
 }
 ; run: %simd_icmp_ule_i64([0 1], [0 0]) == [-1 0]
 ; run: %simd_icmp_ule_i64([-1 0], [-1 1]) == [-1 -1]
 ; run: %simd_icmp_ule_i64([-5 1], [-1 -1]) == [-1 -1]
 ; run: %simd_icmp_ule_i64([0 0], [0 0]) == [-1 -1]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult-i64x2.clif
@ -1,17 +0,0 @@
 test interpret
 test run
 target aarch64
 target s390x
 ; TODO: Move this to the main file once x86_64 supports this operation
 ; See: #5529
 function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ult v0, v1
    return v2
 }
 ; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
 ; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
 ; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
 ; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
+++ b/cranelift/filetests/filetests/runtests/simd-icmp-ult.clif
@ -1,10 +1,13 @@
 test interpret
 test run
 target x86_64 has_sse41=false
 set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target aarch64
 target s390x
 target x86_64
 target x86_64 has_avx
 function %simd_icmp_ult_i8(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@ -53,3 +56,13 @@ block0:
    return v8
 }
 ; run: %icmp_ult_const_i16x8() == 1
 function %simd_icmp_ult_i64(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = icmp ult v0, v1
    return v2
 }
 ; run: %simd_icmp_ult_i64([0 1], [0 0]) == [0 0]
 ; run: %simd_icmp_ult_i64([-1 0], [-1 1]) == [0 -1]
 ; run: %simd_icmp_ult_i64([-5 1], [-1 -1]) == [-1 -1]
 ; run: %simd_icmp_ult_i64([0 0], [0 0]) == [0 0]
--- a/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
+++ b/cranelift/filetests/filetests/runtests/simd-min-max-aarch64.clif
@ -1,39 +0,0 @@
 test run
 test interpret
 target aarch64
 function %smin_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = smin v0, v1
  return v2
 }
 ; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
 ; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
 function %smax_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = smax v0, v1
  return v2
 }
 ; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
 ; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
 function %umin_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = umin v0, v1
  return v2
 }
 ; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
 ; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
 function %umax_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = umax v0, v1
  return v2
 }
 ; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
 ; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]
--- a/cranelift/filetests/filetests/runtests/simd-min-max.clif
+++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif
@ -1,9 +1,12 @@
 test run
 test interpret
 target x86_64 has_sse41=false
 set enable_simd
 target aarch64
 target x86_64
-target x86_64 has_avx
+target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
 target s390x
 function %smin_i8x16(i8x16, i8x16) -> i8x16 {
@ -109,3 +112,39 @@ block0(v0: i32x4, v1: i32x4):
 }
 ; run: %umax_i32x4([0xBAADF00D 0xDEADBEEF 0xC00FFFEE 0xBADAB00F], [0xCA11ACAB 0x12349876 0x98763210 0x43216789]) == [ 0xCA11ACAB 0xDEADBEEF 0xC00FFFEE 0xBADAB00F ]
 function %smin_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = smin v0, v1
  return v2
 }
 ; run: %smin_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
 ; run: %smin_i64x2([0x80000000C00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0x80000000C00FFFEE 0x43216789 ]
 function %smax_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = smax v0, v1
  return v2
 }
 ; run: %smax_i64x2([0xC00FFFEE 0xBADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0xBADAB00F ]
 ; run: %smax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x43216789 ]
 function %umin_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = umin v0, v1
  return v2
 }
 ; run: %umin_i64x2([0xDEADBEEF 0xBADAB00F], [0x12349876 0x43216789]) == [ 0x12349876 0x43216789 ]
 ; run: %umin_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0x98763210 0x43216789 ]
 function %umax_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
  v2 = umax v0, v1
  return v2
 }
 ; run: %umax_i64x2([0xBAADF00D 0xBADAB00F], [0xCA11ACAB 0x43216789]) == [ 0xCA11ACAB 0xBADAB00F ]
 ; run: %umax_i64x2([0xC00FFFEE 0x80000000BADAB00F], [0x98763210 0x43216789]) == [ 0xC00FFFEE 0x80000000BADAB00F ]