Cranelift AArch64: Migrate Splat to ISLE (#4521)

2 years ago · ead6edb0c5
21 changed files with 586 additions and 331 deletions
--- a/cranelift/codegen/src/ir/types.rs
+++ b/cranelift/codegen/src/ir/types.rs
@ -171,8 +171,8 @@ impl Type {
        self.replace_lanes(match self.lane_type() {
            I8 | B1 | B8 => I8,
            I16 | B16 => I16,
-            I32 | B32 => I32,
-            I64 | B64 => I64,
+            I32 | B32 | F32 => I32,
+            I64 | B64 | F64 => I64,
            I128 | B128 => I128,
            _ => unimplemented!(),
        })
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@ -627,7 +627,8 @@
       (VecLoadReplicate
        (rd WritableReg)
        (rn Reg)
-        (size VectorSize))
+        (size VectorSize)
+        (flags MemFlags))

       ;; Vector conditional select, 128 bit.  A synthetic instruction, which generates a 4-insn
       ;; control-flow diamond.
@ -1376,6 +1377,16 @@
 (decl cond_br_cond (Cond) CondBrKind)
 (extern constructor cond_br_cond cond_br_cond)

+;; Lower the address of a load or a store.
+(decl amode (Type Inst u32) AMode)
+;; TODO: Port lower_address() to ISLE.
+(extern constructor amode amode)
+
+;; Matches an `AMode` that is just a register.
+(decl pure amode_is_reg (AMode) Reg)
+;; TODO: Implement in ISLE.
+(extern constructor amode_is_reg amode_is_reg)
+
 ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Helper for creating the zero register.
@ -1481,6 +1492,13 @@
            (_ Unit (emit (MInst.VecDup dst src size))))
        dst))

+;; Helper for emitting `MInst.VecDupFromFpu` instructions.
+(decl vec_dup_from_fpu (Reg VectorSize) Reg)
+(rule (vec_dup_from_fpu src size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecDupFromFpu dst src size))))
+        dst))
+
 ;; Helper for emitting `MInst.AluRRImm12` instructions.
 (decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg)
 (rule (alu_rr_imm12 op ty src imm)
@ -2167,7 +2185,7 @@
 (decl sinkable_atomic_load (SinkableAtomicLoad) Value)
 (extern extractor sinkable_atomic_load sinkable_atomic_load)

-;; Sink a `SinkableLoad` into a `Reg`.
+;; Sink a `SinkableAtomicLoad` into a `Reg`.
 ;;
 ;; This is a side-effectful operation that notifies the context that the
 ;; instruction that produced the `SinkableAtomicLoad` has been sunk into another
@ -2230,6 +2248,29 @@
          (alu_rrr op ty x_lo y_lo)
          (alu_rrr op ty x_hi y_hi))))

+;; Helper for emitting `MInst.VecLoadReplicate` instructions.
+(decl ld1r (Reg VectorSize MemFlags) Reg)
+(rule (ld1r src size flags)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecLoadReplicate dst src size flags))))
+        dst))
+
+;; Helper for emitting `MInst.LoadAddr` instructions.
+(decl load_addr (AMode) Reg)
+(rule (load_addr addr)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.LoadAddr dst addr))))
+        dst))
+
+(rule (load_addr addr)
+      (if-let addr_reg (amode_is_reg addr))
+      addr_reg)
+
+;; Lower a vector splat with a constant parameter.
+(decl splat_const (u64 VectorSize) Reg)
+;; TODO: Port lower_splat_const() to ISLE.
+(extern constructor splat_const splat_const)
+
 ;; Generate comparison to zero operator from input condition code
 (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
 (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -2258,10 +2258,10 @@ impl MachInstEmit for Inst {
                    ScalarSize::Size16 => 0b00010,
                    ScalarSize::Size32 => 0b00100,
                    ScalarSize::Size64 => 0b01000,
-                    _ => unimplemented!("Unexpected VectorSize: {:?}", size),
+                    _ => unreachable!(),
                };
                sink.put4(
-                    0b000_01110000_00000_000011_00000_00000
+                    0b0_0_0_01110000_00000_000011_00000_00000
                        | (q << 30)
                        | (imm5 << 16)
                        | (machreg_to_gpr(rn) << 5)
@ -2625,13 +2625,18 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
            }
-            &Inst::VecLoadReplicate { rd, rn, size } => {
+            &Inst::VecLoadReplicate {
+                rd,
+                rn,
+                size,
+                flags,
+            } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
                let (q, size) = size.enc_size();

                let srcloc = state.cur_srcloc();
-                if srcloc != SourceLoc::default() {
+                if srcloc != SourceLoc::default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -2351,10 +2351,10 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(25),
            rn: xreg(7),
-            size: VectorSize::Size8x16,
+            size: VectorSize::Size8x8,
        },
-        "F90C014E",
-        "dup v25.16b, w7",
+        "F90C010E",
+        "dup v25.8b, w7",
    ));
    insns.push((
        Inst::VecDup {
@ -2387,10 +2387,10 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(0),
            rn: xreg(28),
-            size: VectorSize::Size32x4,
+            size: VectorSize::Size32x2,
        },
-        "800F044E",
-        "dup v0.4s, w28",
+        "800F040E",
+        "dup v0.2s, w28",
    ));
    insns.push((
        Inst::VecDup {
@ -5199,8 +5199,8 @@ fn test_aarch64_binemit() {
        Inst::VecLoadReplicate {
            rd: writable_vreg(31),
            rn: xreg(0),
-
            size: VectorSize::Size64x2,
+            flags: MemFlags::trusted(),
        },
        "1FCC404D",
        "ld1r { v31.2d }, [x0]",
@ -5210,8 +5210,8 @@ fn test_aarch64_binemit() {
        Inst::VecLoadReplicate {
            rd: writable_vreg(0),
            rn: xreg(25),
-
            size: VectorSize::Size8x8,
+            flags: MemFlags::trusted(),
        },
        "20C3400D",
        "ld1r { v0.8b }, [x25]",
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -530,17 +530,6 @@ impl Inst {
            }
        }
    }
-
-    /// Generate a LoadAddr instruction (load address of an amode into
-    /// register). Elides when possible (when amode is just a register). Returns
-    /// destination register: either `rd` or a register directly from the amode.
-    pub fn gen_load_addr(rd: Writable<Reg>, mem: AMode) -> (Reg, Option<Inst>) {
-        if let Some(r) = mem.is_reg() {
-            (r, None)
-        } else {
-            (rd.to_reg(), Some(Inst::LoadAddr { rd, mem }))
-        }
-    }
 }

 //=============================================================================
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@ -165,6 +165,8 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(xreg(14)),
                preg(xreg(15)),
                // x16 and x17 are spilltmp and tmp2 (see above).
+                // x18 could be used by the platform to carry inter-procedural state;
+                // conservatively assume so and make it not allocatable.
                // x19-28 are callee-saved and so not preferred.
                // x21 is the pinned register (if enabled) and not allocatable if so.
                // x29 is FP, x30 is LR, x31 is SP/ZR.
@ -178,30 +180,7 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(vreg(5)),
                preg(vreg(6)),
                preg(vreg(7)),
-                preg(vreg(8)),
-                preg(vreg(9)),
-                preg(vreg(10)),
-                preg(vreg(11)),
-                preg(vreg(12)),
-                preg(vreg(13)),
-                preg(vreg(14)),
-                preg(vreg(15)),
-            ],
-        ],
-        non_preferred_regs_by_class: [
-            vec![
-                preg(xreg(19)),
-                preg(xreg(20)),
-                // x21 is pinned reg if enabled; we add to this list below if not.
-                preg(xreg(22)),
-                preg(xreg(23)),
-                preg(xreg(24)),
-                preg(xreg(25)),
-                preg(xreg(26)),
-                preg(xreg(27)),
-                preg(xreg(28)),
-            ],
-            vec![
+                // v8-15 are callee-saved and so not preferred.
                preg(vreg(16)),
                preg(vreg(17)),
                preg(vreg(18)),
@ -220,6 +199,30 @@ pub fn create_reg_env(flags: &settings::Flags) -> MachineEnv {
                preg(vreg(31)),
            ],
        ],
+        non_preferred_regs_by_class: [
+            vec![
+                preg(xreg(19)),
+                preg(xreg(20)),
+                // x21 is pinned reg if enabled; we add to this list below if not.
+                preg(xreg(22)),
+                preg(xreg(23)),
+                preg(xreg(24)),
+                preg(xreg(25)),
+                preg(xreg(26)),
+                preg(xreg(27)),
+                preg(xreg(28)),
+            ],
+            vec![
+                preg(vreg(8)),
+                preg(vreg(9)),
+                preg(vreg(10)),
+                preg(vreg(11)),
+                preg(vreg(12)),
+                preg(vreg(13)),
+                preg(vreg(14)),
+                preg(vreg(15)),
+            ],
+        ],
        fixed_stack_slots: vec![],
    };

--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@ -1423,7 +1423,8 @@

 ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (bitselect c x y)))
+(rule (lower (has_type ty (bitselect c x y)))
+      (if (ty_int_bool_ref_scalar_64 ty))
      (let ((tmp1 Reg (and_reg ty x c))
            (tmp2 Reg (bic ty y c)))
        (orr ty tmp1 tmp2)))
@ -1441,12 +1442,14 @@
 ;; T -> I{64,32,16,8}: We can simply pass through the value: values
 ;; are always stored with high bits undefined, so we can just leave
 ;; them be.
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (ireduce src)))
+(rule (lower (has_type ty (ireduce src)))
+    (if (ty_int_bool_ref_scalar_64 ty))
    (value_regs_get src 0))

 ;; Likewise for breduce.

-(rule (lower (has_type (ty_int_bool_ref_scalar_64 ty) (breduce src)))
+(rule (lower (has_type ty (breduce src)))
+      (if (ty_int_bool_ref_scalar_64 ty))
      (value_regs_get src 0))


@ -1515,6 +1518,39 @@
      (let ((use_allocated_encoding bool (is_not_baldrdash_call_conv)))
         (side_effect (udf use_allocated_encoding trap_code))))

+;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty (splat x @ (value_type in_ty))))
+      (if (ty_int_bool_ref_scalar_64 in_ty))
+      (vec_dup x (vector_size ty)))
+
+(rule (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
+      (vec_dup_from_fpu x (vector_size ty)))
+
+(rule (lower (has_type ty (splat (bconst (u64_from_bool n)))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat (breduce (bconst (u64_from_bool n))))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n)))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n)))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat (iconst (u64_from_imm64 n)))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n))))))
+      (splat_const n (vector_size ty)))
+
+(rule (lower (has_type ty (splat x @ (load flags _addr offset))))
+      (if-let mem_op (is_sinkable_inst x))
+      (let ((_ Unit (sink_inst mem_op))
+            (addr AMode (amode (lane_type ty) mem_op offset))
+            (address Reg (load_addr addr)))
+           (ld1r address (vector_size ty) flags)))

 ;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (valid_atomic_transaction ty) (atomic_load flags addr)))
@ -1527,7 +1563,6 @@
                addr))
      (side_effect (store_release ty src addr)))

-
 ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule 1 (lower (and (use_lse)
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@ -5,12 +5,13 @@ pub mod generated_code;

 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo,
-    CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
+    insn_inputs, writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget,
+    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
    Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode,
    Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize,
    NZCV,
 };
+use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::{isle::*, InputSourceInst};
 use crate::settings::Flags;
@ -442,4 +443,25 @@ where
            _ => panic!(),
        }
    }
+
+    fn amode(&mut self, ty: Type, mem_op: Inst, offset: u32) -> AMode {
+        lower_address(
+            self.lower_ctx,
+            ty,
+            &insn_inputs(self.lower_ctx, mem_op)[..],
+            offset as i32,
+        )
+    }
+
+    fn amode_is_reg(&mut self, address: &AMode) -> Option<Reg> {
+        address.is_reg()
+    }
+
+    fn splat_const(&mut self, value: u64, size: &VectorSize) -> Reg {
+        let rd = self.temp_writable_reg(I8X16);
+
+        lower_splat_const(self.lower_ctx, rd, value, *size);
+
+        rd.to_reg()
+    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@ -741,80 +741,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            }
        }

-        Opcode::Splat => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let ty = ty.unwrap();
-            // TODO: Handle SVE Dup.
-            let ty = if ty.is_dynamic_vector() {
-                dynamic_to_fixed(ty)
-            } else {
-                ty
-            };
-            let size = VectorSize::from_ty(ty);
-
-            if let Some((_, insn)) = maybe_input_insn_multi(
-                ctx,
-                inputs[0],
-                &[
-                    Opcode::Bconst,
-                    Opcode::F32const,
-                    Opcode::F64const,
-                    Opcode::Iconst,
-                ],
-            ) {
-                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
-            } else if let Some(insn) =
-                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
-            {
-                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
-            } else if let Some(insn) =
-                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
-            {
-                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
-            } else if let Some((_, insn)) = maybe_input_insn_multi(
-                ctx,
-                inputs[0],
-                &[
-                    Opcode::Uload8,
-                    Opcode::Sload8,
-                    Opcode::Uload16,
-                    Opcode::Sload16,
-                    Opcode::Uload32,
-                    Opcode::Sload32,
-                    Opcode::Load,
-                ],
-            ) {
-                ctx.sink_inst(insn);
-                let load_inputs = insn_inputs(ctx, insn);
-                let load_outputs = insn_outputs(ctx, insn);
-                lower_load(
-                    ctx,
-                    insn,
-                    &load_inputs[..],
-                    load_outputs[0],
-                    |ctx, _rd, _elem_ty, mem| {
-                        let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
-                        let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
-                        if let Some(addr_inst) = addr_inst {
-                            ctx.emit(addr_inst);
-                        }
-                        ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
-
-                        Ok(())
-                    },
-                )?;
-            } else {
-                let input_ty = ctx.input_ty(insn, 0);
-                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let inst = if ty_has_int_representation(input_ty) {
-                    Inst::VecDup { rd, rn, size }
-                } else {
-                    Inst::VecDupFromFpu { rd, rn, size }
-                };
-
-                ctx.emit(inst);
-            }
-        }
+        Opcode::Splat => implemented_in_isle(ctx),

        Opcode::ScalarToVector => implemented_in_isle(ctx),

--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@ -1641,15 +1641,6 @@
 (decl sinkable_inst (Inst) Value)
 (extern extractor sinkable_inst sinkable_inst)

-;; Sink a sinkable instruction.
-;;
-;; This is a side-effectful operation that notifies the context that the
-;; sinkable instruction been sunk into another instruction, and no longer
-;; needs to be lowered.
-(decl sink_inst (Inst) Unit)
-(extern constructor sink_inst sink_inst)
-
-
 ;; Sinkable big-endian load instruction.
 (decl sinkable_load (Inst) Value)
 (extractor (sinkable_load inst)
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@ -1656,8 +1656,9 @@

 ;; Insert vector lane from general-purpose register.
 (rule (lower (insertlane x @ (value_type ty)
-                         y @ (value_type (ty_int_bool_ref_scalar_64 _))
+                         y @ (value_type in_ty)
                         (u8_from_uimm8 idx)))
+      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg)))

 ;; Insert vector lane from floating-point register.
@ -1771,8 +1772,9 @@
 ;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Extract vector lane to general-purpose register.
-(rule (lower (has_type (ty_int_bool_ref_scalar_64 _)
+(rule (lower (has_type out_ty
                       (extractlane x @ (value_type ty) (u8_from_uimm8 idx))))
+      (if (ty_int_bool_ref_scalar_64 out_ty))
      (vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg)))

 ;; Extract vector lane to floating-point register.
@ -1828,8 +1830,8 @@
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Load replicated value from general-purpose register.
-(rule (lower (has_type ty (splat
-                             x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
+(rule (lower (has_type ty (splat x @ (value_type in_ty))))
+      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0))

 ;; Load replicated value from floating-point register.
@ -1888,7 +1890,8 @@

 ;; Load scalar value from general-purpose register.
 (rule (lower (has_type ty (scalar_to_vector
-                             x @ (value_type (ty_int_bool_ref_scalar_64 _)))))
+                             x @ (value_type in_ty))))
+      (if (ty_int_bool_ref_scalar_64 in_ty))
      (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg)))

 ;; Load scalar value from floating-point register.
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@ -666,11 +666,6 @@ where
        None
    }

-    #[inline]
-    fn sink_inst(&mut self, inst: Inst) -> Unit {
-        self.lower_ctx.sink_inst(inst);
-    }
-
    #[inline]
    fn emit(&mut self, inst: &MInst) -> Unit {
        self.lower_ctx.emit(inst.clone());
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@ -11,7 +11,9 @@ pub use crate::ir::{
    SigRef, StackSlot,
 };
 pub use crate::isa::unwind::UnwindInst;
-pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable};
+pub use crate::machinst::{
+    ABIArg, ABIArgSlot, ABISig, InputSourceInst, RealReg, Reg, RelocDistance, Writable,
+};

 pub type Unit = ();
 pub type ValueSlice = (ValueList, usize);
@ -425,6 +427,15 @@ macro_rules! isle_prelude_methods {
            imm.bits() as u64
        }

+        #[inline]
+        fn u64_from_bool(&mut self, b: bool) -> u64 {
+            if b {
+                u64::MAX
+            } else {
+                0
+            }
+        }
+
        #[inline]
        fn inst_results(&mut self, inst: Inst) -> ValueSlice {
            (self.lower_ctx.dfg().inst_results_list(inst), 0)
@ -854,6 +865,21 @@ macro_rules! isle_prelude_methods {
        fn real_reg_to_writable_reg(&mut self, reg: RealReg) -> WritableReg {
            Writable::from_reg(Reg::from(reg))
        }
+
+        fn is_sinkable_inst(&mut self, val: Value) -> Option<Inst> {
+            let input = self.lower_ctx.get_value_as_source_or_const(val);
+
+            if let InputSourceInst::UniqueUse(inst, _) = input.inst {
+                Some(inst)
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn sink_inst(&mut self, inst: Inst) {
+            self.lower_ctx.sink_inst(inst);
+        }
    };
 }

--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@ -308,10 +308,10 @@
 (decl fits_in_64 (Type) Type)
 (extern extractor fits_in_64 fits_in_64)

-;; An extractor that only matches scalar booleans, integers, and references that
-;; can fit in 64 bits.
-(decl ty_int_bool_ref_scalar_64 (Type) Type)
-(extern extractor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)
+;; A pure constructor that only matches scalar booleans, integers, and
+;; references that can fit in 64 bits.
+(decl pure ty_int_bool_ref_scalar_64 (Type) Type)
+(extern constructor ty_int_bool_ref_scalar_64 ty_int_bool_ref_scalar_64)

 ;; An extractor that matches 32- and 64-bit types only.
 (decl ty_32_or_64 (Type) Type)
@ -407,6 +407,10 @@
 (decl u8_from_uimm8 (u8) Uimm8)
 (extern extractor infallible u8_from_uimm8 u8_from_uimm8)

+;; Extract a `u64` from a `bool`.
+(decl u64_from_bool (u64) bool)
+(extern extractor infallible u64_from_bool u64_from_bool)
+
 ;; Extract a `u64` from an `Imm64`.
 (decl u64_from_imm64 (u64) Imm64)
 (extern extractor infallible u64_from_imm64 u64_from_imm64)
@ -498,6 +502,10 @@
 (decl pure zero_value (Value) Value)
 (extern constructor zero_value zero_value)

+;; Match a sinkable instruction from a value operand.
+(decl pure is_sinkable_inst (Value) Inst)
+(extern constructor is_sinkable_inst is_sinkable_inst)
+
 ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Emit an instruction.
@ -508,6 +516,14 @@
 (decl emit (MInst) Unit)
 (extern constructor emit emit)

+;; Sink an instruction.
+;;
+;; This is a side-effectful operation that notifies the context that the
+;; instruction has been sunk into another instruction, and no longer needs to
+;; be lowered.
+(decl sink_inst (Inst) Unit)
+(extern constructor sink_inst sink_inst)
+
 ;; Constant pool emission.

 (type VCodeConstant (primitive VCodeConstant))
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@ -244,18 +244,13 @@ block0(v0: i128):
    return v1
 }

-;   stp fp, lr, [sp, #-16]!
-;   mov fp, sp
-;   stp d11, d13, [sp, #-16]!
 ; block0:
 ;   fmov d6, x0
 ;   mov v6.d[1], x1
-;   cnt v11.16b, v6.16b
-;   addv b13, v11.16b
-;   umov w0, v13.b[0]
+;   cnt v19.16b, v6.16b
+;   addv b21, v19.16b
+;   umov w0, v21.b[0]
 ;   movz w1, #0
-;   ldp d11, d13, [sp], #16
-;   ldp fp, lr, [sp], #16
 ;   ret

 function %d(i64) -> i64 {
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
@ -15,9 +15,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.4h, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.4h, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   sqxtn v0.8b, v7.8h
 ;   ret

@ -35,9 +35,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.8h, w0
-;   sqxtn v0.8b, v2.8h
-;   sqxtn2 v0.16b, v2.8h
+;   dup v6.8h, w0
+;   sqxtn v0.8b, v6.8h
+;   sqxtn2 v0.16b, v6.8h
 ;   ret

 function %snarrow_i32x2(i32) -> i16x4 {
@ -54,9 +54,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.2s, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.2s, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   sqxtn v0.4h, v7.4s
 ;   ret

@ -74,9 +74,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.4s, w0
-;   sqxtn v0.4h, v2.4s
-;   sqxtn2 v0.8h, v2.4s
+;   dup v6.4s, w0
+;   sqxtn v0.4h, v6.4s
+;   sqxtn2 v0.8h, v6.4s
 ;   ret

 function %snarrow_i64x2(i64) -> i32x4 {
@ -93,9 +93,9 @@ block0(v0: i64):
 }

 ; block0:
-;   dup v2.2d, x0
-;   sqxtn v0.2s, v2.2d
-;   sqxtn2 v0.4s, v2.2d
+;   dup v6.2d, x0
+;   sqxtn v0.2s, v6.2d
+;   sqxtn2 v0.4s, v6.2d
 ;   ret

 function %unarrow_i16x4(i16) -> i8x8 {
@ -112,9 +112,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.4h, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.4h, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   sqxtun v0.8b, v7.8h
 ;   ret

@ -132,9 +132,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.8h, w0
-;   sqxtun v0.8b, v2.8h
-;   sqxtun2 v0.16b, v2.8h
+;   dup v6.8h, w0
+;   sqxtun v0.8b, v6.8h
+;   sqxtun2 v0.16b, v6.8h
 ;   ret

 function %unarrow_i32x2(i32) -> i16x4 {
@ -151,9 +151,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.2s, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.2s, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   sqxtun v0.4h, v7.4s
 ;   ret

@ -171,9 +171,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.4s, w0
-;   sqxtun v0.4h, v2.4s
-;   sqxtun2 v0.8h, v2.4s
+;   dup v6.4s, w0
+;   sqxtun v0.4h, v6.4s
+;   sqxtun2 v0.8h, v6.4s
 ;   ret

 function %unarrow_i64x2(i64) -> i32x4 {
@ -190,9 +190,9 @@ block0(v0: i64):
 }

 ; block0:
-;   dup v2.2d, x0
-;   sqxtun v0.2s, v2.2d
-;   sqxtun2 v0.4s, v2.2d
+;   dup v6.2d, x0
+;   sqxtun v0.2s, v6.2d
+;   sqxtun2 v0.4s, v6.2d
 ;   ret

 function %uunarrow_i16x4(i16) -> i8x8 {
@ -209,9 +209,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.4h, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.4h, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   uqxtn v0.8b, v7.8h
 ;   ret

@ -229,9 +229,9 @@ block0(v0: i16):
 }

 ; block0:
-;   dup v2.8h, w0
-;   uqxtn v0.8b, v2.8h
-;   uqxtn2 v0.16b, v2.8h
+;   dup v6.8h, w0
+;   uqxtn v0.8b, v6.8h
+;   uqxtn2 v0.16b, v6.8h
 ;   ret

 function %uunarrow_i32x2(i32) -> i16x4 {
@ -248,9 +248,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.2s, w0
-;   mov v7.16b, v2.16b
-;   mov v7.d[1], v2.d[0]
+;   dup v6.2s, w0
+;   mov v7.16b, v6.16b
+;   mov v7.d[1], v6.d[0]
 ;   uqxtn v0.4h, v7.4s
 ;   ret

@ -268,9 +268,9 @@ block0(v0: i32):
 }

 ; block0:
-;   dup v2.4s, w0
-;   uqxtn v0.4h, v2.4s
-;   uqxtn2 v0.8h, v2.4s
+;   dup v6.4s, w0
+;   uqxtn v0.4h, v6.4s
+;   uqxtn2 v0.8h, v6.4s
 ;   ret

 function %uunarrow_i64x2(i64) -> i32x4 {
@ -287,8 +287,7 @@ block0(v0: i64):
 }

 ; block0:
-;   dup v2.2d, x0
-;   uqxtn v0.2s, v2.2d
-;   uqxtn2 v0.4s, v2.2d
+;   dup v6.2d, x0
+;   uqxtn v0.2s, v6.2d
+;   uqxtn2 v0.4s, v6.2d
 ;   ret
-
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
@ -1,4 +1,4 @@
-test compile
+test compile precise-output
 target aarch64

 function %i8x16_splat_add(i8, i8) -> i8x16 {
@ -13,10 +13,11 @@ block0(v0: i8, v1: i8):
  return v5
 }

-; check:  dup v4.16b, w0
-; nextln: dup v6.16b, w1
-; nextln: add v0.16b, v4.16b, v6.16b
-; nextln: ret
+; block0:
+;   dup v16.16b, w0
+;   dup v17.16b, w1
+;   add v0.16b, v16.16b, v17.16b
+;   ret

 function %i16x8_splat_add(i16, i16) -> i16x8 {
  gv0 = dyn_scale_target_const.i16x8
@ -30,10 +31,11 @@ block0(v0: i16, v1: i16):
  return v5
 }

-; check:  dup v4.8h, w0
-; nextln: dup v6.8h, w1
-; nextln: add v0.8h, v4.8h, v6.8h
-; nextln: ret
+; block0:
+;   dup v16.8h, w0
+;   dup v17.8h, w1
+;   add v0.8h, v16.8h, v17.8h
+;   ret

 function %i32x4_splat_mul(i32, i32) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@ -47,10 +49,11 @@ block0(v0: i32, v1: i32):
  return v5
 }

-; check:  dup v4.4s, w0
-; nextln: dup v6.4s, w1
-; nextln: mul v0.4s, v4.4s, v6.4s
-; nextln: ret
+; block0:
+;   dup v16.4s, w0
+;   dup v17.4s, w1
+;   mul v0.4s, v16.4s, v17.4s
+;   ret

 function %i64x2_splat_sub(i64, i64) -> i64x2 {
  gv0 = dyn_scale_target_const.i64x2
@ -64,10 +67,11 @@ block0(v0: i64, v1: i64):
  return v5
 }

-; check:  dup v4.2d, x0
-; nextln: dup v6.2d, x1
-; nextln: sub v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, x0
+;   dup v17.2d, x1
+;   sub v0.2d, v16.2d, v17.2d
+;   ret

 function %f32x4_splat_add(f32, f32) -> f32x4 {
  gv0 = dyn_scale_target_const.f32x4
@ -81,10 +85,11 @@ block0(v0: f32, v1: f32):
  return v5
 }

-; check:  dup v4.4s, v0.s[0]
-; nextln: dup v6.4s, v1.s[0]
-; nextln: fadd v0.4s, v4.4s, v6.4s
-; nextln: ret
+; block0:
+;   dup v16.4s, v0.s[0]
+;   dup v17.4s, v1.s[0]
+;   fadd v0.4s, v16.4s, v17.4s
+;   ret

 function %f64x2_splat_sub(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -98,10 +103,11 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fsub v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, v0.d[0]
+;   dup v17.2d, v1.d[0]
+;   fsub v0.2d, v16.2d, v17.2d
+;   ret

 function %f64x2_splat_mul(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -115,10 +121,11 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fmul v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, v0.d[0]
+;   dup v17.2d, v1.d[0]
+;   fmul v0.2d, v16.2d, v17.2d
+;   ret

 function %f64x2_splat_div(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -132,10 +139,11 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fdiv v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, v0.d[0]
+;   dup v17.2d, v1.d[0]
+;   fdiv v0.2d, v16.2d, v17.2d
+;   ret

 function %f64x2_splat_min(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -149,10 +157,11 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fmin v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, v0.d[0]
+;   dup v17.2d, v1.d[0]
+;   fmin v0.2d, v16.2d, v17.2d
+;   ret

 function %f64x2_splat_max(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -166,10 +175,11 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fmax v0.2d, v4.2d, v6.2d
-; nextln: ret
+; block0:
+;   dup v16.2d, v0.d[0]
+;   dup v17.2d, v1.d[0]
+;   fmax v0.2d, v16.2d, v17.2d
+;   ret

 function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -183,11 +193,12 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fcmgt v0.2d, v4.2d, v6.2d
-; nextln: bsl v0.16b, v6.16b, v4.16b
-; nextln: ret
+; block0:
+;   dup v17.2d, v0.d[0]
+;   dup v18.2d, v1.d[0]
+;   fcmgt v0.2d, v17.2d, v18.2d
+;   bsl v0.16b, v18.16b, v17.16b
+;   ret

 function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 {
  gv0 = dyn_scale_target_const.f64x2
@ -201,8 +212,9 @@ block0(v0: f64, v1: f64):
  return v5
 }

-; check:  dup v4.2d, v0.d[0]
-; nextln: dup v6.2d, v1.d[0]
-; nextln: fcmgt v0.2d, v6.2d, v4.2d
-; nextln: bsl v0.16b, v6.16b, v4.16b
-; nextln: ret
+; block0:
+;   dup v17.2d, v0.d[0]
+;   dup v18.2d, v1.d[0]
+;   fcmgt v0.2d, v18.2d, v17.2d
+;   bsl v0.16b, v18.16b, v17.16b
+;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
@ -1,4 +1,4 @@
-test compile
+test compile precise-output
 target aarch64

 function %swidenhigh_i8x16(i8) -> i16x8 {
@ -14,9 +14,10 @@ block0(v0: i8):
  return v3
 }

-; check: dup v2.16b, w0
-; nextln: sxtl2 v0.8h, v2.16b
-; nextln: ret
+; block0:
+;   dup v5.16b, w0
+;   sxtl2 v0.8h, v5.16b
+;   ret

 function %swidenhigh_i16x8(i16) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@ -31,9 +32,10 @@ block0(v0: i16):
  return v3
 }

-; check: dup v2.8h, w0
-; nextln: sxtl2 v0.4s, v2.8h
-; nextln: ret
+; block0:
+;   dup v5.8h, w0
+;   sxtl2 v0.4s, v5.8h
+;   ret

 function %swidenhigh_i32x4(i32) -> i64x2 {
  gv0 = dyn_scale_target_const.i32x4
@ -48,9 +50,10 @@ block0(v0: i32):
  return v3
 }

-; check: dup v2.4s, w0
-; nextln: sxtl2 v0.2d, v2.4s
-; nextln: ret
+; block0:
+;   dup v5.4s, w0
+;   sxtl2 v0.2d, v5.4s
+;   ret

 function %swidenlow_i8x16(i8) -> i16x8 {
  gv0 = dyn_scale_target_const.i16x8
@ -65,9 +68,10 @@ block0(v0: i8):
  return v3
 }

-; check: dup v2.16b, w0
-; nextln: sxtl v0.8h, v2.8b
-; nextln: ret
+; block0:
+;   dup v5.16b, w0
+;   sxtl v0.8h, v5.8b
+;   ret

 function %swidenlow_i16x8(i16) -> i32x4 {
  gv0 = dyn_scale_target_const.i32x4
@ -82,9 +86,10 @@ block0(v0: i16):
  return v3
 }

-; check: dup v2.8h, w0
-; nextln: sxtl v0.4s, v2.4h
-; nextln: ret
+; block0:
+;   dup v5.8h, w0
+;   sxtl v0.4s, v5.4h
+;   ret

 function %swidenlow_i32x4(i32) -> i64x2 {
  gv0 = dyn_scale_target_const.i32x4
@ -99,6 +104,7 @@ block0(v0: i32):
  return v3
 }

-; check: dup v2.4s, w0
-; nextln: sxtl v0.2d, v2.2s
-; nextln: ret
+; block0:
+;   dup v5.4s, w0
+;   sxtl v0.2d, v5.2s
+;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
@ -58,9 +58,9 @@ block0(v0: i32):
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   dup v2.4s, w0
-;   mov x4, sp
-;   str q2, [x4]
+;   dup v3.4s, w0
+;   mov x3, sp
+;   str q3, [x3]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
@ -101,9 +101,9 @@ block0(v0: i32):
 ;   mov fp, sp
 ;   sub sp, sp, #16
 ; block0:
-;   dup v2.4s, w0
-;   mov x4, sp
-;   str q2, [x4]
+;   dup v3.4s, w0
+;   mov x3, sp
+;   str q3, [x3]
 ;   add sp, sp, #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif
@ -82,6 +82,14 @@ block0(v0: f64):
 ;   stp d10, d11, [sp, #-16]!
 ;   stp d8, d9, [sp, #-16]!
 ; block0:
+;   fadd d24, d0, d0
+;   fadd d25, d0, d0
+;   fadd d26, d0, d0
+;   fadd d27, d0, d0
+;   fadd d28, d0, d0
+;   fadd d29, d0, d0
+;   fadd d30, d0, d0
+;   fadd d31, d0, d0
 ;   fadd d1, d0, d0
 ;   fadd d2, d0, d0
 ;   fadd d3, d0, d0
@ -89,14 +97,6 @@ block0(v0: f64):
 ;   fadd d5, d0, d0
 ;   fadd d6, d0, d0
 ;   fadd d7, d0, d0
-;   fadd d8, d0, d0
-;   fadd d9, d0, d0
-;   fadd d10, d0, d0
-;   fadd d11, d0, d0
-;   fadd d12, d0, d0
-;   fadd d13, d0, d0
-;   fadd d14, d0, d0
-;   fadd d15, d0, d0
 ;   fadd d16, d0, d0
 ;   fadd d17, d0, d0
 ;   fadd d18, d0, d0
@ -105,45 +105,45 @@ block0(v0: f64):
 ;   fadd d21, d0, d0
 ;   fadd d22, d0, d0
 ;   fadd d23, d0, d0
-;   fadd d24, d0, d0
-;   fadd d25, d0, d0
-;   fadd d26, d0, d0
-;   fadd d27, d0, d0
-;   fadd d28, d0, d0
-;   fadd d29, d0, d0
-;   fadd d30, d0, d0
-;   fadd d31, d0, d0
-;   fadd d0, d0, d1
-;   fadd d1, d2, d3
-;   fadd d2, d4, d5
-;   fadd d3, d6, d7
-;   fadd d4, d8, d9
-;   fadd d5, d10, d11
-;   fadd d6, d12, d13
-;   fadd d7, d14, d15
-;   fadd d8, d16, d17
-;   fadd d9, d18, d19
-;   fadd d10, d20, d21
-;   fadd d11, d22, d23
-;   fadd d12, d24, d25
-;   fadd d13, d26, d27
-;   fadd d14, d28, d29
-;   fadd d15, d30, d31
-;   fadd d0, d0, d1
-;   fadd d1, d2, d3
-;   fadd d2, d4, d5
-;   fadd d3, d6, d7
+;   fadd d8, d0, d0
+;   fadd d9, d0, d0
+;   fadd d10, d0, d0
+;   fadd d11, d0, d0
+;   fadd d12, d0, d0
+;   fadd d13, d0, d0
+;   fadd d14, d0, d0
+;   fadd d15, d0, d0
+;   fadd d24, d0, d24
+;   fadd d25, d25, d26
+;   fadd d26, d27, d28
+;   fadd d27, d29, d30
+;   fadd d28, d31, d1
+;   fadd d29, d2, d3
+;   fadd d30, d4, d5
+;   fadd d31, d6, d7
+;   fadd d0, d16, d17
+;   fadd d1, d18, d19
+;   fadd d2, d20, d21
+;   fadd d3, d22, d23
 ;   fadd d4, d8, d9
 ;   fadd d5, d10, d11
 ;   fadd d6, d12, d13
 ;   fadd d7, d14, d15
-;   fadd d0, d0, d1
-;   fadd d1, d2, d3
-;   fadd d2, d4, d5
-;   fadd d3, d6, d7
-;   fadd d0, d0, d1
-;   fadd d1, d2, d3
-;   fadd d0, d0, d1
+;   fadd d24, d24, d25
+;   fadd d25, d26, d27
+;   fadd d26, d28, d29
+;   fadd d27, d30, d31
+;   fadd d28, d0, d1
+;   fadd d29, d2, d3
+;   fadd d30, d4, d5
+;   fadd d31, d6, d7
+;   fadd d24, d24, d25
+;   fadd d25, d26, d27
+;   fadd d26, d28, d29
+;   fadd d27, d30, d31
+;   fadd d24, d24, d25
+;   fadd d25, d26, d27
+;   fadd d0, d24, d25
 ;   ldp d8, d9, [sp], #16
 ;   ldp d10, d11, [sp], #16
 ;   ldp d12, d13, [sp], #16
@ -242,4 +242,3 @@ block0(v0: i64):
 ;   ldr x28, [sp], #16
 ;   ldp fp, lr, [sp], #16
 ;   ret
-
--- a/cranelift/filetests/filetests/runtests/simd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-splat.clif
@ -1,4 +1,4 @@
-test interpret
+; test interpret TODO: Not yet implemented
 test run
 target aarch64
 target s390x
@ -10,6 +10,8 @@ block0(v0: i8):
    v1 = splat.i8x16 v0
    return v1
 }
+; run: %splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
+; run: %splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 ; run: %splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

 function %splat_i16x8(i16) -> i16x8 {
@ -17,6 +19,8 @@ block0(v0: i16):
    v1 = splat.i16x8 v0
    return v1
 }
+; run: %splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1]
+; run: %splat_i16x8(0) == [0 0 0 0 0 0 0 0]
 ; run: %splat_i16x8(512) == [512 512 512 512 512 512 512 512]

 function %splat_i32x4(i32) -> i32x4 {
@ -24,6 +28,8 @@ block0(v0: i32):
    v1 = splat.i32x4 v0
    return v1
 }
+; run: %splat_i32x4(-1) == [-1 -1 -1 -1]
+; run: %splat_i32x4(0) == [0 0 0 0]
 ; run: %splat_i32x4(2000000) == [2000000 2000000 2000000 2000000]

 function %splat_i64x2(i64) -> i64x2 {
@ -31,4 +37,189 @@ block0(v0: i64):
    v1 = splat.i64x2 v0
    return v1
 }
+; run: %splat_i64x2(-1) == [-1 -1]
+; run: %splat_i64x2(0) == [0 0]
 ; run: %splat_i64x2(5000000000) == [5000000000 5000000000]
+
+function %splat_f32x4(f32) -> f32x4 {
+block0(v0: f32):
+    v1 = splat.f32x4 v0
+    return v1
+}
+; run: %splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0]
+; run: %splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0]
+; run: %splat_f32x4(NaN) == [NaN NaN NaN NaN]
+
+function %splat_f64x2(f64) -> f64x2 {
+block0(v0: f64):
+    v1 = splat.f64x2 v0
+    return v1
+}
+; run: %splat_f64x2(0x0.0) == [0x0.0 0x0.0]
+; run: %splat_f64x2(0x2.0) == [0x2.0 0x2.0]
+; run: %splat_f64x2(NaN) == [NaN NaN]
+
+; TODO: Test combinations of `bconst` and `splat`, potentially with `breduce` in
+; the middle
+
+function %splat_i8x16_2(i8x16) -> i8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 116
+  v2 = splat.i8x16 v1
+  v3 = iadd v0, v2
+  return v3
+}
+; run: %splat_i8x16_2([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13]
+
+function %splat_i8x16_3(i8x16) -> i8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i16 116
+  v2 = ireduce.i8 v1
+  v3 = splat.i8x16 v2
+  v4 = iadd v0, v3
+  return v4
+}
+; run: %splat_i8x16_3([-128 -101 -75 -59 -22 -12 -7 -1 0 3 17 34 68 92 111 127]) == [-12 15 41 57 94 104 109 115 116 119 -123 -106 -72 -48 -29 -13]
+
+function %splat_i16x8_2(i16x8) -> i16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 42
+  v2 = splat.i16x8 v1
+  v3 = iadd v0, v2
+  return v3
+}
+; run: %splat_i16x8_2([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727]
+
+function %splat_i16x8_3(i16x8) -> i16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i64 42
+  v2 = ireduce.i16 v1
+  v3 = splat.i16x8 v2
+  v4 = iadd v0, v3
+  return v4
+}
+; run: %splat_i16x8_3([-32768 -1500 -1 0 42 200 8576 32767]) == [-32726 -1458 41 42 84 242 8618 -32727]
+
+function %splat_i32x4_2(i32x4) -> i32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 1024
+  v2 = splat.i32x4 v1
+  v3 = iadd v0, v2
+  return v3
+}
+; run: %splat_i32x4_2([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625]
+
+function %splat_i32x4_3(i32x4) -> i32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i64 1024
+  v2 = ireduce.i32 v1
+  v3 = splat.i32x4 v2
+  v4 = iadd v0, v3
+  return v4
+}
+; run: %splat_i32x4_3([-2147483648 -1 0 2147483647]) == [-2147482624 1023 1024 -2147482625]
+
+function %splat_i64x2_2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -1
+    v2 = splat.i64x2 v1
+    v3 = iadd v0, v2
+    return v3
+}
+; run: %splat_i64x2_2([-1 0]) == [-2 -1]
+
+function %splat_f32x4_2(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0x1.5
+  v2 = splat.f32x4 v1
+  v3 = fadd v0, v2
+  return v3
+}
+; run: %splat_f32x4_2([0x0.0 NaN 0x1.0 0x2.0]) == [0x1.5 NaN 0x2.5 0x3.5]
+
+function %splat_f64x2_2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0x7.5
+  v2 = splat.f64x2 v1
+  v3 = fadd v0, v2
+  return v3
+}
+; run: %splat_f64x2_2([0x0.0 0x1.0]) == [0x7.5 0x8.5]
+
+function %load_splat_i8x16(i8) -> i8x16 {
+    ss0 = explicit_slot 8
+
+block0(v0: i8):
+    stack_store.i8 v0, ss0
+    v1 = stack_load.i8 ss0
+    v2 = splat.i8x16 v1
+    return v2
+}
+; run: %load_splat_i8x16(-1) == [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
+; run: %load_splat_i8x16(0) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+; run: %load_splat_i8x16(1) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %load_splat_i16x8(i16) -> i16x8 {
+    ss0 = explicit_slot 8
+
+block0(v0: i16):
+    stack_store.i16 v0, ss0
+    v1 = stack_load.i16 ss0
+    v2 = splat.i16x8 v1
+    return v2
+}
+; run: %load_splat_i16x8(-1) == [-1 -1 -1 -1 -1 -1 -1 -1]
+; run: %load_splat_i16x8(0) == [0 0 0 0 0 0 0 0]
+; run: %load_splat_i16x8(512) == [512 512 512 512 512 512 512 512]
+
+function %load_splat_i32x4(i32) -> i32x4 {
+    ss0 = explicit_slot 8
+
+block0(v0: i32):
+    stack_store.i32 v0, ss0
+    v1 = stack_load.i32 ss0
+    v2 = splat.i32x4 v1
+    return v2
+}
+; run: %load_splat_i32x4(-1) == [-1 -1 -1 -1]
+; run: %load_splat_i32x4(0) == [0 0 0 0]
+; run: %load_splat_i32x4(2000000) == [2000000 2000000 2000000 2000000]
+
+function %load_splat_i64x2(i64) -> i64x2 {
+    ss0 = explicit_slot 8
+
+block0(v0: i64):
+    stack_store.i64 v0, ss0
+    v1 = stack_load.i64 ss0
+    v2 = splat.i64x2 v1
+    return v2
+}
+; run: %load_splat_i64x2(-1) == [-1 -1]
+; run: %load_splat_i64x2(0) == [0 0]
+; run: %load_splat_i64x2(5000000000) == [5000000000 5000000000]
+
+function %load_splat_f32x4(f32) -> f32x4 {
+    ss0 = explicit_slot 8
+
+block0(v0: f32):
+    stack_store.f32 v0, ss0
+    v1 = stack_load.f32 ss0
+    v2 = splat.f32x4 v1
+    return v2
+}
+; run: %load_splat_f32x4(-0x0.0) == [-0x0.0 -0x0.0 -0x0.0 -0x0.0]
+; run: %load_splat_f32x4(0x1.0) == [0x1.0 0x1.0 0x1.0 0x1.0]
+; run: %load_splat_f32x4(NaN) == [NaN NaN NaN NaN]
+
+function %load_splat_f64x2(f64) -> f64x2 {
+    ss0 = explicit_slot 8
+
+block0(v0: f64):
+    stack_store.f64 v0, ss0
+    v1 = stack_load.f64 ss0
+    v2 = splat.f64x2 v1
+    return v2
+}
+; run: %load_splat_f64x2(0x0.0) == [0x0.0 0x0.0]
+; run: %load_splat_f64x2(0x2.0) == [0x2.0 0x2.0]
+; run: %load_splat_f64x2(NaN) == [NaN NaN]