`cranelift-frontend`: Support moving GCs with user stack maps (#8978)

* cranelift-frontend: Support moving GCs with user stack maps This refactors the way that user stack maps work such that they are compatible with moving GCs. We now keep every GC value in a stack slot, spill to the stack slot on the value's definition, and reload from the stack slot on every use. This means that we can generate a bunch of redundant loads, but we let the mid-end's alias analysis clean that up. This has the added benefit of reducing the length of live ranges for GC values and also means we don't need to proactively spill every single live GC values at each safepoint, because we know they are already spilled. This latter bit actually helps us avoid an accidentally quadratic issue with many, long, overlapping live ranges where we would do `O(n^2)` spills for a series of safepoints that keep creating more GC refs. We also implement two further optimizations: 1. We lazily allocate slots for live GC values, which means that if a GC value is not ever live across a safepoint, then we do never allocate a stack slot for it and never spill it to the stack slot. If we didn't do this, then frames would be larger and we'd have a dead store to the stack slot that would otherwise require the mid-end to grow a dead-store-elimination pass. 2. We keep a free list of available slots that will never be used again, and we reuse slots from here when possible. This means that a chain of non-overlapping live ranges, each of which still needs to appear in some safepoint's stack map, will all reuse the same stack slot, keeping frames from being bloated. Finally, this commit also introduces some logs for the liveness analysis to ease future debugging. Co-Authored-By: Trevor Elliott <telliott@fastly.com> * review feedback --------- Co-authored-by: Trevor Elliott <telliott@fastly.com>
4 months ago · 2028224fd1
3 changed files with 570 additions and 166 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -674,6 +674,7 @@ name = "cranelift-frontend"
 version = "0.111.0"
 dependencies = [
 "cranelift-codegen",
+ "env_logger",
 "hashbrown 0.14.3",
 "log",
 "similar",
--- a/cranelift/frontend/Cargo.toml
+++ b/cranelift/frontend/Cargo.toml
@ -21,6 +21,7 @@ hashbrown = { workspace = true, optional = true }
 smallvec = { workspace = true }

 [dev-dependencies]
+env_logger = { workspace = true }
 similar = { workspace = true }
 cranelift-codegen = { workspace = true, features = ['x86'] }

--- a/cranelift/frontend/src/frontend.rs
+++ b/cranelift/frontend/src/frontend.rs
@ -2,8 +2,9 @@
 use crate::ssa::{SSABuilder, SideEffects};
 use crate::variable::Variable;
 use alloc::collections::BTreeSet;
+use alloc::vec::Vec;
 use core::fmt::{self, Debug};
-use cranelift_codegen::cursor::{Cursor, FuncCursor};
+use cranelift_codegen::cursor::{Cursor, CursorPosition, FuncCursor};
 use cranelift_codegen::entity::{EntityRef, EntitySet, SecondaryMap};
 use cranelift_codegen::ir;
 use cranelift_codegen::ir::condcodes::IntCC;
@ -272,8 +273,6 @@ impl fmt::Display for DefVariableError {
    }
 }

-const LOG2_SIZE_CAPACITY: usize = (16u8.ilog2() as usize) + 1;
-
 /// This module allows you to create a function in Cranelift IR in a straightforward way, hiding
 /// all the complexity of its internal representation.
 ///
@ -504,6 +503,11 @@ impl<'a> FunctionBuilder<'a> {
            return Err(DefVariableError::TypeMismatch(var, val));
        }

+        // If `var` needs inclusion in stack maps, then `val` does too.
+        if self.func_ctx.stack_map_vars.contains(var) {
+            self.declare_value_needs_stack_map(val);
+        }
+
        self.func_ctx.ssa.def_var(var, val, self.position.unwrap());
        Ok(())
    }
@ -704,45 +708,46 @@ impl<'a> FunctionBuilder<'a> {
    /// Third, and finally, we spill the live needs-stack-map values at each
    /// safepoint into those stack slots.
    fn insert_safepoint_spills(&mut self) {
-        // Find all the GC values that are live across each safepoint.
-        let (safepoints, max_vals_in_stack_map_by_log2_size) =
-            self.find_live_stack_map_values_at_each_safepoint();
+        log::trace!(
+            "before inserting safepoint spills and reloads:\n{}",
+            self.func.display()
+        );

-        // Create the stack slots to spill them into.
-        let stack_slots = self.create_safepoint_slots(max_vals_in_stack_map_by_log2_size);
+        // Find all the GC values that are live across each safepoint and add
+        // stack map entries for them.
+        let stack_slots = self.find_live_stack_map_values_at_each_safepoint();

        // Insert spills to our new stack slots before each safepoint
        // instruction.
-        self.insert_safepoint_spills_to_stack_slots(safepoints, stack_slots);
+        self.insert_safepoint_spills_and_reloads(&stack_slots);
+
+        log::trace!(
+            "after inserting safepoint spills and reloads:\n{}",
+            self.func.display()
+        );
    }

    /// Find the live GC references for each safepoint instruction in this
    /// function.
    ///
-    /// Returns a pair of
-    ///
-    /// 1. A map from each safepoint instruction to the set of GC references
-    ///    that are live across it
-    ///
-    /// 2. The maximum number of values we need to store in a stack map at the
-    ///    same time, bucketed by their type's size. This array is indexed by
-    ///    the log2 of the type's size.
+    /// Returns a map from each safepoint instruction to the set of GC
+    /// references that are live across it
    fn find_live_stack_map_values_at_each_safepoint(
        &mut self,
-    ) -> (
-        crate::HashMap<ir::Inst, SmallVec<[ir::Value; 4]>>,
-        [usize; LOG2_SIZE_CAPACITY],
-    ) {
-        // A map from each safepoint to the set of GC references that are live
-        // across it.
-        let mut safepoints: crate::HashMap<ir::Inst, SmallVec<[ir::Value; 4]>> =
-            crate::HashMap::new();
-
-        // The maximum number of values we need to store in a stack map at the
-        // same time, bucketed by their type's size. This array is indexed by
-        // the log2 of the type's size. We do not support recording values whose
-        // size is greater than 16 in stack maps.
-        let mut max_vals_in_stack_map_by_log2_size = [0; LOG2_SIZE_CAPACITY];
+    ) -> crate::HashMap<ir::Value, ir::StackSlot> {
+        // A mapping from each needs-stack-map value that is live across some
+        // safepoint to the stack slot that it resides within. Note that if a
+        // needs-stack-map value is never live across a safepoint, then we won't
+        // ever add it to this map, it can remain in a virtual register for the
+        // duration of its lifetime, and we won't replace all its uses with
+        // reloads and all that stuff.
+        let mut stack_slots: crate::HashMap<ir::Value, ir::StackSlot> = Default::default();
+
+        // A map from size/align to free stack slots that are not being used
+        // anymore. This allows us to reuse stack slots across multiple values
+        // helps cut down on the ultimate size of our stack frames.
+        let mut free_stack_slots: crate::HashMap<u32, SmallVec<[ir::StackSlot; 4]>> =
+            Default::default();

        // The set of needs-stack-maps values that are currently live in our
        // traversal.
@ -760,9 +765,8 @@ impl<'a> FunctionBuilder<'a> {
        // 1. The definition of a value removes it from our `live` set. Values
        //    are not live before they are defined.
        //
-        // 2. When we see any instruction that requires a safepoint (aka
-        //    non-tail calls) we record the current live set of needs-stack-map
-        //    values.
+        // 2. When we see any instruction that is a safepoint (aka non-tail
+        //    calls) we record the current live set of needs-stack-map values.
        //
        //    We ignore tail calls because this caller and its frame won't exist
        //    by the time the callee is executing and potentially triggers a GC;
@ -791,28 +795,117 @@ impl<'a> FunctionBuilder<'a> {
        //    simplest thing. Besides, none of our mid-end optimization passes
        //    have run at this point in time yet, so there probably isn't much,
        //    if any, dead code.
-        for block in self.func_ctx.dfs.post_order_iter(&self.func) {
+
+        // Helper for (1)
+        let process_def = |func: &Function,
+                           stack_slots: &crate::HashMap<_, _>,
+                           free_stack_slots: &mut crate::HashMap<u32, SmallVec<_>>,
+                           live: &mut BTreeSet<ir::Value>,
+                           val: ir::Value| {
+            log::trace!("stack map liveness:   defining {val:?}, removing it from the live set");
+            live.remove(&val);
+
+            // This value's stack slot, if any, is now available for reuse.
+            if let Some(slot) = stack_slots.get(&val) {
+                log::trace!("stack map liveness:     returning {slot:?} to the free list");
+                let ty = func.dfg.value_type(val);
+                free_stack_slots.entry(ty.bytes()).or_default().push(*slot);
+            }
+        };
+
+        // Helper for (2)
+        let process_safepoint = |func: &mut Function,
+                                 stack_slots: &mut crate::HashMap<Value, StackSlot>,
+                                 free_stack_slots: &mut crate::HashMap<u32, SmallVec<_>>,
+                                 live: &BTreeSet<_>,
+                                 inst: Inst| {
+            log::trace!(
+                "stack map liveness:   found safepoint: {inst:?}: {}",
+                func.dfg.display_inst(inst)
+            );
+            log::trace!("stack map liveness:     live set = {live:?}");
+
+            for val in live {
+                let ty = func.dfg.value_type(*val);
+                let slot = *stack_slots.entry(*val).or_insert_with(|| {
+                    log::trace!("stack map liveness:     {val:?} needs a stack slot");
+                    let size = func.dfg.value_type(*val).bytes();
+                    free_stack_slots
+                        .get_mut(&size)
+                        .and_then(|list| list.pop().inspect(|slot| {
+                            log::trace!(
+                                "stack map liveness:       reusing free stack slot {slot:?} for {val:?}"
+                            )
+                        }))
+                        .unwrap_or_else(|| {
+                            debug_assert!(size.is_power_of_two());
+                            let log2_size = size.ilog2();
+                            let slot = func.create_sized_stack_slot(ir::StackSlotData::new(
+                                ir::StackSlotKind::ExplicitSlot,
+                                size,
+                                log2_size.try_into().unwrap(),
+                            ));
+                            log::trace!(
+                                "stack map liveness:       created new stack slot {slot:?} for {val:?}"
+                            );
+                            slot
+                        })
+                });
+                func.dfg.append_user_stack_map_entry(
+                    inst,
+                    ir::UserStackMapEntry {
+                        ty,
+                        slot,
+                        offset: 0,
+                    },
+                );
+            }
+        };
+
+        // Helper for (3)
+        let process_use = |func: &Function, live: &mut BTreeSet<_>, inst: Inst, val: Value| {
+            if live.insert(val) {
+                log::trace!(
+                    "stack map liveness:   found use of {val:?}, marking it live: {inst:?}: {}",
+                    func.dfg.display_inst(inst)
+                );
+            }
+        };
+
+        for block in self
+            .func_ctx
+            .dfs
+            .post_order_iter(&self.func)
+            // We have to `collect` here to release the borrow on `self.func` so
+            // we can add the stack map entries below.
+            .collect::<Vec<_>>()
+        {
+            log::trace!("stack map liveness: traversing {block:?}");
            let mut option_inst = self.func.layout.last_inst(block);
            while let Some(inst) = option_inst {
                // (1) Remove values defined by this instruction from the `live`
                // set.
                for val in self.func.dfg.inst_results(inst) {
-                    live.remove(val);
+                    process_def(
+                        &self.func,
+                        &stack_slots,
+                        &mut free_stack_slots,
+                        &mut live,
+                        *val,
+                    );
                }

-                // (2) If this instruction is a call, then we need to add a
-                // safepoint to record any values in `live`.
+                // (2) If this instruction is a safepoint, then we need to add
+                // stack map entries to record the values in `live`.
                let opcode = self.func.dfg.insts[inst].opcode();
                if opcode.is_call() && !opcode.is_return() {
-                    let mut live: SmallVec<[_; 4]> = live.iter().copied().collect();
-                    for chunk in live_vals_by_size(&self.func.dfg, &mut live) {
-                        let index = log2_size(&self.func.dfg, chunk[0]);
-                        max_vals_in_stack_map_by_log2_size[index] =
-                            core::cmp::max(max_vals_in_stack_map_by_log2_size[index], chunk.len());
-                    }
-
-                    let old_val = safepoints.insert(inst, live);
-                    debug_assert!(old_val.is_none());
+                    process_safepoint(
+                        &mut self.func,
+                        &mut stack_slots,
+                        &mut free_stack_slots,
+                        &live,
+                        inst,
+                    );
                }

                // (3) Add all needs-stack-map values that are operands to this
@ -821,7 +914,7 @@ impl<'a> FunctionBuilder<'a> {
                for val in self.func.dfg.inst_values(inst) {
                    let val = self.func.dfg.resolve_aliases(val);
                    if self.func_ctx.stack_map_values.contains(val) {
-                        live.insert(val);
+                        process_use(&self.func, &mut live, inst, val);
                    }
                }

@ -831,74 +924,90 @@ impl<'a> FunctionBuilder<'a> {
            // After we've processed this block's instructions, remove its
            // parameters from the live set. This is part of step (1).
            for val in self.func.dfg.block_params(block) {
-                live.remove(val);
+                process_def(
+                    &self.func,
+                    &stack_slots,
+                    &mut free_stack_slots,
+                    &mut live,
+                    *val,
+                );
            }
        }

-        (safepoints, max_vals_in_stack_map_by_log2_size)
+        stack_slots
    }

-    /// Create a stack slot for each size of needs-stack-map value.
+    /// This function does a forwards pass over the IR and does two things:
    ///
-    /// These slots are arrays capable of holding the maximum number of
-    /// same-sized values that must appear in the same stack map at the same
-    /// time.
+    /// 1. Insert spills to a needs-stack-map value's associated stack slot just
+    ///    after its definition.
    ///
-    /// The resulting array of stack slots is indexed by the log2 of the type
-    /// size.
-    fn create_safepoint_slots(
+    /// 2. Replace all uses of the needs-stack-map value with loads from that
+    ///    stack slot. This will introduce many redundant loads, but the alias
+    ///    analysis pass in the mid-end should clean most of these up when not
+    ///    actually needed.
+    fn insert_safepoint_spills_and_reloads(
        &mut self,
-        max_vals_in_stack_map_by_log2_size: [usize; LOG2_SIZE_CAPACITY],
-    ) -> [PackedOption<ir::StackSlot>; LOG2_SIZE_CAPACITY] {
-        let mut stack_slots = [PackedOption::<ir::StackSlot>::default(); LOG2_SIZE_CAPACITY];
-        for (log2_size, capacity) in max_vals_in_stack_map_by_log2_size.into_iter().enumerate() {
-            if capacity == 0 {
-                continue;
-            }
-            let size = 1usize << log2_size;
-            let slot = self.func.create_sized_stack_slot(ir::StackSlotData::new(
-                ir::StackSlotKind::ExplicitSlot,
-                u32::try_from(size * capacity).unwrap(),
-                u8::try_from(log2_size).unwrap(),
-            ));
-            stack_slots[log2_size] = Some(slot).into();
-        }
-        stack_slots
-    }
-
-    /// Insert spills to the given stack slots before each safepoint
-    /// instruction.
-    fn insert_safepoint_spills_to_stack_slots(
-        &mut self,
-        safepoints: crate::HashMap<ir::Inst, SmallVec<[ir::Value; 4]>>,
-        stack_slots: [PackedOption<ir::StackSlot>; LOG2_SIZE_CAPACITY],
+        stack_slots: &crate::HashMap<ir::Value, ir::StackSlot>,
    ) {
-        let mut cursor = FuncCursor::new(self.func);
-        for (inst, live_vals) in safepoints {
-            cursor = cursor.at_inst(inst);
+        let mut pos = FuncCursor::new(self.func);
+        let mut vals: SmallVec<[_; 8]> = Default::default();

-            // The offset within each stack slot for the next spill to that
+        while let Some(block) = pos.next_block() {
+            // Spill needs-stack-map values defined by block parameters to their
            // associated stack slot.
-            let mut stack_slot_offsets = [0; LOG2_SIZE_CAPACITY];
-
-            for val in live_vals {
-                let ty = cursor.func.dfg.value_type(val);
-                let size_of_val = ty.bytes();
+            vals.extend_from_slice(pos.func.dfg.block_params(block));
+            pos.next_inst();
+            let mut spilled_any = false;
+            for val in vals.drain(..) {
+                if let Some(slot) = stack_slots.get(&val) {
+                    pos.ins().stack_store(val, *slot, 0);
+                    spilled_any = true;
+                }
+            }

-                let index = log2_size(&cursor.func.dfg, val);
-                let slot = stack_slots[index].unwrap();
+            // The cursor needs to point just *before* the first original
+            // instruction in the block that we didn't introduce just above, so
+            // that when we loop over `next_inst()` below we are processing the
+            // block's original instructions. If we inserted spills, then it
+            // already does point there. If we did not insert spills, then it is
+            // currently pointing at the first original instruction, but that
+            // means that the upcoming `pos.next_inst()` call will skip over the
+            // first original instruction, so in this case we need to back up
+            // the cursor.
+            if !spilled_any {
+                pos = pos.at_position(CursorPosition::Before(block));
+            }

-                let offset = stack_slot_offsets[index];
-                stack_slot_offsets[index] += size_of_val;
+            while let Some(mut inst) = pos.next_inst() {
+                // Replace all uses of needs-stack-map values with loads from
+                // the value's associated stack slot.
+                vals.extend(pos.func.dfg.inst_values(inst));
+                let mut replaced_any = false;
+                for val in &mut vals {
+                    if let Some(slot) = stack_slots.get(val) {
+                        replaced_any = true;
+                        let ty = pos.func.dfg.value_type(*val);
+                        *val = pos.ins().stack_load(ty, *slot, 0);
+                    }
+                }
+                if replaced_any {
+                    pos.func.dfg.overwrite_inst_values(inst, vals.drain(..));
+                } else {
+                    vals.clear();
+                }

-                cursor
-                    .ins()
-                    .stack_store(val, slot, i32::try_from(offset).unwrap());
+                // If this instruction defines a needs-stack-map value, then
+                // spill it to its stack slot.
+                pos = pos.after_inst(inst);
+                vals.extend_from_slice(pos.func.dfg.inst_results(inst));
+                for val in vals.drain(..) {
+                    if let Some(slot) = stack_slots.get(&val) {
+                        inst = pos.ins().stack_store(val, *slot, 0);
+                    }
+                }

-                cursor
-                    .func
-                    .dfg
-                    .append_user_stack_map_entry(inst, ir::UserStackMapEntry { ty, slot, offset });
+                pos = pos.at_inst(inst);
            }
        }
    }
@ -952,25 +1061,6 @@ impl<'a> FunctionBuilder<'a> {
    }
 }

-/// Sort `live` by size and return an iterable of subslices grouped by size.
-fn live_vals_by_size<'a, 'b>(
-    dfg: &'a ir::DataFlowGraph,
-    live: &'b mut [ir::Value],
-) -> impl Iterator<Item = &'b [ir::Value]>
-where
-    'a: 'b,
-{
-    live.sort_by_key(|val| dfg.value_type(*val).bytes());
-    live.chunk_by(|a, b| dfg.value_type(*a).bytes() == dfg.value_type(*b).bytes())
-}
-
-/// Get `log2(sizeof(val))` as a `usize`.
-fn log2_size(dfg: &ir::DataFlowGraph, val: ir::Value) -> usize {
-    let size = dfg.value_type(val).bytes();
-    debug_assert!(size.is_power_of_two());
-    usize::try_from(size.ilog2()).unwrap()
-}
-
 /// All the functions documented in the previous block are write-only and help you build a valid
 /// Cranelift IR functions via multiple debug asserts. However, you might need to improve the
 /// performance of your translation perform more complex transformations to your Cranelift IR
@ -2221,18 +2311,20 @@ block0:
            func.display().to_string().trim(),
            r#"
 function %sample(i32, i32) system_v {
-    ss0 = explicit_slot 8, align = 4
+    ss0 = explicit_slot 4, align = 4
+    ss1 = explicit_slot 4, align = 4
    sig0 = (i32) system_v
    fn0 = colocated u0:0 sig0

 block0(v0: i32, v1: i32):
    stack_store v0, ss0
-    stack_store v1, ss0+4
-    call fn0(v0), stack_map=[i32 @ ss0+0, i32 @ ss0+4]
-    jump block0(v0, v1)
-}
-            "#
-            .trim()
+    stack_store v1, ss1
+    call fn0(v0), stack_map=[i32 @ ss0+0, i32 @ ss1+0]
+    v2 = stack_load.i32 ss0
+    v3 = stack_load.i32 ss1
+    jump block0(v2, v3)
+}            "#
+                .trim()
        );
    }

@ -2261,17 +2353,18 @@ block0(v0: i32, v1: i32):

        // At each `call` we are losing one more value as no longer live, so
        // each stack map should be one smaller than the last. `v3` is never
-        // live, so should never appear in a stack map. Note that a value that
-        // is an argument to the call, but is not live after the call, should
-        // not appear in the stack map. This is why `v0` appears in the first
-        // call's stack map, but not the second call's stack map.
+        // live across a safepoint, so should never appear in a stack map. Note
+        // that a value that is an argument to the call, but is not live after
+        // the call, should not appear in the stack map. This is why `v0`
+        // appears in the first call's stack map, but not the second call's
+        // stack map.
        //
        //     block0:
        //       v0 = needs stack map
        //       v1 = needs stack map
        //       v2 = needs stack map
        //       v3 = needs stack map
-        //       call $foo(v0)
+        //       call $foo(v3)
        //       call $foo(v0)
        //       call $foo(v1)
        //       call $foo(v2)
@ -2287,7 +2380,7 @@ block0(v0: i32, v1: i32):
        builder.declare_value_needs_stack_map(v2);
        let v3 = builder.ins().iconst(ir::types::I32, 3);
        builder.declare_value_needs_stack_map(v3);
-        builder.ins().call(func_ref, &[v0]);
+        builder.ins().call(func_ref, &[v3]);
        builder.ins().call(func_ref, &[v0]);
        builder.ins().call(func_ref, &[v1]);
        builder.ins().call(func_ref, &[v2]);
@ -2300,25 +2393,27 @@ block0(v0: i32, v1: i32):
            func.display().to_string().trim(),
            r#"
 function %sample() system_v {
-    ss0 = explicit_slot 12, align = 4
+    ss0 = explicit_slot 4, align = 4
+    ss1 = explicit_slot 4, align = 4
+    ss2 = explicit_slot 4, align = 4
    sig0 = (i32) system_v
    fn0 = colocated u0:0 sig0

 block0:
    v0 = iconst.i32 0
+    stack_store v0, ss2  ; v0 = 0
    v1 = iconst.i32 1
+    stack_store v1, ss1  ; v1 = 1
    v2 = iconst.i32 2
-    v3 = iconst.i32 3
-    stack_store v0, ss0  ; v0 = 0
-    stack_store v1, ss0+4  ; v1 = 1
-    stack_store v2, ss0+8  ; v2 = 2
-    call fn0(v0), stack_map=[i32 @ ss0+0, i32 @ ss0+4, i32 @ ss0+8]  ; v0 = 0
-    stack_store v1, ss0  ; v1 = 1
-    stack_store v2, ss0+4  ; v2 = 2
-    call fn0(v0), stack_map=[i32 @ ss0+0, i32 @ ss0+4]  ; v0 = 0
    stack_store v2, ss0  ; v2 = 2
-    call fn0(v1), stack_map=[i32 @ ss0+0]  ; v1 = 1
-    call fn0(v2)  ; v2 = 2
+    v3 = iconst.i32 3
+    call fn0(v3), stack_map=[i32 @ ss2+0, i32 @ ss1+0, i32 @ ss0+0]  ; v3 = 3
+    v4 = stack_load.i32 ss2
+    call fn0(v4), stack_map=[i32 @ ss1+0, i32 @ ss0+0]
+    v5 = stack_load.i32 ss1
+    call fn0(v5), stack_map=[i32 @ ss0+0]
+    v6 = stack_load.i32 ss0
+    call fn0(v6)
    return
 }
            "#
@ -2580,14 +2675,15 @@ function u0:0(i32) system_v {

 block0(v0: i32):
    v1 = iconst.i64 0x1234_5678
+    stack_store v1, ss0  ; v1 = 0x1234_5678
    brif v0, block1, block2

 block1:
-    v2 = iadd_imm.i64 v1, 0  ; v1 = 0x1234_5678
+    v3 = stack_load.i64 ss0
+    v2 = iadd_imm v3, 0
    return

 block2:
-    stack_store.i64 v1, ss0  ; v1 = 0x1234_5678
    call fn0(), stack_map=[i64 @ ss0+0]
    return
 }
@ -2826,6 +2922,8 @@ block2:
        builder.ins().jump(block3, &[v3, v3]);

        builder.switch_to_block(block3);
+        builder.append_block_param(block3, ir::types::I64);
+        builder.append_block_param(block3, ir::types::I64);
        builder.ins().call(func_ref, &[]);
        // NB: Our simplistic liveness analysis conservatively treats any use of
        // a value as keeping it live, regardless if the use has side effects or
@ -2842,7 +2940,8 @@ block2:
            func.display().to_string().trim(),
            r#"
 function %sample(i32) system_v {
-    ss0 = explicit_slot 16, align = 8
+    ss0 = explicit_slot 8, align = 8
+    ss1 = explicit_slot 8, align = 8
    sig0 = () system_v
    fn0 = colocated u0:0 sig0

@ -2851,23 +2950,27 @@ block0(v0: i32):

 block1:
    v1 = iconst.i64 1
-    v2 = iconst.i64 2
    stack_store v1, ss0  ; v1 = 1
-    stack_store v2, ss0+8  ; v2 = 2
-    call fn0(), stack_map=[i64 @ ss0+0, i64 @ ss0+8]
-    jump block3(v1, v2)  ; v1 = 1, v2 = 2
+    v2 = iconst.i64 2
+    stack_store v2, ss1  ; v2 = 2
+    call fn0(), stack_map=[i64 @ ss0+0, i64 @ ss1+0]
+    v8 = stack_load.i64 ss0
+    v9 = stack_load.i64 ss1
+    jump block3(v8, v9)

 block2:
    v3 = iconst.i64 3
-    v4 = iconst.i64 4
    stack_store v3, ss0  ; v3 = 3
+    v4 = iconst.i64 4
    call fn0(), stack_map=[i64 @ ss0+0]
-    jump block3(v3, v3)  ; v3 = 3, v3 = 3
+    v10 = stack_load.i64 ss0
+    v11 = stack_load.i64 ss0
+    jump block3(v10, v11)

-block3:
-    stack_store.i64 v1, ss0  ; v1 = 1
+block3(v5: i64, v6: i64):
    call fn0(), stack_map=[i64 @ ss0+0]
-    v5 = iadd_imm.i64 v1, 0  ; v1 = 1
+    v12 = stack_load.i64 ss0
+    v7 = iadd_imm v12, 0
    return
 }
            "#
@ -2939,9 +3042,13 @@ block3:
 function %sample(i8, i16, i32, i64, i128, f32, f64, i8x16, i16x8) -> i8, i16, i32, i64, i128, f32, f64, i8x16, i16x8 system_v {
    ss0 = explicit_slot 1
    ss1 = explicit_slot 2, align = 2
-    ss2 = explicit_slot 8, align = 4
-    ss3 = explicit_slot 16, align = 8
-    ss4 = explicit_slot 48, align = 16
+    ss2 = explicit_slot 4, align = 4
+    ss3 = explicit_slot 8, align = 8
+    ss4 = explicit_slot 16, align = 16
+    ss5 = explicit_slot 4, align = 4
+    ss6 = explicit_slot 8, align = 8
+    ss7 = explicit_slot 16, align = 16
+    ss8 = explicit_slot 16, align = 16
    sig0 = () system_v
    fn0 = colocated u0:0 sig0

@ -2949,14 +3056,308 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64, v4: i128, v5: f32, v6: f64, v7: i8x16,
    stack_store v0, ss0
    stack_store v1, ss1
    stack_store v2, ss2
-    stack_store v5, ss2+4
    stack_store v3, ss3
-    stack_store v6, ss3+8
    stack_store v4, ss4
-    stack_store v7, ss4+16
-    stack_store v8, ss4+32
-    call fn0(), stack_map=[i8 @ ss0+0, i16 @ ss1+0, i32 @ ss2+0, f32 @ ss2+4, i64 @ ss3+0, f64 @ ss3+8, i128 @ ss4+0, i8x16 @ ss4+16, i16x8 @ ss4+32]
-    return v0, v1, v2, v3, v4, v5, v6, v7, v8
+    stack_store v5, ss5
+    stack_store v6, ss6
+    stack_store v7, ss7
+    stack_store v8, ss8
+    call fn0(), stack_map=[i8 @ ss0+0, i16 @ ss1+0, i32 @ ss2+0, i64 @ ss3+0, i128 @ ss4+0, f32 @ ss5+0, f64 @ ss6+0, i8x16 @ ss7+0, i16x8 @ ss8+0]
+    v9 = stack_load.i8 ss0
+    v10 = stack_load.i16 ss1
+    v11 = stack_load.i32 ss2
+    v12 = stack_load.i64 ss3
+    v13 = stack_load.i128 ss4
+    v14 = stack_load.f32 ss5
+    v15 = stack_load.f64 ss6
+    v16 = stack_load.i8x16 ss7
+    v17 = stack_load.i16x8 ss8
+    return v9, v10, v11, v12, v13, v14, v15, v16, v17
+}
+            "#
+            .trim()
+        );
+    }
+
+    #[test]
+    fn series_of_non_overlapping_live_ranges_needs_stack_map() {
+        let sig = Signature::new(CallConv::SystemV);
+
+        let mut fn_ctx = FunctionBuilderContext::new();
+        let mut func = Function::with_name_signature(UserFuncName::testcase("sample"), sig);
+        let mut builder = FunctionBuilder::new(&mut func, &mut fn_ctx);
+
+        let name = builder
+            .func
+            .declare_imported_user_function(ir::UserExternalName {
+                namespace: 0,
+                index: 0,
+            });
+        let signature = builder
+            .func
+            .import_signature(Signature::new(CallConv::SystemV));
+        let foo_func_ref = builder.import_function(ir::ExtFuncData {
+            name: ir::ExternalName::user(name),
+            signature,
+            colocated: true,
+        });
+
+        let name = builder
+            .func
+            .declare_imported_user_function(ir::UserExternalName {
+                namespace: 0,
+                index: 1,
+            });
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(ir::types::I32));
+        let signature = builder.func.import_signature(sig);
+        let consume_func_ref = builder.import_function(ir::ExtFuncData {
+            name: ir::ExternalName::user(name),
+            signature,
+            colocated: true,
+        });
+
+        // Create a series of needs-stack-map values that do not have
+        // overlapping live ranges, but which do appear in stack maps for calls
+        // to `$foo`:
+        //
+        //     block0:
+        //       v0 = needs stack map
+        //       call $foo()
+        //       call consume(v0)
+        //       v1 = needs stack map
+        //       call $foo()
+        //       call consume(v1)
+        //       v2 = needs stack map
+        //       call $foo()
+        //       call consume(v2)
+        //       v3 = needs stack map
+        //       call $foo()
+        //       call consume(v3)
+        //       return
+        let block0 = builder.create_block();
+        builder.append_block_params_for_function_params(block0);
+        builder.switch_to_block(block0);
+        let v0 = builder.ins().iconst(ir::types::I32, 0);
+        builder.declare_value_needs_stack_map(v0);
+        builder.ins().call(foo_func_ref, &[]);
+        builder.ins().call(consume_func_ref, &[v0]);
+        let v1 = builder.ins().iconst(ir::types::I32, 1);
+        builder.declare_value_needs_stack_map(v1);
+        builder.ins().call(foo_func_ref, &[]);
+        builder.ins().call(consume_func_ref, &[v1]);
+        let v2 = builder.ins().iconst(ir::types::I32, 2);
+        builder.declare_value_needs_stack_map(v2);
+        builder.ins().call(foo_func_ref, &[]);
+        builder.ins().call(consume_func_ref, &[v2]);
+        let v3 = builder.ins().iconst(ir::types::I32, 3);
+        builder.declare_value_needs_stack_map(v3);
+        builder.ins().call(foo_func_ref, &[]);
+        builder.ins().call(consume_func_ref, &[v3]);
+        builder.ins().return_(&[]);
+        builder.seal_all_blocks();
+        builder.finalize();
+
+        eprintln!("Actual = {}", func.display());
+        assert_eq!(
+            func.display().to_string().trim(),
+            r#"
+function %sample() system_v {
+    ss0 = explicit_slot 4, align = 4
+    sig0 = () system_v
+    sig1 = (i32) system_v
+    fn0 = colocated u0:0 sig0
+    fn1 = colocated u0:1 sig1
+
+block0:
+    v0 = iconst.i32 0
+    stack_store v0, ss0  ; v0 = 0
+    call fn0(), stack_map=[i32 @ ss0+0]
+    v4 = stack_load.i32 ss0
+    call fn1(v4)
+    v1 = iconst.i32 1
+    stack_store v1, ss0  ; v1 = 1
+    call fn0(), stack_map=[i32 @ ss0+0]
+    v5 = stack_load.i32 ss0
+    call fn1(v5)
+    v2 = iconst.i32 2
+    stack_store v2, ss0  ; v2 = 2
+    call fn0(), stack_map=[i32 @ ss0+0]
+    v6 = stack_load.i32 ss0
+    call fn1(v6)
+    v3 = iconst.i32 3
+    stack_store v3, ss0  ; v3 = 3
+    call fn0(), stack_map=[i32 @ ss0+0]
+    v7 = stack_load.i32 ss0
+    call fn1(v7)
+    return
+}
+            "#
+            .trim()
+        );
+    }
+
+    #[test]
+    fn vars_block_params_and_needs_stack_map() {
+        let _ = env_logger::try_init();
+
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(ir::types::I32));
+        sig.returns.push(AbiParam::new(ir::types::I32));
+
+        let mut fn_ctx = FunctionBuilderContext::new();
+        let mut func = Function::with_name_signature(UserFuncName::testcase("sample"), sig);
+        let mut builder = FunctionBuilder::new(&mut func, &mut fn_ctx);
+
+        let name = builder
+            .func
+            .declare_imported_user_function(ir::UserExternalName {
+                namespace: 0,
+                index: 0,
+            });
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(ir::types::I32));
+        let signature = builder.func.import_signature(sig);
+        let func_ref = builder.import_function(ir::ExtFuncData {
+            name: ir::ExternalName::user(name),
+            signature,
+            colocated: true,
+        });
+
+        // Use a variable, create a control flow diamond so that the variable
+        // forces a block parameter on the control join point, and make sure
+        // that we get stack maps for all the appropriate uses of the variable
+        // in all blocks, as well as that we are reusing stack slots for each of
+        // the values.
+        //
+        //                        block0:
+        //                          x := needs stack map
+        //                          call $foo(x)
+        //                          br_if v0, block1, block2
+        //
+        //
+        //     block1:                                     block2:
+        //       call $foo(x)                                call $foo(x)
+        //       call $foo(x)                                call $foo(x)
+        //       x := new needs stack map                    x := new needs stack map
+        //       call $foo(x)                                call $foo(x)
+        //       jump block3                                 jump block3
+        //
+        //
+        //                        block3:
+        //                          call $foo(x)
+        //                          return x
+
+        let x = Variable::from_u32(0);
+        builder.declare_var(x, ir::types::I32);
+        builder.declare_var_needs_stack_map(x);
+
+        let block0 = builder.create_block();
+        let block1 = builder.create_block();
+        let block2 = builder.create_block();
+        let block3 = builder.create_block();
+
+        builder.append_block_params_for_function_params(block0);
+        builder.switch_to_block(block0);
+        let v0 = builder.func.dfg.block_params(block0)[0];
+        let val = builder.ins().iconst(ir::types::I32, 42);
+        builder.def_var(x, val);
+        {
+            let x = builder.use_var(x);
+            builder.ins().call(func_ref, &[x]);
+        }
+        builder.ins().brif(v0, block1, &[], block2, &[]);
+
+        builder.switch_to_block(block1);
+        {
+            let x = builder.use_var(x);
+            builder.ins().call(func_ref, &[x]);
+            builder.ins().call(func_ref, &[x]);
+        }
+        let val = builder.ins().iconst(ir::types::I32, 36);
+        builder.def_var(x, val);
+        {
+            let x = builder.use_var(x);
+            builder.ins().call(func_ref, &[x]);
+        }
+        builder.ins().jump(block3, &[]);
+
+        builder.switch_to_block(block2);
+        {
+            let x = builder.use_var(x);
+            builder.ins().call(func_ref, &[x]);
+            builder.ins().call(func_ref, &[x]);
+        }
+        let val = builder.ins().iconst(ir::types::I32, 36);
+        builder.def_var(x, val);
+        {
+            let x = builder.use_var(x);
+            builder.ins().call(func_ref, &[x]);
+        }
+        builder.ins().jump(block3, &[]);
+
+        builder.switch_to_block(block3);
+        let x = builder.use_var(x);
+        builder.ins().call(func_ref, &[x]);
+        builder.ins().return_(&[x]);
+
+        builder.seal_all_blocks();
+        builder.finalize();
+
+        eprintln!("Actual = {}", func.display());
+
+        // Because our liveness analysis is very simple, and visit blocks in the
+        // order 3->1->2->0, we see uses of `v2` in block1 and mark it live
+        // across all of block2 because we haven't reached the def in block0
+        // yet, even though it isn't technically live out of block2, only live
+        // in. This means that it shows up in the stack map for block2's second
+        // call to `foo()` when it technically needn't, and additionally means
+        // that we have two stack slots instead of a single one below. This
+        // could all be improved and cleaned up by improving the liveness
+        // analysis.
+        assert_eq!(
+            func.display().to_string().trim(),
+            r#"
+function %sample(i32) -> i32 system_v {
+    ss0 = explicit_slot 4, align = 4
+    ss1 = explicit_slot 4, align = 4
+    sig0 = (i32) system_v
+    fn0 = colocated u0:0 sig0
+
+block0(v0: i32):
+    v1 = iconst.i32 42
+    v2 -> v1
+    v4 -> v1
+    stack_store v1, ss0  ; v1 = 42
+    v7 = stack_load.i32 ss0
+    call fn0(v7), stack_map=[i32 @ ss0+0]
+    brif v0, block1, block2
+
+block1:
+    call fn0(v2), stack_map=[i32 @ ss0+0]  ; v2 = 42
+    call fn0(v2)  ; v2 = 42
+    v3 = iconst.i32 36
+    stack_store v3, ss0  ; v3 = 36
+    v8 = stack_load.i32 ss0
+    call fn0(v8), stack_map=[i32 @ ss0+0]
+    v9 = stack_load.i32 ss0
+    jump block3(v9)
+
+block2:
+    call fn0(v4), stack_map=[i32 @ ss0+0]  ; v4 = 42
+    call fn0(v4), stack_map=[i32 @ ss0+0]  ; v4 = 42
+    v5 = iconst.i32 36
+    stack_store v5, ss1  ; v5 = 36
+    v10 = stack_load.i32 ss1
+    call fn0(v10), stack_map=[i32 @ ss0+0, i32 @ ss1+0]
+    v11 = stack_load.i32 ss1
+    jump block3(v11)
+
+block3(v6: i32):
+    stack_store v6, ss0
+    call fn0(v6), stack_map=[i32 @ ss0+0]
+    v12 = stack_load.i32 ss0
+    return v12
 }
            "#
            .trim()
@ -3021,7 +3422,8 @@ function %sample(i32) -> i32 system_v {
 block0(v0: i32):
    stack_store v0, ss0
    call fn0(), stack_map=[i32 @ ss0+0]
-    return v0
+    v1 = stack_load.i32 ss0
+    return v1
 }
            "#
            .trim()