Cranelift: fix #3953: rework single/multiple-use logic in lowering. (#4061)

* Cranelift: fix #3953: rework single/multiple-use logic in lowering. This PR addresses the longstanding issue with loads trying to merge into compares on x86-64, and more generally, with the lowering framework falsely recognizing "single uses" of one op by another (which would normally allow merging of side-effecting ops like loads) when there is *indirect* duplication. To fix this, we replace the direct `value_uses` count with a transitive notion of uniqueness (not unlike Rust's `&`/`&mut` and how a `&mut` downgrades to `&` when accessed through another `&`!). A value is used multiple times transitively if it has multiple direct uses, or is used by another op that is used multiple times transitively. The canonical example of badness is: ``` v1 := load v2 := ifcmp v1, ... v3 := selectif v2, ... v4 := selectif v2, ... ``` both `v3` and `v4` effectively merge the `ifcmp` (`v2`), so even though the use of `v1` is "unique", it is codegenned twice. This is why we ~~can't have nice things~~ can't merge loads into compares (#3953). There is quite a subtle and interesting design space around this problem and how we might solve it. See the long doc-comment on `ValueUseState` in this PR for more justification for the particular design here. In particular, this design deliberately simplifies a bit relative to an "optimal" solution: some uses can *become* unique depending on merging, but we don't design our data structures for such updates because that would require significant extra costly tracking (some sort of transitive refcounting). For example, in the above, if `selectif` somehow did not merge `ifcmp`, then we would only codegen the `ifcmp` once into its result register (and use that register twice); then the load *is* uniquely used, and could be merged. But that requires transitioning from "multiple use" back to "unique use" with careful tracking as we do pattern-matching, which I've chosen to make out-of-scope here for now. In practice, I don't think it will matter too much (and we can always improve later). With this PR, we can now re-enable load-op merging for compares. A subsequent commit does this. * Update x64 backend to allow load-op merging for `cmp`. * Update filetests. * Add test for cmp-mem merging on x64. * Comment fixes. * Rework ValueUseState analysis for better performance. * Update s390x filetest: iadd_ifcout cannot merge loads anymore because it has multiple outputs (ValueUseState limitation) * Address review comments.
3 years ago · e4b7c8a737
13 changed files with 547 additions and 298 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@ -306,7 +306,8 @@ fn put_input_in_rs<C: LowerCtx<I = Inst>>(
    narrow_mode: NarrowValueMode,
 ) -> ResultRS {
    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-    if let Some((insn, 0)) = inputs.inst {
+    // Unique or non-unique use is fine for merging here.
+    if let Some((insn, 0)) = inputs.inst.as_inst() {
        let op = ctx.data(insn).opcode();

        if op == Opcode::Ishl {
@ -353,7 +354,7 @@ fn get_as_extended_value<C: LowerCtx<I = Inst>>(
    narrow_mode: NarrowValueMode,
 ) -> Option<(Value, ExtendOp)> {
    let inputs = ctx.get_value_as_source_or_const(val);
-    let (insn, n) = inputs.inst?;
+    let (insn, n) = inputs.inst.as_inst()?;
    if n != 0 {
        return None;
    }
@ -1125,7 +1126,7 @@ pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
        inputs,
        op
    );
-    if let Some((src_inst, _)) = inputs.inst {
+    if let Some((src_inst, _)) = inputs.inst.as_inst() {
        let data = c.data(src_inst);
        log::trace!(" -> input inst {:?}", data);
        if data.opcode() == op {
@ -1161,14 +1162,14 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
    conv: Opcode,
 ) -> Option<IRInst> {
    let inputs = c.get_input_as_source_or_const(input.insn, input.input);
-    if let Some((src_inst, _)) = inputs.inst {
+    if let Some((src_inst, _)) = inputs.inst.as_inst() {
        let data = c.data(src_inst);
        if data.opcode() == op {
            return Some(src_inst);
        }
        if data.opcode() == conv {
            let inputs = c.get_input_as_source_or_const(src_inst, 0);
-            if let Some((src_inst, _)) = inputs.inst {
+            if let Some((src_inst, _)) = inputs.inst.as_inst() {
                let data = c.data(src_inst);
                if data.opcode() == op {
                    return Some(src_inst);
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@ -12,7 +12,7 @@ use super::{
    NZCV,
 };
 use crate::isa::aarch64::settings::Flags as IsaFlags;
-use crate::machinst::isle::*;
+use crate::machinst::{isle::*, InputSourceInst};
 use crate::settings::Flags;
 use crate::{
    binemit::CodeOffset,
@ -245,7 +245,7 @@ where

    fn sinkable_atomic_load(&mut self, val: Value) -> Option<SinkableAtomicLoad> {
        let input = self.lower_ctx.get_value_as_source_or_const(val);
-        if let Some((atomic_load, 0)) = input.inst {
+        if let InputSourceInst::UniqueUse(atomic_load, 0) = input.inst {
            if self.lower_ctx.data(atomic_load).opcode() == Opcode::AtomicLoad {
                let atomic_addr = self.lower_ctx.input_as_value(atomic_load, 0);
                return Some(SinkableAtomicLoad {
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@ -497,7 +497,7 @@ where
    #[inline]
    fn sinkable_inst(&mut self, val: Value) -> Option<Inst> {
        let input = self.lower_ctx.get_value_as_source_or_const(val);
-        if let Some((inst, 0)) = input.inst {
+        if let Some((inst, 0)) = input.inst.as_inst() {
            return Some(inst);
        }
        None
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@ -1367,11 +1367,9 @@

 (decl cmp_and_choose (Type CC Value Value) ValueRegs)
 (rule (cmp_and_choose (fits_in_64 ty) cc x y)
-      (let ((x_reg Gpr x)
-            (y_reg Gpr y)
-            (size OperandSize (raw_operand_size_of_type ty)))
-        (with_flags_reg (x64_cmp size x_reg y_reg)
-                        (cmove ty cc y_reg x_reg))))
+      (let ((size OperandSize (raw_operand_size_of_type ty)))
+        (with_flags_reg (x64_cmp size x y)
+                        (cmove ty cc y x))))

 (rule (lower (has_type (fits_in_64 ty) (umin x y)))
      (cmp_and_choose ty (CC.B) x y))
@ -1751,19 +1749,8 @@
 ;; than one instruction for certain types (e.g., XMM-held, I128).

 (rule (lower (has_type ty (select (icmp cc a @ (value_type (fits_in_64 a_ty)) b) x y)))
-      ;; N.B.: we force the comparison operators into registers, and disallow
-      ;; load-op fusion, because we do not have a transitive guarantee that this
-      ;; cmp-site will be the sole user of the value. Consider: the `icmp` might
-      ;; be the only user of a load, but there may be multiple users of the
-      ;; `icmp` (e.g., `select` or `bint` instructions) that each invoke emit a
-      ;; comparison. If we were to allow a load to sink to the *latest* one, but
-      ;; other sites did not permit sinking, then we would be missing the load
-      ;; for other cmp-sites. TODO:
-      ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
-      (let ((gpr_a Gpr (put_in_gpr a))
-            (gpr_b Gpr (put_in_gpr b))
-            (size OperandSize (raw_operand_size_of_type a_ty)))
-           (with_flags (x64_cmp size gpr_b gpr_a) (cmove_from_values ty cc x y))))
+      (let ((size OperandSize (raw_operand_size_of_type a_ty)))
+           (with_flags (x64_cmp size b a) (cmove_from_values ty cc x y))))

 ;; Finally, we lower `select` from a condition value `c`. These rules are meant
 ;; to be the final, default lowerings if no other patterns matched above.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@ -61,7 +61,7 @@ fn matches_input<C: LowerCtx<I = Inst>>(
    op: Opcode,
 ) -> Option<IRInst> {
    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
-    inputs.inst.and_then(|(src_inst, _)| {
+    inputs.inst.as_inst().and_then(|(src_inst, _)| {
        let data = ctx.data(src_inst);
        if data.opcode() == op {
            return Some(src_inst);
@ -172,7 +172,7 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
        return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
    }

-    if let Some((src_insn, 0)) = inputs.inst {
+    if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
        if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
            ctx.sink_inst(src_insn);
            let amode = lower_to_amode(ctx, addr_input, offset);
@ -479,22 +479,11 @@ fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntC
    } else {
        // TODO Try to commute the operands (and invert the condition) if one is an immediate.
        let lhs = put_input_in_reg(ctx, inputs[0]);
-        // We force the RHS into a register, and disallow load-op fusion, because we
-        // do not have a transitive guarantee that this cmp-site will be the sole
-        // user of the value. Consider: the icmp might be the only user of a load,
-        // but there may be multiple users of the icmp (e.g.  select or bint
-        // instructions) that each invoke `emit_cmp()`. If we were to allow a load
-        // to sink to the *latest* one, but other sites did not permit sinking, then
-        // we would be missing the load for other cmp-sites.
-        let rhs = put_input_in_reg(ctx, inputs[1]);
+        let rhs = input_to_reg_mem_imm(ctx, inputs[1]);

        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
        // us dst - src at the machine instruction level, so invert operands.
-        ctx.emit(Inst::cmp_rmi_r(
-            OperandSize::from_ty(ty),
-            RegMemImm::reg(rhs),
-            lhs,
-        ));
+        ctx.emit(Inst::cmp_rmi_r(OperandSize::from_ty(ty), rhs, lhs));
        cc
    }
 }
@ -578,10 +567,8 @@ fn emit_fcmp<C: LowerCtx<I = Inst>>(
        (inputs[0], inputs[1])
    };
    let lhs = put_input_in_reg(ctx, lhs_input);
-    // See above in `emit_cmp()`. We must only use the reg/reg form of the
-    // comparison in order to avoid issues with merged loads.
-    let rhs = put_input_in_reg(ctx, rhs_input);
-    ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));
+    let rhs = input_to_reg_mem(ctx, rhs_input);
+    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));

    let cond_result = match cond_code {
        FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
@ -2406,6 +2393,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let cmp_insn = ctx
                .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                .inst
+                .as_inst()
                .unwrap()
                .0;
            debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@ -2,7 +2,7 @@

 // Pull in the ISLE generated code.
 pub(crate) mod generated_code;
-use crate::machinst::{Reg, Writable};
+use crate::machinst::{InputSourceInst, Reg, Writable};
 use generated_code::MInst;

 // Types that the generated ISLE code uses via `use super::*`.
@ -84,7 +84,7 @@ where
            return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
        }

-        if let Some((src_insn, 0)) = inputs.inst {
+        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
                self.lower_ctx.sink_inst(src_insn);
                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
@ -105,7 +105,7 @@ where
            return RegMem::reg(generated_code::constructor_imm(self, ty, c).unwrap());
        }

-        if let Some((src_insn, 0)) = inputs.inst {
+        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
                self.lower_ctx.sink_inst(src_insn);
                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
@ -237,7 +237,7 @@ where

    fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> {
        let input = self.lower_ctx.get_value_as_source_or_const(val);
-        if let Some((inst, 0)) = input.inst {
+        if let InputSourceInst::UniqueUse(inst, 0) = input.inst {
            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, inst) {
                return Some(SinkableLoad {
                    inst,
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@ -1,4 +1,4 @@
 src/clif.isle 443b34b797fc8ace
 src/prelude.isle afd037c4d91c875c
 src/isa/x64/inst.isle cad03431447aca1b
-src/isa/x64/lower.isle 42bd3982a6132a2f
+src/isa/x64/lower.isle 803aac716f6f4c39
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@ -197,13 +197,49 @@ pub struct NonRegInput {
    /// computation (and side-effect if applicable) could occur at the
    /// current instruction's location instead.
    ///
-    /// If this instruction's operation is merged into the current instruction,
-    /// the backend must call [LowerCtx::sink_inst()].
-    pub inst: Option<(Inst, usize)>,
+    /// If this instruction's operation is merged into the current
+    /// instruction, the backend must call [LowerCtx::sink_inst()].
+    ///
+    /// This enum indicates whether this use of the source instruction
+    /// is unique or not.
+    pub inst: InputSourceInst,
    /// The value is a known constant.
    pub constant: Option<u64>,
 }

+/// When examining an input to an instruction, this enum provides one
+/// of several options: there is or isn't a single instruction (that
+/// we can see and merge with) that produces that input's value, and
+/// we are or aren't the single user of that instruction.
+#[derive(Clone, Copy, Debug)]
+pub enum InputSourceInst {
+    /// The input in question is the single, unique use of the given
+    /// instruction and output index, and it can be sunk to the
+    /// location of this input.
+    UniqueUse(Inst, usize),
+    /// The input in question is one of multiple uses of the given
+    /// instruction. It can still be sunk to the location of this
+    /// input.
+    Use(Inst, usize),
+    /// We cannot determine which instruction produced the input, or
+    /// it is one of several instructions (e.g., due to a control-flow
+    /// merge and blockparam), or the source instruction cannot be
+    /// allowed to sink to the current location due to side-effects.
+    None,
+}
+
+impl InputSourceInst {
+    /// Get the instruction and output index for this source, whether
+    /// we are its single or one of many users.
+    pub fn as_inst(&self) -> Option<(Inst, usize)> {
+        match self {
+            &InputSourceInst::UniqueUse(inst, output_idx)
+            | &InputSourceInst::Use(inst, output_idx) => Some((inst, output_idx)),
+            &InputSourceInst::None => None,
+        }
+    }
+}
+
 /// A machine backend.
 pub trait LowerBackend {
    /// The machine instruction type.
@ -271,8 +307,13 @@ pub struct Lower<'func, I: VCodeInst> {
    /// Instruction constant values, if known.
    inst_constants: FxHashMap<Inst, u64>,

-    /// Use-counts per SSA value, as counted in the input IR.
-    value_uses: SecondaryMap<Value, u32>,
+    /// Use-counts per SSA value, as counted in the input IR. These
+    /// are "coarsened", in the abstract-interpretation sense: we only
+    /// care about "0, 1, many" states, as this is all we need and
+    /// this lets us do an efficient fixpoint analysis.
+    ///
+    /// See doc comment on `ValueUseState` for more details.
+    value_ir_uses: SecondaryMap<Value, ValueUseState>,

    /// Actual uses of each SSA value so far, incremented while lowering.
    value_lowered_uses: SecondaryMap<Value, u32>,
@ -295,6 +336,108 @@ pub struct Lower<'func, I: VCodeInst> {
    vm_context: Option<Reg>,
 }

+/// How is a value used in the IR?
+///
+/// This can be seen as a coarsening of an integer count. We only need
+/// distinct states for zero, one, or many.
+///
+/// This analysis deserves further explanation. The basic idea is that
+/// we want to allow instruction lowering to know whether a value that
+/// an instruction references is *only* referenced by that one use, or
+/// by others as well. This is necessary to know when we might want to
+/// move a side-effect: we cannot, for example, duplicate a load, so
+/// we cannot let instruction lowering match a load as part of a
+/// subpattern and potentially incorporate it.
+///
+/// Note that a lot of subtlety comes into play once we have
+/// *indirect* uses. The classical example of this in our development
+/// history was the x86 compare instruction, which is incorporated
+/// into flags users (e.g. `selectif`, `trueif`, branches) and can
+/// subsequently incorporate loads, or at least we would like it
+/// to. However, danger awaits: the compare might be the only user of
+/// a load, so we might think we can just move the load (and nothing
+/// is duplicated -- success!), except that the compare itself is
+/// codegen'd in multiple places, where it is incorporated as a
+/// subpattern itself.
+///
+/// So we really want a notion of "unique all the way along the
+/// matching path". Rust's `&T` and `&mut T` offer a partial analogy
+/// to the semantics that we want here: we want to know when we've
+/// matched a unique use of an instruction, and that instruction's
+/// unique use of another instruction, etc, just as `&mut T` can only
+/// be obtained by going through a chain of `&mut T`. If one has a
+/// `&T` to a struct containing `&mut T` (one of several uses of an
+/// instruction that itself has a unique use of an instruction), one
+/// can only get a `&T` (one can only get a "I am one of several users
+/// of this instruction" result).
+///
+/// We could track these paths, either dynamically as one "looks up
+/// the operand tree" or precomputed. But the former requires state
+/// and means that the `LowerCtx` API carries that state implicitly,
+/// which we'd like to avoid if we can. And the latter implies O(n^2)
+/// storage: it is an all-pairs property (is inst `i` unique from the
+/// point of view of `j`).
+///
+/// To make matters even a little more complex still, a value that is
+/// not uniquely used when initially viewing the IR can *become*
+/// uniquely used, at least as a root allowing further unique uses of
+/// e.g. loads to merge, if no other instruction actually merges
+/// it. To be more concrete, if we have `v1 := load; v2 := op v1; v3
+/// := op v2; v4 := op v2` then `v2` is non-uniquely used, so from the
+/// point of view of lowering `v4` or `v3`, we cannot merge the load
+/// at `v1`. But if we decide just to use the assigned register for
+/// `v2` at both `v3` and `v4`, then we only actually codegen `v2`
+/// once, so it *is* a unique root at that point and we *can* merge
+/// the load.
+///
+/// Note also that the color scheme is not sufficient to give us this
+/// information, for various reasons: reasoning about side-effects
+/// does not tell us about potential duplication of uses through pure
+/// ops.
+///
+/// To keep things simple and avoid error-prone lowering APIs that
+/// would extract more information about whether instruction merging
+/// happens or not (we don't have that info now, and it would be
+/// difficult to refactor to get it and make that refactor 100%
+/// correct), we give up on the above "can become unique if not
+/// actually merged" point. Instead, we compute a
+/// transitive-uniqueness. That is what this enum represents.
+///
+/// To define it plainly: a value is `Unused` if no references exist
+/// to it; `Once` if only one other op refers to it, *and* that other
+/// op is `Unused` or `Once`; and `Multiple` otherwise. In other
+/// words, `Multiple` is contagious: even if an op's result value is
+/// directly used only once in the CLIF, that value is `Multiple` if
+/// the op that uses it is itself used multiple times (hence could be
+/// codegen'd multiple times). In brief, this analysis tells us
+/// whether, if every op merged all of its operand tree, a given op
+/// could be codegen'd in more than one place.
+///
+/// To compute this, we first consider direct uses. At this point
+/// `Unused` answers are correct, `Multiple` answers are correct, but
+/// some `Once`s may change to `Multiple`s. Then we propagate
+/// `Multiple` transitively using a workqueue/fixpoint algorithm.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum ValueUseState {
+    /// Not used at all.
+    Unused,
+    /// Used exactly once.
+    Once,
+    /// Used multiple times.
+    Multiple,
+}
+
+impl ValueUseState {
+    /// Add one use.
+    fn inc(&mut self) {
+        let new = match self {
+            Self::Unused => Self::Once,
+            Self::Once | Self::Multiple => Self::Multiple,
+        };
+        *self = new;
+    }
+}
+
 /// Notion of "relocation distance". This gives an estimate of how far away a symbol will be from a
 /// reference.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -408,7 +551,6 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        let mut block_end_colors = SecondaryMap::with_default(InstColor::new(0));
        let mut side_effect_inst_entry_colors = FxHashMap::default();
        let mut inst_constants = FxHashMap::default();
-        let mut value_uses = SecondaryMap::with_default(0);
        for bb in f.layout.blocks() {
            cur_color += 1;
            for inst in f.layout.block_insts(bb) {
@ -426,17 +568,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                    log::trace!(" -> constant: {}", c);
                    inst_constants.insert(inst, c);
                }
-
-                // Count uses of all arguments.
-                for arg in f.dfg.inst_args(inst) {
-                    let arg = f.dfg.resolve_aliases(*arg);
-                    value_uses[arg] += 1;
-                }
            }

            block_end_colors[bb] = InstColor::new(cur_color);
        }

+        let value_ir_uses = Self::compute_use_states(f);
+
        Ok(Lower {
            f,
            vcode,
@ -446,7 +584,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            side_effect_inst_entry_colors,
            inst_constants,
            next_vreg,
-            value_uses,
+            value_ir_uses,
            value_lowered_uses: SecondaryMap::default(),
            inst_sunk: FxHashSet::default(),
            cur_scan_entry_color: None,
@ -457,6 +595,114 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        })
    }

+    /// Pre-analysis: compute `value_ir_uses`. See comment on
+    /// `ValueUseState` for a description of what this analysis
+    /// computes.
+    fn compute_use_states<'a>(f: &'a Function) -> SecondaryMap<Value, ValueUseState> {
+        // We perform the analysis without recursion, so we don't
+        // overflow the stack on long chains of ops in the input.
+        //
+        // This is sort of a hybrid of a "shallow use-count" pass and
+        // a DFS. We iterate over all instructions and mark their args
+        // as used. However when we increment a use-count to
+        // "Multiple" we push its args onto the stack and do a DFS,
+        // immediately marking the whole dependency tree as
+        // Multiple. Doing both (shallow use-counting over all insts,
+        // and deep Multiple propagation) lets us trim both
+        // traversals, stopping recursion when a node is already at
+        // the appropriate state.
+        //
+        // In particular, note that the *coarsening* into {Unused,
+        // Once, Multiple} is part of what makes this pass more
+        // efficient than a full indirect-use-counting pass.
+
+        let mut value_ir_uses: SecondaryMap<Value, ValueUseState> =
+            SecondaryMap::with_default(ValueUseState::Unused);
+
+        // Stack of iterators over Values as we do DFS to mark
+        // Multiple-state subtrees.
+        type StackVec<'a> = SmallVec<[std::slice::Iter<'a, Value>; 16]>;
+        let mut stack: StackVec = smallvec![];
+
+        // Push args for a given inst onto the DFS stack.
+        let push_args_on_stack = |stack: &mut StackVec<'a>, value| {
+            log::trace!(" -> pushing args for {} onto stack", value);
+            if let ValueDef::Result(src_inst, _) = f.dfg.value_def(value) {
+                stack.push(f.dfg.inst_args(src_inst).iter());
+            }
+        };
+
+        // Do a DFS through `value_ir_uses` to mark a subtree as
+        // Multiple.
+        let mark_all_uses_as_multiple =
+            |value_ir_uses: &mut SecondaryMap<Value, ValueUseState>, stack: &mut StackVec<'a>| {
+                while let Some(iter) = stack.last_mut() {
+                    if let Some(&value) = iter.next() {
+                        let value = f.dfg.resolve_aliases(value);
+                        log::trace!(" -> DFS reaches {}", value);
+                        if value_ir_uses[value] == ValueUseState::Multiple {
+                            // Truncate DFS here: no need to go further,
+                            // as whole subtree must already be Multiple.
+                            #[cfg(debug_assertions)]
+                            {
+                                // With debug asserts, check one level
+                                // of that invariant at least.
+                                if let ValueDef::Result(src_inst, _) = f.dfg.value_def(value) {
+                                    debug_assert!(f.dfg.inst_args(src_inst).iter().all(|&arg| {
+                                        let arg = f.dfg.resolve_aliases(arg);
+                                        value_ir_uses[arg] == ValueUseState::Multiple
+                                    }));
+                                }
+                            }
+                            continue;
+                        }
+                        value_ir_uses[value] = ValueUseState::Multiple;
+                        log::trace!(" -> became Multiple");
+                        push_args_on_stack(stack, value);
+                    } else {
+                        // Empty iterator, discard.
+                        stack.pop();
+                    }
+                }
+            };
+
+        for inst in f
+            .layout
+            .blocks()
+            .flat_map(|block| f.layout.block_insts(block))
+        {
+            // If this inst produces multiple values, we must mark all
+            // of its args as Multiple, because otherwise two uses
+            // could come in as Once on our two different results.
+            let force_multiple = f.dfg.inst_results(inst).len() > 1;
+
+            // Iterate over all args of all instructions, noting an
+            // additional use on each operand. If an operand becomes Multiple,
+            for &arg in f.dfg.inst_args(inst) {
+                let arg = f.dfg.resolve_aliases(arg);
+                let old = value_ir_uses[arg];
+                if force_multiple {
+                    log::trace!(
+                        "forcing arg {} to Multiple because of multiple results of user inst",
+                        arg
+                    );
+                    value_ir_uses[arg] = ValueUseState::Multiple;
+                } else {
+                    value_ir_uses[arg].inc();
+                }
+                let new = value_ir_uses[arg];
+                log::trace!("arg {} used, old state {:?}, new {:?}", arg, old, new,);
+                // On transition to Multiple, do DFS.
+                if old != ValueUseState::Multiple && new == ValueUseState::Multiple {
+                    push_args_on_stack(&mut stack, arg);
+                    mark_all_uses_as_multiple(&mut value_ir_uses, &mut stack);
+                }
+            }
+        }
+
+        value_ir_uses
+    }
+
    fn gen_arg_setup(&mut self) {
        if let Some(entry_bb) = self.f.layout.entry_block() {
            log::trace!(
@ -1050,9 +1296,11 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
            // OK to merge source instruction if (i) we have a source
            // instruction, and:
            // - It has no side-effects, OR
-            // - It has a side-effect, has one output value, that one output has
-            //   only one use (this one), and the instruction's color is *one less
-            //   than* the current scan color.
+            // - It has a side-effect, has one output value, that one
+            //   output has only one use, directly or indirectly (so
+            //   cannot be duplicated -- see comment on
+            //   `ValueUseState`), and the instruction's color is *one
+            //   less than* the current scan color.
            //
            //   This latter set of conditions is testing whether a
            //   side-effecting instruction can sink to the current scan
@ -1071,15 +1319,26 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
                log::trace!(" -> src inst {}", src_inst);
                log::trace!(" -> has lowering side effect: {}", src_side_effect);
                if !src_side_effect {
-                    // Pure instruction: always possible to sink.
-                    Some((src_inst, result_idx))
+                    // Pure instruction: always possible to
+                    // sink. Let's determine whether we are the only
+                    // user or not.
+                    if self.value_ir_uses[val] == ValueUseState::Once {
+                        InputSourceInst::UniqueUse(src_inst, result_idx)
+                    } else {
+                        InputSourceInst::Use(src_inst, result_idx)
+                    }
                } else {
                    // Side-effect: test whether this is the only use of the
                    // only result of the instruction, and whether colors allow
                    // the code-motion.
+                    log::trace!(
+                        " -> side-effecting op {} for val {}: use state {:?}",
+                        src_inst,
+                        val,
+                        self.value_ir_uses[val]
+                    );
                    if self.cur_scan_entry_color.is_some()
-                        && self.value_uses[val] == 1
-                        && self.value_lowered_uses[val] == 0
+                        && self.value_ir_uses[val] == ValueUseState::Once
                        && self.num_outputs(src_inst) == 1
                        && self
                            .side_effect_inst_entry_colors
@ -1089,15 +1348,15 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
                            + 1
                            == self.cur_scan_entry_color.unwrap().get()
                    {
-                        Some((src_inst, 0))
+                        InputSourceInst::UniqueUse(src_inst, 0)
                    } else {
-                        None
+                        InputSourceInst::None
                    }
                }
            }
-            _ => None,
+            _ => InputSourceInst::None,
        };
-        let constant = inst.and_then(|(inst, _)| self.get_constant(inst));
+        let constant = inst.as_inst().and_then(|(inst, _)| self.get_constant(inst));

        NonRegInput { inst, constant }
    }
--- a/cranelift/filetests/filetests/isa/s390x/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/s390x/arithmetic.clif
@ -262,7 +262,8 @@ block0(v0: i64, v1: i64):
 }

 ; block0:
-;   alg %r2, 0(%r3)
+;   lg %r4, 0(%r3)
+;   algr %r2, %r4
 ;   br %r14

 function %iadd_i64_mem_ext32(i64, i64) -> i64 {
@ -273,7 +274,8 @@ block0(v0: i64, v1: i64):
 }

 ; block0:
-;   algf %r2, 0(%r3)
+;   llgf %r4, 0(%r3)
+;   algr %r2, %r4
 ;   br %r14

 function %iadd_i32(i32, i32) -> i32 {
@ -305,7 +307,8 @@ block0(v0: i32, v1: i64):
 }

 ; block0:
-;   al %r2, 0(%r3)
+;   l %r4, 0(%r3)
+;   alr %r2, %r4
 ;   br %r14

 function %iadd_i32_memoff(i32, i64) -> i32 {
@ -316,7 +319,8 @@ block0(v0: i32, v1: i64):
 }

 ; block0:
-;   aly %r2, 4096(%r3)
+;   ly %r4, 4096(%r3)
+;   alr %r2, %r4
 ;   br %r14

 function %isub_i64(i64, i64) -> i64 {
--- a/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
+++ b/cranelift/filetests/filetests/isa/s390x/heap_addr.clif
@ -13,15 +13,15 @@ block0(v0: i64, v1: i32):

 ; block0:
 ;   llgfr %r4, %r3
-;   lghi %r3, 0
-;   ag %r3, 0(%r2)
-;   clgr %r4, %r3
+;   lg %r5, 0(%r2)
+;   aghi %r5, 0
+;   clgr %r4, %r5
 ;   jgnh label1 ; jg label2
 ; block1:
 ;   agr %r2, %r4
-;   lghi %r5, 0
-;   clgr %r4, %r3
-;   locgrh %r2, %r5
+;   lghi %r3, 0
+;   clgr %r4, %r5
+;   locgrh %r2, %r3
 ;   br %r14
 ; block2:
 ;   trap
--- a/cranelift/filetests/filetests/isa/x64/load-op.clif
+++ b/cranelift/filetests/filetests/isa/x64/load-op.clif
@ -68,3 +68,14 @@ block0(v0: i64):
 block1:
  return v2
 }
+
+function %cmp_mem(i64) -> i64 {
+block0(v0: i64):
+  v1 = load.i64 v0
+  v2 = icmp eq v0, v1
+  v3 = bint.i64 v2
+  return v3
+
+  ; check:  cmpq    0(%rdi), %rdi
+  ; nextln: setz    %al
+}
--- a/cranelift/filetests/filetests/isa/x64/select-i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif
@ -13,8 +13,7 @@ block0(v0: i32, v1: i128, v2: i128):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    $42, %r9d
-;   cmpl    %r9d, %edi
+;   cmpl    $42, %edi
 ;   cmovzq  %rsi, %rcx, %rcx
 ;   cmovzq  %rdx, %r8, %r8
 ;   movq    %rcx, %rax