Browse Source

Cranelift: Use a fixpoint loop to compute the best value for each eclass (#7859)

* Cranelift: Use a fixpoint loop to compute the best value for each eclass

Fixes #7857

* Remove fixpoint loop early-continue optimization

* Add document describing optimization rule invariants

* Make select optimizations use subsume

* Remove invalid debug assert

* Remove now-unused methods

* Add commutative adds to cost tests
pull/7880/head
Nick Fitzgerald 9 months ago
committed by GitHub
parent
commit
5b2ae8365e
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 1
      cranelift/codegen/src/egraph.rs
  2. 78
      cranelift/codegen/src/egraph/cost.rs
  3. 149
      cranelift/codegen/src/egraph/elaborate.rs
  4. 86
      cranelift/codegen/src/opts/README.md
  5. 10
      cranelift/codegen/src/opts/cprop.isle
  6. 37
      cranelift/filetests/filetests/egraph/issue-7875.clif

1
cranelift/codegen/src/egraph.rs

@ -701,4 +701,5 @@ pub(crate) struct Stats {
pub(crate) elaborate_func: u64,
pub(crate) elaborate_func_pre_insts: u64,
pub(crate) elaborate_func_post_insts: u64,
pub(crate) elaborate_best_cost_fixpoint_iters: u64,
}

78
cranelift/codegen/src/egraph/cost.rs

@ -74,7 +74,7 @@ impl Cost {
const DEPTH_BITS: u8 = 8;
const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
const MAX_OP_COST: u32 = (Self::OP_COST_MASK >> Self::DEPTH_BITS) - 1;
const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
pub(crate) fn infinity() -> Cost {
// 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
@ -86,14 +86,16 @@ impl Cost {
Cost(0)
}
/// Construct a new finite cost from the given parts.
/// Construct a new `Cost` from the given parts.
///
/// The opcode cost is clamped to the maximum value representable.
fn new_finite(opcode_cost: u32, depth: u8) -> Cost {
let opcode_cost = std::cmp::min(opcode_cost, Self::MAX_OP_COST);
let cost = Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth));
debug_assert_ne!(cost, Cost::infinity());
cost
/// If the opcode cost is greater than or equal to the maximum representable
/// opcode cost, then the resulting `Cost` saturates to infinity.
fn new(opcode_cost: u32, depth: u8) -> Cost {
if opcode_cost >= Self::MAX_OP_COST {
Self::infinity()
} else {
Cost(opcode_cost << Self::DEPTH_BITS | u32::from(depth))
}
}
fn depth(&self) -> u8 {
@ -111,7 +113,7 @@ impl Cost {
/// that satisfies `inst_predicates::is_pure_for_egraph()`.
pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
let c = pure_op_cost(op) + operand_costs.into_iter().sum();
Cost::new_finite(c.op_cost(), c.depth().saturating_add(1))
Cost::new(c.op_cost(), c.depth().saturating_add(1))
}
}
@ -131,12 +133,9 @@ impl std::ops::Add<Cost> for Cost {
type Output = Cost;
fn add(self, other: Cost) -> Cost {
let op_cost = std::cmp::min(
self.op_cost().saturating_add(other.op_cost()),
Self::MAX_OP_COST,
);
let op_cost = self.op_cost().saturating_add(other.op_cost());
let depth = std::cmp::max(self.depth(), other.depth());
Cost::new_finite(op_cost, depth)
Cost::new(op_cost, depth)
}
}
@ -147,11 +146,11 @@ impl std::ops::Add<Cost> for Cost {
fn pure_op_cost(op: Opcode) -> Cost {
match op {
// Constants.
Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new_finite(1, 0),
Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
// Extends/reduces.
Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
Cost::new_finite(2, 0)
Cost::new(2, 0)
}
// "Simple" arithmetic.
@ -163,9 +162,52 @@ fn pure_op_cost(op: Opcode) -> Cost {
| Opcode::Bnot
| Opcode::Ishl
| Opcode::Ushr
| Opcode::Sshr => Cost::new_finite(3, 0),
| Opcode::Sshr => Cost::new(3, 0),
// Everything else (pure.)
_ => Cost::new_finite(4, 0),
_ => Cost::new(4, 0),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn add_cost() {
let a = Cost::new(5, 2);
let b = Cost::new(37, 3);
assert_eq!(a + b, Cost::new(42, 3));
assert_eq!(b + a, Cost::new(42, 3));
}
#[test]
fn add_infinity() {
let a = Cost::new(5, 2);
let b = Cost::infinity();
assert_eq!(a + b, Cost::infinity());
assert_eq!(b + a, Cost::infinity());
}
#[test]
fn op_cost_saturates_to_infinity() {
let a = Cost::new(Cost::MAX_OP_COST - 10, 2);
let b = Cost::new(11, 2);
assert_eq!(a + b, Cost::infinity());
assert_eq!(b + a, Cost::infinity());
}
#[test]
fn depth_saturates_to_max_depth() {
let a = Cost::new(10, u8::MAX);
let b = Cost::new(10, 1);
assert_eq!(
Cost::of_pure_op(Opcode::Iconst, [a, b]),
Cost::new(21, u8::MAX)
);
assert_eq!(
Cost::of_pure_op(Opcode::Iconst, [b, a]),
Cost::new(21, u8::MAX)
);
}
}

149
cranelift/codegen/src/egraph/elaborate.rs

@ -7,6 +7,7 @@ use super::Stats;
use crate::dominator_tree::DominatorTree;
use crate::fx::{FxHashMap, FxHashSet};
use crate::hash_map::Entry as HashEntry;
use crate::inst_predicates::is_pure_for_egraph;
use crate::ir::{Block, Function, Inst, Value, ValueDef};
use crate::loop_analysis::{Loop, LoopAnalysis};
use crate::scoped_hash_map::ScopedHashMap;
@ -216,46 +217,112 @@ impl<'a> Elaborator<'a> {
fn compute_best_values(&mut self) {
let best = &mut self.value_to_best_value;
for (value, def) in self.func.dfg.values_and_defs() {
trace!("computing best for value {:?} def {:?}", value, def);
match def {
ValueDef::Union(x, y) => {
// Pick the best of the two options based on
// min-cost. This works because each element of `best`
// is a `(cost, value)` tuple; `cost` comes first so
// the natural comparison works based on cost, and
// breaks ties based on value number.
trace!(" -> best of {:?} and {:?}", best[x], best[y]);
best[value] = std::cmp::min(best[x], best[y]);
trace!(" -> {:?}", best[value]);
}
ValueDef::Param(_, _) => {
best[value] = BestEntry(Cost::zero(), value);
}
// If the Inst is inserted into the layout (which is,
// at this point, only the side-effecting skeleton),
// then it must be computed and thus we give it zero
// cost.
ValueDef::Result(inst, _) => {
if let Some(_) = self.func.layout.inst_block(inst) {
best[value] = BestEntry(Cost::zero(), value);
} else {
trace!(" -> value {}: result, computing cost", value);
let inst_data = &self.func.dfg.insts[inst];
// N.B.: at this point we know that the opcode is
// pure, so `pure_op_cost`'s precondition is
// satisfied.
let cost = Cost::of_pure_op(
inst_data.opcode(),
self.func.dfg.inst_values(inst).map(|value| best[value].0),
// Do a fixpoint loop to compute the best value for each eclass.
//
// The maximum number of iterations is the length of the longest chain
// of `vNN -> vMM` edges in the dataflow graph where `NN < MM`, so this
// is *technically* quadratic, but `cranelift-frontend` won't construct
// any such edges. NaN canonicalization will introduce some of these
// edges, but they are chains of only two or three edges. So in
// practice, we *never* do more than a handful of iterations here unless
// (a) we parsed the CLIF from text and the text was funkily numbered,
// which we don't really care about, or (b) the CLIF producer did
// something weird, in which case it is their responsibility to stop
// doing that.
trace!("Entering fixpoint loop to compute the best values for each eclass");
let mut keep_going = true;
while keep_going {
keep_going = false;
trace!(
"fixpoint iteration {}",
self.stats.elaborate_best_cost_fixpoint_iters
);
self.stats.elaborate_best_cost_fixpoint_iters += 1;
for (value, def) in self.func.dfg.values_and_defs() {
trace!("computing best for value {:?} def {:?}", value, def);
let orig_best_value = best[value];
match def {
ValueDef::Union(x, y) => {
// Pick the best of the two options based on
// min-cost. This works because each element of `best`
// is a `(cost, value)` tuple; `cost` comes first so
// the natural comparison works based on cost, and
// breaks ties based on value number.
best[value] = std::cmp::min(best[x], best[y]);
trace!(
" -> best of union({:?}, {:?}) = {:?}",
best[x],
best[y],
best[value]
);
best[value] = BestEntry(cost, value);
}
}
};
debug_assert_ne!(best[value].0, Cost::infinity());
debug_assert_ne!(best[value].1, Value::reserved_value());
trace!("best for eclass {:?}: {:?}", value, best[value]);
ValueDef::Param(_, _) => {
best[value] = BestEntry(Cost::zero(), value);
}
// If the Inst is inserted into the layout (which is,
// at this point, only the side-effecting skeleton),
// then it must be computed and thus we give it zero
// cost.
ValueDef::Result(inst, _) => {
if let Some(_) = self.func.layout.inst_block(inst) {
best[value] = BestEntry(Cost::zero(), value);
} else {
let inst_data = &self.func.dfg.insts[inst];
// N.B.: at this point we know that the opcode is
// pure, so `pure_op_cost`'s precondition is
// satisfied.
let cost = Cost::of_pure_op(
inst_data.opcode(),
self.func.dfg.inst_values(inst).map(|value| best[value].0),
);
best[value] = BestEntry(cost, value);
trace!(" -> cost of value {} = {:?}", value, cost);
}
}
};
// Keep on iterating the fixpoint loop while we are finding new
// best values.
keep_going |= orig_best_value != best[value];
}
}
if cfg!(any(feature = "trace-log", debug_assertions)) {
trace!("finished fixpoint loop to compute best value for each eclass");
for value in self.func.dfg.values() {
trace!("-> best for eclass {:?}: {:?}", value, best[value]);
debug_assert_ne!(best[value].1, Value::reserved_value());
// You might additionally be expecting an assert that the best
// cost is not infinity, however infinite cost *can* happen in
// practice. First, note that our cost function doesn't know
// about any shared structure in the dataflow graph, it only
// sums operand costs. (And trying to avoid that by deduping a
// single operation's operands is a losing game because you can
// always just add one indirection and go from `add(x, x)` to
// `add(foo(x), bar(x))` to hide the shared structure.) Given
// that blindness to sharing, we can make cost grow
// exponentially with a linear sequence of operations:
//
// v0 = iconst.i32 1 ;; cost = 1
// v1 = iadd v0, v0 ;; cost = 3 + 1 + 1
// v2 = iadd v1, v1 ;; cost = 3 + 5 + 5
// v3 = iadd v2, v2 ;; cost = 3 + 13 + 13
// v4 = iadd v3, v3 ;; cost = 3 + 29 + 29
// v5 = iadd v4, v4 ;; cost = 3 + 61 + 61
// v6 = iadd v5, v5 ;; cost = 3 + 125 + 125
// ;; etc...
//
// Such a chain can cause cost to saturate to infinity. How do
// we choose which e-node is best when there are multiple that
// have saturated to infinity? It doesn't matter. As long as
// invariant (2) for optimization rules is upheld by our rule
// set (see `cranelift/codegen/src/opts/README.md`) it is safe
// to choose *any* e-node in the e-class. At worst we will
// produce suboptimal code, but never an incorrectness.
}
}
}
@ -606,7 +673,13 @@ impl<'a> Elaborator<'a> {
}
inst
};
// Place the inst just before `before`.
debug_assert!(
is_pure_for_egraph(self.func, inst),
"something has gone very wrong if we are elaborating effectful \
instructions, they should have remained in the skeleton"
);
self.func.layout.insert_inst(inst, before);
// Update the inst's arguments.

86
cranelift/codegen/src/opts/README.md

@ -1,5 +1,81 @@
Rules here are allowed to rewrite pure expressions arbitrarily,
using the same inputs as the original, or fewer. In other words, we
cannot pull a new eclass id out of thin air and refer to it, other
than a piece of the input or a new node that we construct; but we
can freely rewrite e.g. `x+y-y` to `x`.
# Rules for Writing Optimization Rules
For both correctness and compile speed, we must be careful with our rules. A lot
of it boils down to the fact that, unlike traditional e-graphs, our rules are
*directional*.
1. Rules should not rewrite to worse code: the right-hand side should be at
least as good as the left-hand side or better.
For example, the rule
x => (add x 0)
is disallowed, but swapping its left- and right-hand sides produces a rule
that is allowed.
Any kind of canonicalizing rule that intends to help subsequent rules match
and unlock further optimizations (e.g. floating constants to the right side
for our constant-propagation rules to match) must produce canonicalized
output that is no worse than its noncanonical input.
We assume this invariant as a heuristic to break ties between two
otherwise-equal-cost expressions in various places, making up for some
limitations of our explicit cost function.
2. Any rule that removes value-uses in its right-hand side that previously
existed in its left-hand side MUST use `subsume`.
For example, the rule
(select 1 x y) => x
MUST use `subsume`.
This is required for correctness because, once a value-use is removed, some
e-nodes in the e-class are more equal than others. There might be uses of `x`
in a scope where `y` is not available, and so emitting `(select 1 x y)` in
place of `x` in such cases would introduce uses of `y` where it is not
defined.
3. Avoid overly general rewrites like commutativity and associativity. Instead,
prefer targeted instances of the rewrite (for example, canonicalizing adds
where one operand is a constant such that the constant is always the add's
second operand, rather than general commutativity for adds) or even writing
the "same" optimization rule multiple times.
For example, the commutativity in the first rule in the following snippet is
bad because it will match even when the first operand is not an add:
;; Commute to allow `(foo (add ...) x)`, when we see it, to match.
(foo x y) => (foo y x)
;; Optimize.
(foo x (add ...)) => (bar x)
Better is to commute only when we know that canonicalizing in this way will
all definitely allow the subsequent optimization rule to match:
;; Canonicalize all adds to `foo`'s second operand.
(foo (add ...) x) => (foo x (add ...))
;; Optimize.
(foo x (add ...)) => (bar x)
But even better in this case is to write the "same" optimization multiple
times:
(foo (add ...) x) => (bar x)
(foo x (add ...)) => (bar x)
The cost of rule-matching is amortized by the ISLE compiler, where as the
intermediate result of each rewrite allocates new e-nodes and requires
storage in the dataflow graph. Therefore, additional rules are cheaper than
additional e-nodes.
Commutativity and associativity in particular can cause huge amounts of
e-graph bloat.
One day we intend to extend ISLE with built-in support for commutativity, so
we don't need to author the redundant commutations ourselves:
https://github.com/bytecodealliance/wasmtime/issues/6128

10
cranelift/codegen/src/opts/cprop.isle

@ -167,12 +167,10 @@
(bxor ty (bxor ty x k1 @ (iconst ty _)) k2 @ (iconst ty _)))
(bxor ty x (bxor ty k1 k2)))
(rule (simplify
(select ty (iconst_u _ (u64_nonzero _)) x y))
x)
(rule (simplify
(select ty (iconst_u _ 0) x y))
y)
(rule (simplify (select ty (iconst_u _ (u64_nonzero _)) x _))
(subsume x))
(rule (simplify (select ty (iconst_u _ 0) _ y))
(subsume y))
;; Replace subtraction by a "negative" constant with addition.
;; Notably, this gives `x - (-1) == x + 1`, so other patterns don't have to

37
cranelift/filetests/filetests/egraph/issue-7875.clif

@ -0,0 +1,37 @@
test optimize
set enable_verifier=true
set opt_level=speed
target x86_64
;; This test case should optimize just fine, and should definitely not produce
;; CLIF that has verifier errors like
;;
;; error: inst10 (v12 = select.f32 v11, v4, v10 ; v11 = 1): uses value arg
;; from non-dominating block4
function %foo() {
block0:
v0 = iconst.i64 0
v2 = f32const 0.0
v9 = f32const 0.0
v20 = fneg v2
v18 = fcmp eq v20, v20
v4 = select v18, v2, v20
v8 = iconst.i32 0
v11 = iconst.i32 1
brif v0, block2, block3
block2:
brif.i32 v8, block4(v2), block4(v9)
block4(v10: f32):
v12 = select.f32 v11, v4, v10
v13 = bitcast.i32 v12
store v13, v0
trap user0
block3:
v15 = bitcast.i32 v4
store v15, v0
return
}
Loading…
Cancel
Save