/src/wasmtime/cranelift/codegen/src/remove_constant_phis.rs
Line | Count | Source |
1 | | //! A Constant-Phi-Node removal pass. |
2 | | |
3 | | use crate::dominator_tree::DominatorTree; |
4 | | use crate::ir; |
5 | | use crate::ir::Function; |
6 | | use crate::ir::{Block, BlockArg, BlockCall, Inst, Value}; |
7 | | use crate::timing; |
8 | | use crate::{FxHashMap, FxHashSet}; |
9 | | use bumpalo::Bump; |
10 | | use cranelift_entity::SecondaryMap; |
11 | | use smallvec::SmallVec; |
12 | | |
13 | | // A note on notation. For the sake of clarity, this file uses the phrase |
14 | | // "formal parameters" to mean the `Value`s listed in the block head, and |
15 | | // "actual parameters" to mean the `Value`s passed in a branch or a jump: |
16 | | // |
17 | | // block4(v16: i32, v18: i32): <-- formal parameters |
18 | | // ... |
19 | | // brif v27, block7(v22, v24), block6 <-- actual parameters |
20 | | |
21 | | // This transformation pass (conceptually) partitions all values in the |
22 | | // function into two groups: |
23 | | // |
24 | | // * Group A: values defined by block formal parameters, except for the entry block. |
25 | | // |
26 | | // * Group B: All other values: that is, values defined by instructions, |
27 | | // and the formals of the entry block. |
28 | | // |
29 | | // For each value in Group A, it attempts to establish whether it will have |
30 | | // the value of exactly one member of Group B. If so, the formal parameter is |
31 | | // deleted, all corresponding actual parameters (in jumps/branches to the |
32 | | // defining block) are deleted, and a rename is inserted. |
33 | | // |
34 | | // The entry block is special-cased because (1) we don't know what values flow |
35 | | // to its formals and (2) in any case we can't change its formals. |
36 | | // |
37 | | // Work proceeds in three phases. |
38 | | // |
39 | | // * Phase 1: examine all instructions. For each block, make up a useful |
40 | | // grab-bag of information, `BlockSummary`, that summarises the block's |
41 | | // formals and jump/branch instruction. This is used by Phases 2 and 3. |
42 | | // |
43 | | // * Phase 2: for each value in Group A, try to find a single Group B value |
44 | | // that flows to it. This is done using a classical iterative forward |
45 | | // dataflow analysis over a simple constant-propagation style lattice. It |
46 | | // converges quickly in practice -- I have seen at most 4 iterations. This |
47 | | // is relatively cheap because the iteration is done over the |
48 | | // `BlockSummary`s, and does not visit each instruction. The resulting |
49 | | // fixed point is stored in a `SolverState`. |
50 | | // |
51 | | // * Phase 3: using the `SolverState` and `BlockSummary`, edit the function to |
52 | | // remove redundant formals and actuals, and to insert suitable renames. |
53 | | // |
54 | | // Note that the effectiveness of the analysis depends on on the fact that |
55 | | // there are no copy instructions in Cranelift's IR. If there were, the |
56 | | // computation of `actual_absval` in Phase 2 would have to be extended to |
57 | | // chase through such copies. |
58 | | // |
59 | | // For large functions, the analysis cost using the new AArch64 backend is about |
60 | | // 0.6% of the non-optimising compile time, as measured by instruction counts. |
61 | | // This transformation usually pays for itself several times over, though, by |
62 | | // reducing the isel/regalloc cost downstream. Gains of up to 7% have been |
63 | | // seen for large functions. |
64 | | |
65 | | /// The `Value`s (Group B) that can flow to a formal parameter (Group A). |
66 | | #[derive(Clone, Copy, Debug, PartialEq)] |
67 | | enum AbstractValue { |
68 | | /// Two or more values flow to this formal. |
69 | | Many, |
70 | | |
71 | | /// Exactly one value, as stated, flows to this formal. The `Value`s that |
72 | | /// can appear here are exactly: `Value`s defined by `Inst`s, plus the |
73 | | /// `Value`s defined by the formals of the entry block. Note that this is |
74 | | /// exactly the set of `Value`s that are *not* tracked in the solver below |
75 | | /// (see `SolverState`). |
76 | | One(Value /*Group B*/), |
77 | | |
78 | | /// No value flows to this formal. |
79 | | None, |
80 | | } |
81 | | |
82 | | impl AbstractValue { |
83 | 40.6M | fn join(self, other: AbstractValue) -> AbstractValue { |
84 | 40.6M | match (self, other) { |
85 | | // Joining with `None` has no effect |
86 | 4.45M | (AbstractValue::None, p2) => p2, |
87 | 0 | (p1, AbstractValue::None) => p1, |
88 | | // Joining with `Many` produces `Many` |
89 | 15.8M | (AbstractValue::Many, _p2) => AbstractValue::Many, |
90 | 290k | (_p1, AbstractValue::Many) => AbstractValue::Many, |
91 | | // The only interesting case |
92 | 20.0M | (AbstractValue::One(v1), AbstractValue::One(v2)) => { |
93 | 20.0M | if v1 == v2 { |
94 | 19.1M | AbstractValue::One(v1) |
95 | | } else { |
96 | 896k | AbstractValue::Many |
97 | | } |
98 | | } |
99 | | } |
100 | 40.6M | } |
101 | | |
102 | 19.1M | fn is_one(self) -> bool { |
103 | 19.1M | matches!(self, AbstractValue::One(_)) |
104 | 19.1M | } |
105 | | } |
106 | | |
107 | | #[derive(Clone, Copy, Debug)] |
108 | | struct OutEdge<'a> { |
109 | | /// An instruction that transfers control. |
110 | | inst: Inst, |
111 | | /// The index into branch_destinations for this instruction that corresponds |
112 | | /// to this edge. |
113 | | branch_index: u32, |
114 | | /// The block that control is transferred to. |
115 | | block: Block, |
116 | | /// The arguments to that block. |
117 | | /// |
118 | | /// These values can be from both groups A and B. |
119 | | args: &'a [BlockArg], |
120 | | } |
121 | | |
122 | | impl<'a> OutEdge<'a> { |
123 | | /// Construct a new `OutEdge` for the given instruction. |
124 | | /// |
125 | | /// Returns `None` if this is an edge without any block arguments, which |
126 | | /// means we can ignore it for this analysis's purposes. |
127 | | #[inline] |
128 | 13.0M | fn new( |
129 | 13.0M | bump: &'a Bump, |
130 | 13.0M | dfg: &ir::DataFlowGraph, |
131 | 13.0M | inst: Inst, |
132 | 13.0M | branch_index: usize, |
133 | 13.0M | block: BlockCall, |
134 | 13.0M | ) -> Option<Self> { |
135 | 13.0M | let inst_var_args = block.args(&dfg.value_lists); |
136 | | |
137 | | // Skip edges without params. |
138 | 13.0M | if inst_var_args.len() == 0 { |
139 | 9.13M | return None; |
140 | 3.94M | } |
141 | | |
142 | | Some(OutEdge { |
143 | 3.94M | inst, |
144 | 3.94M | branch_index: branch_index as u32, |
145 | 3.94M | block: block.block(&dfg.value_lists), |
146 | 3.94M | args: bump.alloc_slice_fill_iter( |
147 | 17.7M | inst_var_args.map(|arg| arg.map_value(|value| dfg.resolve_aliases(value))), |
148 | | ), |
149 | | }) |
150 | 13.0M | } |
151 | | } |
152 | | |
153 | | /// For some block, a useful bundle of info. The `Block` itself is not stored |
154 | | /// here since it will be the key in the associated `FxHashMap` -- see |
155 | | /// `summaries` below. For the `SmallVec` tuning params: most blocks have |
156 | | /// few parameters, hence `4`. And almost all blocks have either one or two |
157 | | /// successors, hence `2`. |
158 | | #[derive(Clone, Debug, Default)] |
159 | | struct BlockSummary<'a> { |
160 | | /// Formal parameters for this `Block`. |
161 | | /// |
162 | | /// These values are from group A. |
163 | | formals: &'a [Value], |
164 | | |
165 | | /// Each outgoing edge from this block. |
166 | | /// |
167 | | /// We don't bother to include transfers that pass zero parameters |
168 | | /// since that makes more work for the solver for no purpose. |
169 | | /// |
170 | | /// We optimize for the case where a branch instruction has up to two |
171 | | /// outgoing edges, as unconditional jumps and conditional branches are |
172 | | /// more prominent than br_table. |
173 | | dests: SmallVec<[OutEdge<'a>; 2]>, |
174 | | } |
175 | | |
176 | | impl<'a> BlockSummary<'a> { |
177 | | /// Construct a new `BlockSummary`, using `values` as its backing storage. |
178 | | #[inline] |
179 | 8.52M | fn new(bump: &'a Bump, formals: &[Value]) -> Self { |
180 | 8.52M | Self { |
181 | 8.52M | formals: bump.alloc_slice_copy(formals), |
182 | 8.52M | dests: Default::default(), |
183 | 8.52M | } |
184 | 8.52M | } |
185 | | } |
186 | | |
187 | | /// Solver state. This holds a AbstractValue for each formal parameter, except |
188 | | /// for those from the entry block. |
189 | | struct SolverState { |
190 | | absvals: FxHashMap<Value /*Group A*/, AbstractValue>, |
191 | | } |
192 | | |
193 | | impl SolverState { |
194 | 1.07M | fn new() -> Self { |
195 | 1.07M | Self { |
196 | 1.07M | absvals: FxHashMap::default(), |
197 | 1.07M | } |
198 | 1.07M | } |
199 | | |
200 | 58.3M | fn get(&self, actual: Value) -> AbstractValue { |
201 | 58.3M | *self |
202 | 58.3M | .absvals |
203 | 58.3M | .get(&actual) |
204 | 58.3M | .unwrap_or_else(|| panic!("SolverState::get: formal param {actual:?} is untracked?!")) |
205 | 58.3M | } |
206 | | |
207 | 39.9M | fn maybe_get(&self, actual: Value) -> Option<&AbstractValue> { |
208 | 39.9M | self.absvals.get(&actual) |
209 | 39.9M | } |
210 | | |
211 | 5.64M | fn set(&mut self, actual: Value, lp: AbstractValue) { |
212 | 5.64M | match self.absvals.insert(actual, lp) { |
213 | 5.64M | Some(_old_lp) => {} |
214 | 0 | None => panic!("SolverState::set: formal param {actual:?} is untracked?!"), |
215 | | } |
216 | 5.64M | } |
217 | | } |
218 | | |
219 | | /// Detect phis in `func` that will only ever produce one value, using a |
220 | | /// classic forward dataflow analysis. Then remove them. |
221 | | #[inline(never)] |
222 | 1.07M | pub fn do_remove_constant_phis(func: &mut Function, domtree: &mut DominatorTree) { |
223 | 1.07M | let _tt = timing::remove_constant_phis(); |
224 | 1.07M | debug_assert!(domtree.is_valid()); |
225 | | |
226 | | // Phase 1 of 3: for each block, make a summary containing all relevant |
227 | | // info. The solver will iterate over the summaries, rather than having |
228 | | // to inspect each instruction in each block. |
229 | 1.07M | let bump = |
230 | 1.07M | Bump::with_capacity(domtree.cfg_postorder().len() * 4 * core::mem::size_of::<Value>()); |
231 | 1.07M | let mut summaries = |
232 | 1.07M | SecondaryMap::<Block, BlockSummary>::with_capacity(domtree.cfg_postorder().len()); |
233 | | |
234 | 8.52M | for b in domtree.cfg_rpo().copied() { |
235 | 8.52M | let formals = func.dfg.block_params(b); |
236 | 8.52M | let mut summary = BlockSummary::new(&bump, formals); |
237 | | |
238 | 76.3M | for inst in func.layout.block_insts(b) { |
239 | 76.3M | for (ix, dest) in func.dfg.insts[inst] |
240 | 76.3M | .branch_destination(&func.dfg.jump_tables, &func.dfg.exception_tables) |
241 | 76.3M | .iter() |
242 | 76.3M | .enumerate() |
243 | | { |
244 | 13.0M | if let Some(edge) = OutEdge::new(&bump, &func.dfg, inst, ix, *dest) { |
245 | 3.94M | summary.dests.push(edge); |
246 | 9.13M | } |
247 | | } |
248 | | } |
249 | | |
250 | | // Ensure the invariant that all blocks (except for the entry) appear |
251 | | // in the summary, *unless* they have neither formals nor any |
252 | | // param-carrying branches/jumps. |
253 | 8.52M | if formals.len() > 0 || summary.dests.len() > 0 { |
254 | 3.75M | summaries[b] = summary; |
255 | 4.77M | } |
256 | | } |
257 | | |
258 | | // Phase 2 of 3: iterate over the summaries in reverse postorder, |
259 | | // computing new `AbstractValue`s for each tracked `Value`. The set of |
260 | | // tracked `Value`s is exactly Group A as described above. |
261 | | |
262 | 1.07M | let entry_block = func |
263 | 1.07M | .layout |
264 | 1.07M | .entry_block() |
265 | 1.07M | .expect("remove_constant_phis: entry block unknown"); |
266 | | |
267 | | // Set up initial solver state |
268 | 1.07M | let mut state = SolverState::new(); |
269 | | |
270 | 8.52M | for b in domtree.cfg_rpo().copied() { |
271 | | // For each block, get the formals |
272 | 8.52M | if b == entry_block { |
273 | 1.07M | continue; |
274 | 7.45M | } |
275 | 7.45M | let formals = func.dfg.block_params(b); |
276 | 7.45M | for formal in formals { |
277 | 4.45M | let mb_old_absval = state.absvals.insert(*formal, AbstractValue::None); |
278 | 4.45M | assert!(mb_old_absval.is_none()); |
279 | | } |
280 | | } |
281 | | |
282 | | // Solve: repeatedly traverse the blocks in reverse postorder, until there |
283 | | // are no changes. |
284 | 1.07M | let mut iter_no = 0; |
285 | | loop { |
286 | 1.46M | iter_no += 1; |
287 | 1.46M | let mut changed = false; |
288 | | |
289 | 15.3M | for src in domtree.cfg_rpo().copied() { |
290 | 15.3M | let src_summary = &summaries[src]; |
291 | 15.3M | for edge in &src_summary.dests { |
292 | 8.48M | assert!(edge.block != entry_block); |
293 | | // By contrast, the dst block must have a summary. Phase 1 |
294 | | // will have only included an entry in `src_summary.dests` if |
295 | | // that branch/jump carried at least one parameter. So the |
296 | | // dst block does take parameters, so it must have a summary. |
297 | 8.48M | let dst_summary = &summaries[edge.block]; |
298 | 8.48M | let dst_formals = &dst_summary.formals; |
299 | 8.48M | assert_eq!(edge.args.len(), dst_formals.len()); |
300 | 40.6M | for (formal, actual) in dst_formals.iter().zip(edge.args) { |
301 | | // Find the abstract value for `actual`. If it is a block |
302 | | // formal parameter then the most recent abstract value is |
303 | | // to be found in the solver state. If not, then it's a |
304 | | // real value defining point (not a phi), in which case |
305 | | // return it itself. |
306 | 40.6M | let actual_absval = match actual { |
307 | 39.9M | BlockArg::Value(actual) => match state.maybe_get(*actual) { |
308 | 26.8M | Some(pt) => *pt, |
309 | 13.1M | None => AbstractValue::One(*actual), |
310 | | }, |
311 | 668k | _ => AbstractValue::Many, |
312 | | }; |
313 | | |
314 | | // And `join` the new value with the old. |
315 | 40.6M | let formal_absval_old = state.get(*formal); |
316 | 40.6M | let formal_absval_new = formal_absval_old.join(actual_absval); |
317 | 40.6M | if formal_absval_new != formal_absval_old { |
318 | 5.64M | changed = true; |
319 | 5.64M | state.set(*formal, formal_absval_new); |
320 | 34.9M | } |
321 | | } |
322 | | } |
323 | | } |
324 | | |
325 | 1.46M | if !changed { |
326 | 1.07M | break; |
327 | 397k | } |
328 | | } |
329 | | |
330 | 1.07M | let mut n_consts = 0; |
331 | 4.45M | for absval in state.absvals.values() { |
332 | 4.45M | if absval.is_one() { |
333 | 2.53M | n_consts += 1; |
334 | 2.53M | } |
335 | | } |
336 | | |
337 | | // Phase 3 of 3: edit the function to remove constant formals, using the |
338 | | // summaries and the final solver state as a guide. |
339 | | |
340 | | // Make up a set of blocks that need editing. |
341 | 1.07M | let mut need_editing = FxHashSet::<Block>::default(); |
342 | 7.31M | for (block, summary) in summaries.iter() { |
343 | 7.31M | if block == entry_block { |
344 | 1.05M | continue; |
345 | 6.26M | } |
346 | 6.26M | for formal in summary.formals { |
347 | 2.17M | let formal_absval = state.get(*formal); |
348 | 2.17M | if formal_absval.is_one() { |
349 | 528k | need_editing.insert(block); |
350 | 528k | break; |
351 | 1.64M | } |
352 | | } |
353 | | } |
354 | | |
355 | | // Firstly, deal with the formals. For each formal which is redundant, |
356 | | // remove it, and also add a reroute from it to the constant value which |
357 | | // it we know it to be. |
358 | 1.07M | for b in &need_editing { |
359 | 528k | let mut del_these = SmallVec::<[(Value, Value); 32]>::new(); |
360 | 528k | let formals: &[Value] = func.dfg.block_params(*b); |
361 | 3.03M | for formal in formals { |
362 | | // The state must give an absval for `formal`. |
363 | 3.03M | if let AbstractValue::One(replacement_val) = state.get(*formal) { |
364 | 2.53M | del_these.push((*formal, replacement_val)); |
365 | 2.53M | } |
366 | | } |
367 | | // We can delete the formals in any order. However, |
368 | | // `remove_block_param` works by sliding backwards all arguments to |
369 | | // the right of the value it is asked to delete. Hence when removing more |
370 | | // than one formal, it is significantly more efficient to ask it to |
371 | | // remove the rightmost formal first, and hence this `rev()`. |
372 | 2.53M | for (redundant_formal, replacement_val) in del_these.into_iter().rev() { |
373 | 2.53M | func.dfg.remove_block_param(redundant_formal); |
374 | 2.53M | func.dfg.change_to_alias(redundant_formal, replacement_val); |
375 | 2.53M | } |
376 | | } |
377 | | |
378 | | // Secondly, visit all branch insns. If the destination has had its |
379 | | // formals changed, change the actuals accordingly. Don't scan all insns, |
380 | | // rather just visit those as listed in the summaries we prepared earlier. |
381 | 1.07M | let mut old_actuals = alloc::vec::Vec::new(); |
382 | 7.31M | for summary in summaries.values() { |
383 | 7.31M | for edge in &summary.dests { |
384 | 3.94M | if !need_editing.contains(&edge.block) { |
385 | 1.90M | continue; |
386 | 2.04M | } |
387 | | |
388 | 2.04M | let dfg = &mut func.dfg; |
389 | 2.04M | let dests = dfg.insts[edge.inst] |
390 | 2.04M | .branch_destination_mut(&mut dfg.jump_tables, &mut dfg.exception_tables); |
391 | 2.04M | let block = &mut dests[edge.branch_index as usize]; |
392 | | |
393 | 2.04M | old_actuals.extend(block.args(&dfg.value_lists)); |
394 | | |
395 | | // Check that the numbers of arguments make sense. |
396 | 2.04M | let formals = &summaries[edge.block].formals; |
397 | 2.04M | assert_eq!(formals.len(), old_actuals.len()); |
398 | | |
399 | | // Filter out redundant block arguments. |
400 | 2.04M | let mut formals = formals.iter(); |
401 | 12.5M | old_actuals.retain(|_| { |
402 | 12.5M | let formal_i = formals.next().unwrap(); |
403 | 12.5M | !state.get(*formal_i).is_one() |
404 | 12.5M | }); |
405 | | |
406 | | // Replace the block with a new one that only includes the non-redundant arguments. |
407 | | // This leaks the value list from the old block, |
408 | | // https://github.com/bytecodealliance/wasmtime/issues/5451 for more information. |
409 | 2.04M | let destination = block.block(&dfg.value_lists); |
410 | 2.04M | *block = BlockCall::new(destination, old_actuals.drain(..), &mut dfg.value_lists); |
411 | | } |
412 | | } |
413 | | |
414 | 1.07M | log::debug!( |
415 | | "do_remove_constant_phis: done, {} iters. {} formals, of which {} const.", |
416 | | iter_no, |
417 | 0 | state.absvals.len(), |
418 | | n_consts |
419 | | ); |
420 | 1.07M | } |