/src/regex/regex-automata/src/dfa/minimize.rs
Line | Count | Source |
1 | | use core::{cell::RefCell, fmt, mem}; |
2 | | |
3 | | use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; |
4 | | |
5 | | use crate::{ |
6 | | dfa::{automaton::Automaton, dense, DEAD}, |
7 | | util::{ |
8 | | alphabet, |
9 | | primitives::{PatternID, StateID}, |
10 | | }, |
11 | | }; |
12 | | |
13 | | /// An implementation of Hopcroft's algorithm for minimizing DFAs. |
14 | | /// |
15 | | /// The algorithm implemented here is mostly taken from Wikipedia: |
16 | | /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm |
17 | | /// |
18 | | /// This code has had some light optimization attention paid to it, |
19 | | /// particularly in the form of reducing allocation as much as possible. |
20 | | /// However, it is still generally slow. Future optimization work should |
21 | | /// probably focus on the bigger picture rather than micro-optimizations. For |
22 | | /// example: |
23 | | /// |
24 | | /// 1. Figure out how to more intelligently create initial partitions. That is, |
25 | | /// Hopcroft's algorithm starts by creating two partitions of DFA states |
26 | | /// that are known to NOT be equivalent: match states and non-match states. |
27 | | /// The algorithm proceeds by progressively refining these partitions into |
28 | | /// smaller partitions. If we could start with more partitions, then we |
29 | | /// could reduce the amount of work that Hopcroft's algorithm needs to do. |
30 | | /// 2. For every partition that we visit, we find all incoming transitions to |
31 | | /// every state in the partition for *every* element in the alphabet. (This |
32 | | /// is why using byte classes can significantly decrease minimization times, |
33 | | /// since byte classes shrink the alphabet.) This is quite costly and there |
34 | | /// is perhaps some redundant work being performed depending on the specific |
35 | | /// states in the set. For example, we might be able to only visit some |
36 | | /// elements of the alphabet based on the transitions. |
37 | | /// 3. Move parts of minimization into determinization. If minimization has |
38 | | /// fewer states to deal with, then it should run faster. A prime example |
39 | | /// of this might be large Unicode classes, which are generated in way that |
40 | | /// can create a lot of redundant states. (Some work has been done on this |
41 | | /// point during NFA compilation via the algorithm described in the |
42 | | /// "Incremental Construction of MinimalAcyclic Finite-State Automata" |
43 | | /// paper.) |
44 | | pub(crate) struct Minimizer<'a> { |
45 | | dfa: &'a mut dense::OwnedDFA, |
46 | | in_transitions: Vec<Vec<Vec<StateID>>>, |
47 | | partitions: Vec<StateSet>, |
48 | | waiting: Vec<StateSet>, |
49 | | } |
50 | | |
51 | | impl<'a> fmt::Debug for Minimizer<'a> { |
52 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
53 | 0 | f.debug_struct("Minimizer") |
54 | 0 | .field("dfa", &self.dfa) |
55 | 0 | .field("in_transitions", &self.in_transitions) |
56 | 0 | .field("partitions", &self.partitions) |
57 | 0 | .field("waiting", &self.waiting) |
58 | 0 | .finish() |
59 | 0 | } |
60 | | } |
61 | | |
62 | | /// A set of states. A state set makes up a single partition in Hopcroft's |
63 | | /// algorithm. |
64 | | /// |
65 | | /// It is represented by an ordered set of state identifiers. We use shared |
66 | | /// ownership so that a single state set can be in both the set of partitions |
67 | | /// and in the set of waiting sets simultaneously without an additional |
68 | | /// allocation. Generally, once a state set is built, it becomes immutable. |
69 | | /// |
70 | | /// We use this representation because it avoids the overhead of more |
71 | | /// traditional set data structures (HashSet/BTreeSet), and also because |
72 | | /// computing intersection/subtraction on this representation is especially |
73 | | /// fast. |
74 | | #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] |
75 | | struct StateSet { |
76 | | ids: Rc<RefCell<Vec<StateID>>>, |
77 | | } |
78 | | |
79 | | impl<'a> Minimizer<'a> { |
80 | 0 | pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { |
81 | 0 | let in_transitions = Minimizer::incoming_transitions(dfa); |
82 | 0 | let partitions = Minimizer::initial_partitions(dfa); |
83 | 0 | let waiting = partitions.clone(); |
84 | 0 | Minimizer { dfa, in_transitions, partitions, waiting } |
85 | 0 | } |
86 | | |
87 | 0 | pub fn run(mut self) { |
88 | 0 | let stride2 = self.dfa.stride2(); |
89 | 0 | let as_state_id = |index: usize| -> StateID { |
90 | 0 | StateID::new(index << stride2).unwrap() |
91 | 0 | }; |
92 | 0 | let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; |
93 | | |
94 | 0 | let mut incoming = StateSet::empty(); |
95 | 0 | let mut scratch1 = StateSet::empty(); |
96 | 0 | let mut scratch2 = StateSet::empty(); |
97 | 0 | let mut newparts = vec![]; |
98 | | |
99 | | // This loop is basically Hopcroft's algorithm. Everything else is just |
100 | | // shuffling data around to fit our representation. |
101 | 0 | while let Some(set) = self.waiting.pop() { |
102 | 0 | for b in self.dfa.byte_classes().iter() { |
103 | 0 | self.find_incoming_to(b, &set, &mut incoming); |
104 | | // If incoming is empty, then the intersection with any other |
105 | | // set must also be empty. So 'newparts' just ends up being |
106 | | // 'self.partitions'. So there's no need to go through the loop |
107 | | // below. |
108 | | // |
109 | | // This actually turns out to be rather large optimization. On |
110 | | // the order of making minimization 4-5x faster. It's likely |
111 | | // that the vast majority of all states have very few incoming |
112 | | // transitions. |
113 | 0 | if incoming.is_empty() { |
114 | 0 | continue; |
115 | 0 | } |
116 | | |
117 | 0 | for p in 0..self.partitions.len() { |
118 | 0 | self.partitions[p].intersection(&incoming, &mut scratch1); |
119 | 0 | if scratch1.is_empty() { |
120 | 0 | newparts.push(self.partitions[p].clone()); |
121 | 0 | continue; |
122 | 0 | } |
123 | | |
124 | 0 | self.partitions[p].subtract(&incoming, &mut scratch2); |
125 | 0 | if scratch2.is_empty() { |
126 | 0 | newparts.push(self.partitions[p].clone()); |
127 | 0 | continue; |
128 | 0 | } |
129 | | |
130 | 0 | let (x, y) = |
131 | 0 | (scratch1.deep_clone(), scratch2.deep_clone()); |
132 | 0 | newparts.push(x.clone()); |
133 | 0 | newparts.push(y.clone()); |
134 | 0 | match self.find_waiting(&self.partitions[p]) { |
135 | 0 | Some(i) => { |
136 | 0 | self.waiting[i] = x; |
137 | 0 | self.waiting.push(y); |
138 | 0 | } |
139 | | None => { |
140 | 0 | if x.len() <= y.len() { |
141 | 0 | self.waiting.push(x); |
142 | 0 | } else { |
143 | 0 | self.waiting.push(y); |
144 | 0 | } |
145 | | } |
146 | | } |
147 | | } |
148 | 0 | newparts = mem::replace(&mut self.partitions, newparts); |
149 | 0 | newparts.clear(); |
150 | | } |
151 | | } |
152 | | |
153 | | // At this point, we now have a minimal partitioning of states, where |
154 | | // each partition is an equivalence class of DFA states. Now we need to |
155 | | // use this partitioning to update the DFA to only contain one state for |
156 | | // each partition. |
157 | | |
158 | | // Create a map from DFA state ID to the representative ID of the |
159 | | // equivalence class to which it belongs. The representative ID of an |
160 | | // equivalence class of states is the minimum ID in that class. |
161 | 0 | let mut state_to_part = vec![DEAD; self.dfa.state_len()]; |
162 | 0 | for p in &self.partitions { |
163 | 0 | p.iter(|id| state_to_part[as_index(id)] = p.min()); |
164 | | } |
165 | | |
166 | | // Generate a new contiguous sequence of IDs for minimal states, and |
167 | | // create a map from equivalence IDs to the new IDs. Thus, the new |
168 | | // minimal ID of *any* state in the unminimized DFA can be obtained |
169 | | // with minimals_ids[state_to_part[old_id]]. |
170 | 0 | let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; |
171 | 0 | let mut new_index = 0; |
172 | 0 | for state in self.dfa.states() { |
173 | 0 | if state_to_part[as_index(state.id())] == state.id() { |
174 | 0 | minimal_ids[as_index(state.id())] = as_state_id(new_index); |
175 | 0 | new_index += 1; |
176 | 0 | } |
177 | | } |
178 | | // The total number of states in the minimal DFA. |
179 | 0 | let minimal_count = new_index; |
180 | | // Convenience function for remapping state IDs. This takes an old ID, |
181 | | // looks up its Hopcroft partition and then maps that to the new ID |
182 | | // range. |
183 | 0 | let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; |
184 | | |
185 | | // Re-map this DFA in place such that the only states remaining |
186 | | // correspond to the representative states of every equivalence class. |
187 | 0 | for id in (0..self.dfa.state_len()).map(as_state_id) { |
188 | | // If this state isn't a representative for an equivalence class, |
189 | | // then we skip it since it won't appear in the minimal DFA. |
190 | 0 | if state_to_part[as_index(id)] != id { |
191 | 0 | continue; |
192 | 0 | } |
193 | 0 | self.dfa.remap_state(id, remap); |
194 | 0 | self.dfa.swap_states(id, minimal_ids[as_index(id)]); |
195 | | } |
196 | | // Trim off all unused states from the pre-minimized DFA. This |
197 | | // represents all states that were merged into a non-singleton |
198 | | // equivalence class of states, and appeared after the first state |
199 | | // in each such class. (Because the state with the smallest ID in each |
200 | | // equivalence class is its representative ID.) |
201 | 0 | self.dfa.truncate_states(minimal_count); |
202 | | |
203 | | // Update the new start states, which is now just the minimal ID of |
204 | | // whatever state the old start state was collapsed into. Also, we |
205 | | // collect everything before-hand to work around the borrow checker. |
206 | | // We're already allocating so much that this is probably fine. If this |
207 | | // turns out to be costly, then I guess add a `starts_mut` iterator. |
208 | 0 | let starts: Vec<_> = self.dfa.starts().collect(); |
209 | 0 | for (old_start_id, anchored, start_type) in starts { |
210 | 0 | self.dfa.set_start_state( |
211 | 0 | anchored, |
212 | 0 | start_type, |
213 | 0 | remap(old_start_id), |
214 | 0 | ); |
215 | 0 | } |
216 | | |
217 | | // Update the match state pattern ID list for multi-regexes. All we |
218 | | // need to do is remap the match state IDs. The pattern ID lists are |
219 | | // always the same as they were since match states with distinct |
220 | | // pattern ID lists are always considered distinct states. |
221 | 0 | let mut pmap = BTreeMap::new(); |
222 | 0 | for (match_id, pattern_ids) in self.dfa.pattern_map() { |
223 | 0 | let new_id = remap(match_id); |
224 | 0 | pmap.insert(new_id, pattern_ids); |
225 | 0 | } |
226 | | // This unwrap is OK because minimization never increases the number of |
227 | | // match states or patterns in those match states. Since minimization |
228 | | // runs after the pattern map has already been set at least once, we |
229 | | // know that our match states cannot error. |
230 | 0 | self.dfa.set_pattern_map(&pmap).unwrap(); |
231 | | |
232 | | // In order to update the ID of the maximum match state, we need to |
233 | | // find the maximum ID among all of the match states in the minimized |
234 | | // DFA. This is not necessarily the new ID of the unminimized maximum |
235 | | // match state, since that could have been collapsed with a much |
236 | | // earlier match state. Therefore, to find the new max match state, |
237 | | // we iterate over all previous match states, find their corresponding |
238 | | // new minimal ID, and take the maximum of those. |
239 | 0 | let old = self.dfa.special().clone(); |
240 | 0 | let new = self.dfa.special_mut(); |
241 | | // ... but only remap if we had match states. |
242 | 0 | if old.matches() { |
243 | 0 | new.min_match = StateID::MAX; |
244 | 0 | new.max_match = StateID::ZERO; |
245 | 0 | for i in as_index(old.min_match)..=as_index(old.max_match) { |
246 | 0 | let new_id = remap(as_state_id(i)); |
247 | 0 | if new_id < new.min_match { |
248 | 0 | new.min_match = new_id; |
249 | 0 | } |
250 | 0 | if new_id > new.max_match { |
251 | 0 | new.max_match = new_id; |
252 | 0 | } |
253 | | } |
254 | 0 | } |
255 | | // ... same, but for start states. |
256 | 0 | if old.starts() { |
257 | 0 | new.min_start = StateID::MAX; |
258 | 0 | new.max_start = StateID::ZERO; |
259 | 0 | for i in as_index(old.min_start)..=as_index(old.max_start) { |
260 | 0 | let new_id = remap(as_state_id(i)); |
261 | 0 | if new_id == DEAD { |
262 | 0 | continue; |
263 | 0 | } |
264 | 0 | if new_id < new.min_start { |
265 | 0 | new.min_start = new_id; |
266 | 0 | } |
267 | 0 | if new_id > new.max_start { |
268 | 0 | new.max_start = new_id; |
269 | 0 | } |
270 | | } |
271 | 0 | if new.max_start == DEAD { |
272 | 0 | new.min_start = DEAD; |
273 | 0 | } |
274 | 0 | } |
275 | 0 | new.quit_id = remap(new.quit_id); |
276 | 0 | new.set_max(); |
277 | 0 | } |
278 | | |
279 | 0 | fn find_waiting(&self, set: &StateSet) -> Option<usize> { |
280 | 0 | self.waiting.iter().position(|s| s == set) |
281 | 0 | } |
282 | | |
283 | 0 | fn find_incoming_to( |
284 | 0 | &self, |
285 | 0 | b: alphabet::Unit, |
286 | 0 | set: &StateSet, |
287 | 0 | incoming: &mut StateSet, |
288 | 0 | ) { |
289 | 0 | incoming.clear(); |
290 | 0 | set.iter(|id| { |
291 | 0 | for &inid in |
292 | 0 | &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] |
293 | 0 | { |
294 | 0 | incoming.add(inid); |
295 | 0 | } |
296 | 0 | }); |
297 | 0 | incoming.canonicalize(); |
298 | 0 | } |
299 | | |
300 | 0 | fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { |
301 | | // For match states, we know that two match states with different |
302 | | // pattern ID lists will *always* be distinct, so we can partition them |
303 | | // initially based on that. |
304 | 0 | let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); |
305 | 0 | let mut is_quit = StateSet::empty(); |
306 | 0 | let mut no_match = StateSet::empty(); |
307 | 0 | for state in dfa.states() { |
308 | 0 | if dfa.is_match_state(state.id()) { |
309 | 0 | let mut pids = vec![]; |
310 | 0 | for i in 0..dfa.match_len(state.id()) { |
311 | 0 | pids.push(dfa.match_pattern(state.id(), i)); |
312 | 0 | } |
313 | 0 | matching |
314 | 0 | .entry(pids) |
315 | 0 | .or_insert(StateSet::empty()) |
316 | 0 | .add(state.id()); |
317 | 0 | } else if dfa.is_quit_state(state.id()) { |
318 | 0 | is_quit.add(state.id()); |
319 | 0 | } else { |
320 | 0 | no_match.add(state.id()); |
321 | 0 | } |
322 | | } |
323 | | |
324 | 0 | let mut sets: Vec<StateSet> = |
325 | 0 | matching.into_iter().map(|(_, set)| set).collect(); |
326 | 0 | sets.push(no_match); |
327 | 0 | sets.push(is_quit); |
328 | 0 | sets |
329 | 0 | } |
330 | | |
331 | 0 | fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { |
332 | 0 | let mut incoming = vec![]; |
333 | 0 | for _ in dfa.states() { |
334 | 0 | incoming.push(vec![vec![]; dfa.alphabet_len()]); |
335 | 0 | } |
336 | 0 | for state in dfa.states() { |
337 | 0 | for (b, next) in state.transitions() { |
338 | 0 | incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); |
339 | 0 | } |
340 | | } |
341 | 0 | incoming |
342 | 0 | } |
343 | | } |
344 | | |
345 | | impl StateSet { |
346 | 0 | fn empty() -> StateSet { |
347 | 0 | StateSet { ids: Rc::new(RefCell::new(vec![])) } |
348 | 0 | } |
349 | | |
350 | 0 | fn add(&mut self, id: StateID) { |
351 | 0 | self.ids.borrow_mut().push(id); |
352 | 0 | } |
353 | | |
354 | 0 | fn min(&self) -> StateID { |
355 | 0 | self.ids.borrow()[0] |
356 | 0 | } |
357 | | |
358 | 0 | fn canonicalize(&mut self) { |
359 | 0 | self.ids.borrow_mut().sort(); |
360 | 0 | self.ids.borrow_mut().dedup(); |
361 | 0 | } |
362 | | |
363 | 0 | fn clear(&mut self) { |
364 | 0 | self.ids.borrow_mut().clear(); |
365 | 0 | } |
366 | | |
367 | 0 | fn len(&self) -> usize { |
368 | 0 | self.ids.borrow().len() |
369 | 0 | } |
370 | | |
371 | 0 | fn is_empty(&self) -> bool { |
372 | 0 | self.len() == 0 |
373 | 0 | } |
374 | | |
375 | 0 | fn deep_clone(&self) -> StateSet { |
376 | 0 | let ids = self.ids.borrow().iter().cloned().collect(); |
377 | 0 | StateSet { ids: Rc::new(RefCell::new(ids)) } |
378 | 0 | } |
379 | | |
380 | 0 | fn iter<F: FnMut(StateID)>(&self, mut f: F) { |
381 | 0 | for &id in self.ids.borrow().iter() { |
382 | 0 | f(id); |
383 | 0 | } |
384 | 0 | } Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::StateSet>::subtract::{closure#0}> Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::find_incoming_to::{closure#0}> Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::run::{closure#2}> |
385 | | |
386 | 0 | fn intersection(&self, other: &StateSet, dest: &mut StateSet) { |
387 | 0 | dest.clear(); |
388 | 0 | if self.is_empty() || other.is_empty() { |
389 | 0 | return; |
390 | 0 | } |
391 | | |
392 | 0 | let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
393 | 0 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
394 | 0 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
395 | | loop { |
396 | 0 | if a == b { |
397 | 0 | dest.add(a); |
398 | 0 | a = match ita.next() { |
399 | 0 | None => break, |
400 | 0 | Some(a) => a, |
401 | | }; |
402 | 0 | b = match itb.next() { |
403 | 0 | None => break, |
404 | 0 | Some(b) => b, |
405 | | }; |
406 | 0 | } else if a < b { |
407 | 0 | a = match ita.next() { |
408 | 0 | None => break, |
409 | 0 | Some(a) => a, |
410 | | }; |
411 | | } else { |
412 | 0 | b = match itb.next() { |
413 | 0 | None => break, |
414 | 0 | Some(b) => b, |
415 | | }; |
416 | | } |
417 | | } |
418 | 0 | } |
419 | | |
420 | 0 | fn subtract(&self, other: &StateSet, dest: &mut StateSet) { |
421 | 0 | dest.clear(); |
422 | 0 | if self.is_empty() || other.is_empty() { |
423 | 0 | self.iter(|s| dest.add(s)); |
424 | 0 | return; |
425 | 0 | } |
426 | | |
427 | 0 | let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
428 | 0 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
429 | 0 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
430 | | loop { |
431 | 0 | if a == b { |
432 | 0 | a = match ita.next() { |
433 | 0 | None => break, |
434 | 0 | Some(a) => a, |
435 | | }; |
436 | 0 | b = match itb.next() { |
437 | | None => { |
438 | 0 | dest.add(a); |
439 | 0 | break; |
440 | | } |
441 | 0 | Some(b) => b, |
442 | | }; |
443 | 0 | } else if a < b { |
444 | 0 | dest.add(a); |
445 | 0 | a = match ita.next() { |
446 | 0 | None => break, |
447 | 0 | Some(a) => a, |
448 | | }; |
449 | | } else { |
450 | 0 | b = match itb.next() { |
451 | | None => { |
452 | 0 | dest.add(a); |
453 | 0 | break; |
454 | | } |
455 | 0 | Some(b) => b, |
456 | | }; |
457 | | } |
458 | | } |
459 | 0 | for a in ita { |
460 | 0 | dest.add(a); |
461 | 0 | } |
462 | 0 | } |
463 | | } |