Coverage Report

Created: 2025-09-27 07:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/dfa/minimize.rs
Line
Count
Source
1
use core::{cell::RefCell, fmt, mem};
2
3
use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
4
5
use crate::{
6
    dfa::{automaton::Automaton, dense, DEAD},
7
    util::{
8
        alphabet,
9
        primitives::{PatternID, StateID},
10
    },
11
};
12
13
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
14
///
15
/// The algorithm implemented here is mostly taken from Wikipedia:
16
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
17
///
18
/// This code has had some light optimization attention paid to it,
19
/// particularly in the form of reducing allocation as much as possible.
20
/// However, it is still generally slow. Future optimization work should
21
/// probably focus on the bigger picture rather than micro-optimizations. For
22
/// example:
23
///
24
/// 1. Figure out how to more intelligently create initial partitions. That is,
25
///    Hopcroft's algorithm starts by creating two partitions of DFA states
26
///    that are known to NOT be equivalent: match states and non-match states.
27
///    The algorithm proceeds by progressively refining these partitions into
28
///    smaller partitions. If we could start with more partitions, then we
29
///    could reduce the amount of work that Hopcroft's algorithm needs to do.
30
/// 2. For every partition that we visit, we find all incoming transitions to
31
///    every state in the partition for *every* element in the alphabet. (This
32
///    is why using byte classes can significantly decrease minimization times,
33
///    since byte classes shrink the alphabet.) This is quite costly and there
34
///    is perhaps some redundant work being performed depending on the specific
35
///    states in the set. For example, we might be able to only visit some
36
///    elements of the alphabet based on the transitions.
37
/// 3. Move parts of minimization into determinization. If minimization has
38
///    fewer states to deal with, then it should run faster. A prime example
39
///    of this might be large Unicode classes, which are generated in way that
40
///    can create a lot of redundant states. (Some work has been done on this
41
///    point during NFA compilation via the algorithm described in the
42
///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
43
///    paper.)
44
pub(crate) struct Minimizer<'a> {
45
    dfa: &'a mut dense::OwnedDFA,
46
    in_transitions: Vec<Vec<Vec<StateID>>>,
47
    partitions: Vec<StateSet>,
48
    waiting: Vec<StateSet>,
49
}
50
51
impl<'a> fmt::Debug for Minimizer<'a> {
52
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53
0
        f.debug_struct("Minimizer")
54
0
            .field("dfa", &self.dfa)
55
0
            .field("in_transitions", &self.in_transitions)
56
0
            .field("partitions", &self.partitions)
57
0
            .field("waiting", &self.waiting)
58
0
            .finish()
59
0
    }
60
}
61
62
/// A set of states. A state set makes up a single partition in Hopcroft's
63
/// algorithm.
64
///
65
/// It is represented by an ordered set of state identifiers. We use shared
66
/// ownership so that a single state set can be in both the set of partitions
67
/// and in the set of waiting sets simultaneously without an additional
68
/// allocation. Generally, once a state set is built, it becomes immutable.
69
///
70
/// We use this representation because it avoids the overhead of more
71
/// traditional set data structures (HashSet/BTreeSet), and also because
72
/// computing intersection/subtraction on this representation is especially
73
/// fast.
74
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
75
struct StateSet {
76
    ids: Rc<RefCell<Vec<StateID>>>,
77
}
78
79
impl<'a> Minimizer<'a> {
80
0
    pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
81
0
        let in_transitions = Minimizer::incoming_transitions(dfa);
82
0
        let partitions = Minimizer::initial_partitions(dfa);
83
0
        let waiting = partitions.clone();
84
0
        Minimizer { dfa, in_transitions, partitions, waiting }
85
0
    }
86
87
0
    pub fn run(mut self) {
88
0
        let stride2 = self.dfa.stride2();
89
0
        let as_state_id = |index: usize| -> StateID {
90
0
            StateID::new(index << stride2).unwrap()
91
0
        };
92
0
        let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
93
94
0
        let mut incoming = StateSet::empty();
95
0
        let mut scratch1 = StateSet::empty();
96
0
        let mut scratch2 = StateSet::empty();
97
0
        let mut newparts = vec![];
98
99
        // This loop is basically Hopcroft's algorithm. Everything else is just
100
        // shuffling data around to fit our representation.
101
0
        while let Some(set) = self.waiting.pop() {
102
0
            for b in self.dfa.byte_classes().iter() {
103
0
                self.find_incoming_to(b, &set, &mut incoming);
104
                // If incoming is empty, then the intersection with any other
105
                // set must also be empty. So 'newparts' just ends up being
106
                // 'self.partitions'. So there's no need to go through the loop
107
                // below.
108
                //
109
                // This actually turns out to be rather large optimization. On
110
                // the order of making minimization 4-5x faster. It's likely
111
                // that the vast majority of all states have very few incoming
112
                // transitions.
113
0
                if incoming.is_empty() {
114
0
                    continue;
115
0
                }
116
117
0
                for p in 0..self.partitions.len() {
118
0
                    self.partitions[p].intersection(&incoming, &mut scratch1);
119
0
                    if scratch1.is_empty() {
120
0
                        newparts.push(self.partitions[p].clone());
121
0
                        continue;
122
0
                    }
123
124
0
                    self.partitions[p].subtract(&incoming, &mut scratch2);
125
0
                    if scratch2.is_empty() {
126
0
                        newparts.push(self.partitions[p].clone());
127
0
                        continue;
128
0
                    }
129
130
0
                    let (x, y) =
131
0
                        (scratch1.deep_clone(), scratch2.deep_clone());
132
0
                    newparts.push(x.clone());
133
0
                    newparts.push(y.clone());
134
0
                    match self.find_waiting(&self.partitions[p]) {
135
0
                        Some(i) => {
136
0
                            self.waiting[i] = x;
137
0
                            self.waiting.push(y);
138
0
                        }
139
                        None => {
140
0
                            if x.len() <= y.len() {
141
0
                                self.waiting.push(x);
142
0
                            } else {
143
0
                                self.waiting.push(y);
144
0
                            }
145
                        }
146
                    }
147
                }
148
0
                newparts = mem::replace(&mut self.partitions, newparts);
149
0
                newparts.clear();
150
            }
151
        }
152
153
        // At this point, we now have a minimal partitioning of states, where
154
        // each partition is an equivalence class of DFA states. Now we need to
155
        // use this partitioning to update the DFA to only contain one state for
156
        // each partition.
157
158
        // Create a map from DFA state ID to the representative ID of the
159
        // equivalence class to which it belongs. The representative ID of an
160
        // equivalence class of states is the minimum ID in that class.
161
0
        let mut state_to_part = vec![DEAD; self.dfa.state_len()];
162
0
        for p in &self.partitions {
163
0
            p.iter(|id| state_to_part[as_index(id)] = p.min());
164
        }
165
166
        // Generate a new contiguous sequence of IDs for minimal states, and
167
        // create a map from equivalence IDs to the new IDs. Thus, the new
168
        // minimal ID of *any* state in the unminimized DFA can be obtained
169
        // with minimals_ids[state_to_part[old_id]].
170
0
        let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
171
0
        let mut new_index = 0;
172
0
        for state in self.dfa.states() {
173
0
            if state_to_part[as_index(state.id())] == state.id() {
174
0
                minimal_ids[as_index(state.id())] = as_state_id(new_index);
175
0
                new_index += 1;
176
0
            }
177
        }
178
        // The total number of states in the minimal DFA.
179
0
        let minimal_count = new_index;
180
        // Convenience function for remapping state IDs. This takes an old ID,
181
        // looks up its Hopcroft partition and then maps that to the new ID
182
        // range.
183
0
        let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
184
185
        // Re-map this DFA in place such that the only states remaining
186
        // correspond to the representative states of every equivalence class.
187
0
        for id in (0..self.dfa.state_len()).map(as_state_id) {
188
            // If this state isn't a representative for an equivalence class,
189
            // then we skip it since it won't appear in the minimal DFA.
190
0
            if state_to_part[as_index(id)] != id {
191
0
                continue;
192
0
            }
193
0
            self.dfa.remap_state(id, remap);
194
0
            self.dfa.swap_states(id, minimal_ids[as_index(id)]);
195
        }
196
        // Trim off all unused states from the pre-minimized DFA. This
197
        // represents all states that were merged into a non-singleton
198
        // equivalence class of states, and appeared after the first state
199
        // in each such class. (Because the state with the smallest ID in each
200
        // equivalence class is its representative ID.)
201
0
        self.dfa.truncate_states(minimal_count);
202
203
        // Update the new start states, which is now just the minimal ID of
204
        // whatever state the old start state was collapsed into. Also, we
205
        // collect everything before-hand to work around the borrow checker.
206
        // We're already allocating so much that this is probably fine. If this
207
        // turns out to be costly, then I guess add a `starts_mut` iterator.
208
0
        let starts: Vec<_> = self.dfa.starts().collect();
209
0
        for (old_start_id, anchored, start_type) in starts {
210
0
            self.dfa.set_start_state(
211
0
                anchored,
212
0
                start_type,
213
0
                remap(old_start_id),
214
0
            );
215
0
        }
216
217
        // Update the match state pattern ID list for multi-regexes. All we
218
        // need to do is remap the match state IDs. The pattern ID lists are
219
        // always the same as they were since match states with distinct
220
        // pattern ID lists are always considered distinct states.
221
0
        let mut pmap = BTreeMap::new();
222
0
        for (match_id, pattern_ids) in self.dfa.pattern_map() {
223
0
            let new_id = remap(match_id);
224
0
            pmap.insert(new_id, pattern_ids);
225
0
        }
226
        // This unwrap is OK because minimization never increases the number of
227
        // match states or patterns in those match states. Since minimization
228
        // runs after the pattern map has already been set at least once, we
229
        // know that our match states cannot error.
230
0
        self.dfa.set_pattern_map(&pmap).unwrap();
231
232
        // In order to update the ID of the maximum match state, we need to
233
        // find the maximum ID among all of the match states in the minimized
234
        // DFA. This is not necessarily the new ID of the unminimized maximum
235
        // match state, since that could have been collapsed with a much
236
        // earlier match state. Therefore, to find the new max match state,
237
        // we iterate over all previous match states, find their corresponding
238
        // new minimal ID, and take the maximum of those.
239
0
        let old = self.dfa.special().clone();
240
0
        let new = self.dfa.special_mut();
241
        // ... but only remap if we had match states.
242
0
        if old.matches() {
243
0
            new.min_match = StateID::MAX;
244
0
            new.max_match = StateID::ZERO;
245
0
            for i in as_index(old.min_match)..=as_index(old.max_match) {
246
0
                let new_id = remap(as_state_id(i));
247
0
                if new_id < new.min_match {
248
0
                    new.min_match = new_id;
249
0
                }
250
0
                if new_id > new.max_match {
251
0
                    new.max_match = new_id;
252
0
                }
253
            }
254
0
        }
255
        // ... same, but for start states.
256
0
        if old.starts() {
257
0
            new.min_start = StateID::MAX;
258
0
            new.max_start = StateID::ZERO;
259
0
            for i in as_index(old.min_start)..=as_index(old.max_start) {
260
0
                let new_id = remap(as_state_id(i));
261
0
                if new_id == DEAD {
262
0
                    continue;
263
0
                }
264
0
                if new_id < new.min_start {
265
0
                    new.min_start = new_id;
266
0
                }
267
0
                if new_id > new.max_start {
268
0
                    new.max_start = new_id;
269
0
                }
270
            }
271
0
            if new.max_start == DEAD {
272
0
                new.min_start = DEAD;
273
0
            }
274
0
        }
275
0
        new.quit_id = remap(new.quit_id);
276
0
        new.set_max();
277
0
    }
278
279
0
    fn find_waiting(&self, set: &StateSet) -> Option<usize> {
280
0
        self.waiting.iter().position(|s| s == set)
281
0
    }
282
283
0
    fn find_incoming_to(
284
0
        &self,
285
0
        b: alphabet::Unit,
286
0
        set: &StateSet,
287
0
        incoming: &mut StateSet,
288
0
    ) {
289
0
        incoming.clear();
290
0
        set.iter(|id| {
291
0
            for &inid in
292
0
                &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
293
0
            {
294
0
                incoming.add(inid);
295
0
            }
296
0
        });
297
0
        incoming.canonicalize();
298
0
    }
299
300
0
    fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
301
        // For match states, we know that two match states with different
302
        // pattern ID lists will *always* be distinct, so we can partition them
303
        // initially based on that.
304
0
        let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
305
0
        let mut is_quit = StateSet::empty();
306
0
        let mut no_match = StateSet::empty();
307
0
        for state in dfa.states() {
308
0
            if dfa.is_match_state(state.id()) {
309
0
                let mut pids = vec![];
310
0
                for i in 0..dfa.match_len(state.id()) {
311
0
                    pids.push(dfa.match_pattern(state.id(), i));
312
0
                }
313
0
                matching
314
0
                    .entry(pids)
315
0
                    .or_insert(StateSet::empty())
316
0
                    .add(state.id());
317
0
            } else if dfa.is_quit_state(state.id()) {
318
0
                is_quit.add(state.id());
319
0
            } else {
320
0
                no_match.add(state.id());
321
0
            }
322
        }
323
324
0
        let mut sets: Vec<StateSet> =
325
0
            matching.into_iter().map(|(_, set)| set).collect();
326
0
        sets.push(no_match);
327
0
        sets.push(is_quit);
328
0
        sets
329
0
    }
330
331
0
    fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
332
0
        let mut incoming = vec![];
333
0
        for _ in dfa.states() {
334
0
            incoming.push(vec![vec![]; dfa.alphabet_len()]);
335
0
        }
336
0
        for state in dfa.states() {
337
0
            for (b, next) in state.transitions() {
338
0
                incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
339
0
            }
340
        }
341
0
        incoming
342
0
    }
343
}
344
345
impl StateSet {
346
0
    fn empty() -> StateSet {
347
0
        StateSet { ids: Rc::new(RefCell::new(vec![])) }
348
0
    }
349
350
0
    fn add(&mut self, id: StateID) {
351
0
        self.ids.borrow_mut().push(id);
352
0
    }
353
354
0
    fn min(&self) -> StateID {
355
0
        self.ids.borrow()[0]
356
0
    }
357
358
0
    fn canonicalize(&mut self) {
359
0
        self.ids.borrow_mut().sort();
360
0
        self.ids.borrow_mut().dedup();
361
0
    }
362
363
0
    fn clear(&mut self) {
364
0
        self.ids.borrow_mut().clear();
365
0
    }
366
367
0
    fn len(&self) -> usize {
368
0
        self.ids.borrow().len()
369
0
    }
370
371
0
    fn is_empty(&self) -> bool {
372
0
        self.len() == 0
373
0
    }
374
375
0
    fn deep_clone(&self) -> StateSet {
376
0
        let ids = self.ids.borrow().iter().cloned().collect();
377
0
        StateSet { ids: Rc::new(RefCell::new(ids)) }
378
0
    }
379
380
0
    fn iter<F: FnMut(StateID)>(&self, mut f: F) {
381
0
        for &id in self.ids.borrow().iter() {
382
0
            f(id);
383
0
        }
384
0
    }
Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::StateSet>::subtract::{closure#0}>
Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::find_incoming_to::{closure#0}>
Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::run::{closure#2}>
385
386
0
    fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
387
0
        dest.clear();
388
0
        if self.is_empty() || other.is_empty() {
389
0
            return;
390
0
        }
391
392
0
        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
393
0
        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
394
0
        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
395
        loop {
396
0
            if a == b {
397
0
                dest.add(a);
398
0
                a = match ita.next() {
399
0
                    None => break,
400
0
                    Some(a) => a,
401
                };
402
0
                b = match itb.next() {
403
0
                    None => break,
404
0
                    Some(b) => b,
405
                };
406
0
            } else if a < b {
407
0
                a = match ita.next() {
408
0
                    None => break,
409
0
                    Some(a) => a,
410
                };
411
            } else {
412
0
                b = match itb.next() {
413
0
                    None => break,
414
0
                    Some(b) => b,
415
                };
416
            }
417
        }
418
0
    }
419
420
0
    fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
421
0
        dest.clear();
422
0
        if self.is_empty() || other.is_empty() {
423
0
            self.iter(|s| dest.add(s));
424
0
            return;
425
0
        }
426
427
0
        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
428
0
        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
429
0
        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
430
        loop {
431
0
            if a == b {
432
0
                a = match ita.next() {
433
0
                    None => break,
434
0
                    Some(a) => a,
435
                };
436
0
                b = match itb.next() {
437
                    None => {
438
0
                        dest.add(a);
439
0
                        break;
440
                    }
441
0
                    Some(b) => b,
442
                };
443
0
            } else if a < b {
444
0
                dest.add(a);
445
0
                a = match ita.next() {
446
0
                    None => break,
447
0
                    Some(a) => a,
448
                };
449
            } else {
450
0
                b = match itb.next() {
451
                    None => {
452
0
                        dest.add(a);
453
0
                        break;
454
                    }
455
0
                    Some(b) => b,
456
                };
457
            }
458
        }
459
0
        for a in ita {
460
0
            dest.add(a);
461
0
        }
462
0
    }
463
}