/src/regex/regex-automata/src/dfa/minimize.rs

Source
use core::{cell::RefCell, fmt, mem};

use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};

use crate::{
    dfa::{automaton::Automaton, dense, DEAD},
    util::{
        alphabet,
        primitives::{PatternID, StateID},
    },
};

/// An implementation of Hopcroft's algorithm for minimizing DFAs.
///
/// The algorithm implemented here is mostly taken from Wikipedia:
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
///
/// This code has had some light optimization attention paid to it,
/// particularly in the form of reducing allocation as much as possible.
/// However, it is still generally slow. Future optimization work should
/// probably focus on the bigger picture rather than micro-optimizations. For
/// example:
///
/// 1. Figure out how to more intelligently create initial partitions. That is,
///    Hopcroft's algorithm starts by creating two partitions of DFA states
///    that are known to NOT be equivalent: match states and non-match states.
///    The algorithm proceeds by progressively refining these partitions into
///    smaller partitions. If we could start with more partitions, then we
///    could reduce the amount of work that Hopcroft's algorithm needs to do.
/// 2. For every partition that we visit, we find all incoming transitions to
///    every state in the partition for *every* element in the alphabet. (This
///    is why using byte classes can significantly decrease minimization times,
///    since byte classes shrink the alphabet.) This is quite costly and there
///    is perhaps some redundant work being performed depending on the specific
///    states in the set. For example, we might be able to only visit some
///    elements of the alphabet based on the transitions.
/// 3. Move parts of minimization into determinization. If minimization has
///    fewer states to deal with, then it should run faster. A prime example
///    of this might be large Unicode classes, which are generated in way that
///    can create a lot of redundant states. (Some work has been done on this
///    point during NFA compilation via the algorithm described in the
///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
///    paper.)
pub(crate) struct Minimizer<'a> {
    dfa: &'a mut dense::OwnedDFA,
    in_transitions: Vec<Vec<Vec<StateID>>>,
    partitions: Vec<StateSet>,
    waiting: Vec<StateSet>,
}

impl<'a> fmt::Debug for Minimizer<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("Minimizer")
            .field("dfa", &self.dfa)
            .field("in_transitions", &self.in_transitions)
            .field("partitions", &self.partitions)
            .field("waiting", &self.waiting)
            .finish()
    }
}

/// A set of states. A state set makes up a single partition in Hopcroft's
/// algorithm.
///
/// It is represented by an ordered set of state identifiers. We use shared
/// ownership so that a single state set can be in both the set of partitions
/// and in the set of waiting sets simultaneously without an additional
/// allocation. Generally, once a state set is built, it becomes immutable.
///
/// We use this representation because it avoids the overhead of more
/// traditional set data structures (HashSet/BTreeSet), and also because
/// computing intersection/subtraction on this representation is especially
/// fast.
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
struct StateSet {
    ids: Rc<RefCell<Vec<StateID>>>,
}

impl<'a> Minimizer<'a> {
    pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
        let in_transitions = Minimizer::incoming_transitions(dfa);
        let partitions = Minimizer::initial_partitions(dfa);
        let waiting = partitions.clone();
        Minimizer { dfa, in_transitions, partitions, waiting }
    }

    pub fn run(mut self) {
        let stride2 = self.dfa.stride2();
        let as_state_id = |index: usize| -> StateID {
            StateID::new(index << stride2).unwrap()
        };
        let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };

        let mut incoming = StateSet::empty();
        let mut scratch1 = StateSet::empty();
        let mut scratch2 = StateSet::empty();
        let mut newparts = vec![];

        // This loop is basically Hopcroft's algorithm. Everything else is just
        // shuffling data around to fit our representation.
        while let Some(set) = self.waiting.pop() {
            for b in self.dfa.byte_classes().iter() {
                self.find_incoming_to(b, &set, &mut incoming);
                // If incoming is empty, then the intersection with any other
                // set must also be empty. So 'newparts' just ends up being
                // 'self.partitions'. So there's no need to go through the loop
                // below.
                //
                // This actually turns out to be rather large optimization. On
                // the order of making minimization 4-5x faster. It's likely
                // that the vast majority of all states have very few incoming
                // transitions.
                if incoming.is_empty() {
                    continue;
                }

                for p in 0..self.partitions.len() {
                    self.partitions[p].intersection(&incoming, &mut scratch1);
                    if scratch1.is_empty() {
                        newparts.push(self.partitions[p].clone());
                        continue;
                    }

                    self.partitions[p].subtract(&incoming, &mut scratch2);
                    if scratch2.is_empty() {
                        newparts.push(self.partitions[p].clone());
                        continue;
                    }

                    let (x, y) =
                        (scratch1.deep_clone(), scratch2.deep_clone());
                    newparts.push(x.clone());
                    newparts.push(y.clone());
                    match self.find_waiting(&self.partitions[p]) {
                        Some(i) => {
                            self.waiting[i] = x;
                            self.waiting.push(y);
                        }
                        None => {
                            if x.len() <= y.len() {
                                self.waiting.push(x);
                            } else {
                                self.waiting.push(y);
                            }
                        }
                    }
                }
                newparts = mem::replace(&mut self.partitions, newparts);
                newparts.clear();
            }
        }

        // At this point, we now have a minimal partitioning of states, where
        // each partition is an equivalence class of DFA states. Now we need to
        // use this partitioning to update the DFA to only contain one state for
        // each partition.

        // Create a map from DFA state ID to the representative ID of the
        // equivalence class to which it belongs. The representative ID of an
        // equivalence class of states is the minimum ID in that class.
        let mut state_to_part = vec![DEAD; self.dfa.state_len()];
        for p in &self.partitions {
            p.iter(|id| state_to_part[as_index(id)] = p.min());
        }

        // Generate a new contiguous sequence of IDs for minimal states, and
        // create a map from equivalence IDs to the new IDs. Thus, the new
        // minimal ID of *any* state in the unminimized DFA can be obtained
        // with minimals_ids[state_to_part[old_id]].
        let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
        let mut new_index = 0;
        for state in self.dfa.states() {
            if state_to_part[as_index(state.id())] == state.id() {
                minimal_ids[as_index(state.id())] = as_state_id(new_index);
                new_index += 1;
            }
        }
        // The total number of states in the minimal DFA.
        let minimal_count = new_index;
        // Convenience function for remapping state IDs. This takes an old ID,
        // looks up its Hopcroft partition and then maps that to the new ID
        // range.
        let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];

        // Re-map this DFA in place such that the only states remaining
        // correspond to the representative states of every equivalence class.
        for id in (0..self.dfa.state_len()).map(as_state_id) {
            // If this state isn't a representative for an equivalence class,
            // then we skip it since it won't appear in the minimal DFA.
            if state_to_part[as_index(id)] != id {
                continue;
            }
            self.dfa.remap_state(id, remap);
            self.dfa.swap_states(id, minimal_ids[as_index(id)]);
        }
        // Trim off all unused states from the pre-minimized DFA. This
        // represents all states that were merged into a non-singleton
        // equivalence class of states, and appeared after the first state
        // in each such class. (Because the state with the smallest ID in each
        // equivalence class is its representative ID.)
        self.dfa.truncate_states(minimal_count);

        // Update the new start states, which is now just the minimal ID of
        // whatever state the old start state was collapsed into. Also, we
        // collect everything before-hand to work around the borrow checker.
        // We're already allocating so much that this is probably fine. If this
        // turns out to be costly, then I guess add a `starts_mut` iterator.
        let starts: Vec<_> = self.dfa.starts().collect();
        for (old_start_id, anchored, start_type) in starts {
            self.dfa.set_start_state(
                anchored,
                start_type,
                remap(old_start_id),
            );
        }

        // Update the match state pattern ID list for multi-regexes. All we
        // need to do is remap the match state IDs. The pattern ID lists are
        // always the same as they were since match states with distinct
        // pattern ID lists are always considered distinct states.
        let mut pmap = BTreeMap::new();
        for (match_id, pattern_ids) in self.dfa.pattern_map() {
            let new_id = remap(match_id);
            pmap.insert(new_id, pattern_ids);
        }
        // This unwrap is OK because minimization never increases the number of
        // match states or patterns in those match states. Since minimization
        // runs after the pattern map has already been set at least once, we
        // know that our match states cannot error.
        self.dfa.set_pattern_map(&pmap).unwrap();

        // In order to update the ID of the maximum match state, we need to
        // find the maximum ID among all of the match states in the minimized
        // DFA. This is not necessarily the new ID of the unminimized maximum
        // match state, since that could have been collapsed with a much
        // earlier match state. Therefore, to find the new max match state,
        // we iterate over all previous match states, find their corresponding
        // new minimal ID, and take the maximum of those.
        let old = self.dfa.special().clone();
        let new = self.dfa.special_mut();
        // ... but only remap if we had match states.
        if old.matches() {
            new.min_match = StateID::MAX;
            new.max_match = StateID::ZERO;
            for i in as_index(old.min_match)..=as_index(old.max_match) {
                let new_id = remap(as_state_id(i));
                if new_id < new.min_match {
                    new.min_match = new_id;
                }
                if new_id > new.max_match {
                    new.max_match = new_id;
                }
            }
        }
        // ... same, but for start states.
        if old.starts() {
            new.min_start = StateID::MAX;
            new.max_start = StateID::ZERO;
            for i in as_index(old.min_start)..=as_index(old.max_start) {
                let new_id = remap(as_state_id(i));
                if new_id == DEAD {
                    continue;
                }
                if new_id < new.min_start {
                    new.min_start = new_id;
                }
                if new_id > new.max_start {
                    new.max_start = new_id;
                }
            }
            if new.max_start == DEAD {
                new.min_start = DEAD;
            }
        }
        new.quit_id = remap(new.quit_id);
        new.set_max();
    }

    fn find_waiting(&self, set: &StateSet) -> Option<usize> {
        self.waiting.iter().position(|s| s == set)
    }

    fn find_incoming_to(
        &self,
        b: alphabet::Unit,
        set: &StateSet,
        incoming: &mut StateSet,
    ) {
        incoming.clear();
        set.iter(|id| {
            for &inid in
                &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
            {
                incoming.add(inid);
            }
        });
        incoming.canonicalize();
    }

    fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
        // For match states, we know that two match states with different
        // pattern ID lists will *always* be distinct, so we can partition them
        // initially based on that.
        let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
        let mut is_quit = StateSet::empty();
        let mut no_match = StateSet::empty();
        for state in dfa.states() {
            if dfa.is_match_state(state.id()) {
                let mut pids = vec![];
                for i in 0..dfa.match_len(state.id()) {
                    pids.push(dfa.match_pattern(state.id(), i));
                }
                matching
                    .entry(pids)
                    .or_insert(StateSet::empty())
                    .add(state.id());
            } else if dfa.is_quit_state(state.id()) {
                is_quit.add(state.id());
            } else {
                no_match.add(state.id());
            }
        }

        let mut sets: Vec<StateSet> =
            matching.into_iter().map(|(_, set)| set).collect();
        sets.push(no_match);
        sets.push(is_quit);
        sets
    }

    fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
        let mut incoming = vec![];
        for _ in dfa.states() {
            incoming.push(vec![vec![]; dfa.alphabet_len()]);
        }
        for state in dfa.states() {
            for (b, next) in state.transitions() {
                incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
            }
        }
        incoming
    }
}

impl StateSet {
    fn empty() -> StateSet {
        StateSet { ids: Rc::new(RefCell::new(vec![])) }
    }

    fn add(&mut self, id: StateID) {
        self.ids.borrow_mut().push(id);
    }

    fn min(&self) -> StateID {
        self.ids.borrow()[0]
    }

    fn canonicalize(&mut self) {
        self.ids.borrow_mut().sort();
        self.ids.borrow_mut().dedup();
    }

    fn clear(&mut self) {
        self.ids.borrow_mut().clear();
    }

    fn len(&self) -> usize {
        self.ids.borrow().len()
    }

    fn is_empty(&self) -> bool {
        self.len() == 0
    }

    fn deep_clone(&self) -> StateSet {
        let ids = self.ids.borrow().iter().cloned().collect();
        StateSet { ids: Rc::new(RefCell::new(ids)) }
    }

    fn iter<F: FnMut(StateID)>(&self, mut f: F) {
        for &id in self.ids.borrow().iter() {
            f(id);
        }
    }

    fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
        dest.clear();
        if self.is_empty() || other.is_empty() {
            return;
        }

        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
        loop {
            if a == b {
                dest.add(a);
                a = match ita.next() {
                    None => break,
                    Some(a) => a,
                };
                b = match itb.next() {
                    None => break,
                    Some(b) => b,
                };
            } else if a < b {
                a = match ita.next() {
                    None => break,
                    Some(a) => a,
                };
            } else {
                b = match itb.next() {
                    None => break,
                    Some(b) => b,
                };
            }
        }
    }

    fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
        dest.clear();
        if self.is_empty() || other.is_empty() {
            self.iter(|s| dest.add(s));
            return;
        }

        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
        loop {
            if a == b {
                a = match ita.next() {
                    None => break,
                    Some(a) => a,
                };
                b = match itb.next() {
                    None => {
                        dest.add(a);
                        break;
                    }
                    Some(b) => b,
                };
            } else if a < b {
                dest.add(a);
                a = match ita.next() {
                    None => break,
                    Some(a) => a,
                };
            } else {
                b = match itb.next() {
                    None => {
                        dest.add(a);
                        break;
                    }
                    Some(b) => b,
                };
            }
        }
        for a in ita {
            dest.add(a);
        }
    }
}

Coverage Report

Created: 2025-09-27 07:15

Line	Count	Source
1		use core::{cell::RefCell, fmt, mem};
2
3		use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
4
5		use crate::{
6		dfa::{automaton::Automaton, dense, DEAD},
7		util::{
8		alphabet,
9		primitives::{PatternID, StateID},
10		},
11		};
12
13		/// An implementation of Hopcroft's algorithm for minimizing DFAs.
14		///
15		/// The algorithm implemented here is mostly taken from Wikipedia:
16		/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
17		///
18		/// This code has had some light optimization attention paid to it,
19		/// particularly in the form of reducing allocation as much as possible.
20		/// However, it is still generally slow. Future optimization work should
21		/// probably focus on the bigger picture rather than micro-optimizations. For
22		/// example:
23		///
24		/// 1. Figure out how to more intelligently create initial partitions. That is,
25		/// Hopcroft's algorithm starts by creating two partitions of DFA states
26		/// that are known to NOT be equivalent: match states and non-match states.
27		/// The algorithm proceeds by progressively refining these partitions into
28		/// smaller partitions. If we could start with more partitions, then we
29		/// could reduce the amount of work that Hopcroft's algorithm needs to do.
30		/// 2. For every partition that we visit, we find all incoming transitions to
31		/// every state in the partition for every element in the alphabet. (This
32		/// is why using byte classes can significantly decrease minimization times,
33		/// since byte classes shrink the alphabet.) This is quite costly and there
34		/// is perhaps some redundant work being performed depending on the specific
35		/// states in the set. For example, we might be able to only visit some
36		/// elements of the alphabet based on the transitions.
37		/// 3. Move parts of minimization into determinization. If minimization has
38		/// fewer states to deal with, then it should run faster. A prime example
39		/// of this might be large Unicode classes, which are generated in way that
40		/// can create a lot of redundant states. (Some work has been done on this
41		/// point during NFA compilation via the algorithm described in the
42		/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
43		/// paper.)
44		pub(crate) struct Minimizer<'a> {
45		dfa: &'a mut dense::OwnedDFA,
46		in_transitions: Vec<Vec<Vec<StateID>>>,
47		partitions: Vec<StateSet>,
48		waiting: Vec<StateSet>,
49		}
50
51		impl<'a> fmt::Debug for Minimizer<'a> {
52	0	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53	0	f.debug_struct("Minimizer")
54	0	.field("dfa", &self.dfa)
55	0	.field("in_transitions", &self.in_transitions)
56	0	.field("partitions", &self.partitions)
57	0	.field("waiting", &self.waiting)
58	0	.finish()
59	0	}
60		}
61
62		/// A set of states. A state set makes up a single partition in Hopcroft's
63		/// algorithm.
64		///
65		/// It is represented by an ordered set of state identifiers. We use shared
66		/// ownership so that a single state set can be in both the set of partitions
67		/// and in the set of waiting sets simultaneously without an additional
68		/// allocation. Generally, once a state set is built, it becomes immutable.
69		///
70		/// We use this representation because it avoids the overhead of more
71		/// traditional set data structures (HashSet/BTreeSet), and also because
72		/// computing intersection/subtraction on this representation is especially
73		/// fast.
74		#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
75		struct StateSet {
76		ids: Rc<RefCell<Vec<StateID>>>,
77		}
78
79		impl<'a> Minimizer<'a> {
80	0	pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
81	0	let in_transitions = Minimizer::incoming_transitions(dfa);
82	0	let partitions = Minimizer::initial_partitions(dfa);
83	0	let waiting = partitions.clone();
84	0	Minimizer { dfa, in_transitions, partitions, waiting }
85	0	}
86
87	0	pub fn run(mut self) {
88	0	let stride2 = self.dfa.stride2();
89	0	let as_state_id = \|index: usize\| -> StateID {
90	0	StateID::new(index << stride2).unwrap()
91	0	};
92	0	let as_index = \|id: StateID\| -> usize { id.as_usize() >> stride2 };
93
94	0	let mut incoming = StateSet::empty();
95	0	let mut scratch1 = StateSet::empty();
96	0	let mut scratch2 = StateSet::empty();
97	0	let mut newparts = vec![];
98
99		// This loop is basically Hopcroft's algorithm. Everything else is just
100		// shuffling data around to fit our representation.
101	0	while let Some(set) = self.waiting.pop() {
102	0	for b in self.dfa.byte_classes().iter() {
103	0	self.find_incoming_to(b, &set, &mut incoming);
104		// If incoming is empty, then the intersection with any other
105		// set must also be empty. So 'newparts' just ends up being
106		// 'self.partitions'. So there's no need to go through the loop
107		// below.
108		//
109		// This actually turns out to be rather large optimization. On
110		// the order of making minimization 4-5x faster. It's likely
111		// that the vast majority of all states have very few incoming
112		// transitions.
113	0	if incoming.is_empty() {
114	0	continue;
115	0	}
116
117	0	for p in 0..self.partitions.len() {
118	0	self.partitions[p].intersection(&incoming, &mut scratch1);
119	0	if scratch1.is_empty() {
120	0	newparts.push(self.partitions[p].clone());
121	0	continue;
122	0	}
123
124	0	self.partitions[p].subtract(&incoming, &mut scratch2);
125	0	if scratch2.is_empty() {
126	0	newparts.push(self.partitions[p].clone());
127	0	continue;
128	0	}
129
130	0	let (x, y) =
131	0	(scratch1.deep_clone(), scratch2.deep_clone());
132	0	newparts.push(x.clone());
133	0	newparts.push(y.clone());
134	0	match self.find_waiting(&self.partitions[p]) {
135	0	Some(i) => {
136	0	self.waiting[i] = x;
137	0	self.waiting.push(y);
138	0	}
139		None => {
140	0	if x.len() <= y.len() {
141	0	self.waiting.push(x);
142	0	} else {
143	0	self.waiting.push(y);
144	0	}
145		}
146		}
147		}
148	0	newparts = mem::replace(&mut self.partitions, newparts);
149	0	newparts.clear();
150		}
151		}
152
153		// At this point, we now have a minimal partitioning of states, where
154		// each partition is an equivalence class of DFA states. Now we need to
155		// use this partitioning to update the DFA to only contain one state for
156		// each partition.
157
158		// Create a map from DFA state ID to the representative ID of the
159		// equivalence class to which it belongs. The representative ID of an
160		// equivalence class of states is the minimum ID in that class.
161	0	let mut state_to_part = vec![DEAD; self.dfa.state_len()];
162	0	for p in &self.partitions {
163	0	p.iter(\|id\| state_to_part[as_index(id)] = p.min());
164		}
165
166		// Generate a new contiguous sequence of IDs for minimal states, and
167		// create a map from equivalence IDs to the new IDs. Thus, the new
168		// minimal ID of any state in the unminimized DFA can be obtained
169		// with minimals_ids[state_to_part[old_id]].
170	0	let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
171	0	let mut new_index = 0;
172	0	for state in self.dfa.states() {
173	0	if state_to_part[as_index(state.id())] == state.id() {
174	0	minimal_ids[as_index(state.id())] = as_state_id(new_index);
175	0	new_index += 1;
176	0	}
177		}
178		// The total number of states in the minimal DFA.
179	0	let minimal_count = new_index;
180		// Convenience function for remapping state IDs. This takes an old ID,
181		// looks up its Hopcroft partition and then maps that to the new ID
182		// range.
183	0	let remap = \|old\| minimal_ids[as_index(state_to_part[as_index(old)])];
184
185		// Re-map this DFA in place such that the only states remaining
186		// correspond to the representative states of every equivalence class.
187	0	for id in (0..self.dfa.state_len()).map(as_state_id) {
188		// If this state isn't a representative for an equivalence class,
189		// then we skip it since it won't appear in the minimal DFA.
190	0	if state_to_part[as_index(id)] != id {
191	0	continue;
192	0	}
193	0	self.dfa.remap_state(id, remap);
194	0	self.dfa.swap_states(id, minimal_ids[as_index(id)]);
195		}
196		// Trim off all unused states from the pre-minimized DFA. This
197		// represents all states that were merged into a non-singleton
198		// equivalence class of states, and appeared after the first state
199		// in each such class. (Because the state with the smallest ID in each
200		// equivalence class is its representative ID.)
201	0	self.dfa.truncate_states(minimal_count);
202
203		// Update the new start states, which is now just the minimal ID of
204		// whatever state the old start state was collapsed into. Also, we
205		// collect everything before-hand to work around the borrow checker.
206		// We're already allocating so much that this is probably fine. If this
207		// turns out to be costly, then I guess add a `starts_mut` iterator.
208	0	let starts: Vec<_> = self.dfa.starts().collect();
209	0	for (old_start_id, anchored, start_type) in starts {
210	0	self.dfa.set_start_state(
211	0	anchored,
212	0	start_type,
213	0	remap(old_start_id),
214	0	);
215	0	}
216
217		// Update the match state pattern ID list for multi-regexes. All we
218		// need to do is remap the match state IDs. The pattern ID lists are
219		// always the same as they were since match states with distinct
220		// pattern ID lists are always considered distinct states.
221	0	let mut pmap = BTreeMap::new();
222	0	for (match_id, pattern_ids) in self.dfa.pattern_map() {
223	0	let new_id = remap(match_id);
224	0	pmap.insert(new_id, pattern_ids);
225	0	}
226		// This unwrap is OK because minimization never increases the number of
227		// match states or patterns in those match states. Since minimization
228		// runs after the pattern map has already been set at least once, we
229		// know that our match states cannot error.
230	0	self.dfa.set_pattern_map(&pmap).unwrap();
231
232		// In order to update the ID of the maximum match state, we need to
233		// find the maximum ID among all of the match states in the minimized
234		// DFA. This is not necessarily the new ID of the unminimized maximum
235		// match state, since that could have been collapsed with a much
236		// earlier match state. Therefore, to find the new max match state,
237		// we iterate over all previous match states, find their corresponding
238		// new minimal ID, and take the maximum of those.
239	0	let old = self.dfa.special().clone();
240	0	let new = self.dfa.special_mut();
241		// ... but only remap if we had match states.
242	0	if old.matches() {
243	0	new.min_match = StateID::MAX;
244	0	new.max_match = StateID::ZERO;
245	0	for i in as_index(old.min_match)..=as_index(old.max_match) {
246	0	let new_id = remap(as_state_id(i));
247	0	if new_id < new.min_match {
248	0	new.min_match = new_id;
249	0	}
250	0	if new_id > new.max_match {
251	0	new.max_match = new_id;
252	0	}
253		}
254	0	}
255		// ... same, but for start states.
256	0	if old.starts() {
257	0	new.min_start = StateID::MAX;
258	0	new.max_start = StateID::ZERO;
259	0	for i in as_index(old.min_start)..=as_index(old.max_start) {
260	0	let new_id = remap(as_state_id(i));
261	0	if new_id == DEAD {
262	0	continue;
263	0	}
264	0	if new_id < new.min_start {
265	0	new.min_start = new_id;
266	0	}
267	0	if new_id > new.max_start {
268	0	new.max_start = new_id;
269	0	}
270		}
271	0	if new.max_start == DEAD {
272	0	new.min_start = DEAD;
273	0	}
274	0	}
275	0	new.quit_id = remap(new.quit_id);
276	0	new.set_max();
277	0	}
278
279	0	fn find_waiting(&self, set: &StateSet) -> Option<usize> {
280	0	self.waiting.iter().position(\|s\| s == set)
281	0	}
282
283	0	fn find_incoming_to(
284	0	&self,
285	0	b: alphabet::Unit,
286	0	set: &StateSet,
287	0	incoming: &mut StateSet,
288	0	) {
289	0	incoming.clear();
290	0	set.iter(\|id\| {
291	0	for &inid in
292	0	&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
293	0	{
294	0	incoming.add(inid);
295	0	}
296	0	});
297	0	incoming.canonicalize();
298	0	}
299
300	0	fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
301		// For match states, we know that two match states with different
302		// pattern ID lists will always be distinct, so we can partition them
303		// initially based on that.
304	0	let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
305	0	let mut is_quit = StateSet::empty();
306	0	let mut no_match = StateSet::empty();
307	0	for state in dfa.states() {
308	0	if dfa.is_match_state(state.id()) {
309	0	let mut pids = vec![];
310	0	for i in 0..dfa.match_len(state.id()) {
311	0	pids.push(dfa.match_pattern(state.id(), i));
312	0	}
313	0	matching
314	0	.entry(pids)
315	0	.or_insert(StateSet::empty())
316	0	.add(state.id());
317	0	} else if dfa.is_quit_state(state.id()) {
318	0	is_quit.add(state.id());
319	0	} else {
320	0	no_match.add(state.id());
321	0	}
322		}
323
324	0	let mut sets: Vec<StateSet> =
325	0	matching.into_iter().map(\|(_, set)\| set).collect();
326	0	sets.push(no_match);
327	0	sets.push(is_quit);
328	0	sets
329	0	}
330
331	0	fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
332	0	let mut incoming = vec![];
333	0	for _ in dfa.states() {
334	0	incoming.push(vec![vec![]; dfa.alphabet_len()]);
335	0	}
336	0	for state in dfa.states() {
337	0	for (b, next) in state.transitions() {
338	0	incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
339	0	}
340		}
341	0	incoming
342	0	}
343		}
344
345		impl StateSet {
346	0	fn empty() -> StateSet {
347	0	StateSet { ids: Rc::new(RefCell::new(vec![])) }
348	0	}
349
350	0	fn add(&mut self, id: StateID) {
351	0	self.ids.borrow_mut().push(id);
352	0	}
353
354	0	fn min(&self) -> StateID {
355	0	self.ids.borrow()[0]
356	0	}
357
358	0	fn canonicalize(&mut self) {
359	0	self.ids.borrow_mut().sort();
360	0	self.ids.borrow_mut().dedup();
361	0	}
362
363	0	fn clear(&mut self) {
364	0	self.ids.borrow_mut().clear();
365	0	}
366
367	0	fn len(&self) -> usize {
368	0	self.ids.borrow().len()
369	0	}
370
371	0	fn is_empty(&self) -> bool {
372	0	self.len() == 0
373	0	}
374
375	0	fn deep_clone(&self) -> StateSet {
376	0	let ids = self.ids.borrow().iter().cloned().collect();
377	0	StateSet { ids: Rc::new(RefCell::new(ids)) }
378	0	}
379
380	0	fn iter<F: FnMut(StateID)>(&self, mut f: F) {
381	0	for &id in self.ids.borrow().iter() {
382	0	f(id);
383	0	}
384	0	} Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::StateSet>::subtract::{closure#0}> Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::find_incoming_to::{closure#0}> Unexecuted instantiation: <regex_automata::dfa::minimize::StateSet>::iter::<<regex_automata::dfa::minimize::Minimizer>::run::{closure#2}>
385
386	0	fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
387	0	dest.clear();
388	0	if self.is_empty() \|\| other.is_empty() {
389	0	return;
390	0	}
391
392	0	let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
393	0	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
394	0	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
395		loop {
396	0	if a == b {
397	0	dest.add(a);
398	0	a = match ita.next() {
399	0	None => break,
400	0	Some(a) => a,
401		};
402	0	b = match itb.next() {
403	0	None => break,
404	0	Some(b) => b,
405		};
406	0	} else if a < b {
407	0	a = match ita.next() {
408	0	None => break,
409	0	Some(a) => a,
410		};
411		} else {
412	0	b = match itb.next() {
413	0	None => break,
414	0	Some(b) => b,
415		};
416		}
417		}
418	0	}
419
420	0	fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
421	0	dest.clear();
422	0	if self.is_empty() \|\| other.is_empty() {
423	0	self.iter(\|s\| dest.add(s));
424	0	return;
425	0	}
426
427	0	let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
428	0	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
429	0	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
430		loop {
431	0	if a == b {
432	0	a = match ita.next() {
433	0	None => break,
434	0	Some(a) => a,
435		};
436	0	b = match itb.next() {
437		None => {
438	0	dest.add(a);
439	0	break;
440		}
441	0	Some(b) => b,
442		};
443	0	} else if a < b {
444	0	dest.add(a);
445	0	a = match ita.next() {
446	0	None => break,
447	0	Some(a) => a,
448		};
449		} else {
450	0	b = match itb.next() {
451		None => {
452	0	dest.add(a);
453	0	break;
454		}
455	0	Some(b) => b,
456		};
457		}
458		}
459	0	for a in ita {
460	0	dest.add(a);
461	0	}
462	0	}
463		}