/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.1.10/src/nfa/mod.rs

Source (jump to first uncovered line)
use std::fmt;

use classes::ByteClasses;
pub use nfa::compiler::Builder;

mod compiler;
mod map;
mod range_trie;

/// The representation for an NFA state identifier.
pub type StateID = usize;

/// A final compiled NFA.
///
/// The states of the NFA are indexed by state IDs, which are how transitions
/// are expressed.
#[derive(Clone)]
pub struct NFA {
    /// Whether this NFA can only match at the beginning of input or not.
    ///
    /// When true, a match should only be reported if it begins at the 0th
    /// index of the haystack.
    anchored: bool,
    /// The starting state of this NFA.
    start: StateID,
    /// The state list. This list is guaranteed to be indexable by the starting
    /// state ID, and it is also guaranteed to contain exactly one `Match`
    /// state.
    states: Vec<State>,
    /// A mapping from any byte value to its corresponding equivalence class
    /// identifier. Two bytes in the same equivalence class cannot discriminate
    /// between a match or a non-match. This map can be used to shrink the
    /// total size of a DFA's transition table with a small match-time cost.
    ///
    /// Note that the NFA's transitions are *not* defined in terms of these
    /// equivalence classes. The NFA's transitions are defined on the original
    /// byte values. For the most part, this is because they wouldn't really
    /// help the NFA much since the NFA already uses a sparse representation
    /// to represent transitions. Byte classes are most effective in a dense
    /// representation.
    byte_classes: ByteClasses,
}

impl NFA {
    /// Returns an NFA that always matches at every position.
    pub fn always_match() -> NFA {
        NFA {
            anchored: false,
            start: 0,
            states: vec![State::Match],
            byte_classes: ByteClasses::empty(),
        }
    }

    /// Returns an NFA that never matches at any position.
    pub fn never_match() -> NFA {
        NFA {
            anchored: false,
            start: 0,
            states: vec![State::Fail],
            byte_classes: ByteClasses::empty(),
        }
    }

    /// Returns true if and only if this NFA is anchored.
    pub fn is_anchored(&self) -> bool {
        self.anchored
    }

    /// Return the number of states in this NFA.
    pub fn len(&self) -> usize {
        self.states.len()
    }

    /// Return the ID of the initial state of this NFA.
    pub fn start(&self) -> StateID {
        self.start
    }

    /// Return the NFA state corresponding to the given ID.
    pub fn state(&self, id: StateID) -> &State {
        &self.states[id]
    }

    /// Return the set of equivalence classes for this NFA. The slice returned
    /// always has length 256 and maps each possible byte value to its
    /// corresponding equivalence class ID (which is never more than 255).
    pub fn byte_classes(&self) -> &ByteClasses {
        &self.byte_classes
    }
}

impl fmt::Debug for NFA {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        for (i, state) in self.states.iter().enumerate() {
            let status = if i == self.start { '>' } else { ' ' };
            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
        }
        Ok(())
    }
}

/// A state in a final compiled NFA.
#[derive(Clone, Eq, PartialEq)]
pub enum State {
    /// A state that transitions to `next` if and only if the current input
    /// byte is in the range `[start, end]` (inclusive).
    ///
    /// This is a special case of Sparse in that it encodes only one transition
    /// (and therefore avoids the allocation).
    Range { range: Transition },
    /// A state with possibly many transitions, represented in a sparse
    /// fashion. Transitions are ordered lexicographically by input range.
    /// As such, this may only be used when every transition has equal
    /// priority. (In practice, this is only used for encoding large UTF-8
    /// automata.)
    Sparse { ranges: Box<[Transition]> },
    /// An alternation such that there exists an epsilon transition to all
    /// states in `alternates`, where matches found via earlier transitions
    /// are preferred over later transitions.
    Union { alternates: Box<[StateID]> },
    /// A fail state. When encountered, the automaton is guaranteed to never
    /// reach a match state.
    Fail,
    /// A match state. There is exactly one such occurrence of this state in
    /// an NFA.
    Match,
}

/// A transition to another state, only if the given byte falls in the
/// inclusive range specified.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Transition {
    pub start: u8,
    pub end: u8,
    pub next: StateID,
}

impl State {
    /// Returns true if and only if this state contains one or more epsilon
    /// transitions.
    pub fn is_epsilon(&self) -> bool {
        match *self {
            State::Range { .. }
            | State::Sparse { .. }
            | State::Fail
            | State::Match => false,
            State::Union { .. } => true,
        }
    }

    /// Remap the transitions in this state using the given map. Namely, the
    /// given map should be indexed according to the transitions currently
    /// in this state.
    ///
    /// This is used during the final phase of the NFA compiler, which turns
    /// its intermediate NFA into the final NFA.
    fn remap(&mut self, remap: &[StateID]) {
        match *self {
            State::Range { ref mut range } => range.next = remap[range.next],
            State::Sparse { ref mut ranges } => {
                for r in ranges.iter_mut() {
                    r.next = remap[r.next];
                }
            }
            State::Union { ref mut alternates } => {
                for alt in alternates.iter_mut() {
                    *alt = remap[*alt];
                }
            }
            State::Fail => {}
            State::Match => {}
        }
    }
}

impl fmt::Debug for State {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            State::Range { ref range } => range.fmt(f),
            State::Sparse { ref ranges } => {
                let rs = ranges
                    .iter()
                    .map(|t| format!("{:?}", t))
                    .collect::<Vec<String>>()
                    .join(", ");
                write!(f, "sparse({})", rs)
            }
            State::Union { ref alternates } => {
                let alts = alternates
                    .iter()
                    .map(|id| format!("{}", id))
                    .collect::<Vec<String>>()
                    .join(", ");
                write!(f, "alt({})", alts)
            }
            State::Fail => write!(f, "FAIL"),
            State::Match => write!(f, "MATCH"),
        }
    }
}

impl fmt::Debug for Transition {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let Transition { start, end, next } = *self;
        if self.start == self.end {
            write!(f, "{} => {}", escape(start), next)
        } else {
            write!(f, "{}-{} => {}", escape(start), escape(end), next)
        }
    }
}

/// Return the given byte as its escaped string form.
fn escape(b: u8) -> String {
    use std::ascii;

    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}

#[cfg(test)]
mod tests {
    use super::*;
    use dense;
    use dfa::DFA;

    #[test]
    fn always_match() {
        let nfa = NFA::always_match();
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();

        assert_eq!(Some(0), dfa.find_at(b"", 0));
        assert_eq!(Some(0), dfa.find_at(b"a", 0));
        assert_eq!(Some(1), dfa.find_at(b"a", 1));
        assert_eq!(Some(0), dfa.find_at(b"ab", 0));
        assert_eq!(Some(1), dfa.find_at(b"ab", 1));
        assert_eq!(Some(2), dfa.find_at(b"ab", 2));
    }

    #[test]
    fn never_match() {
        let nfa = NFA::never_match();
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();

        assert_eq!(None, dfa.find_at(b"", 0));
        assert_eq!(None, dfa.find_at(b"a", 0));
        assert_eq!(None, dfa.find_at(b"a", 1));
        assert_eq!(None, dfa.find_at(b"ab", 0));
        assert_eq!(None, dfa.find_at(b"ab", 1));
        assert_eq!(None, dfa.find_at(b"ab", 2));
    }
}

Coverage Report

Created: 2024-12-17 06:15

Line	Count	Source (jump to first uncovered line)
1		use std::fmt;
2
3		use classes::ByteClasses;
4		pub use nfa::compiler::Builder;
5
6		mod compiler;
7		mod map;
8		mod range_trie;
9
10		/// The representation for an NFA state identifier.
11		pub type StateID = usize;
12
13		/// A final compiled NFA.
14		///
15		/// The states of the NFA are indexed by state IDs, which are how transitions
16		/// are expressed.
17		#[derive(Clone)]
18		pub struct NFA {
19		/// Whether this NFA can only match at the beginning of input or not.
20		///
21		/// When true, a match should only be reported if it begins at the 0th
22		/// index of the haystack.
23		anchored: bool,
24		/// The starting state of this NFA.
25		start: StateID,
26		/// The state list. This list is guaranteed to be indexable by the starting
27		/// state ID, and it is also guaranteed to contain exactly one `Match`
28		/// state.
29		states: Vec<State>,
30		/// A mapping from any byte value to its corresponding equivalence class
31		/// identifier. Two bytes in the same equivalence class cannot discriminate
32		/// between a match or a non-match. This map can be used to shrink the
33		/// total size of a DFA's transition table with a small match-time cost.
34		///
35		/// Note that the NFA's transitions are not defined in terms of these
36		/// equivalence classes. The NFA's transitions are defined on the original
37		/// byte values. For the most part, this is because they wouldn't really
38		/// help the NFA much since the NFA already uses a sparse representation
39		/// to represent transitions. Byte classes are most effective in a dense
40		/// representation.
41		byte_classes: ByteClasses,
42		}
43
44		impl NFA {
45		/// Returns an NFA that always matches at every position.
46	0	pub fn always_match() -> NFA {
47	0	NFA {
48	0	anchored: false,
49	0	start: 0,
50	0	states: vec![State::Match],
51	0	byte_classes: ByteClasses::empty(),
52	0	}
53	0	}
54
55		/// Returns an NFA that never matches at any position.
56	0	pub fn never_match() -> NFA {
57	0	NFA {
58	0	anchored: false,
59	0	start: 0,
60	0	states: vec![State::Fail],
61	0	byte_classes: ByteClasses::empty(),
62	0	}
63	0	}
64
65		/// Returns true if and only if this NFA is anchored.
66	0	pub fn is_anchored(&self) -> bool {
67	0	self.anchored
68	0	}
69
70		/// Return the number of states in this NFA.
71	0	pub fn len(&self) -> usize {
72	0	self.states.len()
73	0	}
74
75		/// Return the ID of the initial state of this NFA.
76	0	pub fn start(&self) -> StateID {
77	0	self.start
78	0	}
79
80		/// Return the NFA state corresponding to the given ID.
81	0	pub fn state(&self, id: StateID) -> &State {
82	0	&self.states[id]
83	0	}
84
85		/// Return the set of equivalence classes for this NFA. The slice returned
86		/// always has length 256 and maps each possible byte value to its
87		/// corresponding equivalence class ID (which is never more than 255).
88	0	pub fn byte_classes(&self) -> &ByteClasses {
89	0	&self.byte_classes
90	0	}
91		}
92
93		impl fmt::Debug for NFA {
94	0	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95	0	for (i, state) in self.states.iter().enumerate() {
96	0	let status = if i == self.start { '>' } else { ' ' };
97	0	writeln!(f, "{}{:06}: {:?}", status, i, state)?;
98		}
99	0	Ok(())
100	0	}
101		}
102
103		/// A state in a final compiled NFA.
104		#[derive(Clone, Eq, PartialEq)]
105		pub enum State {
106		/// A state that transitions to `next` if and only if the current input
107		/// byte is in the range `[start, end]` (inclusive).
108		///
109		/// This is a special case of Sparse in that it encodes only one transition
110		/// (and therefore avoids the allocation).
111		Range { range: Transition },
112		/// A state with possibly many transitions, represented in a sparse
113		/// fashion. Transitions are ordered lexicographically by input range.
114		/// As such, this may only be used when every transition has equal
115		/// priority. (In practice, this is only used for encoding large UTF-8
116		/// automata.)
117		Sparse { ranges: Box<[Transition]> },
118		/// An alternation such that there exists an epsilon transition to all
119		/// states in `alternates`, where matches found via earlier transitions
120		/// are preferred over later transitions.
121		Union { alternates: Box<[StateID]> },
122		/// A fail state. When encountered, the automaton is guaranteed to never
123		/// reach a match state.
124		Fail,
125		/// A match state. There is exactly one such occurrence of this state in
126		/// an NFA.
127		Match,
128		}
129
130		/// A transition to another state, only if the given byte falls in the
131		/// inclusive range specified.
132		#[derive(Clone, Copy, Eq, Hash, PartialEq)]
133		pub struct Transition {
134		pub start: u8,
135		pub end: u8,
136		pub next: StateID,
137		}
138
139		impl State {
140		/// Returns true if and only if this state contains one or more epsilon
141		/// transitions.
142	0	pub fn is_epsilon(&self) -> bool {
143	0	match *self {
144		State::Range { .. }
145		\| State::Sparse { .. }
146		\| State::Fail
147	0	\| State::Match => false,
148	0	State::Union { .. } => true,
149		}
150	0	}
151
152		/// Remap the transitions in this state using the given map. Namely, the
153		/// given map should be indexed according to the transitions currently
154		/// in this state.
155		///
156		/// This is used during the final phase of the NFA compiler, which turns
157		/// its intermediate NFA into the final NFA.
158	0	fn remap(&mut self, remap: &[StateID]) {
159	0	match *self {
160	0	State::Range { ref mut range } => range.next = remap[range.next],
161	0	State::Sparse { ref mut ranges } => {
162	0	for r in ranges.iter_mut() {
163	0	r.next = remap[r.next];
164	0	}
165		}
166	0	State::Union { ref mut alternates } => {
167	0	for alt in alternates.iter_mut() {
168	0	alt = remap[alt];
169	0	}
170		}
171	0	State::Fail => {}
172	0	State::Match => {}
173		}
174	0	}
175		}
176
177		impl fmt::Debug for State {
178	0	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179	0	match *self {
180	0	State::Range { ref range } => range.fmt(f),
181	0	State::Sparse { ref ranges } => {
182	0	let rs = ranges
183	0	.iter()
184	0	.map(\|t\| format!("{:?}", t))
185	0	.collect::<Vec<String>>()
186	0	.join(", ");
187	0	write!(f, "sparse({})", rs)
188		}
189	0	State::Union { ref alternates } => {
190	0	let alts = alternates
191	0	.iter()
192	0	.map(\|id\| format!("{}", id))
193	0	.collect::<Vec<String>>()
194	0	.join(", ");
195	0	write!(f, "alt({})", alts)
196		}
197	0	State::Fail => write!(f, "FAIL"),
198	0	State::Match => write!(f, "MATCH"),
199		}
200	0	}
201		}
202
203		impl fmt::Debug for Transition {
204	0	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
205	0	let Transition { start, end, next } = *self;
206	0	if self.start == self.end {
207	0	write!(f, "{} => {}", escape(start), next)
208		} else {
209	0	write!(f, "{}-{} => {}", escape(start), escape(end), next)
210		}
211	0	}
212		}
213
214		/// Return the given byte as its escaped string form.
215	0	fn escape(b: u8) -> String {
216		use std::ascii;
217
218	0	String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
219	0	}
220
221		#[cfg(test)]
222		mod tests {
223		use super::*;
224		use dense;
225		use dfa::DFA;
226
227		#[test]
228		fn always_match() {
229		let nfa = NFA::always_match();
230		let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
231
232		assert_eq!(Some(0), dfa.find_at(b"", 0));
233		assert_eq!(Some(0), dfa.find_at(b"a", 0));
234		assert_eq!(Some(1), dfa.find_at(b"a", 1));
235		assert_eq!(Some(0), dfa.find_at(b"ab", 0));
236		assert_eq!(Some(1), dfa.find_at(b"ab", 1));
237		assert_eq!(Some(2), dfa.find_at(b"ab", 2));
238		}
239
240		#[test]
241		fn never_match() {
242		let nfa = NFA::never_match();
243		let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
244
245		assert_eq!(None, dfa.find_at(b"", 0));
246		assert_eq!(None, dfa.find_at(b"a", 0));
247		assert_eq!(None, dfa.find_at(b"a", 1));
248		assert_eq!(None, dfa.find_at(b"ab", 0));
249		assert_eq!(None, dfa.find_at(b"ab", 1));
250		assert_eq!(None, dfa.find_at(b"ab", 2));
251		}
252		}