Coverage Report

Created: 2024-12-17 06:15

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.1.10/src/nfa/mod.rs
Line
Count
Source (jump to first uncovered line)
1
use std::fmt;
2
3
use classes::ByteClasses;
4
pub use nfa::compiler::Builder;
5
6
mod compiler;
7
mod map;
8
mod range_trie;
9
10
/// The representation for an NFA state identifier.
11
pub type StateID = usize;
12
13
/// A final compiled NFA.
14
///
15
/// The states of the NFA are indexed by state IDs, which are how transitions
16
/// are expressed.
17
#[derive(Clone)]
18
pub struct NFA {
19
    /// Whether this NFA can only match at the beginning of input or not.
20
    ///
21
    /// When true, a match should only be reported if it begins at the 0th
22
    /// index of the haystack.
23
    anchored: bool,
24
    /// The starting state of this NFA.
25
    start: StateID,
26
    /// The state list. This list is guaranteed to be indexable by the starting
27
    /// state ID, and it is also guaranteed to contain exactly one `Match`
28
    /// state.
29
    states: Vec<State>,
30
    /// A mapping from any byte value to its corresponding equivalence class
31
    /// identifier. Two bytes in the same equivalence class cannot discriminate
32
    /// between a match or a non-match. This map can be used to shrink the
33
    /// total size of a DFA's transition table with a small match-time cost.
34
    ///
35
    /// Note that the NFA's transitions are *not* defined in terms of these
36
    /// equivalence classes. The NFA's transitions are defined on the original
37
    /// byte values. For the most part, this is because they wouldn't really
38
    /// help the NFA much since the NFA already uses a sparse representation
39
    /// to represent transitions. Byte classes are most effective in a dense
40
    /// representation.
41
    byte_classes: ByteClasses,
42
}
43
44
impl NFA {
45
    /// Returns an NFA that always matches at every position.
46
0
    pub fn always_match() -> NFA {
47
0
        NFA {
48
0
            anchored: false,
49
0
            start: 0,
50
0
            states: vec![State::Match],
51
0
            byte_classes: ByteClasses::empty(),
52
0
        }
53
0
    }
54
55
    /// Returns an NFA that never matches at any position.
56
0
    pub fn never_match() -> NFA {
57
0
        NFA {
58
0
            anchored: false,
59
0
            start: 0,
60
0
            states: vec![State::Fail],
61
0
            byte_classes: ByteClasses::empty(),
62
0
        }
63
0
    }
64
65
    /// Returns true if and only if this NFA is anchored.
66
0
    pub fn is_anchored(&self) -> bool {
67
0
        self.anchored
68
0
    }
69
70
    /// Return the number of states in this NFA.
71
0
    pub fn len(&self) -> usize {
72
0
        self.states.len()
73
0
    }
74
75
    /// Return the ID of the initial state of this NFA.
76
0
    pub fn start(&self) -> StateID {
77
0
        self.start
78
0
    }
79
80
    /// Return the NFA state corresponding to the given ID.
81
0
    pub fn state(&self, id: StateID) -> &State {
82
0
        &self.states[id]
83
0
    }
84
85
    /// Return the set of equivalence classes for this NFA. The slice returned
86
    /// always has length 256 and maps each possible byte value to its
87
    /// corresponding equivalence class ID (which is never more than 255).
88
0
    pub fn byte_classes(&self) -> &ByteClasses {
89
0
        &self.byte_classes
90
0
    }
91
}
92
93
impl fmt::Debug for NFA {
94
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95
0
        for (i, state) in self.states.iter().enumerate() {
96
0
            let status = if i == self.start { '>' } else { ' ' };
97
0
            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
98
        }
99
0
        Ok(())
100
0
    }
101
}
102
103
/// A state in a final compiled NFA.
104
#[derive(Clone, Eq, PartialEq)]
105
pub enum State {
106
    /// A state that transitions to `next` if and only if the current input
107
    /// byte is in the range `[start, end]` (inclusive).
108
    ///
109
    /// This is a special case of Sparse in that it encodes only one transition
110
    /// (and therefore avoids the allocation).
111
    Range { range: Transition },
112
    /// A state with possibly many transitions, represented in a sparse
113
    /// fashion. Transitions are ordered lexicographically by input range.
114
    /// As such, this may only be used when every transition has equal
115
    /// priority. (In practice, this is only used for encoding large UTF-8
116
    /// automata.)
117
    Sparse { ranges: Box<[Transition]> },
118
    /// An alternation such that there exists an epsilon transition to all
119
    /// states in `alternates`, where matches found via earlier transitions
120
    /// are preferred over later transitions.
121
    Union { alternates: Box<[StateID]> },
122
    /// A fail state. When encountered, the automaton is guaranteed to never
123
    /// reach a match state.
124
    Fail,
125
    /// A match state. There is exactly one such occurrence of this state in
126
    /// an NFA.
127
    Match,
128
}
129
130
/// A transition to another state, only if the given byte falls in the
131
/// inclusive range specified.
132
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
133
pub struct Transition {
134
    pub start: u8,
135
    pub end: u8,
136
    pub next: StateID,
137
}
138
139
impl State {
140
    /// Returns true if and only if this state contains one or more epsilon
141
    /// transitions.
142
0
    pub fn is_epsilon(&self) -> bool {
143
0
        match *self {
144
            State::Range { .. }
145
            | State::Sparse { .. }
146
            | State::Fail
147
0
            | State::Match => false,
148
0
            State::Union { .. } => true,
149
        }
150
0
    }
151
152
    /// Remap the transitions in this state using the given map. Namely, the
153
    /// given map should be indexed according to the transitions currently
154
    /// in this state.
155
    ///
156
    /// This is used during the final phase of the NFA compiler, which turns
157
    /// its intermediate NFA into the final NFA.
158
0
    fn remap(&mut self, remap: &[StateID]) {
159
0
        match *self {
160
0
            State::Range { ref mut range } => range.next = remap[range.next],
161
0
            State::Sparse { ref mut ranges } => {
162
0
                for r in ranges.iter_mut() {
163
0
                    r.next = remap[r.next];
164
0
                }
165
            }
166
0
            State::Union { ref mut alternates } => {
167
0
                for alt in alternates.iter_mut() {
168
0
                    *alt = remap[*alt];
169
0
                }
170
            }
171
0
            State::Fail => {}
172
0
            State::Match => {}
173
        }
174
0
    }
175
}
176
177
impl fmt::Debug for State {
178
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179
0
        match *self {
180
0
            State::Range { ref range } => range.fmt(f),
181
0
            State::Sparse { ref ranges } => {
182
0
                let rs = ranges
183
0
                    .iter()
184
0
                    .map(|t| format!("{:?}", t))
185
0
                    .collect::<Vec<String>>()
186
0
                    .join(", ");
187
0
                write!(f, "sparse({})", rs)
188
            }
189
0
            State::Union { ref alternates } => {
190
0
                let alts = alternates
191
0
                    .iter()
192
0
                    .map(|id| format!("{}", id))
193
0
                    .collect::<Vec<String>>()
194
0
                    .join(", ");
195
0
                write!(f, "alt({})", alts)
196
            }
197
0
            State::Fail => write!(f, "FAIL"),
198
0
            State::Match => write!(f, "MATCH"),
199
        }
200
0
    }
201
}
202
203
impl fmt::Debug for Transition {
204
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
205
0
        let Transition { start, end, next } = *self;
206
0
        if self.start == self.end {
207
0
            write!(f, "{} => {}", escape(start), next)
208
        } else {
209
0
            write!(f, "{}-{} => {}", escape(start), escape(end), next)
210
        }
211
0
    }
212
}
213
214
/// Return the given byte as its escaped string form.
215
0
fn escape(b: u8) -> String {
216
    use std::ascii;
217
218
0
    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
219
0
}
220
221
#[cfg(test)]
222
mod tests {
223
    use super::*;
224
    use dense;
225
    use dfa::DFA;
226
227
    #[test]
228
    fn always_match() {
229
        let nfa = NFA::always_match();
230
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
231
232
        assert_eq!(Some(0), dfa.find_at(b"", 0));
233
        assert_eq!(Some(0), dfa.find_at(b"a", 0));
234
        assert_eq!(Some(1), dfa.find_at(b"a", 1));
235
        assert_eq!(Some(0), dfa.find_at(b"ab", 0));
236
        assert_eq!(Some(1), dfa.find_at(b"ab", 1));
237
        assert_eq!(Some(2), dfa.find_at(b"ab", 2));
238
    }
239
240
    #[test]
241
    fn never_match() {
242
        let nfa = NFA::never_match();
243
        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
244
245
        assert_eq!(None, dfa.find_at(b"", 0));
246
        assert_eq!(None, dfa.find_at(b"a", 0));
247
        assert_eq!(None, dfa.find_at(b"a", 1));
248
        assert_eq!(None, dfa.find_at(b"ab", 0));
249
        assert_eq!(None, dfa.find_at(b"ab", 1));
250
        assert_eq!(None, dfa.find_at(b"ab", 2));
251
    }
252
}