Coverage Report

Created: 2025-11-16 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.1.10/src/dfa.rs
Line
Count
Source
1
use state_id::StateID;
2
3
/// A trait describing the interface of a deterministic finite automaton (DFA).
4
///
5
/// Every DFA has exactly one start state and at least one dead state (which
6
/// may be the same, as in the case of an empty DFA). In all cases, a state
7
/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
8
/// always returns `true`.
9
///
10
/// Every DFA also has zero or more match states, such that
11
/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
12
/// a match state.
13
///
14
/// In general, users of this trait likely will only need to use the search
15
/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
16
/// methods are lower level and are used for walking the transitions of a DFA
17
/// manually. In particular, the aforementioned search routines are implemented
18
/// generically in terms of the lower level transition walking routines.
19
pub trait DFA {
20
    /// The representation used for state identifiers in this DFA.
21
    ///
22
    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
23
    type ID: StateID;
24
25
    /// Return the identifier of this DFA's start state.
26
    fn start_state(&self) -> Self::ID;
27
28
    /// Returns true if and only if the given identifier corresponds to a match
29
    /// state.
30
    fn is_match_state(&self, id: Self::ID) -> bool;
31
32
    /// Returns true if and only if the given identifier corresponds to a dead
33
    /// state. When a DFA enters a dead state, it is impossible to leave and
34
    /// thus can never lead to a match.
35
    fn is_dead_state(&self, id: Self::ID) -> bool;
36
37
    /// Returns true if and only if the given identifier corresponds to either
38
    /// a dead state or a match state, such that one of `is_match_state(id)`
39
    /// or `is_dead_state(id)` must return true.
40
    ///
41
    /// Depending on the implementation of the DFA, this routine can be used
42
    /// to save a branch in the core matching loop. Nevertheless,
43
    /// `is_match_state(id) || is_dead_state(id)` is always a valid
44
    /// implementation.
45
    fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
46
47
    /// Returns true if and only if this DFA is anchored.
48
    ///
49
    /// When a DFA is anchored, it is only allowed to report matches that
50
    /// start at index `0`.
51
    fn is_anchored(&self) -> bool;
52
53
    /// Given the current state that this DFA is in and the next input byte,
54
    /// this method returns the identifier of the next state. The identifier
55
    /// returned is always valid, but it may correspond to a dead state.
56
    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
57
58
    /// Like `next_state`, but its implementation may look up the next state
59
    /// without memory safety checks such as bounds checks. As such, callers
60
    /// must ensure that the given identifier corresponds to a valid DFA
61
    /// state. Implementors must, in turn, ensure that this routine is safe
62
    /// for all valid state identifiers and for all possible `u8` values.
63
    unsafe fn next_state_unchecked(
64
        &self,
65
        current: Self::ID,
66
        input: u8,
67
    ) -> Self::ID;
68
69
    /// Returns true if and only if the given bytes match this DFA.
70
    ///
71
    /// This routine may short circuit if it knows that scanning future input
72
    /// will never lead to a different result. In particular, if a DFA enters
73
    /// a match state or a dead state, then this routine will return `true` or
74
    /// `false`, respectively, without inspecting any future input.
75
    ///
76
    /// # Example
77
    ///
78
    /// This example shows how to use this method with a
79
    /// [`DenseDFA`](enum.DenseDFA.html).
80
    ///
81
    /// ```
82
    /// use regex_automata::{DFA, DenseDFA};
83
    ///
84
    /// # fn example() -> Result<(), regex_automata::Error> {
85
    /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
86
    /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
87
    /// assert_eq!(false, dfa.is_match(b"foobar"));
88
    /// # Ok(()) }; example().unwrap()
89
    /// ```
90
    #[inline]
91
0
    fn is_match(&self, bytes: &[u8]) -> bool {
92
0
        self.is_match_at(bytes, 0)
93
0
    }
94
95
    /// Returns the first position at which a match is found.
96
    ///
97
    /// This routine stops scanning input in precisely the same circumstances
98
    /// as `is_match`. The key difference is that this routine returns the
99
    /// position at which it stopped scanning input if and only if a match
100
    /// was found. If no match is found, then `None` is returned.
101
    ///
102
    /// # Example
103
    ///
104
    /// This example shows how to use this method with a
105
    /// [`DenseDFA`](enum.DenseDFA.html).
106
    ///
107
    /// ```
108
    /// use regex_automata::{DFA, DenseDFA};
109
    ///
110
    /// # fn example() -> Result<(), regex_automata::Error> {
111
    /// let dfa = DenseDFA::new("foo[0-9]+")?;
112
    /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
113
    ///
114
    /// // Normally, the end of the leftmost first match here would be 3,
115
    /// // but the shortest match semantics detect a match earlier.
116
    /// let dfa = DenseDFA::new("abc|a")?;
117
    /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
118
    /// # Ok(()) }; example().unwrap()
119
    /// ```
120
    #[inline]
121
0
    fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
122
0
        self.shortest_match_at(bytes, 0)
123
0
    }
124
125
    /// Returns the end offset of the longest match. If no match exists,
126
    /// then `None` is returned.
127
    ///
128
    /// Implementors of this trait are not required to implement any particular
129
    /// match semantics (such as leftmost-first), which are instead manifest in
130
    /// the DFA's topology itself.
131
    ///
132
    /// In particular, this method must continue searching even after it
133
    /// enters a match state. The search should only terminate once it has
134
    /// reached the end of the input or when it has entered a dead state. Upon
135
    /// termination, the position of the last byte seen while still in a match
136
    /// state is returned.
137
    ///
138
    /// # Example
139
    ///
140
    /// This example shows how to use this method with a
141
    /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
142
    /// "leftmost first" match semantics.
143
    ///
144
    /// Leftmost first match semantics corresponds to the match with the
145
    /// smallest starting offset, but where the end offset is determined by
146
    /// preferring earlier branches in the original regular expression. For
147
    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
148
    /// will match `Samwise` in `Samwise`.
149
    ///
150
    /// Generally speaking, the "leftmost first" match is how most backtracking
151
    /// regular expressions tend to work. This is in contrast to POSIX-style
152
    /// regular expressions that yield "leftmost longest" matches. Namely,
153
    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
154
    /// leftmost longest semantics.
155
    ///
156
    /// ```
157
    /// use regex_automata::{DFA, DenseDFA};
158
    ///
159
    /// # fn example() -> Result<(), regex_automata::Error> {
160
    /// let dfa = DenseDFA::new("foo[0-9]+")?;
161
    /// assert_eq!(Some(8), dfa.find(b"foo12345"));
162
    ///
163
    /// // Even though a match is found after reading the first byte (`a`),
164
    /// // the leftmost first match semantics demand that we find the earliest
165
    /// // match that prefers earlier parts of the pattern over latter parts.
166
    /// let dfa = DenseDFA::new("abc|a")?;
167
    /// assert_eq!(Some(3), dfa.find(b"abc"));
168
    /// # Ok(()) }; example().unwrap()
169
    /// ```
170
    #[inline]
171
0
    fn find(&self, bytes: &[u8]) -> Option<usize> {
172
0
        self.find_at(bytes, 0)
173
0
    }
174
175
    /// Returns the start offset of the longest match in reverse, by searching
176
    /// from the end of the input towards the start of the input. If no match
177
    /// exists, then `None` is returned. In other words, this has the same
178
    /// match semantics as `find`, but in reverse.
179
    ///
180
    /// # Example
181
    ///
182
    /// This example shows how to use this method with a
183
    /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
184
    /// is principally useful when used in conjunction with the
185
    /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
186
    /// configuration knob. In general, it's unlikely to be correct to use both
187
    /// `find` and `rfind` with the same DFA since any particular DFA will only
188
    /// support searching in one direction.
189
    ///
190
    /// ```
191
    /// use regex_automata::{dense, DFA};
192
    ///
193
    /// # fn example() -> Result<(), regex_automata::Error> {
194
    /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
195
    /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
196
    /// # Ok(()) }; example().unwrap()
197
    /// ```
198
    #[inline]
199
0
    fn rfind(&self, bytes: &[u8]) -> Option<usize> {
200
0
        self.rfind_at(bytes, bytes.len())
201
0
    }
202
203
    /// Returns the same as `is_match`, but starts the search at the given
204
    /// offset.
205
    ///
206
    /// The significance of the starting point is that it takes the surrounding
207
    /// context into consideration. For example, if the DFA is anchored, then
208
    /// a match can only occur when `start == 0`.
209
    #[inline]
210
0
    fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
211
0
        if self.is_anchored() && start > 0 {
212
0
            return false;
213
0
        }
214
215
0
        let mut state = self.start_state();
216
0
        if self.is_match_or_dead_state(state) {
217
0
            return self.is_match_state(state);
218
0
        }
219
0
        for &b in bytes[start..].iter() {
220
0
            state = unsafe { self.next_state_unchecked(state, b) };
221
0
            if self.is_match_or_dead_state(state) {
222
0
                return self.is_match_state(state);
223
0
            }
224
        }
225
0
        false
226
0
    }
227
228
    /// Returns the same as `shortest_match`, but starts the search at the
229
    /// given offset.
230
    ///
231
    /// The significance of the starting point is that it takes the surrounding
232
    /// context into consideration. For example, if the DFA is anchored, then
233
    /// a match can only occur when `start == 0`.
234
    #[inline]
235
0
    fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
236
0
        if self.is_anchored() && start > 0 {
237
0
            return None;
238
0
        }
239
240
0
        let mut state = self.start_state();
241
0
        if self.is_match_or_dead_state(state) {
242
0
            return if self.is_dead_state(state) { None } else { Some(start) };
243
0
        }
244
0
        for (i, &b) in bytes[start..].iter().enumerate() {
245
0
            state = unsafe { self.next_state_unchecked(state, b) };
246
0
            if self.is_match_or_dead_state(state) {
247
0
                return if self.is_dead_state(state) {
248
0
                    None
249
                } else {
250
0
                    Some(start + i + 1)
251
                };
252
0
            }
253
        }
254
0
        None
255
0
    }
256
257
    /// Returns the same as `find`, but starts the search at the given
258
    /// offset.
259
    ///
260
    /// The significance of the starting point is that it takes the surrounding
261
    /// context into consideration. For example, if the DFA is anchored, then
262
    /// a match can only occur when `start == 0`.
263
    #[inline]
264
0
    fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
265
0
        if self.is_anchored() && start > 0 {
266
0
            return None;
267
0
        }
268
269
0
        let mut state = self.start_state();
270
0
        let mut last_match = if self.is_dead_state(state) {
271
0
            return None;
272
0
        } else if self.is_match_state(state) {
273
0
            Some(start)
274
        } else {
275
0
            None
276
        };
277
0
        for (i, &b) in bytes[start..].iter().enumerate() {
278
0
            state = unsafe { self.next_state_unchecked(state, b) };
279
0
            if self.is_match_or_dead_state(state) {
280
0
                if self.is_dead_state(state) {
281
0
                    return last_match;
282
0
                }
283
0
                last_match = Some(start + i + 1);
284
0
            }
285
        }
286
0
        last_match
287
0
    }
288
289
    /// Returns the same as `rfind`, but starts the search at the given
290
    /// offset.
291
    ///
292
    /// The significance of the starting point is that it takes the surrounding
293
    /// context into consideration. For example, if the DFA is anchored, then
294
    /// a match can only occur when `start == bytes.len()`.
295
    #[inline(never)]
296
0
    fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
297
0
        if self.is_anchored() && start < bytes.len() {
298
0
            return None;
299
0
        }
300
301
0
        let mut state = self.start_state();
302
0
        let mut last_match = if self.is_dead_state(state) {
303
0
            return None;
304
0
        } else if self.is_match_state(state) {
305
0
            Some(start)
306
        } else {
307
0
            None
308
        };
309
0
        for (i, &b) in bytes[..start].iter().enumerate().rev() {
310
0
            state = unsafe { self.next_state_unchecked(state, b) };
311
0
            if self.is_match_or_dead_state(state) {
312
0
                if self.is_dead_state(state) {
313
0
                    return last_match;
314
0
                }
315
0
                last_match = Some(i);
316
0
            }
317
        }
318
0
        last_match
319
0
    }
320
}
321
322
impl<'a, T: DFA> DFA for &'a T {
323
    type ID = T::ID;
324
325
    #[inline]
326
0
    fn start_state(&self) -> Self::ID {
327
0
        (**self).start_state()
328
0
    }
329
330
    #[inline]
331
0
    fn is_match_state(&self, id: Self::ID) -> bool {
332
0
        (**self).is_match_state(id)
333
0
    }
334
335
    #[inline]
336
0
    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
337
0
        (**self).is_match_or_dead_state(id)
338
0
    }
339
340
    #[inline]
341
0
    fn is_dead_state(&self, id: Self::ID) -> bool {
342
0
        (**self).is_dead_state(id)
343
0
    }
344
345
    #[inline]
346
0
    fn is_anchored(&self) -> bool {
347
0
        (**self).is_anchored()
348
0
    }
349
350
    #[inline]
351
0
    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
352
0
        (**self).next_state(current, input)
353
0
    }
354
355
    #[inline]
356
0
    unsafe fn next_state_unchecked(
357
0
        &self,
358
0
        current: Self::ID,
359
0
        input: u8,
360
0
    ) -> Self::ID {
361
0
        (**self).next_state_unchecked(current, input)
362
0
    }
363
}