/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.1.10/src/dfa.rs
Line | Count | Source |
1 | | use state_id::StateID; |
2 | | |
3 | | /// A trait describing the interface of a deterministic finite automaton (DFA). |
4 | | /// |
5 | | /// Every DFA has exactly one start state and at least one dead state (which |
6 | | /// may be the same, as in the case of an empty DFA). In all cases, a state |
7 | | /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` |
8 | | /// always returns `true`. |
9 | | /// |
10 | | /// Every DFA also has zero or more match states, such that |
11 | | /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to |
12 | | /// a match state. |
13 | | /// |
14 | | /// In general, users of this trait likely will only need to use the search |
15 | | /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other |
16 | | /// methods are lower level and are used for walking the transitions of a DFA |
17 | | /// manually. In particular, the aforementioned search routines are implemented |
18 | | /// generically in terms of the lower level transition walking routines. |
19 | | pub trait DFA { |
20 | | /// The representation used for state identifiers in this DFA. |
21 | | /// |
22 | | /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. |
23 | | type ID: StateID; |
24 | | |
25 | | /// Return the identifier of this DFA's start state. |
26 | | fn start_state(&self) -> Self::ID; |
27 | | |
28 | | /// Returns true if and only if the given identifier corresponds to a match |
29 | | /// state. |
30 | | fn is_match_state(&self, id: Self::ID) -> bool; |
31 | | |
32 | | /// Returns true if and only if the given identifier corresponds to a dead |
33 | | /// state. When a DFA enters a dead state, it is impossible to leave and |
34 | | /// thus can never lead to a match. |
35 | | fn is_dead_state(&self, id: Self::ID) -> bool; |
36 | | |
37 | | /// Returns true if and only if the given identifier corresponds to either |
38 | | /// a dead state or a match state, such that one of `is_match_state(id)` |
39 | | /// or `is_dead_state(id)` must return true. |
40 | | /// |
41 | | /// Depending on the implementation of the DFA, this routine can be used |
42 | | /// to save a branch in the core matching loop. Nevertheless, |
43 | | /// `is_match_state(id) || is_dead_state(id)` is always a valid |
44 | | /// implementation. |
45 | | fn is_match_or_dead_state(&self, id: Self::ID) -> bool; |
46 | | |
47 | | /// Returns true if and only if this DFA is anchored. |
48 | | /// |
49 | | /// When a DFA is anchored, it is only allowed to report matches that |
50 | | /// start at index `0`. |
51 | | fn is_anchored(&self) -> bool; |
52 | | |
53 | | /// Given the current state that this DFA is in and the next input byte, |
54 | | /// this method returns the identifier of the next state. The identifier |
55 | | /// returned is always valid, but it may correspond to a dead state. |
56 | | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; |
57 | | |
58 | | /// Like `next_state`, but its implementation may look up the next state |
59 | | /// without memory safety checks such as bounds checks. As such, callers |
60 | | /// must ensure that the given identifier corresponds to a valid DFA |
61 | | /// state. Implementors must, in turn, ensure that this routine is safe |
62 | | /// for all valid state identifiers and for all possible `u8` values. |
63 | | unsafe fn next_state_unchecked( |
64 | | &self, |
65 | | current: Self::ID, |
66 | | input: u8, |
67 | | ) -> Self::ID; |
68 | | |
69 | | /// Returns true if and only if the given bytes match this DFA. |
70 | | /// |
71 | | /// This routine may short circuit if it knows that scanning future input |
72 | | /// will never lead to a different result. In particular, if a DFA enters |
73 | | /// a match state or a dead state, then this routine will return `true` or |
74 | | /// `false`, respectively, without inspecting any future input. |
75 | | /// |
76 | | /// # Example |
77 | | /// |
78 | | /// This example shows how to use this method with a |
79 | | /// [`DenseDFA`](enum.DenseDFA.html). |
80 | | /// |
81 | | /// ``` |
82 | | /// use regex_automata::{DFA, DenseDFA}; |
83 | | /// |
84 | | /// # fn example() -> Result<(), regex_automata::Error> { |
85 | | /// let dfa = DenseDFA::new("foo[0-9]+bar")?; |
86 | | /// assert_eq!(true, dfa.is_match(b"foo12345bar")); |
87 | | /// assert_eq!(false, dfa.is_match(b"foobar")); |
88 | | /// # Ok(()) }; example().unwrap() |
89 | | /// ``` |
90 | | #[inline] |
91 | 0 | fn is_match(&self, bytes: &[u8]) -> bool { |
92 | 0 | self.is_match_at(bytes, 0) |
93 | 0 | } |
94 | | |
95 | | /// Returns the first position at which a match is found. |
96 | | /// |
97 | | /// This routine stops scanning input in precisely the same circumstances |
98 | | /// as `is_match`. The key difference is that this routine returns the |
99 | | /// position at which it stopped scanning input if and only if a match |
100 | | /// was found. If no match is found, then `None` is returned. |
101 | | /// |
102 | | /// # Example |
103 | | /// |
104 | | /// This example shows how to use this method with a |
105 | | /// [`DenseDFA`](enum.DenseDFA.html). |
106 | | /// |
107 | | /// ``` |
108 | | /// use regex_automata::{DFA, DenseDFA}; |
109 | | /// |
110 | | /// # fn example() -> Result<(), regex_automata::Error> { |
111 | | /// let dfa = DenseDFA::new("foo[0-9]+")?; |
112 | | /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); |
113 | | /// |
114 | | /// // Normally, the end of the leftmost first match here would be 3, |
115 | | /// // but the shortest match semantics detect a match earlier. |
116 | | /// let dfa = DenseDFA::new("abc|a")?; |
117 | | /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); |
118 | | /// # Ok(()) }; example().unwrap() |
119 | | /// ``` |
120 | | #[inline] |
121 | 0 | fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { |
122 | 0 | self.shortest_match_at(bytes, 0) |
123 | 0 | } |
124 | | |
125 | | /// Returns the end offset of the longest match. If no match exists, |
126 | | /// then `None` is returned. |
127 | | /// |
128 | | /// Implementors of this trait are not required to implement any particular |
129 | | /// match semantics (such as leftmost-first), which are instead manifest in |
130 | | /// the DFA's topology itself. |
131 | | /// |
132 | | /// In particular, this method must continue searching even after it |
133 | | /// enters a match state. The search should only terminate once it has |
134 | | /// reached the end of the input or when it has entered a dead state. Upon |
135 | | /// termination, the position of the last byte seen while still in a match |
136 | | /// state is returned. |
137 | | /// |
138 | | /// # Example |
139 | | /// |
140 | | /// This example shows how to use this method with a |
141 | | /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses |
142 | | /// "leftmost first" match semantics. |
143 | | /// |
144 | | /// Leftmost first match semantics corresponds to the match with the |
145 | | /// smallest starting offset, but where the end offset is determined by |
146 | | /// preferring earlier branches in the original regular expression. For |
147 | | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
148 | | /// will match `Samwise` in `Samwise`. |
149 | | /// |
150 | | /// Generally speaking, the "leftmost first" match is how most backtracking |
151 | | /// regular expressions tend to work. This is in contrast to POSIX-style |
152 | | /// regular expressions that yield "leftmost longest" matches. Namely, |
153 | | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
154 | | /// leftmost longest semantics. |
155 | | /// |
156 | | /// ``` |
157 | | /// use regex_automata::{DFA, DenseDFA}; |
158 | | /// |
159 | | /// # fn example() -> Result<(), regex_automata::Error> { |
160 | | /// let dfa = DenseDFA::new("foo[0-9]+")?; |
161 | | /// assert_eq!(Some(8), dfa.find(b"foo12345")); |
162 | | /// |
163 | | /// // Even though a match is found after reading the first byte (`a`), |
164 | | /// // the leftmost first match semantics demand that we find the earliest |
165 | | /// // match that prefers earlier parts of the pattern over latter parts. |
166 | | /// let dfa = DenseDFA::new("abc|a")?; |
167 | | /// assert_eq!(Some(3), dfa.find(b"abc")); |
168 | | /// # Ok(()) }; example().unwrap() |
169 | | /// ``` |
170 | | #[inline] |
171 | 0 | fn find(&self, bytes: &[u8]) -> Option<usize> { |
172 | 0 | self.find_at(bytes, 0) |
173 | 0 | } |
174 | | |
175 | | /// Returns the start offset of the longest match in reverse, by searching |
176 | | /// from the end of the input towards the start of the input. If no match |
177 | | /// exists, then `None` is returned. In other words, this has the same |
178 | | /// match semantics as `find`, but in reverse. |
179 | | /// |
180 | | /// # Example |
181 | | /// |
182 | | /// This example shows how to use this method with a |
183 | | /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine |
184 | | /// is principally useful when used in conjunction with the |
185 | | /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) |
186 | | /// configuration knob. In general, it's unlikely to be correct to use both |
187 | | /// `find` and `rfind` with the same DFA since any particular DFA will only |
188 | | /// support searching in one direction. |
189 | | /// |
190 | | /// ``` |
191 | | /// use regex_automata::{dense, DFA}; |
192 | | /// |
193 | | /// # fn example() -> Result<(), regex_automata::Error> { |
194 | | /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; |
195 | | /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); |
196 | | /// # Ok(()) }; example().unwrap() |
197 | | /// ``` |
198 | | #[inline] |
199 | 0 | fn rfind(&self, bytes: &[u8]) -> Option<usize> { |
200 | 0 | self.rfind_at(bytes, bytes.len()) |
201 | 0 | } |
202 | | |
203 | | /// Returns the same as `is_match`, but starts the search at the given |
204 | | /// offset. |
205 | | /// |
206 | | /// The significance of the starting point is that it takes the surrounding |
207 | | /// context into consideration. For example, if the DFA is anchored, then |
208 | | /// a match can only occur when `start == 0`. |
209 | | #[inline] |
210 | 0 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |
211 | 0 | if self.is_anchored() && start > 0 { |
212 | 0 | return false; |
213 | 0 | } |
214 | | |
215 | 0 | let mut state = self.start_state(); |
216 | 0 | if self.is_match_or_dead_state(state) { |
217 | 0 | return self.is_match_state(state); |
218 | 0 | } |
219 | 0 | for &b in bytes[start..].iter() { |
220 | 0 | state = unsafe { self.next_state_unchecked(state, b) }; |
221 | 0 | if self.is_match_or_dead_state(state) { |
222 | 0 | return self.is_match_state(state); |
223 | 0 | } |
224 | | } |
225 | 0 | false |
226 | 0 | } |
227 | | |
228 | | /// Returns the same as `shortest_match`, but starts the search at the |
229 | | /// given offset. |
230 | | /// |
231 | | /// The significance of the starting point is that it takes the surrounding |
232 | | /// context into consideration. For example, if the DFA is anchored, then |
233 | | /// a match can only occur when `start == 0`. |
234 | | #[inline] |
235 | 0 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
236 | 0 | if self.is_anchored() && start > 0 { |
237 | 0 | return None; |
238 | 0 | } |
239 | | |
240 | 0 | let mut state = self.start_state(); |
241 | 0 | if self.is_match_or_dead_state(state) { |
242 | 0 | return if self.is_dead_state(state) { None } else { Some(start) }; |
243 | 0 | } |
244 | 0 | for (i, &b) in bytes[start..].iter().enumerate() { |
245 | 0 | state = unsafe { self.next_state_unchecked(state, b) }; |
246 | 0 | if self.is_match_or_dead_state(state) { |
247 | 0 | return if self.is_dead_state(state) { |
248 | 0 | None |
249 | | } else { |
250 | 0 | Some(start + i + 1) |
251 | | }; |
252 | 0 | } |
253 | | } |
254 | 0 | None |
255 | 0 | } |
256 | | |
257 | | /// Returns the same as `find`, but starts the search at the given |
258 | | /// offset. |
259 | | /// |
260 | | /// The significance of the starting point is that it takes the surrounding |
261 | | /// context into consideration. For example, if the DFA is anchored, then |
262 | | /// a match can only occur when `start == 0`. |
263 | | #[inline] |
264 | 0 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
265 | 0 | if self.is_anchored() && start > 0 { |
266 | 0 | return None; |
267 | 0 | } |
268 | | |
269 | 0 | let mut state = self.start_state(); |
270 | 0 | let mut last_match = if self.is_dead_state(state) { |
271 | 0 | return None; |
272 | 0 | } else if self.is_match_state(state) { |
273 | 0 | Some(start) |
274 | | } else { |
275 | 0 | None |
276 | | }; |
277 | 0 | for (i, &b) in bytes[start..].iter().enumerate() { |
278 | 0 | state = unsafe { self.next_state_unchecked(state, b) }; |
279 | 0 | if self.is_match_or_dead_state(state) { |
280 | 0 | if self.is_dead_state(state) { |
281 | 0 | return last_match; |
282 | 0 | } |
283 | 0 | last_match = Some(start + i + 1); |
284 | 0 | } |
285 | | } |
286 | 0 | last_match |
287 | 0 | } |
288 | | |
289 | | /// Returns the same as `rfind`, but starts the search at the given |
290 | | /// offset. |
291 | | /// |
292 | | /// The significance of the starting point is that it takes the surrounding |
293 | | /// context into consideration. For example, if the DFA is anchored, then |
294 | | /// a match can only occur when `start == bytes.len()`. |
295 | | #[inline(never)] |
296 | 0 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
297 | 0 | if self.is_anchored() && start < bytes.len() { |
298 | 0 | return None; |
299 | 0 | } |
300 | | |
301 | 0 | let mut state = self.start_state(); |
302 | 0 | let mut last_match = if self.is_dead_state(state) { |
303 | 0 | return None; |
304 | 0 | } else if self.is_match_state(state) { |
305 | 0 | Some(start) |
306 | | } else { |
307 | 0 | None |
308 | | }; |
309 | 0 | for (i, &b) in bytes[..start].iter().enumerate().rev() { |
310 | 0 | state = unsafe { self.next_state_unchecked(state, b) }; |
311 | 0 | if self.is_match_or_dead_state(state) { |
312 | 0 | if self.is_dead_state(state) { |
313 | 0 | return last_match; |
314 | 0 | } |
315 | 0 | last_match = Some(i); |
316 | 0 | } |
317 | | } |
318 | 0 | last_match |
319 | 0 | } |
320 | | } |
321 | | |
322 | | impl<'a, T: DFA> DFA for &'a T { |
323 | | type ID = T::ID; |
324 | | |
325 | | #[inline] |
326 | 0 | fn start_state(&self) -> Self::ID { |
327 | 0 | (**self).start_state() |
328 | 0 | } |
329 | | |
330 | | #[inline] |
331 | 0 | fn is_match_state(&self, id: Self::ID) -> bool { |
332 | 0 | (**self).is_match_state(id) |
333 | 0 | } |
334 | | |
335 | | #[inline] |
336 | 0 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool { |
337 | 0 | (**self).is_match_or_dead_state(id) |
338 | 0 | } |
339 | | |
340 | | #[inline] |
341 | 0 | fn is_dead_state(&self, id: Self::ID) -> bool { |
342 | 0 | (**self).is_dead_state(id) |
343 | 0 | } |
344 | | |
345 | | #[inline] |
346 | 0 | fn is_anchored(&self) -> bool { |
347 | 0 | (**self).is_anchored() |
348 | 0 | } |
349 | | |
350 | | #[inline] |
351 | 0 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { |
352 | 0 | (**self).next_state(current, input) |
353 | 0 | } |
354 | | |
355 | | #[inline] |
356 | 0 | unsafe fn next_state_unchecked( |
357 | 0 | &self, |
358 | 0 | current: Self::ID, |
359 | 0 | input: u8, |
360 | 0 | ) -> Self::ID { |
361 | 0 | (**self).next_state_unchecked(current, input) |
362 | 0 | } |
363 | | } |