/src/regex/regex-automata/src/nfa/thompson/nfa.rs
Line | Count | Source |
1 | | use core::{fmt, mem}; |
2 | | |
3 | | use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; |
4 | | |
5 | | #[cfg(feature = "syntax")] |
6 | | use crate::nfa::thompson::{ |
7 | | compiler::{Compiler, Config}, |
8 | | error::BuildError, |
9 | | }; |
10 | | use crate::{ |
11 | | nfa::thompson::builder::Builder, |
12 | | util::{ |
13 | | alphabet::{self, ByteClassSet, ByteClasses}, |
14 | | captures::{GroupInfo, GroupInfoError}, |
15 | | look::{Look, LookMatcher, LookSet}, |
16 | | primitives::{ |
17 | | IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, |
18 | | }, |
19 | | sparse_set::SparseSet, |
20 | | }, |
21 | | }; |
22 | | |
23 | | /// A byte oriented Thompson non-deterministic finite automaton (NFA). |
24 | | /// |
25 | | /// A Thompson NFA is a finite state machine that permits unconditional epsilon |
26 | | /// transitions, but guarantees that there exists at most one non-epsilon |
27 | | /// transition for each element in the alphabet for each state. |
28 | | /// |
29 | | /// An NFA may be used directly for searching, for analysis or to build |
30 | | /// a deterministic finite automaton (DFA). |
31 | | /// |
32 | | /// # Cheap clones |
33 | | /// |
34 | | /// Since an NFA is a core data type in this crate that many other regex |
35 | | /// engines are based on top of, it is convenient to give ownership of an NFA |
36 | | /// to said regex engines. Because of this, an NFA uses reference counting |
37 | | /// internally. Therefore, it is cheap to clone and it is encouraged to do so. |
38 | | /// |
39 | | /// # Capabilities |
40 | | /// |
41 | | /// Using an NFA for searching via the |
42 | | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount |
43 | | /// of "power" of any regex engine in this crate. Namely, it supports the |
44 | | /// following in all cases: |
45 | | /// |
46 | | /// 1. Detection of a match. |
47 | | /// 2. Location of a match, including both the start and end offset, in a |
48 | | /// single pass of the haystack. |
49 | | /// 3. Location of matching capturing groups. |
50 | | /// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are |
51 | | /// present. |
52 | | /// |
53 | | /// # Capturing Groups |
54 | | /// |
55 | | /// Groups refer to parenthesized expressions inside a regex pattern. They look |
56 | | /// like this, where `exp` is an arbitrary regex: |
57 | | /// |
58 | | /// * `(exp)` - An unnamed capturing group. |
59 | | /// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group. |
60 | | /// * `(?:exp)` - A non-capturing group. |
61 | | /// * `(?i:exp)` - A non-capturing group that sets flags. |
62 | | /// |
63 | | /// Only the first two forms are said to be _capturing_. Capturing |
64 | | /// means that the last position at which they match is reportable. The |
65 | | /// [`Captures`](crate::util::captures::Captures) type provides convenient |
66 | | /// access to the match positions of capturing groups, which includes looking |
67 | | /// up capturing groups by their name. |
68 | | /// |
69 | | /// # Byte oriented |
70 | | /// |
71 | | /// This NFA is byte oriented, which means that all of its transitions are |
72 | | /// defined on bytes. In other words, the alphabet of an NFA consists of the |
73 | | /// 256 different byte values. |
74 | | /// |
75 | | /// While DFAs nearly demand that they be byte oriented for performance |
76 | | /// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, |
77 | | /// a previous version of this NFA supported both byte and codepoint oriented |
78 | | /// modes. A codepoint oriented mode can work because an NFA fundamentally uses |
79 | | /// a sparse representation of transitions, which works well with the large |
80 | | /// sparse space of Unicode codepoints. |
81 | | /// |
82 | | /// Nevertheless, this NFA is only byte oriented. This choice is primarily |
83 | | /// driven by implementation simplicity, and also in part memory usage. In |
84 | | /// practice, performance between the two is roughly comparable. However, |
85 | | /// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. |
86 | | /// So if we do have a codepoint oriented NFA, then we also need to generate |
87 | | /// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only |
88 | | /// generating byte oriented NFAs, we can produce one less NFA. In other words, |
89 | | /// if we made our NFA codepoint oriented, we'd need to *also* make it support |
90 | | /// a byte oriented mode, which is more complicated. But a byte oriented mode |
91 | | /// can support everything. |
92 | | /// |
93 | | /// # Differences with DFAs |
94 | | /// |
95 | | /// At the theoretical level, the precise difference between an NFA and a DFA |
96 | | /// is that, in a DFA, for every state, an input symbol unambiguously refers |
97 | | /// to a single transition _and_ that an input symbol is required for each |
98 | | /// transition. At a practical level, this permits DFA implementations to be |
99 | | /// implemented at their core with a small constant number of CPU instructions |
100 | | /// for each byte of input searched. In practice, this makes them quite a bit |
101 | | /// faster than NFAs _in general_. Namely, in order to execute a search for any |
102 | | /// Thompson NFA, one needs to keep track of a _set_ of states, and execute |
103 | | /// the possible transitions on all of those states for each input symbol. |
104 | | /// Overall, this results in much more overhead. To a first approximation, one |
105 | | /// can expect DFA searches to be about an order of magnitude faster. |
106 | | /// |
107 | | /// So why use an NFA at all? The main advantage of an NFA is that it takes |
108 | | /// linear time (in the size of the pattern string after repetitions have been |
109 | | /// expanded) to build and linear memory usage. A DFA, on the other hand, may |
110 | | /// take exponential time and/or space to build. Even in non-pathological |
111 | | /// cases, DFAs often take quite a bit more memory than their NFA counterparts, |
112 | | /// _especially_ if large Unicode character classes are involved. Of course, |
113 | | /// an NFA also provides additional capabilities. For example, it can match |
114 | | /// Unicode word boundaries on non-ASCII text and resolve the positions of |
115 | | /// capturing groups. |
116 | | /// |
117 | | /// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a |
118 | | /// good balance between an NFA and a DFA. It avoids the exponential build time |
119 | | /// of a DFA while maintaining its fast search time. The downside of a hybrid |
120 | | /// NFA/DFA is that in some cases it can be slower at search time than the NFA. |
121 | | /// (It also has less functionality than a pure NFA. It cannot handle Unicode |
122 | | /// word boundaries on non-ASCII text and cannot resolve capturing groups.) |
123 | | /// |
124 | | /// # Example |
125 | | /// |
126 | | /// This shows how to build an NFA with the default configuration and execute a |
127 | | /// search using the Pike VM. |
128 | | /// |
129 | | /// ``` |
130 | | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
131 | | /// |
132 | | /// let re = PikeVM::new(r"foo[0-9]+")?; |
133 | | /// let mut cache = re.create_cache(); |
134 | | /// let mut caps = re.create_captures(); |
135 | | /// |
136 | | /// let expected = Some(Match::must(0, 0..8)); |
137 | | /// re.captures(&mut cache, b"foo12345", &mut caps); |
138 | | /// assert_eq!(expected, caps.get_match()); |
139 | | /// |
140 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
141 | | /// ``` |
142 | | /// |
143 | | /// # Example: resolving capturing groups |
144 | | /// |
145 | | /// This example shows how to parse some simple dates and extract the |
146 | | /// components of each date via capturing groups. |
147 | | /// |
148 | | /// ``` |
149 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
150 | | /// use regex_automata::{ |
151 | | /// nfa::thompson::pikevm::PikeVM, |
152 | | /// util::captures::Captures, |
153 | | /// }; |
154 | | /// |
155 | | /// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?; |
156 | | /// let mut cache = vm.create_cache(); |
157 | | /// |
158 | | /// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05"; |
159 | | /// let all: Vec<Captures> = vm.captures_iter( |
160 | | /// &mut cache, haystack.as_bytes() |
161 | | /// ).collect(); |
162 | | /// // There should be a total of 3 matches. |
163 | | /// assert_eq!(3, all.len()); |
164 | | /// // The year from the second match is '2013'. |
165 | | /// let span = all[1].get_group_by_name("y").unwrap(); |
166 | | /// assert_eq!("2013", &haystack[span]); |
167 | | /// |
168 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
169 | | /// ``` |
170 | | /// |
171 | | /// This example shows that only the last match of a capturing group is |
172 | | /// reported, even if it had to match multiple times for an overall match |
173 | | /// to occur. |
174 | | /// |
175 | | /// ``` |
176 | | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
177 | | /// |
178 | | /// let re = PikeVM::new(r"([a-z]){4}")?; |
179 | | /// let mut cache = re.create_cache(); |
180 | | /// let mut caps = re.create_captures(); |
181 | | /// |
182 | | /// let haystack = b"quux"; |
183 | | /// re.captures(&mut cache, haystack, &mut caps); |
184 | | /// assert!(caps.is_match()); |
185 | | /// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); |
186 | | /// |
187 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
188 | | /// ``` |
189 | | #[derive(Clone)] |
190 | | pub struct NFA( |
191 | | // We make NFAs reference counted primarily for two reasons. First is that |
192 | | // the NFA type itself is quite large (at least 0.5KB), and so it makes |
193 | | // sense to put it on the heap by default anyway. Second is that, for Arc |
194 | | // specifically, this enables cheap clones. This tends to be useful because |
195 | | // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) |
196 | | // all want to hang on to an NFA for use during search time. We could |
197 | | // provide the NFA at search time via a function argument, but this makes |
198 | | // for an unnecessarily annoying API. Instead, we just let each structure |
199 | | // share ownership of the NFA. Using a deep clone would not be smart, since |
200 | | // the NFA can use quite a bit of heap space. |
201 | | Arc<Inner>, |
202 | | ); |
203 | | |
204 | | impl NFA { |
205 | | /// Parse the given regular expression using a default configuration and |
206 | | /// build an NFA from it. |
207 | | /// |
208 | | /// If you want a non-default configuration, then use the NFA |
209 | | /// [`Compiler`] with a [`Config`]. |
210 | | /// |
211 | | /// # Example |
212 | | /// |
213 | | /// ``` |
214 | | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
215 | | /// |
216 | | /// let re = PikeVM::new(r"foo[0-9]+")?; |
217 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
218 | | /// |
219 | | /// let expected = Some(Match::must(0, 0..8)); |
220 | | /// re.captures(&mut cache, b"foo12345", &mut caps); |
221 | | /// assert_eq!(expected, caps.get_match()); |
222 | | /// |
223 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
224 | | /// ``` |
225 | | #[cfg(feature = "syntax")] |
226 | 0 | pub fn new(pattern: &str) -> Result<NFA, BuildError> { |
227 | 0 | NFA::compiler().build(pattern) |
228 | 0 | } |
229 | | |
230 | | /// Parse the given regular expressions using a default configuration and |
231 | | /// build a multi-NFA from them. |
232 | | /// |
233 | | /// If you want a non-default configuration, then use the NFA |
234 | | /// [`Compiler`] with a [`Config`]. |
235 | | /// |
236 | | /// # Example |
237 | | /// |
238 | | /// ``` |
239 | | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
240 | | /// |
241 | | /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?; |
242 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
243 | | /// |
244 | | /// let expected = Some(Match::must(1, 0..3)); |
245 | | /// re.captures(&mut cache, b"foo12345bar", &mut caps); |
246 | | /// assert_eq!(expected, caps.get_match()); |
247 | | /// |
248 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
249 | | /// ``` |
250 | | #[cfg(feature = "syntax")] |
251 | | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> { |
252 | | NFA::compiler().build_many(patterns) |
253 | | } |
254 | | |
255 | | /// Returns an NFA with a single regex pattern that always matches at every |
256 | | /// position. |
257 | | /// |
258 | | /// # Example |
259 | | /// |
260 | | /// ``` |
261 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
262 | | /// |
263 | | /// let re = PikeVM::new_from_nfa(NFA::always_match())?; |
264 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
265 | | /// |
266 | | /// let expected = Some(Match::must(0, 0..0)); |
267 | | /// re.captures(&mut cache, b"", &mut caps); |
268 | | /// assert_eq!(expected, caps.get_match()); |
269 | | /// re.captures(&mut cache, b"foo", &mut caps); |
270 | | /// assert_eq!(expected, caps.get_match()); |
271 | | /// |
272 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
273 | | /// ``` |
274 | 0 | pub fn always_match() -> NFA { |
275 | | // We could use NFA::new("") here and we'd get the same semantics, but |
276 | | // hand-assembling the NFA (as below) does the same thing with a fewer |
277 | | // number of states. It also avoids needing the 'syntax' feature |
278 | | // enabled. |
279 | | // |
280 | | // Technically all we need is the "match" state, but we add the |
281 | | // "capture" states so that the PikeVM can use this NFA. |
282 | | // |
283 | | // The unwraps below are OK because we add so few states that they will |
284 | | // never exhaust any default limits in any environment. |
285 | 0 | let mut builder = Builder::new(); |
286 | 0 | let pid = builder.start_pattern().unwrap(); |
287 | 0 | assert_eq!(pid.as_usize(), 0); |
288 | 0 | let start_id = |
289 | 0 | builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); |
290 | 0 | let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); |
291 | 0 | let match_id = builder.add_match().unwrap(); |
292 | 0 | builder.patch(start_id, end_id).unwrap(); |
293 | 0 | builder.patch(end_id, match_id).unwrap(); |
294 | 0 | let pid = builder.finish_pattern(start_id).unwrap(); |
295 | 0 | assert_eq!(pid.as_usize(), 0); |
296 | 0 | builder.build(start_id, start_id).unwrap() |
297 | 0 | } |
298 | | |
299 | | /// Returns an NFA that never matches at any position. |
300 | | /// |
301 | | /// This is a convenience routine for creating an NFA with zero patterns. |
302 | | /// |
303 | | /// # Example |
304 | | /// |
305 | | /// ``` |
306 | | /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; |
307 | | /// |
308 | | /// let re = PikeVM::new_from_nfa(NFA::never_match())?; |
309 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
310 | | /// |
311 | | /// re.captures(&mut cache, b"", &mut caps); |
312 | | /// assert!(!caps.is_match()); |
313 | | /// re.captures(&mut cache, b"foo", &mut caps); |
314 | | /// assert!(!caps.is_match()); |
315 | | /// |
316 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
317 | | /// ``` |
318 | 0 | pub fn never_match() -> NFA { |
319 | | // This always succeeds because it only requires one NFA state, which |
320 | | // will never exhaust any (default) limits. |
321 | 0 | let mut builder = Builder::new(); |
322 | 0 | let sid = builder.add_fail().unwrap(); |
323 | 0 | builder.build(sid, sid).unwrap() |
324 | 0 | } |
325 | | |
326 | | /// Return a default configuration for an `NFA`. |
327 | | /// |
328 | | /// This is a convenience routine to avoid needing to import the `Config` |
329 | | /// type when customizing the construction of an NFA. |
330 | | /// |
331 | | /// # Example |
332 | | /// |
333 | | /// This example shows how to build an NFA with a small size limit that |
334 | | /// results in a compilation error for any regex that tries to use more |
335 | | /// heap memory than the configured limit. |
336 | | /// |
337 | | /// ``` |
338 | | /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; |
339 | | /// |
340 | | /// let result = PikeVM::builder() |
341 | | /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) |
342 | | /// // Remember, \w is Unicode-aware by default and thus huge. |
343 | | /// .build(r"\w+"); |
344 | | /// assert!(result.is_err()); |
345 | | /// ``` |
346 | | #[cfg(feature = "syntax")] |
347 | 0 | pub fn config() -> Config { |
348 | 0 | Config::new() |
349 | 0 | } |
350 | | |
351 | | /// Return a compiler for configuring the construction of an `NFA`. |
352 | | /// |
353 | | /// This is a convenience routine to avoid needing to import the |
354 | | /// [`Compiler`] type in common cases. |
355 | | /// |
356 | | /// # Example |
357 | | /// |
358 | | /// This example shows how to build an NFA that is permitted match invalid |
359 | | /// UTF-8. Without the additional syntax configuration here, compilation of |
360 | | /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. |
361 | | /// |
362 | | /// ``` |
363 | | /// use regex_automata::{ |
364 | | /// nfa::thompson::pikevm::PikeVM, |
365 | | /// util::syntax, |
366 | | /// Match, |
367 | | /// }; |
368 | | /// |
369 | | /// let re = PikeVM::builder() |
370 | | /// .syntax(syntax::Config::new().utf8(false)) |
371 | | /// .build(r"[a-z]+(?-u:.)")?; |
372 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
373 | | /// |
374 | | /// let expected = Some(Match::must(0, 1..5)); |
375 | | /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps); |
376 | | /// assert_eq!(expected, caps.get_match()); |
377 | | /// |
378 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
379 | | /// ``` |
380 | | #[cfg(feature = "syntax")] |
381 | 0 | pub fn compiler() -> Compiler { |
382 | 0 | Compiler::new() |
383 | 0 | } |
384 | | |
385 | | /// Returns an iterator over all pattern identifiers in this NFA. |
386 | | /// |
387 | | /// Pattern IDs are allocated in sequential order starting from zero, |
388 | | /// where the order corresponds to the order of patterns provided to the |
389 | | /// [`NFA::new_many`] constructor. |
390 | | /// |
391 | | /// # Example |
392 | | /// |
393 | | /// ``` |
394 | | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
395 | | /// |
396 | | /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; |
397 | | /// let pids: Vec<PatternID> = nfa.patterns().collect(); |
398 | | /// assert_eq!(pids, vec![ |
399 | | /// PatternID::must(0), |
400 | | /// PatternID::must(1), |
401 | | /// PatternID::must(2), |
402 | | /// ]); |
403 | | /// |
404 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
405 | | /// ``` |
406 | 91.4k | pub fn patterns(&self) -> PatternIter<'_> { |
407 | 91.4k | PatternIter { |
408 | 91.4k | it: PatternID::iter(self.pattern_len()), |
409 | 91.4k | _marker: core::marker::PhantomData, |
410 | 91.4k | } |
411 | 91.4k | } |
412 | | |
413 | | /// Returns the total number of regex patterns in this NFA. |
414 | | /// |
415 | | /// This may return zero if the NFA was constructed with no patterns. In |
416 | | /// this case, the NFA can never produce a match for any input. |
417 | | /// |
418 | | /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because |
419 | | /// NFA construction will fail if too many patterns are added. |
420 | | /// |
421 | | /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. |
422 | | /// |
423 | | /// # Example |
424 | | /// |
425 | | /// ``` |
426 | | /// use regex_automata::nfa::thompson::NFA; |
427 | | /// |
428 | | /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; |
429 | | /// assert_eq!(3, nfa.pattern_len()); |
430 | | /// |
431 | | /// let nfa = NFA::never_match(); |
432 | | /// assert_eq!(0, nfa.pattern_len()); |
433 | | /// |
434 | | /// let nfa = NFA::always_match(); |
435 | | /// assert_eq!(1, nfa.pattern_len()); |
436 | | /// |
437 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
438 | | /// ``` |
439 | | #[inline] |
440 | 1.49M | pub fn pattern_len(&self) -> usize { |
441 | 1.49M | self.0.start_pattern.len() |
442 | 1.49M | } |
443 | | |
444 | | /// Return the state identifier of the initial anchored state of this NFA. |
445 | | /// |
446 | | /// The returned identifier is guaranteed to be a valid index into the |
447 | | /// slice returned by [`NFA::states`], and is also a valid argument to |
448 | | /// [`NFA::state`]. |
449 | | /// |
450 | | /// # Example |
451 | | /// |
452 | | /// This example shows a somewhat contrived example where we can easily |
453 | | /// predict the anchored starting state. |
454 | | /// |
455 | | /// ``` |
456 | | /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; |
457 | | /// |
458 | | /// let nfa = NFA::compiler() |
459 | | /// .configure(NFA::config().which_captures(WhichCaptures::None)) |
460 | | /// .build("a")?; |
461 | | /// let state = nfa.state(nfa.start_anchored()); |
462 | | /// match *state { |
463 | | /// State::ByteRange { trans } => { |
464 | | /// assert_eq!(b'a', trans.start); |
465 | | /// assert_eq!(b'a', trans.end); |
466 | | /// } |
467 | | /// _ => unreachable!("unexpected state"), |
468 | | /// } |
469 | | /// |
470 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
471 | | /// ``` |
472 | | #[inline] |
473 | 295k | pub fn start_anchored(&self) -> StateID { |
474 | 295k | self.0.start_anchored |
475 | 295k | } |
476 | | |
477 | | /// Return the state identifier of the initial unanchored state of this |
478 | | /// NFA. |
479 | | /// |
480 | | /// This is equivalent to the identifier returned by |
481 | | /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. |
482 | | /// |
483 | | /// The returned identifier is guaranteed to be a valid index into the |
484 | | /// slice returned by [`NFA::states`], and is also a valid argument to |
485 | | /// [`NFA::state`]. |
486 | | /// |
487 | | /// # Example |
488 | | /// |
489 | | /// This example shows that the anchored and unanchored starting states |
490 | | /// are equivalent when an anchored NFA is built. |
491 | | /// |
492 | | /// ``` |
493 | | /// use regex_automata::nfa::thompson::NFA; |
494 | | /// |
495 | | /// let nfa = NFA::new("^a")?; |
496 | | /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); |
497 | | /// |
498 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
499 | | /// ``` |
500 | | #[inline] |
501 | 201k | pub fn start_unanchored(&self) -> StateID { |
502 | 201k | self.0.start_unanchored |
503 | 201k | } |
504 | | |
505 | | /// Return the state identifier of the initial anchored state for the given |
506 | | /// pattern, or `None` if there is no pattern corresponding to the given |
507 | | /// identifier. |
508 | | /// |
509 | | /// If one uses the starting state for a particular pattern, then the only |
510 | | /// match that can be returned is for the corresponding pattern. |
511 | | /// |
512 | | /// The returned identifier is guaranteed to be a valid index into the |
513 | | /// slice returned by [`NFA::states`], and is also a valid argument to |
514 | | /// [`NFA::state`]. |
515 | | /// |
516 | | /// # Errors |
517 | | /// |
518 | | /// If the pattern doesn't exist in this NFA, then this returns an error. |
519 | | /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. |
520 | | /// |
521 | | /// # Example |
522 | | /// |
523 | | /// This example shows that the anchored and unanchored starting states |
524 | | /// are equivalent when an anchored NFA is built. |
525 | | /// |
526 | | /// ``` |
527 | | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
528 | | /// |
529 | | /// let nfa = NFA::new_many(&["^a", "^b"])?; |
530 | | /// // The anchored and unanchored states for the entire NFA are the same, |
531 | | /// // since all of the patterns are anchored. |
532 | | /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); |
533 | | /// // But the anchored starting states for each pattern are distinct, |
534 | | /// // because these starting states can only lead to matches for the |
535 | | /// // corresponding pattern. |
536 | | /// let anchored = Some(nfa.start_anchored()); |
537 | | /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); |
538 | | /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); |
539 | | /// // Requesting a pattern not in the NFA will result in None: |
540 | | /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); |
541 | | /// |
542 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
543 | | /// ``` |
544 | | #[inline] |
545 | 92.4k | pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> { |
546 | 92.4k | self.0.start_pattern.get(pid.as_usize()).copied() |
547 | 92.4k | } |
548 | | |
549 | | /// Get the byte class set for this NFA. |
550 | | /// |
551 | | /// A byte class set is a partitioning of this NFA's alphabet into |
552 | | /// equivalence classes. Any two bytes in the same equivalence class are |
553 | | /// guaranteed to never discriminate between a match or a non-match. (The |
554 | | /// partitioning may not be minimal.) |
555 | | /// |
556 | | /// Byte classes are used internally by this crate when building DFAs. |
557 | | /// Namely, among other optimizations, they enable a space optimization |
558 | | /// where the DFA's internal alphabet is defined over the equivalence |
559 | | /// classes of bytes instead of all possible byte values. The former is |
560 | | /// often quite a bit smaller than the latter, which permits the DFA to use |
561 | | /// less space for its transition table. |
562 | | #[inline] |
563 | 131k | pub(crate) fn byte_class_set(&self) -> &ByteClassSet { |
564 | 131k | &self.0.byte_class_set |
565 | 131k | } |
566 | | |
567 | | /// Get the byte classes for this NFA. |
568 | | /// |
569 | | /// Byte classes represent a partitioning of this NFA's alphabet into |
570 | | /// equivalence classes. Any two bytes in the same equivalence class are |
571 | | /// guaranteed to never discriminate between a match or a non-match. (The |
572 | | /// partitioning may not be minimal.) |
573 | | /// |
574 | | /// Byte classes are used internally by this crate when building DFAs. |
575 | | /// Namely, among other optimizations, they enable a space optimization |
576 | | /// where the DFA's internal alphabet is defined over the equivalence |
577 | | /// classes of bytes instead of all possible byte values. The former is |
578 | | /// often quite a bit smaller than the latter, which permits the DFA to use |
579 | | /// less space for its transition table. |
580 | | /// |
581 | | /// # Example |
582 | | /// |
583 | | /// This example shows how to query the class of various bytes. |
584 | | /// |
585 | | /// ``` |
586 | | /// use regex_automata::nfa::thompson::NFA; |
587 | | /// |
588 | | /// let nfa = NFA::new("[a-z]+")?; |
589 | | /// let classes = nfa.byte_classes(); |
590 | | /// // 'a' and 'z' are in the same class for this regex. |
591 | | /// assert_eq!(classes.get(b'a'), classes.get(b'z')); |
592 | | /// // But 'a' and 'A' are not. |
593 | | /// assert_ne!(classes.get(b'a'), classes.get(b'A')); |
594 | | /// |
595 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
596 | | /// ``` |
597 | | #[inline] |
598 | 30.0k | pub fn byte_classes(&self) -> &ByteClasses { |
599 | 30.0k | &self.0.byte_classes |
600 | 30.0k | } |
601 | | |
602 | | /// Return a reference to the NFA state corresponding to the given ID. |
603 | | /// |
604 | | /// This is a convenience routine for `nfa.states()[id]`. |
605 | | /// |
606 | | /// # Panics |
607 | | /// |
608 | | /// This panics when the given identifier does not reference a valid state. |
609 | | /// That is, when `id.as_usize() >= nfa.states().len()`. |
610 | | /// |
611 | | /// # Example |
612 | | /// |
613 | | /// The anchored state for a pattern will typically correspond to a |
614 | | /// capturing state for that pattern. (Although, this is not an API |
615 | | /// guarantee!) |
616 | | /// |
617 | | /// ``` |
618 | | /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; |
619 | | /// |
620 | | /// let nfa = NFA::new("a")?; |
621 | | /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); |
622 | | /// match *state { |
623 | | /// State::Capture { slot, .. } => { |
624 | | /// assert_eq!(0, slot.as_usize()); |
625 | | /// } |
626 | | /// _ => unreachable!("unexpected state"), |
627 | | /// } |
628 | | /// |
629 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
630 | | /// ``` |
631 | | #[inline] |
632 | 9.53G | pub fn state(&self, id: StateID) -> &State { |
633 | 9.53G | &self.states()[id] |
634 | 9.53G | } |
635 | | |
636 | | /// Returns a slice of all states in this NFA. |
637 | | /// |
638 | | /// The slice returned is indexed by `StateID`. This provides a convenient |
639 | | /// way to access states while following transitions among those states. |
640 | | /// |
641 | | /// # Example |
642 | | /// |
643 | | /// This demonstrates that disabling UTF-8 mode can shrink the size of the |
644 | | /// NFA considerably in some cases, especially when using Unicode character |
645 | | /// classes. |
646 | | /// |
647 | | /// ``` |
648 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
649 | | /// use regex_automata::nfa::thompson::NFA; |
650 | | /// |
651 | | /// let nfa_unicode = NFA::new(r"\w")?; |
652 | | /// let nfa_ascii = NFA::new(r"(?-u)\w")?; |
653 | | /// // Yes, a factor of 45 difference. No lie. |
654 | | /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); |
655 | | /// |
656 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
657 | | /// ``` |
658 | | #[inline] |
659 | 9.53G | pub fn states(&self) -> &[State] { |
660 | 9.53G | &self.0.states |
661 | 9.53G | } |
662 | | |
663 | | /// Returns the capturing group info for this NFA. |
664 | | /// |
665 | | /// The [`GroupInfo`] provides a way to map to and from capture index |
666 | | /// and capture name for each pattern. It also provides a mapping from |
667 | | /// each of the capturing groups in every pattern to their corresponding |
668 | | /// slot offsets encoded in [`State::Capture`] states. |
669 | | /// |
670 | | /// Note that `GroupInfo` uses reference counting internally, such that |
671 | | /// cloning a `GroupInfo` is very cheap. |
672 | | /// |
673 | | /// # Example |
674 | | /// |
675 | | /// This example shows how to get a list of all capture group names for |
676 | | /// a particular pattern. |
677 | | /// |
678 | | /// ``` |
679 | | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
680 | | /// |
681 | | /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; |
682 | | /// // The first is the implicit group that is always unnamed. The next |
683 | | /// // 5 groups are the explicit groups found in the concrete syntax above. |
684 | | /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; |
685 | | /// let got: Vec<Option<&str>> = |
686 | | /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); |
687 | | /// assert_eq!(expected, got); |
688 | | /// |
689 | | /// // Using an invalid pattern ID will result in nothing yielded. |
690 | | /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); |
691 | | /// assert_eq!(0, got); |
692 | | /// |
693 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
694 | | /// ``` |
695 | | #[inline] |
696 | 217k | pub fn group_info(&self) -> &GroupInfo { |
697 | 217k | &self.0.group_info() |
698 | 217k | } |
699 | | |
700 | | /// Returns true if and only if this NFA has at least one |
701 | | /// [`Capture`](State::Capture) in its sequence of states. |
702 | | /// |
703 | | /// This is useful as a way to perform a quick test before attempting |
704 | | /// something that does or does not require capture states. For example, |
705 | | /// some regex engines (like the PikeVM) require capture states in order to |
706 | | /// work at all. |
707 | | /// |
708 | | /// # Example |
709 | | /// |
710 | | /// This example shows a few different NFAs and whether they have captures |
711 | | /// or not. |
712 | | /// |
713 | | /// ``` |
714 | | /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; |
715 | | /// |
716 | | /// // Obviously has capture states. |
717 | | /// let nfa = NFA::new("(a)")?; |
718 | | /// assert!(nfa.has_capture()); |
719 | | /// |
720 | | /// // Less obviously has capture states, because every pattern has at |
721 | | /// // least one anonymous capture group corresponding to the match for the |
722 | | /// // entire pattern. |
723 | | /// let nfa = NFA::new("a")?; |
724 | | /// assert!(nfa.has_capture()); |
725 | | /// |
726 | | /// // Other than hand building your own NFA, this is the only way to build |
727 | | /// // an NFA without capturing groups. In general, you should only do this |
728 | | /// // if you don't intend to use any of the NFA-oriented regex engines. |
729 | | /// // Overall, capturing groups don't have many downsides. Although they |
730 | | /// // can add a bit of noise to simple NFAs, so it can be nice to disable |
731 | | /// // them for debugging purposes. |
732 | | /// // |
733 | | /// // Notice that 'has_capture' is false here even when we have an |
734 | | /// // explicit capture group in the pattern. |
735 | | /// let nfa = NFA::compiler() |
736 | | /// .configure(NFA::config().which_captures(WhichCaptures::None)) |
737 | | /// .build("(a)")?; |
738 | | /// assert!(!nfa.has_capture()); |
739 | | /// |
740 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
741 | | /// ``` |
742 | | #[inline] |
743 | | pub fn has_capture(&self) -> bool { |
744 | | self.0.has_capture |
745 | | } |
746 | | |
747 | | /// Returns true if and only if this NFA can match the empty string. |
748 | | /// When it returns false, all possible matches are guaranteed to have a |
749 | | /// non-zero length. |
750 | | /// |
751 | | /// This is useful as cheap way to know whether code needs to handle the |
752 | | /// case of a zero length match. This is particularly important when UTF-8 |
753 | | /// modes are enabled, as when UTF-8 mode is enabled, empty matches that |
754 | | /// split a codepoint must never be reported. This extra handling can |
755 | | /// sometimes be costly, and since regexes matching an empty string are |
756 | | /// somewhat rare, it can be beneficial to treat such regexes specially. |
757 | | /// |
758 | | /// # Example |
759 | | /// |
760 | | /// This example shows a few different NFAs and whether they match the |
761 | | /// empty string or not. Notice the empty string isn't merely a matter |
762 | | /// of a string of length literally `0`, but rather, whether a match can |
763 | | /// occur between specific pairs of bytes. |
764 | | /// |
765 | | /// ``` |
766 | | /// use regex_automata::{nfa::thompson::NFA, util::syntax}; |
767 | | /// |
768 | | /// // The empty regex matches the empty string. |
769 | | /// let nfa = NFA::new("")?; |
770 | | /// assert!(nfa.has_empty(), "empty matches empty"); |
771 | | /// // The '+' repetition operator requires at least one match, and so |
772 | | /// // does not match the empty string. |
773 | | /// let nfa = NFA::new("a+")?; |
774 | | /// assert!(!nfa.has_empty(), "+ does not match empty"); |
775 | | /// // But the '*' repetition operator does. |
776 | | /// let nfa = NFA::new("a*")?; |
777 | | /// assert!(nfa.has_empty(), "* does match empty"); |
778 | | /// // And wrapping '+' in an operator that can match an empty string also |
779 | | /// // causes it to match the empty string too. |
780 | | /// let nfa = NFA::new("(a+)*")?; |
781 | | /// assert!(nfa.has_empty(), "+ inside of * matches empty"); |
782 | | /// |
783 | | /// // If a regex is just made of a look-around assertion, even if the |
784 | | /// // assertion requires some kind of non-empty string around it (such as |
785 | | /// // \b), then it is still treated as if it matches the empty string. |
786 | | /// // Namely, if a match occurs of just a look-around assertion, then the |
787 | | /// // match returned is empty. |
788 | | /// let nfa = NFA::compiler() |
789 | | /// .syntax(syntax::Config::new().utf8(false)) |
790 | | /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; |
791 | | /// assert!(nfa.has_empty(), "assertions match empty"); |
792 | | /// // Even when an assertion is wrapped in a '+', it still matches the |
793 | | /// // empty string. |
794 | | /// let nfa = NFA::new(r"\b+")?; |
795 | | /// assert!(nfa.has_empty(), "+ of an assertion matches empty"); |
796 | | /// |
797 | | /// // An alternation with even one branch that can match the empty string |
798 | | /// // is also said to match the empty string overall. |
799 | | /// let nfa = NFA::new("foo|(bar)?|quux")?; |
800 | | /// assert!(nfa.has_empty(), "alternations can match empty"); |
801 | | /// |
802 | | /// // An NFA that matches nothing does not match the empty string. |
803 | | /// let nfa = NFA::new("[a&&b]")?; |
804 | | /// assert!(!nfa.has_empty(), "never matching means not matching empty"); |
805 | | /// // But if it's wrapped in something that doesn't require a match at |
806 | | /// // all, then it can match the empty string! |
807 | | /// let nfa = NFA::new("[a&&b]*")?; |
808 | | /// assert!(nfa.has_empty(), "* on never-match still matches empty"); |
809 | | /// // Since a '+' requires a match, using it on something that can never |
810 | | /// // match will itself produce a regex that can never match anything, |
811 | | /// // and thus does not match the empty string. |
812 | | /// let nfa = NFA::new("[a&&b]+")?; |
813 | | /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing"); |
814 | | /// |
815 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
816 | | /// ``` |
817 | | #[inline] |
818 | 197k | pub fn has_empty(&self) -> bool { |
819 | 197k | self.0.has_empty |
820 | 197k | } |
821 | | |
822 | | /// Whether UTF-8 mode is enabled for this NFA or not. |
823 | | /// |
824 | | /// When UTF-8 mode is enabled, all matches reported by a regex engine |
825 | | /// derived from this NFA are guaranteed to correspond to spans of valid |
826 | | /// UTF-8. This includes zero-width matches. For example, the regex engine |
827 | | /// must guarantee that the empty regex will not match at the positions |
828 | | /// between code units in the UTF-8 encoding of a single codepoint. |
829 | | /// |
830 | | /// See [`Config::utf8`] for more information. |
831 | | /// |
832 | | /// This is enabled by default. |
833 | | /// |
834 | | /// # Example |
835 | | /// |
836 | | /// This example shows how UTF-8 mode can impact the match spans that may |
837 | | /// be reported in certain cases. |
838 | | /// |
839 | | /// ``` |
840 | | /// use regex_automata::{ |
841 | | /// nfa::thompson::{self, pikevm::PikeVM}, |
842 | | /// Match, Input, |
843 | | /// }; |
844 | | /// |
845 | | /// let re = PikeVM::new("")?; |
846 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
847 | | /// |
848 | | /// // UTF-8 mode is enabled by default. |
849 | | /// let mut input = Input::new("☃"); |
850 | | /// re.search(&mut cache, &input, &mut caps); |
851 | | /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); |
852 | | /// |
853 | | /// // Even though an empty regex matches at 1..1, our next match is |
854 | | /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is |
855 | | /// // three bytes long). |
856 | | /// input.set_start(1); |
857 | | /// re.search(&mut cache, &input, &mut caps); |
858 | | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
859 | | /// |
860 | | /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: |
861 | | /// let re = PikeVM::builder() |
862 | | /// .thompson(thompson::Config::new().utf8(false)) |
863 | | /// .build("")?; |
864 | | /// re.search(&mut cache, &input, &mut caps); |
865 | | /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); |
866 | | /// |
867 | | /// input.set_start(2); |
868 | | /// re.search(&mut cache, &input, &mut caps); |
869 | | /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); |
870 | | /// |
871 | | /// input.set_start(3); |
872 | | /// re.search(&mut cache, &input, &mut caps); |
873 | | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
874 | | /// |
875 | | /// input.set_start(4); |
876 | | /// re.search(&mut cache, &input, &mut caps); |
877 | | /// assert_eq!(None, caps.get_match()); |
878 | | /// |
879 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
880 | | /// ``` |
881 | | #[inline] |
882 | 116k | pub fn is_utf8(&self) -> bool { |
883 | 116k | self.0.utf8 |
884 | 116k | } |
885 | | |
886 | | /// Returns true when this NFA is meant to be matched in reverse. |
887 | | /// |
888 | | /// Generally speaking, when this is true, it means the NFA is supposed to |
889 | | /// be used in conjunction with moving backwards through the haystack. That |
890 | | /// is, from a higher memory address to a lower memory address. |
891 | | /// |
892 | | /// It is often the case that lower level routines dealing with an NFA |
893 | | /// don't need to care about whether it is "meant" to be matched in reverse |
894 | | /// or not. However, there are some specific cases where it matters. For |
895 | | /// example, the implementation of CRLF-aware `^` and `$` line anchors |
896 | | /// needs to know whether the search is in the forward or reverse |
897 | | /// direction. In the forward direction, neither `^` nor `$` should match |
898 | | /// when a `\r` has been seen previously and a `\n` is next. However, in |
899 | | /// the reverse direction, neither `^` nor `$` should match when a `\n` |
900 | | /// has been seen previously and a `\r` is next. This fundamentally changes |
901 | | /// how the state machine is constructed, and thus needs to be altered |
902 | | /// based on the direction of the search. |
903 | | /// |
904 | | /// This is automatically set when using a [`Compiler`] with a configuration |
905 | | /// where [`Config::reverse`] is enabled. If you're building your own NFA |
906 | | /// by hand via a [`Builder`] |
907 | | #[inline] |
908 | 15.5M | pub fn is_reverse(&self) -> bool { |
909 | 15.5M | self.0.reverse |
910 | 15.5M | } |
911 | | |
912 | | /// Returns true if and only if all starting states for this NFA correspond |
913 | | /// to the beginning of an anchored search. |
914 | | /// |
915 | | /// Typically, an NFA will have both an anchored and an unanchored starting |
916 | | /// state. Namely, because it tends to be useful to have both and the cost |
917 | | /// of having an unanchored starting state is almost zero (for an NFA). |
918 | | /// However, if all patterns in the NFA are themselves anchored, then even |
919 | | /// the unanchored starting state will correspond to an anchored search |
920 | | /// since the pattern doesn't permit anything else. |
921 | | /// |
922 | | /// # Example |
923 | | /// |
924 | | /// This example shows a few different scenarios where this method's |
925 | | /// return value varies. |
926 | | /// |
927 | | /// ``` |
928 | | /// use regex_automata::nfa::thompson::NFA; |
929 | | /// |
930 | | /// // The unanchored starting state permits matching this pattern anywhere |
931 | | /// // in a haystack, instead of just at the beginning. |
932 | | /// let nfa = NFA::new("a")?; |
933 | | /// assert!(!nfa.is_always_start_anchored()); |
934 | | /// |
935 | | /// // In this case, the pattern is itself anchored, so there is no way |
936 | | /// // to run an unanchored search. |
937 | | /// let nfa = NFA::new("^a")?; |
938 | | /// assert!(nfa.is_always_start_anchored()); |
939 | | /// |
940 | | /// // When multiline mode is enabled, '^' can match at the start of a line |
941 | | /// // in addition to the start of a haystack, so an unanchored search is |
942 | | /// // actually possible. |
943 | | /// let nfa = NFA::new("(?m)^a")?; |
944 | | /// assert!(!nfa.is_always_start_anchored()); |
945 | | /// |
946 | | /// // Weird cases also work. A pattern is only considered anchored if all |
947 | | /// // matches may only occur at the start of a haystack. |
948 | | /// let nfa = NFA::new("(^a)|a")?; |
949 | | /// assert!(!nfa.is_always_start_anchored()); |
950 | | /// |
951 | | /// // When multiple patterns are present, if they are all anchored, then |
952 | | /// // the NFA is always anchored too. |
953 | | /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?; |
954 | | /// assert!(nfa.is_always_start_anchored()); |
955 | | /// |
956 | | /// // But if one pattern is unanchored, then the NFA must permit an |
957 | | /// // unanchored search. |
958 | | /// let nfa = NFA::new_many(&["^a", "b", "^c"])?; |
959 | | /// assert!(!nfa.is_always_start_anchored()); |
960 | | /// |
961 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
962 | | /// ``` |
963 | | #[inline] |
964 | 147k | pub fn is_always_start_anchored(&self) -> bool { |
965 | 147k | self.start_anchored() == self.start_unanchored() |
966 | 147k | } |
967 | | |
968 | | /// Returns the look-around matcher associated with this NFA. |
969 | | /// |
970 | | /// A look-around matcher determines how to match look-around assertions. |
971 | | /// In particular, some assertions are configurable. For example, the |
972 | | /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed |
973 | | /// from the default of `\n` to any other byte. |
974 | | /// |
975 | | /// If the NFA was built using a [`Compiler`], then this matcher |
976 | | /// can be set via the [`Config::look_matcher`] configuration |
977 | | /// knob. Otherwise, if you've built an NFA by hand, it is set via |
978 | | /// [`Builder::set_look_matcher`]. |
979 | | /// |
980 | | /// # Example |
981 | | /// |
982 | | /// This shows how to change the line terminator for multi-line assertions. |
983 | | /// |
984 | | /// ``` |
985 | | /// use regex_automata::{ |
986 | | /// nfa::thompson::{self, pikevm::PikeVM}, |
987 | | /// util::look::LookMatcher, |
988 | | /// Match, Input, |
989 | | /// }; |
990 | | /// |
991 | | /// let mut lookm = LookMatcher::new(); |
992 | | /// lookm.set_line_terminator(b'\x00'); |
993 | | /// |
994 | | /// let re = PikeVM::builder() |
995 | | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
996 | | /// .build(r"(?m)^[a-z]+$")?; |
997 | | /// let mut cache = re.create_cache(); |
998 | | /// |
999 | | /// // Multi-line assertions now use NUL as a terminator. |
1000 | | /// assert_eq!( |
1001 | | /// Some(Match::must(0, 1..4)), |
1002 | | /// re.find(&mut cache, b"\x00abc\x00"), |
1003 | | /// ); |
1004 | | /// // ... and \n is no longer recognized as a terminator. |
1005 | | /// assert_eq!( |
1006 | | /// None, |
1007 | | /// re.find(&mut cache, b"\nabc\n"), |
1008 | | /// ); |
1009 | | /// |
1010 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1011 | | /// ``` |
1012 | | #[inline] |
1013 | 489M | pub fn look_matcher(&self) -> &LookMatcher { |
1014 | 489M | &self.0.look_matcher |
1015 | 489M | } |
1016 | | |
1017 | | /// Returns the union of all look-around assertions used throughout this |
1018 | | /// NFA. When the returned set is empty, it implies that the NFA has no |
1019 | | /// look-around assertions and thus zero conditional epsilon transitions. |
1020 | | /// |
1021 | | /// This is useful in some cases enabling optimizations. It is not |
1022 | | /// unusual, for example, for optimizations to be of the form, "for any |
1023 | | /// regex with zero conditional epsilon transitions, do ..." where "..." |
1024 | | /// is some kind of optimization. |
1025 | | /// |
1026 | | /// This isn't only helpful for optimizations either. Sometimes look-around |
1027 | | /// assertions are difficult to support. For example, many of the DFAs in |
1028 | | /// this crate don't support Unicode word boundaries or handle them using |
1029 | | /// heuristics. Handling that correctly typically requires some kind of |
1030 | | /// cheap check of whether the NFA has a Unicode word boundary in the first |
1031 | | /// place. |
1032 | | /// |
1033 | | /// # Example |
1034 | | /// |
1035 | | /// This example shows how this routine varies based on the regex pattern: |
1036 | | /// |
1037 | | /// ``` |
1038 | | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1039 | | /// |
1040 | | /// // No look-around at all. |
1041 | | /// let nfa = NFA::new("a")?; |
1042 | | /// assert!(nfa.look_set_any().is_empty()); |
1043 | | /// |
1044 | | /// // When multiple patterns are present, since this returns the union, |
1045 | | /// // it will include look-around assertions that only appear in one |
1046 | | /// // pattern. |
1047 | | /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?; |
1048 | | /// assert!(nfa.look_set_any().contains(Look::Start)); |
1049 | | /// |
1050 | | /// // Some groups of assertions have various shortcuts. For example: |
1051 | | /// let nfa = NFA::new(r"(?-u:\b)")?; |
1052 | | /// assert!(nfa.look_set_any().contains_word()); |
1053 | | /// assert!(!nfa.look_set_any().contains_word_unicode()); |
1054 | | /// assert!(nfa.look_set_any().contains_word_ascii()); |
1055 | | /// |
1056 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1057 | | /// ``` |
1058 | | #[inline] |
1059 | 59.5M | pub fn look_set_any(&self) -> LookSet { |
1060 | 59.5M | self.0.look_set_any |
1061 | 59.5M | } |
1062 | | |
1063 | | /// Returns the union of all prefix look-around assertions for every |
1064 | | /// pattern in this NFA. When the returned set is empty, it implies none of |
1065 | | /// the patterns require moving through a conditional epsilon transition |
1066 | | /// before inspecting the first byte in the haystack. |
1067 | | /// |
1068 | | /// This can be useful for determining what kinds of assertions need to be |
1069 | | /// satisfied at the beginning of a search. For example, typically DFAs |
1070 | | /// in this crate will build a distinct starting state for each possible |
1071 | | /// starting configuration that might result in look-around assertions |
1072 | | /// being satisfied differently. However, if the set returned here is |
1073 | | /// empty, then you know that the start state is invariant because there |
1074 | | /// are no conditional epsilon transitions to consider. |
1075 | | /// |
1076 | | /// # Example |
1077 | | /// |
1078 | | /// This example shows how this routine varies based on the regex pattern: |
1079 | | /// |
1080 | | /// ``` |
1081 | | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1082 | | /// |
1083 | | /// // No look-around at all. |
1084 | | /// let nfa = NFA::new("a")?; |
1085 | | /// assert!(nfa.look_set_prefix_any().is_empty()); |
1086 | | /// |
1087 | | /// // When multiple patterns are present, since this returns the union, |
1088 | | /// // it will include look-around assertions that only appear in one |
1089 | | /// // pattern. But it will only include assertions that are in the prefix |
1090 | | /// // of a pattern. For example, this includes '^' but not '$' even though |
1091 | | /// // '$' does appear. |
1092 | | /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?; |
1093 | | /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); |
1094 | | /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); |
1095 | | /// |
1096 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1097 | | /// ``` |
1098 | | #[inline] |
1099 | 432k | pub fn look_set_prefix_any(&self) -> LookSet { |
1100 | 432k | self.0.look_set_prefix_any |
1101 | 432k | } |
1102 | | |
1103 | | // FIXME: The `look_set_prefix_all` computation was not correct, and it |
1104 | | // seemed a little tricky to fix it. Since I wasn't actually using it for |
1105 | | // anything, I just decided to remove it in the run up to the regex 1.9 |
1106 | | // release. If you need this, please file an issue. |
1107 | | /* |
1108 | | /// Returns the intersection of all prefix look-around assertions for every |
1109 | | /// pattern in this NFA. When the returned set is empty, it implies at |
1110 | | /// least one of the patterns does not require moving through a conditional |
1111 | | /// epsilon transition before inspecting the first byte in the haystack. |
1112 | | /// Conversely, when the set contains an assertion, it implies that every |
1113 | | /// pattern in the NFA also contains that assertion in its prefix. |
1114 | | /// |
1115 | | /// This can be useful for determining what kinds of assertions need to be |
1116 | | /// satisfied at the beginning of a search. For example, if you know that |
1117 | | /// [`Look::Start`] is in the prefix intersection set returned here, then |
1118 | | /// you know that all searches, regardless of input configuration, will be |
1119 | | /// anchored. |
1120 | | /// |
1121 | | /// # Example |
1122 | | /// |
1123 | | /// This example shows how this routine varies based on the regex pattern: |
1124 | | /// |
1125 | | /// ``` |
1126 | | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1127 | | /// |
1128 | | /// // No look-around at all. |
1129 | | /// let nfa = NFA::new("a")?; |
1130 | | /// assert!(nfa.look_set_prefix_all().is_empty()); |
1131 | | /// |
1132 | | /// // When multiple patterns are present, since this returns the |
1133 | | /// // intersection, it will only include assertions present in every |
1134 | | /// // prefix, and only the prefix. |
1135 | | /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; |
1136 | | /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); |
1137 | | /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); |
1138 | | /// |
1139 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1140 | | /// ``` |
1141 | | #[inline] |
1142 | | pub fn look_set_prefix_all(&self) -> LookSet { |
1143 | | self.0.look_set_prefix_all |
1144 | | } |
1145 | | */ |
1146 | | |
1147 | | /// Returns the memory usage, in bytes, of this NFA. |
1148 | | /// |
1149 | | /// This does **not** include the stack size used up by this NFA. To |
1150 | | /// compute that, use `std::mem::size_of::<NFA>()`. |
1151 | | /// |
1152 | | /// # Example |
1153 | | /// |
1154 | | /// This example shows that large Unicode character classes can use quite |
1155 | | /// a bit of memory. |
1156 | | /// |
1157 | | /// ``` |
1158 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1159 | | /// use regex_automata::nfa::thompson::NFA; |
1160 | | /// |
1161 | | /// let nfa_unicode = NFA::new(r"\w")?; |
1162 | | /// let nfa_ascii = NFA::new(r"(?-u:\w)")?; |
1163 | | /// |
1164 | | /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); |
1165 | | /// |
1166 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1167 | | /// ``` |
1168 | | #[inline] |
1169 | 0 | pub fn memory_usage(&self) -> usize { |
1170 | | use core::mem::size_of; |
1171 | | |
1172 | 0 | size_of::<Inner>() // allocated on the heap via Arc |
1173 | 0 | + self.0.states.len() * size_of::<State>() |
1174 | 0 | + self.0.start_pattern.len() * size_of::<StateID>() |
1175 | 0 | + self.0.group_info.memory_usage() |
1176 | 0 | + self.0.memory_extra |
1177 | 0 | } |
1178 | | } |
1179 | | |
1180 | | impl fmt::Debug for NFA { |
1181 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1182 | 0 | self.0.fmt(f) |
1183 | 0 | } |
1184 | | } |
1185 | | |
1186 | | /// The "inner" part of the NFA. We split this part out so that we can easily |
1187 | | /// wrap it in an `Arc` above in the definition of `NFA`. |
1188 | | /// |
1189 | | /// See builder.rs for the code that actually builds this type. This module |
1190 | | /// does provide (internal) mutable methods for adding things to this |
1191 | | /// NFA before finalizing it, but the high level construction process is |
1192 | | /// controlled by the builder abstraction. (Which is complicated enough to |
1193 | | /// get its own module.) |
1194 | | #[derive(Default)] |
1195 | | pub(super) struct Inner { |
1196 | | /// The state sequence. This sequence is guaranteed to be indexable by all |
1197 | | /// starting state IDs, and it is also guaranteed to contain at most one |
1198 | | /// `Match` state for each pattern compiled into this NFA. (A pattern may |
1199 | | /// not have a corresponding `Match` state if a `Match` state is impossible |
1200 | | /// to reach.) |
1201 | | states: Vec<State>, |
1202 | | /// The anchored starting state of this NFA. |
1203 | | start_anchored: StateID, |
1204 | | /// The unanchored starting state of this NFA. |
1205 | | start_unanchored: StateID, |
1206 | | /// The starting states for each individual pattern. Starting at any |
1207 | | /// of these states will result in only an anchored search for the |
1208 | | /// corresponding pattern. The vec is indexed by pattern ID. When the NFA |
1209 | | /// contains a single regex, then `start_pattern[0]` and `start_anchored` |
1210 | | /// are always equivalent. |
1211 | | start_pattern: Vec<StateID>, |
1212 | | /// Info about the capturing groups in this NFA. This is responsible for |
1213 | | /// mapping groups to slots, mapping groups to names and names to groups. |
1214 | | group_info: GroupInfo, |
1215 | | /// A representation of equivalence classes over the transitions in this |
1216 | | /// NFA. Two bytes in the same equivalence class must not discriminate |
1217 | | /// between a match or a non-match. This map can be used to shrink the |
1218 | | /// total size of a DFA's transition table with a small match-time cost. |
1219 | | /// |
1220 | | /// Note that the NFA's transitions are *not* defined in terms of these |
1221 | | /// equivalence classes. The NFA's transitions are defined on the original |
1222 | | /// byte values. For the most part, this is because they wouldn't really |
1223 | | /// help the NFA much since the NFA already uses a sparse representation |
1224 | | /// to represent transitions. Byte classes are most effective in a dense |
1225 | | /// representation. |
1226 | | byte_class_set: ByteClassSet, |
1227 | | /// This is generated from `byte_class_set`, and essentially represents the |
1228 | | /// same thing but supports different access patterns. Namely, this permits |
1229 | | /// looking up the equivalence class of a byte very cheaply. |
1230 | | /// |
1231 | | /// Ideally we would just store this, but because of annoying code |
1232 | | /// structure reasons, we keep both this and `byte_class_set` around for |
1233 | | /// now. I think I would prefer that `byte_class_set` were computed in the |
1234 | | /// `Builder`, but right now, we compute it as states are added to the |
1235 | | /// `NFA`. |
1236 | | byte_classes: ByteClasses, |
1237 | | /// Whether this NFA has a `Capture` state anywhere. |
1238 | | has_capture: bool, |
1239 | | /// When the empty string is in the language matched by this NFA. |
1240 | | has_empty: bool, |
1241 | | /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that |
1242 | | /// all non-empty matches produced by this NFA correspond to spans of valid |
1243 | | /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 |
1244 | | /// encoded codepoint should be filtered out by the corresponding regex |
1245 | | /// engine. |
1246 | | utf8: bool, |
1247 | | /// Whether this NFA is meant to be matched in reverse or not. |
1248 | | reverse: bool, |
1249 | | /// The matcher to be used for look-around assertions. |
1250 | | look_matcher: LookMatcher, |
1251 | | /// The union of all look-around assertions that occur anywhere within |
1252 | | /// this NFA. If this set is empty, then it means there are precisely zero |
1253 | | /// conditional epsilon transitions in the NFA. |
1254 | | look_set_any: LookSet, |
1255 | | /// The union of all look-around assertions that occur as a zero-length |
1256 | | /// prefix for any of the patterns in this NFA. |
1257 | | look_set_prefix_any: LookSet, |
1258 | | /* |
1259 | | /// The intersection of all look-around assertions that occur as a |
1260 | | /// zero-length prefix for any of the patterns in this NFA. |
1261 | | look_set_prefix_all: LookSet, |
1262 | | */ |
1263 | | /// Heap memory used indirectly by NFA states and other things (like the |
1264 | | /// various capturing group representations above). Since each state |
1265 | | /// might use a different amount of heap, we need to keep track of this |
1266 | | /// incrementally. |
1267 | | memory_extra: usize, |
1268 | | } |
1269 | | |
1270 | | impl Inner { |
1271 | | /// Runs any last finalization bits and turns this into a full NFA. |
1272 | 129k | pub(super) fn into_nfa(mut self) -> NFA { |
1273 | 129k | self.byte_classes = self.byte_class_set.byte_classes(); |
1274 | | // Do epsilon closure from the start state of every pattern in order |
1275 | | // to compute various properties such as look-around assertions and |
1276 | | // whether the empty string can be matched. |
1277 | 129k | let mut stack = vec![]; |
1278 | 129k | let mut seen = SparseSet::new(self.states.len()); |
1279 | 129k | for &start_id in self.start_pattern.iter() { |
1280 | 129k | stack.push(start_id); |
1281 | 129k | seen.clear(); |
1282 | | // let mut prefix_all = LookSet::full(); |
1283 | 129k | let mut prefix_any = LookSet::empty(); |
1284 | 27.4M | while let Some(sid) = stack.pop() { |
1285 | 27.2M | if !seen.insert(sid) { |
1286 | 2.55M | continue; |
1287 | 24.7M | } |
1288 | 24.7M | match self.states[sid] { |
1289 | | State::ByteRange { .. } |
1290 | | | State::Dense { .. } |
1291 | 12.5M | | State::Fail => continue, |
1292 | | State::Sparse(_) => { |
1293 | | // This snippet below will rewrite this sparse state |
1294 | | // as a dense state. By doing it here, we apply this |
1295 | | // optimization to all hot "sparse" states since these |
1296 | | // are the states that are reachable from the start |
1297 | | // state via an epsilon closure. |
1298 | | // |
1299 | | // Unfortunately, this optimization did not seem to |
1300 | | // help much in some very limited ad hoc benchmarking. |
1301 | | // |
1302 | | // I left the 'Dense' state type in place in case we |
1303 | | // want to revisit this, but I suspect the real way |
1304 | | // to make forward progress is a more fundamental |
1305 | | // re-architecting of how data in the NFA is laid out. |
1306 | | // I think we should consider a single contiguous |
1307 | | // allocation instead of all this indirection and |
1308 | | // potential heap allocations for every state. But this |
1309 | | // is a large re-design and will require API breaking |
1310 | | // changes. |
1311 | | // self.memory_extra -= self.states[sid].memory_usage(); |
1312 | | // let trans = DenseTransitions::from_sparse(sparse); |
1313 | | // self.states[sid] = State::Dense(trans); |
1314 | | // self.memory_extra += self.states[sid].memory_usage(); |
1315 | 2.05M | continue; |
1316 | | } |
1317 | 30.2k | State::Match { .. } => self.has_empty = true, |
1318 | 4.14M | State::Look { look, next } => { |
1319 | 4.14M | prefix_any = prefix_any.insert(look); |
1320 | 4.14M | stack.push(next); |
1321 | 4.14M | } |
1322 | 1.66M | State::Union { ref alternates } => { |
1323 | 1.66M | // Order doesn't matter here, since we're just dealing |
1324 | 1.66M | // with look-around sets. But if we do richer analysis |
1325 | 1.66M | // here that needs to care about preference order, then |
1326 | 1.66M | // this should be done in reverse. |
1327 | 1.66M | stack.extend(alternates.iter()); |
1328 | 1.66M | } |
1329 | 1.77M | State::BinaryUnion { alt1, alt2 } => { |
1330 | 1.77M | stack.push(alt2); |
1331 | 1.77M | stack.push(alt1); |
1332 | 1.77M | } |
1333 | 2.46M | State::Capture { next, .. } => { |
1334 | 2.46M | stack.push(next); |
1335 | 2.46M | } |
1336 | | } |
1337 | | } |
1338 | 129k | self.look_set_prefix_any = |
1339 | 129k | self.look_set_prefix_any.union(prefix_any); |
1340 | | } |
1341 | 129k | self.states.shrink_to_fit(); |
1342 | 129k | self.start_pattern.shrink_to_fit(); |
1343 | 129k | NFA(Arc::new(self)) |
1344 | 129k | } |
1345 | | |
1346 | | /// Returns the capturing group info for this NFA. |
1347 | 4.03M | pub(super) fn group_info(&self) -> &GroupInfo { |
1348 | 4.03M | &self.group_info |
1349 | 4.03M | } |
1350 | | |
1351 | | /// Add the given state to this NFA after allocating a fresh identifier for |
1352 | | /// it. |
1353 | | /// |
1354 | | /// This panics if too many states are added such that a fresh identifier |
1355 | | /// could not be created. (Currently, the only caller of this routine is |
1356 | | /// a `Builder`, and it upholds this invariant.) |
1357 | 112M | pub(super) fn add(&mut self, state: State) -> StateID { |
1358 | 112M | match state { |
1359 | 88.4M | State::ByteRange { ref trans } => { |
1360 | 88.4M | self.byte_class_set.set_range(trans.start, trans.end); |
1361 | 88.4M | } |
1362 | 7.47M | State::Sparse(ref sparse) => { |
1363 | 33.0M | for trans in sparse.transitions.iter() { |
1364 | 33.0M | self.byte_class_set.set_range(trans.start, trans.end); |
1365 | 33.0M | } |
1366 | | } |
1367 | 0 | State::Dense { .. } => unreachable!(), |
1368 | 5.34M | State::Look { look, .. } => { |
1369 | 5.34M | self.look_matcher |
1370 | 5.34M | .add_to_byteset(look, &mut self.byte_class_set); |
1371 | 5.34M | self.look_set_any = self.look_set_any.insert(look); |
1372 | 5.34M | } |
1373 | 3.82M | State::Capture { .. } => { |
1374 | 3.82M | self.has_capture = true; |
1375 | 3.82M | } |
1376 | | State::Union { .. } |
1377 | | | State::BinaryUnion { .. } |
1378 | | | State::Fail |
1379 | 7.86M | | State::Match { .. } => {} |
1380 | | } |
1381 | | |
1382 | 112M | let id = StateID::new(self.states.len()).unwrap(); |
1383 | 112M | self.memory_extra += state.memory_usage(); |
1384 | 112M | self.states.push(state); |
1385 | 112M | id |
1386 | 112M | } |
1387 | | |
1388 | | /// Set the starting state identifiers for this NFA. |
1389 | | /// |
1390 | | /// `start_anchored` and `start_unanchored` may be equivalent. When they |
1391 | | /// are, then the NFA can only execute anchored searches. This might |
1392 | | /// occur, for example, for patterns that are unconditionally anchored. |
1393 | | /// e.g., `^foo`. |
1394 | 129k | pub(super) fn set_starts( |
1395 | 129k | &mut self, |
1396 | 129k | start_anchored: StateID, |
1397 | 129k | start_unanchored: StateID, |
1398 | 129k | start_pattern: &[StateID], |
1399 | 129k | ) { |
1400 | 129k | self.start_anchored = start_anchored; |
1401 | 129k | self.start_unanchored = start_unanchored; |
1402 | 129k | self.start_pattern = start_pattern.to_vec(); |
1403 | 129k | } |
1404 | | |
1405 | | /// Sets the UTF-8 mode of this NFA. |
1406 | 129k | pub(super) fn set_utf8(&mut self, yes: bool) { |
1407 | 129k | self.utf8 = yes; |
1408 | 129k | } |
1409 | | |
1410 | | /// Sets the reverse mode of this NFA. |
1411 | 129k | pub(super) fn set_reverse(&mut self, yes: bool) { |
1412 | 129k | self.reverse = yes; |
1413 | 129k | } |
1414 | | |
1415 | | /// Sets the look-around assertion matcher for this NFA. |
1416 | 129k | pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { |
1417 | 129k | self.look_matcher = m; |
1418 | 129k | } |
1419 | | |
1420 | | /// Set the capturing groups for this NFA. |
1421 | | /// |
1422 | | /// The given slice should contain the capturing groups for each pattern, |
1423 | | /// The capturing groups in turn should correspond to the total number of |
1424 | | /// capturing groups in the pattern, including the anonymous first capture |
1425 | | /// group for each pattern. If a capturing group does have a name, then it |
1426 | | /// should be provided as a Arc<str>. |
1427 | | /// |
1428 | | /// This returns an error if a corresponding `GroupInfo` could not be |
1429 | | /// built. |
1430 | 129k | pub(super) fn set_captures( |
1431 | 129k | &mut self, |
1432 | 129k | captures: &[Vec<Option<Arc<str>>>], |
1433 | 129k | ) -> Result<(), GroupInfoError> { |
1434 | 129k | self.group_info = GroupInfo::new( |
1435 | 162k | captures.iter().map(|x| x.iter().map(|y| y.as_ref())), |
1436 | 0 | )?; |
1437 | 129k | Ok(()) |
1438 | 129k | } |
1439 | | |
1440 | | /// Remap the transitions in every state of this NFA using the given map. |
1441 | | /// The given map should be indexed according to state ID namespace used by |
1442 | | /// the transitions of the states currently in this NFA. |
1443 | | /// |
1444 | | /// This is particularly useful to the NFA builder, since it is convenient |
1445 | | /// to add NFA states in order to produce their final IDs. Then, after all |
1446 | | /// of the intermediate "empty" states (unconditional epsilon transitions) |
1447 | | /// have been removed from the builder's representation, we can re-map all |
1448 | | /// of the transitions in the states already added to their final IDs. |
1449 | 129k | pub(super) fn remap(&mut self, old_to_new: &[StateID]) { |
1450 | 113M | for state in &mut self.states { |
1451 | 112M | state.remap(old_to_new); |
1452 | 112M | } |
1453 | 129k | self.start_anchored = old_to_new[self.start_anchored]; |
1454 | 129k | self.start_unanchored = old_to_new[self.start_unanchored]; |
1455 | 129k | for id in self.start_pattern.iter_mut() { |
1456 | 129k | *id = old_to_new[*id]; |
1457 | 129k | } |
1458 | 129k | } |
1459 | | } |
1460 | | |
1461 | | impl fmt::Debug for Inner { |
1462 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1463 | 0 | writeln!(f, "thompson::NFA(")?; |
1464 | 0 | for (sid, state) in self.states.iter().with_state_ids() { |
1465 | 0 | let status = if sid == self.start_anchored { |
1466 | 0 | '^' |
1467 | 0 | } else if sid == self.start_unanchored { |
1468 | 0 | '>' |
1469 | | } else { |
1470 | 0 | ' ' |
1471 | | }; |
1472 | 0 | writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; |
1473 | | } |
1474 | 0 | let pattern_len = self.start_pattern.len(); |
1475 | 0 | if pattern_len > 1 { |
1476 | 0 | writeln!(f)?; |
1477 | 0 | for pid in 0..pattern_len { |
1478 | 0 | let sid = self.start_pattern[pid]; |
1479 | 0 | writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; |
1480 | | } |
1481 | 0 | } |
1482 | 0 | writeln!(f)?; |
1483 | 0 | writeln!( |
1484 | 0 | f, |
1485 | 0 | "transition equivalence classes: {:?}", |
1486 | | self.byte_classes, |
1487 | 0 | )?; |
1488 | 0 | writeln!(f, ")")?; |
1489 | 0 | Ok(()) |
1490 | 0 | } |
1491 | | } |
1492 | | |
1493 | | /// A state in an NFA. |
1494 | | /// |
1495 | | /// In theory, it can help to conceptualize an `NFA` as a graph consisting of |
1496 | | /// `State`s. Each `State` contains its complete set of outgoing transitions. |
1497 | | /// |
1498 | | /// In practice, it can help to conceptualize an `NFA` as a sequence of |
1499 | | /// instructions for a virtual machine. Each `State` says what to do and where |
1500 | | /// to go next. |
1501 | | /// |
1502 | | /// Strictly speaking, the practical interpretation is the most correct one, |
1503 | | /// because of the [`Capture`](State::Capture) state. Namely, a `Capture` |
1504 | | /// state always forwards execution to another state unconditionally. Its only |
1505 | | /// purpose is to cause a side effect: the recording of the current input |
1506 | | /// position at a particular location in memory. In this sense, an `NFA` |
1507 | | /// has more power than a theoretical non-deterministic finite automaton. |
1508 | | /// |
1509 | | /// For most uses of this crate, it is likely that one may never even need to |
1510 | | /// be aware of this type at all. The main use cases for looking at `State`s |
1511 | | /// directly are if you need to write your own search implementation or if you |
1512 | | /// need to do some kind of analysis on the NFA. |
1513 | | #[derive(Clone, Eq, PartialEq)] |
1514 | | pub enum State { |
1515 | | /// A state with a single transition that can only be taken if the current |
1516 | | /// input symbol is in a particular range of bytes. |
1517 | | ByteRange { |
1518 | | /// The transition from this state to the next. |
1519 | | trans: Transition, |
1520 | | }, |
1521 | | /// A state with possibly many transitions represented in a sparse fashion. |
1522 | | /// Transitions are non-overlapping and ordered lexicographically by input |
1523 | | /// range. |
1524 | | /// |
1525 | | /// In practice, this is used for encoding UTF-8 automata. Its presence is |
1526 | | /// primarily an optimization that avoids many additional unconditional |
1527 | | /// epsilon transitions (via [`Union`](State::Union) states), and thus |
1528 | | /// decreases the overhead of traversing the NFA. This can improve both |
1529 | | /// matching time and DFA construction time. |
1530 | | Sparse(SparseTransitions), |
1531 | | /// A dense representation of a state with multiple transitions. |
1532 | | Dense(DenseTransitions), |
1533 | | /// A conditional epsilon transition satisfied via some sort of |
1534 | | /// look-around. Look-around is limited to anchor and word boundary |
1535 | | /// assertions. |
1536 | | /// |
1537 | | /// Look-around states are meant to be evaluated while performing epsilon |
1538 | | /// closure (computing the set of states reachable from a particular state |
1539 | | /// via only epsilon transitions). If the current position in the haystack |
1540 | | /// satisfies the look-around assertion, then you're permitted to follow |
1541 | | /// that epsilon transition. |
1542 | | Look { |
1543 | | /// The look-around assertion that must be satisfied before moving |
1544 | | /// to `next`. |
1545 | | look: Look, |
1546 | | /// The state to transition to if the look-around assertion is |
1547 | | /// satisfied. |
1548 | | next: StateID, |
1549 | | }, |
1550 | | /// An alternation such that there exists an epsilon transition to all |
1551 | | /// states in `alternates`, where matches found via earlier transitions |
1552 | | /// are preferred over later transitions. |
1553 | | Union { |
1554 | | /// An ordered sequence of unconditional epsilon transitions to other |
1555 | | /// states. Transitions earlier in the sequence are preferred over |
1556 | | /// transitions later in the sequence. |
1557 | | alternates: Box<[StateID]>, |
1558 | | }, |
1559 | | /// An alternation such that there exists precisely two unconditional |
1560 | | /// epsilon transitions, where matches found via `alt1` are preferred over |
1561 | | /// matches found via `alt2`. |
1562 | | /// |
1563 | | /// This state exists as a common special case of Union where there are |
1564 | | /// only two alternates. In this case, we don't need any allocations to |
1565 | | /// represent the state. This saves a bit of memory and also saves an |
1566 | | /// additional memory access when traversing the NFA. |
1567 | | BinaryUnion { |
1568 | | /// An unconditional epsilon transition to another NFA state. This |
1569 | | /// is preferred over `alt2`. |
1570 | | alt1: StateID, |
1571 | | /// An unconditional epsilon transition to another NFA state. Matches |
1572 | | /// reported via this transition should only be reported if no matches |
1573 | | /// were found by following `alt1`. |
1574 | | alt2: StateID, |
1575 | | }, |
1576 | | /// An empty state that records a capture location. |
1577 | | /// |
1578 | | /// From the perspective of finite automata, this is precisely equivalent |
1579 | | /// to an unconditional epsilon transition, but serves the purpose of |
1580 | | /// instructing NFA simulations to record additional state when the finite |
1581 | | /// state machine passes through this epsilon transition. |
1582 | | /// |
1583 | | /// `slot` in this context refers to the specific capture group slot |
1584 | | /// offset that is being recorded. Each capturing group has two slots |
1585 | | /// corresponding to the start and end of the matching portion of that |
1586 | | /// group. |
1587 | | /// |
1588 | | /// The pattern ID and capture group index are also included in this state |
1589 | | /// in case they are useful. But mostly, all you'll need is `next` and |
1590 | | /// `slot`. |
1591 | | Capture { |
1592 | | /// The state to transition to, unconditionally. |
1593 | | next: StateID, |
1594 | | /// The pattern ID that this capture belongs to. |
1595 | | pattern_id: PatternID, |
1596 | | /// The capture group index that this capture belongs to. Capture group |
1597 | | /// indices are local to each pattern. For example, when capturing |
1598 | | /// groups are enabled, every pattern has a capture group at index |
1599 | | /// `0`. |
1600 | | group_index: SmallIndex, |
1601 | | /// The slot index for this capture. Every capturing group has two |
1602 | | /// slots: one for the start haystack offset and one for the end |
1603 | | /// haystack offset. Unlike capture group indices, slot indices are |
1604 | | /// global across all patterns in this NFA. That is, each slot belongs |
1605 | | /// to a single pattern, but there is only one slot at index `i`. |
1606 | | slot: SmallIndex, |
1607 | | }, |
1608 | | /// A state that cannot be transitioned out of. This is useful for cases |
1609 | | /// where you want to prevent matching from occurring. For example, if your |
1610 | | /// regex parser permits empty character classes, then one could choose |
1611 | | /// a `Fail` state to represent them. (An empty character class can be |
1612 | | /// thought of as an empty set. Since nothing is in an empty set, they can |
1613 | | /// never match anything.) |
1614 | | Fail, |
1615 | | /// A match state. There is at least one such occurrence of this state for |
1616 | | /// each regex that can match that is in this NFA. |
1617 | | Match { |
1618 | | /// The matching pattern ID. |
1619 | | pattern_id: PatternID, |
1620 | | }, |
1621 | | } |
1622 | | |
1623 | | impl State { |
1624 | | /// Returns true if and only if this state contains one or more epsilon |
1625 | | /// transitions. |
1626 | | /// |
1627 | | /// In practice, a state has no outgoing transitions (like `Match`), has |
1628 | | /// only non-epsilon transitions (like `ByteRange`) or has only epsilon |
1629 | | /// transitions (like `Union`). |
1630 | | /// |
1631 | | /// # Example |
1632 | | /// |
1633 | | /// ``` |
1634 | | /// use regex_automata::{ |
1635 | | /// nfa::thompson::{State, Transition}, |
1636 | | /// util::primitives::{PatternID, StateID, SmallIndex}, |
1637 | | /// }; |
1638 | | /// |
1639 | | /// // Capture states are epsilon transitions. |
1640 | | /// let state = State::Capture { |
1641 | | /// next: StateID::ZERO, |
1642 | | /// pattern_id: PatternID::ZERO, |
1643 | | /// group_index: SmallIndex::ZERO, |
1644 | | /// slot: SmallIndex::ZERO, |
1645 | | /// }; |
1646 | | /// assert!(state.is_epsilon()); |
1647 | | /// |
1648 | | /// // ByteRange states are not. |
1649 | | /// let state = State::ByteRange { |
1650 | | /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO }, |
1651 | | /// }; |
1652 | | /// assert!(!state.is_epsilon()); |
1653 | | /// |
1654 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1655 | | /// ``` |
1656 | | #[inline] |
1657 | 977M | pub fn is_epsilon(&self) -> bool { |
1658 | 977M | match *self { |
1659 | | State::ByteRange { .. } |
1660 | | | State::Sparse { .. } |
1661 | | | State::Dense { .. } |
1662 | | | State::Fail |
1663 | 487M | | State::Match { .. } => false, |
1664 | | State::Look { .. } |
1665 | | | State::Union { .. } |
1666 | | | State::BinaryUnion { .. } |
1667 | 490M | | State::Capture { .. } => true, |
1668 | | } |
1669 | 977M | } |
1670 | | |
1671 | | /// Returns the heap memory usage of this NFA state in bytes. |
1672 | 112M | fn memory_usage(&self) -> usize { |
1673 | 112M | match *self { |
1674 | | State::ByteRange { .. } |
1675 | | | State::Look { .. } |
1676 | | | State::BinaryUnion { .. } |
1677 | | | State::Capture { .. } |
1678 | | | State::Match { .. } |
1679 | 103M | | State::Fail => 0, |
1680 | 7.47M | State::Sparse(SparseTransitions { ref transitions }) => { |
1681 | 7.47M | transitions.len() * mem::size_of::<Transition>() |
1682 | | } |
1683 | 0 | State::Dense { .. } => 256 * mem::size_of::<StateID>(), |
1684 | 2.34M | State::Union { ref alternates } => { |
1685 | 2.34M | alternates.len() * mem::size_of::<StateID>() |
1686 | | } |
1687 | | } |
1688 | 112M | } |
1689 | | |
1690 | | /// Remap the transitions in this state using the given map. Namely, the |
1691 | | /// given map should be indexed according to the transitions currently |
1692 | | /// in this state. |
1693 | | /// |
1694 | | /// This is used during the final phase of the NFA compiler, which turns |
1695 | | /// its intermediate NFA into the final NFA. |
1696 | 112M | fn remap(&mut self, remap: &[StateID]) { |
1697 | 112M | match *self { |
1698 | 88.4M | State::ByteRange { ref mut trans } => { |
1699 | 88.4M | trans.next = remap[trans.next] |
1700 | | } |
1701 | 7.47M | State::Sparse(SparseTransitions { ref mut transitions }) => { |
1702 | 33.0M | for t in transitions.iter_mut() { |
1703 | 33.0M | t.next = remap[t.next]; |
1704 | 33.0M | } |
1705 | | } |
1706 | 0 | State::Dense(DenseTransitions { ref mut transitions }) => { |
1707 | 0 | for sid in transitions.iter_mut() { |
1708 | 0 | *sid = remap[*sid]; |
1709 | 0 | } |
1710 | | } |
1711 | 5.34M | State::Look { ref mut next, .. } => *next = remap[*next], |
1712 | 2.34M | State::Union { ref mut alternates } => { |
1713 | 27.5M | for alt in alternates.iter_mut() { |
1714 | 27.5M | *alt = remap[*alt]; |
1715 | 27.5M | } |
1716 | | } |
1717 | 5.04M | State::BinaryUnion { ref mut alt1, ref mut alt2 } => { |
1718 | 5.04M | *alt1 = remap[*alt1]; |
1719 | 5.04M | *alt2 = remap[*alt2]; |
1720 | 5.04M | } |
1721 | 3.82M | State::Capture { ref mut next, .. } => *next = remap[*next], |
1722 | 356k | State::Fail => {} |
1723 | 129k | State::Match { .. } => {} |
1724 | | } |
1725 | 112M | } |
1726 | | } |
1727 | | |
1728 | | impl fmt::Debug for State { |
1729 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1730 | 0 | match *self { |
1731 | 0 | State::ByteRange { ref trans } => trans.fmt(f), |
1732 | 0 | State::Sparse(SparseTransitions { ref transitions }) => { |
1733 | 0 | let rs = transitions |
1734 | 0 | .iter() |
1735 | 0 | .map(|t| format!("{t:?}")) |
1736 | 0 | .collect::<Vec<String>>() |
1737 | 0 | .join(", "); |
1738 | 0 | write!(f, "sparse({rs})") |
1739 | | } |
1740 | 0 | State::Dense(ref dense) => { |
1741 | 0 | write!(f, "dense(")?; |
1742 | 0 | for (i, t) in dense.iter().enumerate() { |
1743 | 0 | if i > 0 { |
1744 | 0 | write!(f, ", ")?; |
1745 | 0 | } |
1746 | 0 | write!(f, "{t:?}")?; |
1747 | | } |
1748 | 0 | write!(f, ")") |
1749 | | } |
1750 | 0 | State::Look { ref look, next } => { |
1751 | 0 | write!(f, "{:?} => {:?}", look, next.as_usize()) |
1752 | | } |
1753 | 0 | State::Union { ref alternates } => { |
1754 | 0 | let alts = alternates |
1755 | 0 | .iter() |
1756 | 0 | .map(|id| format!("{:?}", id.as_usize())) |
1757 | 0 | .collect::<Vec<String>>() |
1758 | 0 | .join(", "); |
1759 | 0 | write!(f, "union({alts})") |
1760 | | } |
1761 | 0 | State::BinaryUnion { alt1, alt2 } => { |
1762 | 0 | write!( |
1763 | 0 | f, |
1764 | 0 | "binary-union({}, {})", |
1765 | 0 | alt1.as_usize(), |
1766 | 0 | alt2.as_usize() |
1767 | | ) |
1768 | | } |
1769 | 0 | State::Capture { next, pattern_id, group_index, slot } => { |
1770 | 0 | write!( |
1771 | 0 | f, |
1772 | 0 | "capture(pid={:?}, group={:?}, slot={:?}) => {:?}", |
1773 | 0 | pattern_id.as_usize(), |
1774 | 0 | group_index.as_usize(), |
1775 | 0 | slot.as_usize(), |
1776 | 0 | next.as_usize(), |
1777 | | ) |
1778 | | } |
1779 | 0 | State::Fail => write!(f, "FAIL"), |
1780 | 0 | State::Match { pattern_id } => { |
1781 | 0 | write!(f, "MATCH({:?})", pattern_id.as_usize()) |
1782 | | } |
1783 | | } |
1784 | 0 | } |
1785 | | } |
1786 | | |
1787 | | /// A sequence of transitions used to represent a sparse state. |
1788 | | /// |
1789 | | /// This is the primary representation of a [`Sparse`](State::Sparse) state. |
1790 | | /// It corresponds to a sorted sequence of transitions with non-overlapping |
1791 | | /// byte ranges. If the byte at the current position in the haystack matches |
1792 | | /// one of the byte ranges, then the finite state machine should take the |
1793 | | /// corresponding transition. |
1794 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1795 | | pub struct SparseTransitions { |
1796 | | /// The sorted sequence of non-overlapping transitions. |
1797 | | pub transitions: Box<[Transition]>, |
1798 | | } |
1799 | | |
1800 | | impl SparseTransitions { |
1801 | | /// This follows the matching transition for a particular byte. |
1802 | | /// |
1803 | | /// The matching transition is found by looking for a matching byte |
1804 | | /// range (there is at most one) corresponding to the position `at` in |
1805 | | /// `haystack`. |
1806 | | /// |
1807 | | /// If `at >= haystack.len()`, then this returns `None`. |
1808 | | #[inline] |
1809 | 178M | pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { |
1810 | 178M | haystack.get(at).and_then(|&b| self.matches_byte(b)) |
1811 | 178M | } |
1812 | | |
1813 | | /// This follows the matching transition for any member of the alphabet. |
1814 | | /// |
1815 | | /// The matching transition is found by looking for a matching byte |
1816 | | /// range (there is at most one) corresponding to the position `at` in |
1817 | | /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), |
1818 | | /// then this always returns `None`. |
1819 | | #[inline] |
1820 | 309M | pub(crate) fn matches_unit( |
1821 | 309M | &self, |
1822 | 309M | unit: alphabet::Unit, |
1823 | 309M | ) -> Option<StateID> { |
1824 | 309M | unit.as_u8().and_then(|byte| self.matches_byte(byte)) |
1825 | 309M | } |
1826 | | |
1827 | | /// This follows the matching transition for a particular byte. |
1828 | | /// |
1829 | | /// The matching transition is found by looking for a matching byte range |
1830 | | /// (there is at most one) corresponding to the byte given. |
1831 | | #[inline] |
1832 | 485M | pub fn matches_byte(&self, byte: u8) -> Option<StateID> { |
1833 | 971M | for t in self.transitions.iter() { |
1834 | 971M | if t.start > byte { |
1835 | 12.5M | break; |
1836 | 959M | } else if t.matches_byte(byte) { |
1837 | 461M | return Some(t.next); |
1838 | 497M | } |
1839 | | } |
1840 | 24.1M | None |
1841 | | |
1842 | | /* |
1843 | | // This is an alternative implementation that uses binary search. In |
1844 | | // some ad hoc experiments, like |
1845 | | // |
1846 | | // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file |
1847 | | // |
1848 | | // I could not observe any improvement, and in fact, things seemed to |
1849 | | // be a bit slower. I can see an improvement in at least one benchmark: |
1850 | | // |
1851 | | // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 |
1852 | | // |
1853 | | // Where total search time goes from 3.2s to 2.4s when using binary |
1854 | | // search. |
1855 | | self.transitions |
1856 | | .binary_search_by(|t| { |
1857 | | if t.end < byte { |
1858 | | core::cmp::Ordering::Less |
1859 | | } else if t.start > byte { |
1860 | | core::cmp::Ordering::Greater |
1861 | | } else { |
1862 | | core::cmp::Ordering::Equal |
1863 | | } |
1864 | | }) |
1865 | | .ok() |
1866 | | .map(|i| self.transitions[i].next) |
1867 | | */ |
1868 | 485M | } |
1869 | | } |
1870 | | |
1871 | | /// A sequence of transitions used to represent a dense state. |
1872 | | /// |
1873 | | /// This is the primary representation of a [`Dense`](State::Dense) state. It |
1874 | | /// provides constant time matching. That is, given a byte in a haystack and |
1875 | | /// a `DenseTransitions`, one can determine if the state matches in constant |
1876 | | /// time. |
1877 | | /// |
1878 | | /// This is in contrast to `SparseTransitions`, whose time complexity is |
1879 | | /// necessarily bigger than constant time. Also in contrast, `DenseTransitions` |
1880 | | /// usually requires (much) more heap memory. |
1881 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1882 | | pub struct DenseTransitions { |
1883 | | /// A dense representation of this state's transitions on the heap. This |
1884 | | /// always has length 256. |
1885 | | pub transitions: Box<[StateID]>, |
1886 | | } |
1887 | | |
1888 | | impl DenseTransitions { |
1889 | | /// This follows the matching transition for a particular byte. |
1890 | | /// |
1891 | | /// The matching transition is found by looking for a transition that |
1892 | | /// doesn't correspond to `StateID::ZERO` for the byte `at` the given |
1893 | | /// position in `haystack`. |
1894 | | /// |
1895 | | /// If `at >= haystack.len()`, then this returns `None`. |
1896 | | #[inline] |
1897 | 0 | pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { |
1898 | 0 | haystack.get(at).and_then(|&b| self.matches_byte(b)) |
1899 | 0 | } |
1900 | | |
1901 | | /// This follows the matching transition for any member of the alphabet. |
1902 | | /// |
1903 | | /// The matching transition is found by looking for a transition that |
1904 | | /// doesn't correspond to `StateID::ZERO` for the given alphabet `unit`. |
1905 | | /// |
1906 | | /// If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), then |
1907 | | /// this returns `None`. |
1908 | | #[inline] |
1909 | 0 | pub(crate) fn matches_unit( |
1910 | 0 | &self, |
1911 | 0 | unit: alphabet::Unit, |
1912 | 0 | ) -> Option<StateID> { |
1913 | 0 | unit.as_u8().and_then(|byte| self.matches_byte(byte)) |
1914 | 0 | } |
1915 | | |
1916 | | /// This follows the matching transition for a particular byte. |
1917 | | /// |
1918 | | /// The matching transition is found by looking for a transition that |
1919 | | /// doesn't correspond to `StateID::ZERO` for the given `byte`. |
1920 | | #[inline] |
1921 | 0 | pub fn matches_byte(&self, byte: u8) -> Option<StateID> { |
1922 | 0 | let next = self.transitions[usize::from(byte)]; |
1923 | 0 | if next == StateID::ZERO { |
1924 | 0 | None |
1925 | | } else { |
1926 | 0 | Some(next) |
1927 | | } |
1928 | 0 | } |
1929 | | |
1930 | | /* |
1931 | | /// The dense state optimization isn't currently enabled, so permit a |
1932 | | /// little bit of dead code. |
1933 | | pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { |
1934 | | let mut dense = vec![StateID::ZERO; 256]; |
1935 | | for t in sparse.transitions.iter() { |
1936 | | for b in t.start..=t.end { |
1937 | | dense[usize::from(b)] = t.next; |
1938 | | } |
1939 | | } |
1940 | | DenseTransitions { transitions: dense.into_boxed_slice() } |
1941 | | } |
1942 | | */ |
1943 | | |
1944 | | /// Returns an iterator over all transitions that don't point to |
1945 | | /// `StateID::ZERO`. |
1946 | 0 | pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ { |
1947 | | use crate::util::int::Usize; |
1948 | 0 | self.transitions |
1949 | 0 | .iter() |
1950 | 0 | .enumerate() |
1951 | 0 | .filter(|&(_, &sid)| sid != StateID::ZERO) |
1952 | 0 | .map(|(byte, &next)| Transition { |
1953 | 0 | start: byte.as_u8(), |
1954 | 0 | end: byte.as_u8(), |
1955 | 0 | next, |
1956 | 0 | }) |
1957 | 0 | } |
1958 | | } |
1959 | | |
1960 | | /// A single transition to another state. |
1961 | | /// |
1962 | | /// This transition may only be followed if the current byte in the haystack |
1963 | | /// falls in the inclusive range of bytes specified. |
1964 | | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
1965 | | pub struct Transition { |
1966 | | /// The inclusive start of the byte range. |
1967 | | pub start: u8, |
1968 | | /// The inclusive end of the byte range. |
1969 | | pub end: u8, |
1970 | | /// The identifier of the state to transition to. |
1971 | | pub next: StateID, |
1972 | | } |
1973 | | |
1974 | | impl Transition { |
1975 | | /// Returns true if the position `at` in `haystack` falls in this |
1976 | | /// transition's range of bytes. |
1977 | | /// |
1978 | | /// If `at >= haystack.len()`, then this returns `false`. |
1979 | 100M | pub fn matches(&self, haystack: &[u8], at: usize) -> bool { |
1980 | 100M | haystack.get(at).map_or(false, |&b| self.matches_byte(b)) |
1981 | 100M | } |
1982 | | |
1983 | | /// Returns true if the given alphabet unit falls in this transition's |
1984 | | /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then |
1985 | | /// this returns `false`. |
1986 | 1.06G | pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { |
1987 | 1.06G | unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) |
1988 | 1.06G | } |
1989 | | |
1990 | | /// Returns true if the given byte falls in this transition's range of |
1991 | | /// bytes. |
1992 | 2.11G | pub fn matches_byte(&self, byte: u8) -> bool { |
1993 | 2.11G | self.start <= byte && byte <= self.end |
1994 | 2.11G | } |
1995 | | } |
1996 | | |
1997 | | impl fmt::Debug for Transition { |
1998 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1999 | | use crate::util::escape::DebugByte; |
2000 | | |
2001 | 0 | let Transition { start, end, next } = *self; |
2002 | 0 | if self.start == self.end { |
2003 | 0 | write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) |
2004 | | } else { |
2005 | 0 | write!( |
2006 | 0 | f, |
2007 | 0 | "{:?}-{:?} => {:?}", |
2008 | 0 | DebugByte(start), |
2009 | 0 | DebugByte(end), |
2010 | 0 | next.as_usize(), |
2011 | | ) |
2012 | | } |
2013 | 0 | } |
2014 | | } |
2015 | | |
2016 | | /// An iterator over all pattern IDs in an NFA. |
2017 | | /// |
2018 | | /// This iterator is created by [`NFA::patterns`]. |
2019 | | /// |
2020 | | /// The lifetime parameter `'a` refers to the lifetime of the NFA from which |
2021 | | /// this pattern iterator was created. |
2022 | | #[derive(Debug)] |
2023 | | pub struct PatternIter<'a> { |
2024 | | it: PatternIDIter, |
2025 | | /// We explicitly associate a lifetime with this iterator even though we |
2026 | | /// don't actually borrow anything from the NFA. We do this for backward |
2027 | | /// compatibility purposes. If we ever do need to borrow something from |
2028 | | /// the NFA, then we can and just get rid of this marker without breaking |
2029 | | /// the public API. |
2030 | | _marker: core::marker::PhantomData<&'a ()>, |
2031 | | } |
2032 | | |
2033 | | impl<'a> Iterator for PatternIter<'a> { |
2034 | | type Item = PatternID; |
2035 | | |
2036 | 182k | fn next(&mut self) -> Option<PatternID> { |
2037 | 182k | self.it.next() |
2038 | 182k | } |
2039 | | } |
2040 | | |
2041 | | #[cfg(all(test, feature = "nfa-pikevm"))] |
2042 | | mod tests { |
2043 | | use super::*; |
2044 | | use crate::{nfa::thompson::pikevm::PikeVM, Input}; |
2045 | | |
2046 | | // This asserts that an NFA state doesn't have its size changed. It is |
2047 | | // *really* easy to accidentally increase the size, and thus potentially |
2048 | | // dramatically increase the memory usage of every NFA. |
2049 | | // |
2050 | | // This assert doesn't mean we absolutely cannot increase the size of an |
2051 | | // NFA state. We can. It's just here to make sure we do it knowingly and |
2052 | | // intentionally. |
2053 | | #[test] |
2054 | | fn state_has_small_size() { |
2055 | | #[cfg(target_pointer_width = "64")] |
2056 | | assert_eq!(24, core::mem::size_of::<State>()); |
2057 | | #[cfg(target_pointer_width = "32")] |
2058 | | assert_eq!(20, core::mem::size_of::<State>()); |
2059 | | } |
2060 | | |
2061 | | #[test] |
2062 | | fn always_match() { |
2063 | | let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); |
2064 | | let mut cache = re.create_cache(); |
2065 | | let mut caps = re.create_captures(); |
2066 | | let mut find = |haystack, start, end| { |
2067 | | let input = Input::new(haystack).range(start..end); |
2068 | | re.search(&mut cache, &input, &mut caps); |
2069 | | caps.get_match().map(|m| m.end()) |
2070 | | }; |
2071 | | |
2072 | | assert_eq!(Some(0), find("", 0, 0)); |
2073 | | assert_eq!(Some(0), find("a", 0, 1)); |
2074 | | assert_eq!(Some(1), find("a", 1, 1)); |
2075 | | assert_eq!(Some(0), find("ab", 0, 2)); |
2076 | | assert_eq!(Some(1), find("ab", 1, 2)); |
2077 | | assert_eq!(Some(2), find("ab", 2, 2)); |
2078 | | } |
2079 | | |
2080 | | #[test] |
2081 | | fn never_match() { |
2082 | | let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); |
2083 | | let mut cache = re.create_cache(); |
2084 | | let mut caps = re.create_captures(); |
2085 | | let mut find = |haystack, start, end| { |
2086 | | let input = Input::new(haystack).range(start..end); |
2087 | | re.search(&mut cache, &input, &mut caps); |
2088 | | caps.get_match().map(|m| m.end()) |
2089 | | }; |
2090 | | |
2091 | | assert_eq!(None, find("", 0, 0)); |
2092 | | assert_eq!(None, find("a", 0, 1)); |
2093 | | assert_eq!(None, find("a", 1, 1)); |
2094 | | assert_eq!(None, find("ab", 0, 2)); |
2095 | | assert_eq!(None, find("ab", 1, 2)); |
2096 | | assert_eq!(None, find("ab", 2, 2)); |
2097 | | } |
2098 | | } |