/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/dfa/onepass.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | A DFA that can return spans for matching capturing groups. |
3 | | |
4 | | This module is the home of a [one-pass DFA](DFA). |
5 | | |
6 | | This module also contains a [`Builder`] and a [`Config`] for building and |
7 | | configuring a one-pass DFA. |
8 | | */ |
9 | | |
10 | | // A note on naming and credit: |
11 | | // |
12 | | // As far as I know, Russ Cox came up with the practical vision and |
13 | | // implementation of a "one-pass regex engine." He mentions and describes it |
14 | | // briefly in the third article of his regexp article series: |
15 | | // https://swtch.com/~rsc/regexp/regexp3.html |
16 | | // |
17 | | // Cox's implementation is in RE2, and the implementation below is most |
18 | | // heavily inspired by RE2's. The key thing they have in common is that |
19 | | // their transitions are defined over an alphabet of bytes. In contrast, |
20 | | // Go's regex engine also has a one-pass engine, but its transitions are |
21 | | // more firmly rooted on Unicode codepoints. The ideas are the same, but the |
22 | | // implementations are different. |
23 | | // |
24 | | // RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA." |
25 | | // They're both true in their own ways: |
26 | | // |
27 | | // * The "one-pass" criterion is generally a property of the NFA itself. In |
28 | | // particular, it is said that an NFA is one-pass if, after each byte of input |
29 | | // during a search, there is at most one "VM thread" remaining to take for the |
30 | | // next byte of input. That is, there is never any ambiguity as to the path to |
31 | | // take through the NFA during a search. |
32 | | // |
33 | | // * On the other hand, once a one-pass NFA has its representation converted |
34 | | // to something where a constant number of instructions is used for each byte |
35 | | // of input, the implementation looks a lot more like a DFA. It's technically |
36 | | // more powerful than a DFA since it has side effects (storing offsets inside |
37 | | // of slots activated by a transition), but it is far closer to a DFA than an |
38 | | // NFA simulation. |
39 | | // |
40 | | // Thus, in this crate, we call it a one-pass DFA. |
41 | | |
42 | | use alloc::{vec, vec::Vec}; |
43 | | |
44 | | use crate::{ |
45 | | dfa::{remapper::Remapper, DEAD}, |
46 | | nfa::thompson::{self, NFA}, |
47 | | util::{ |
48 | | alphabet::ByteClasses, |
49 | | captures::Captures, |
50 | | escape::DebugByte, |
51 | | int::{Usize, U32, U64, U8}, |
52 | | look::{Look, LookSet, UnicodeWordBoundaryError}, |
53 | | primitives::{NonMaxUsize, PatternID, StateID}, |
54 | | search::{Anchored, Input, Match, MatchError, MatchKind, Span}, |
55 | | sparse_set::SparseSet, |
56 | | }, |
57 | | }; |
58 | | |
59 | | /// The configuration used for building a [one-pass DFA](DFA). |
60 | | /// |
61 | | /// A one-pass DFA configuration is a simple data object that is typically used |
62 | | /// with [`Builder::configure`]. It can be cheaply cloned. |
63 | | /// |
64 | | /// A default configuration can be created either with `Config::new`, or |
65 | | /// perhaps more conveniently, with [`DFA::config`]. |
66 | | #[derive(Clone, Debug, Default)] |
67 | | pub struct Config { |
68 | | match_kind: Option<MatchKind>, |
69 | | starts_for_each_pattern: Option<bool>, |
70 | | byte_classes: Option<bool>, |
71 | | size_limit: Option<Option<usize>>, |
72 | | } |
73 | | |
74 | | impl Config { |
75 | | /// Return a new default one-pass DFA configuration. |
76 | 0 | pub fn new() -> Config { |
77 | 0 | Config::default() |
78 | 0 | } |
79 | | |
80 | | /// Set the desired match semantics. |
81 | | /// |
82 | | /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
83 | | /// match semantics of Perl-like regex engines. That is, when multiple |
84 | | /// patterns would match at the same leftmost position, the pattern that |
85 | | /// appears first in the concrete syntax is chosen. |
86 | | /// |
87 | | /// Currently, the only other kind of match semantics supported is |
88 | | /// [`MatchKind::All`]. This corresponds to "classical DFA" construction |
89 | | /// where all possible matches are visited. |
90 | | /// |
91 | | /// When it comes to the one-pass DFA, it is rarer for preference order and |
92 | | /// "longest match" to actually disagree. Since if they did disagree, then |
93 | | /// the regex typically isn't one-pass. For example, searching `Samwise` |
94 | | /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and |
95 | | /// `Samwise` for "longest match" or "all" matching. However, this regex is |
96 | | /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)` |
97 | | /// is one-pass and `Sam|Samwise` may be optimized to it. |
98 | | /// |
99 | | /// The other main difference is that "all" match semantics don't support |
100 | | /// non-greedy matches. "All" match semantics always try to match as much |
101 | | /// as possible. |
102 | 0 | pub fn match_kind(mut self, kind: MatchKind) -> Config { |
103 | 0 | self.match_kind = Some(kind); |
104 | 0 | self |
105 | 0 | } |
106 | | |
107 | | /// Whether to compile a separate start state for each pattern in the |
108 | | /// one-pass DFA. |
109 | | /// |
110 | | /// When enabled, a separate **anchored** start state is added for each |
111 | | /// pattern in the DFA. When this start state is used, then the DFA will |
112 | | /// only search for matches for the pattern specified, even if there are |
113 | | /// other patterns in the DFA. |
114 | | /// |
115 | | /// The main downside of this option is that it can potentially increase |
116 | | /// the size of the DFA and/or increase the time it takes to build the DFA. |
117 | | /// |
118 | | /// You might want to enable this option when you want to both search for |
119 | | /// anchored matches of any pattern or to search for anchored matches of |
120 | | /// one particular pattern while using the same DFA. (Otherwise, you would |
121 | | /// need to compile a new DFA for each pattern.) |
122 | | /// |
123 | | /// By default this is disabled. |
124 | | /// |
125 | | /// # Example |
126 | | /// |
127 | | /// This example shows how to build a multi-regex and then search for |
128 | | /// matches for a any of the patterns or matches for a specific pattern. |
129 | | /// |
130 | | /// ``` |
131 | | /// use regex_automata::{ |
132 | | /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, |
133 | | /// }; |
134 | | /// |
135 | | /// let re = DFA::builder() |
136 | | /// .configure(DFA::config().starts_for_each_pattern(true)) |
137 | | /// .build_many(&["[a-z]+", "[0-9]+"])?; |
138 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
139 | | /// let haystack = "123abc"; |
140 | | /// let input = Input::new(haystack).anchored(Anchored::Yes); |
141 | | /// |
142 | | /// // A normal multi-pattern search will show pattern 1 matches. |
143 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
144 | | /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); |
145 | | /// |
146 | | /// // If we only want to report pattern 0 matches, then we'll get no |
147 | | /// // match here. |
148 | | /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); |
149 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
150 | | /// assert_eq!(None, caps.get_match()); |
151 | | /// |
152 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
153 | | /// ``` |
154 | 0 | pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { |
155 | 0 | self.starts_for_each_pattern = Some(yes); |
156 | 0 | self |
157 | 0 | } |
158 | | |
159 | | /// Whether to attempt to shrink the size of the DFA's alphabet or not. |
160 | | /// |
161 | | /// This option is enabled by default and should never be disabled unless |
162 | | /// one is debugging a one-pass DFA. |
163 | | /// |
164 | | /// When enabled, the DFA will use a map from all possible bytes to their |
165 | | /// corresponding equivalence class. Each equivalence class represents a |
166 | | /// set of bytes that does not discriminate between a match and a non-match |
167 | | /// in the DFA. For example, the pattern `[ab]+` has at least two |
168 | | /// equivalence classes: a set containing `a` and `b` and a set containing |
169 | | /// every byte except for `a` and `b`. `a` and `b` are in the same |
170 | | /// equivalence class because they never discriminate between a match and a |
171 | | /// non-match. |
172 | | /// |
173 | | /// The advantage of this map is that the size of the transition table |
174 | | /// can be reduced drastically from (approximately) `#states * 256 * |
175 | | /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the |
176 | | /// number of equivalence classes (rounded up to the nearest power of 2). |
177 | | /// As a result, total space usage can decrease substantially. Moreover, |
178 | | /// since a smaller alphabet is used, DFA compilation becomes faster as |
179 | | /// well. |
180 | | /// |
181 | | /// **WARNING:** This is only useful for debugging DFAs. Disabling this |
182 | | /// does not yield any speed advantages. Namely, even when this is |
183 | | /// disabled, a byte class map is still used while searching. The only |
184 | | /// difference is that every byte will be forced into its own distinct |
185 | | /// equivalence class. This is useful for debugging the actual generated |
186 | | /// transitions because it lets one see the transitions defined on actual |
187 | | /// bytes instead of the equivalence classes. |
188 | 0 | pub fn byte_classes(mut self, yes: bool) -> Config { |
189 | 0 | self.byte_classes = Some(yes); |
190 | 0 | self |
191 | 0 | } |
192 | | |
193 | | /// Set a size limit on the total heap used by a one-pass DFA. |
194 | | /// |
195 | | /// This size limit is expressed in bytes and is applied during |
196 | | /// construction of a one-pass DFA. If the DFA's heap usage exceeds |
197 | | /// this configured limit, then construction is stopped and an error is |
198 | | /// returned. |
199 | | /// |
200 | | /// The default is no limit. |
201 | | /// |
202 | | /// # Example |
203 | | /// |
204 | | /// This example shows a one-pass DFA that fails to build because of |
205 | | /// a configured size limit. This particular example also serves as a |
206 | | /// cautionary tale demonstrating just how big DFAs with large Unicode |
207 | | /// character classes can get. |
208 | | /// |
209 | | /// ``` |
210 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
211 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
212 | | /// |
213 | | /// // 6MB isn't enough! |
214 | | /// DFA::builder() |
215 | | /// .configure(DFA::config().size_limit(Some(6_000_000))) |
216 | | /// .build(r"\w{20}") |
217 | | /// .unwrap_err(); |
218 | | /// |
219 | | /// // ... but 7MB probably is! |
220 | | /// // (Note that DFA sizes aren't necessarily stable between releases.) |
221 | | /// let re = DFA::builder() |
222 | | /// .configure(DFA::config().size_limit(Some(7_000_000))) |
223 | | /// .build(r"\w{20}")?; |
224 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
225 | | /// let haystack = "A".repeat(20); |
226 | | /// re.captures(&mut cache, &haystack, &mut caps); |
227 | | /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match()); |
228 | | /// |
229 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
230 | | /// ``` |
231 | | /// |
232 | | /// While one needs a little more than 3MB to represent `\w{20}`, it |
233 | | /// turns out that you only need a little more than 4KB to represent |
234 | | /// `(?-u:\w{20})`. So only use Unicode if you need it! |
235 | 0 | pub fn size_limit(mut self, limit: Option<usize>) -> Config { |
236 | 0 | self.size_limit = Some(limit); |
237 | 0 | self |
238 | 0 | } |
239 | | |
240 | | /// Returns the match semantics set in this configuration. |
241 | 0 | pub fn get_match_kind(&self) -> MatchKind { |
242 | 0 | self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
243 | 0 | } |
244 | | |
245 | | /// Returns whether this configuration has enabled anchored starting states |
246 | | /// for every pattern in the DFA. |
247 | 0 | pub fn get_starts_for_each_pattern(&self) -> bool { |
248 | 0 | self.starts_for_each_pattern.unwrap_or(false) |
249 | 0 | } |
250 | | |
251 | | /// Returns whether this configuration has enabled byte classes or not. |
252 | | /// This is typically a debugging oriented option, as disabling it confers |
253 | | /// no speed benefit. |
254 | 0 | pub fn get_byte_classes(&self) -> bool { |
255 | 0 | self.byte_classes.unwrap_or(true) |
256 | 0 | } |
257 | | |
258 | | /// Returns the DFA size limit of this configuration if one was set. |
259 | | /// The size limit is total number of bytes on the heap that a DFA is |
260 | | /// permitted to use. If the DFA exceeds this limit during construction, |
261 | | /// then construction is stopped and an error is returned. |
262 | 0 | pub fn get_size_limit(&self) -> Option<usize> { |
263 | 0 | self.size_limit.unwrap_or(None) |
264 | 0 | } |
265 | | |
266 | | /// Overwrite the default configuration such that the options in `o` are |
267 | | /// always used. If an option in `o` is not set, then the corresponding |
268 | | /// option in `self` is used. If it's not set in `self` either, then it |
269 | | /// remains not set. |
270 | 0 | pub(crate) fn overwrite(&self, o: Config) -> Config { |
271 | 0 | Config { |
272 | 0 | match_kind: o.match_kind.or(self.match_kind), |
273 | 0 | starts_for_each_pattern: o |
274 | 0 | .starts_for_each_pattern |
275 | 0 | .or(self.starts_for_each_pattern), |
276 | 0 | byte_classes: o.byte_classes.or(self.byte_classes), |
277 | 0 | size_limit: o.size_limit.or(self.size_limit), |
278 | 0 | } |
279 | 0 | } |
280 | | } |
281 | | |
282 | | /// A builder for a [one-pass DFA](DFA). |
283 | | /// |
284 | | /// This builder permits configuring options for the syntax of a pattern, the |
285 | | /// NFA construction and the DFA construction. This builder is different from a |
286 | | /// general purpose regex builder in that it permits fine grain configuration |
287 | | /// of the construction process. The trade off for this is complexity, and |
288 | | /// the possibility of setting a configuration that might not make sense. For |
289 | | /// example, there are two different UTF-8 modes: |
290 | | /// |
291 | | /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls |
292 | | /// whether the pattern itself can contain sub-expressions that match invalid |
293 | | /// UTF-8. |
294 | | /// * [`thompson::Config::utf8`] controls whether empty matches that split a |
295 | | /// Unicode codepoint are reported or not. |
296 | | /// |
297 | | /// Generally speaking, callers will want to either enable all of these or |
298 | | /// disable all of these. |
299 | | /// |
300 | | /// # Example |
301 | | /// |
302 | | /// This example shows how to disable UTF-8 mode in the syntax and the NFA. |
303 | | /// This is generally what you want for matching on arbitrary bytes. |
304 | | /// |
305 | | /// ``` |
306 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
307 | | /// use regex_automata::{ |
308 | | /// dfa::onepass::DFA, |
309 | | /// nfa::thompson, |
310 | | /// util::syntax, |
311 | | /// Match, |
312 | | /// }; |
313 | | /// |
314 | | /// let re = DFA::builder() |
315 | | /// .syntax(syntax::Config::new().utf8(false)) |
316 | | /// .thompson(thompson::Config::new().utf8(false)) |
317 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
318 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
319 | | /// |
320 | | /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; |
321 | | /// re.captures(&mut cache, haystack, &mut caps); |
322 | | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
323 | | /// // but the subsequent `.*` does not! Disabling UTF-8 |
324 | | /// // on the syntax permits this. |
325 | | /// // |
326 | | /// // N.B. This example does not show the impact of |
327 | | /// // disabling UTF-8 mode on a one-pass DFA Config, |
328 | | /// // since that only impacts regexes that can |
329 | | /// // produce matches of length 0. |
330 | | /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); |
331 | | /// |
332 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
333 | | /// ``` |
334 | | #[derive(Clone, Debug)] |
335 | | pub struct Builder { |
336 | | config: Config, |
337 | | #[cfg(feature = "syntax")] |
338 | | thompson: thompson::Compiler, |
339 | | } |
340 | | |
341 | | impl Builder { |
342 | | /// Create a new one-pass DFA builder with the default configuration. |
343 | 0 | pub fn new() -> Builder { |
344 | 0 | Builder { |
345 | 0 | config: Config::default(), |
346 | 0 | #[cfg(feature = "syntax")] |
347 | 0 | thompson: thompson::Compiler::new(), |
348 | 0 | } |
349 | 0 | } |
350 | | |
351 | | /// Build a one-pass DFA from the given pattern. |
352 | | /// |
353 | | /// If there was a problem parsing or compiling the pattern, then an error |
354 | | /// is returned. |
355 | | #[cfg(feature = "syntax")] |
356 | 0 | pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { |
357 | 0 | self.build_many(&[pattern]) |
358 | 0 | } |
359 | | |
360 | | /// Build a one-pass DFA from the given patterns. |
361 | | /// |
362 | | /// When matches are returned, the pattern ID corresponds to the index of |
363 | | /// the pattern in the slice given. |
364 | | #[cfg(feature = "syntax")] |
365 | 0 | pub fn build_many<P: AsRef<str>>( |
366 | 0 | &self, |
367 | 0 | patterns: &[P], |
368 | 0 | ) -> Result<DFA, BuildError> { |
369 | 0 | let nfa = |
370 | 0 | self.thompson.build_many(patterns).map_err(BuildError::nfa)?; |
371 | 0 | self.build_from_nfa(nfa) |
372 | 0 | } |
373 | | |
374 | | /// Build a DFA from the given NFA. |
375 | | /// |
376 | | /// # Example |
377 | | /// |
378 | | /// This example shows how to build a DFA if you already have an NFA in |
379 | | /// hand. |
380 | | /// |
381 | | /// ``` |
382 | | /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match}; |
383 | | /// |
384 | | /// // This shows how to set non-default options for building an NFA. |
385 | | /// let nfa = NFA::compiler() |
386 | | /// .configure(NFA::config().shrink(true)) |
387 | | /// .build(r"[a-z0-9]+")?; |
388 | | /// let re = DFA::builder().build_from_nfa(nfa)?; |
389 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
390 | | /// re.captures(&mut cache, "foo123bar", &mut caps); |
391 | | /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match()); |
392 | | /// |
393 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
394 | | /// ``` |
395 | 0 | pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> { |
396 | 0 | // Why take ownership if we're just going to pass a reference to the |
397 | 0 | // NFA to our internal builder? Well, the first thing to note is that |
398 | 0 | // an NFA uses reference counting internally, so either choice is going |
399 | 0 | // to be cheap. So there isn't much cost either way. |
400 | 0 | // |
401 | 0 | // The real reason is that a one-pass DFA, semantically, shares |
402 | 0 | // ownership of an NFA. This is unlike other DFAs that don't share |
403 | 0 | // ownership of an NFA at all, primarily because they want to be |
404 | 0 | // self-contained in order to support cheap (de)serialization. |
405 | 0 | // |
406 | 0 | // But then why pass a '&nfa' below if we want to share ownership? |
407 | 0 | // Well, it turns out that using a '&NFA' in our internal builder |
408 | 0 | // separates its lifetime from the DFA we're building, and this turns |
409 | 0 | // out to make code a bit more composable. e.g., We can iterate over |
410 | 0 | // things inside the NFA while borrowing the builder as mutable because |
411 | 0 | // we know the NFA cannot be mutated. So TL;DR --- this weirdness is |
412 | 0 | // "because borrow checker." |
413 | 0 | InternalBuilder::new(self.config.clone(), &nfa).build() |
414 | 0 | } |
415 | | |
416 | | /// Apply the given one-pass DFA configuration options to this builder. |
417 | 0 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
418 | 0 | self.config = self.config.overwrite(config); |
419 | 0 | self |
420 | 0 | } |
421 | | |
422 | | /// Set the syntax configuration for this builder using |
423 | | /// [`syntax::Config`](crate::util::syntax::Config). |
424 | | /// |
425 | | /// This permits setting things like case insensitivity, Unicode and multi |
426 | | /// line mode. |
427 | | /// |
428 | | /// These settings only apply when constructing a one-pass DFA directly |
429 | | /// from a pattern. |
430 | | #[cfg(feature = "syntax")] |
431 | 0 | pub fn syntax( |
432 | 0 | &mut self, |
433 | 0 | config: crate::util::syntax::Config, |
434 | 0 | ) -> &mut Builder { |
435 | 0 | self.thompson.syntax(config); |
436 | 0 | self |
437 | 0 | } |
438 | | |
439 | | /// Set the Thompson NFA configuration for this builder using |
440 | | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
441 | | /// |
442 | | /// This permits setting things like whether additional time should be |
443 | | /// spent shrinking the size of the NFA. |
444 | | /// |
445 | | /// These settings only apply when constructing a DFA directly from a |
446 | | /// pattern. |
447 | | #[cfg(feature = "syntax")] |
448 | 0 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
449 | 0 | self.thompson.configure(config); |
450 | 0 | self |
451 | 0 | } |
452 | | } |
453 | | |
454 | | /// An internal builder for encapsulating the state necessary to build a |
455 | | /// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`. |
456 | | /// |
457 | | /// There is no separate pass for determining whether the NFA is one-pass or |
458 | | /// not. We just try to build the DFA. If during construction we discover that |
459 | | /// it is not one-pass, we bail out. This is likely to lead to some undesirable |
460 | | /// expense in some cases, so it might make sense to try an identify common |
461 | | /// patterns in the NFA that make it definitively not one-pass. That way, we |
462 | | /// can avoid ever trying to build a one-pass DFA in the first place. For |
463 | | /// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by |
464 | | /// default, it's probably not a trivial cost to try and build a one-pass DFA |
465 | | /// for it and then fail. |
466 | | /// |
467 | | /// Note that some (immutable) fields are duplicated here. For example, the |
468 | | /// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing, |
469 | | /// but we duplicate them because it makes composition easier below. Otherwise, |
470 | | /// since the borrow checker can't see through method calls, the mutable borrow |
471 | | /// we use to mutate the DFA winds up preventing borrowing from any other part |
472 | | /// of the DFA, even though we aren't mutating those parts. We only do this |
473 | | /// because the duplication is cheap. |
474 | | #[derive(Debug)] |
475 | | struct InternalBuilder<'a> { |
476 | | /// The DFA we're building. |
477 | | dfa: DFA, |
478 | | /// An unordered collection of NFA state IDs that we haven't yet tried to |
479 | | /// build into a DFA state yet. |
480 | | /// |
481 | | /// This collection does not ultimately wind up including every NFA state |
482 | | /// ID. Instead, each ID represents a "start" state for a sub-graph of the |
483 | | /// NFA. The set of NFA states we then use to build a DFA state consists |
484 | | /// of that "start" state and all states reachable from it via epsilon |
485 | | /// transitions. |
486 | | uncompiled_nfa_ids: Vec<StateID>, |
487 | | /// A map from NFA state ID to DFA state ID. This is useful for easily |
488 | | /// determining whether an NFA state has been used as a "starting" point |
489 | | /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD, |
490 | | /// and since DEAD is specially added and never corresponds to any NFA |
491 | | /// state, it follows that a mapping to DEAD implies the NFA state has |
492 | | /// no corresponding DFA state yet. |
493 | | nfa_to_dfa_id: Vec<StateID>, |
494 | | /// A stack used to traverse the NFA states that make up a single DFA |
495 | | /// state. Traversal occurs until the stack is empty, and we only push to |
496 | | /// the stack when the state ID isn't in 'seen'. Actually, even more than |
497 | | /// that, if we try to push something on to this stack that is already in |
498 | | /// 'seen', then we bail out on construction completely, since it implies |
499 | | /// that the NFA is not one-pass. |
500 | | stack: Vec<(StateID, Epsilons)>, |
501 | | /// The set of NFA states that we've visited via 'stack'. |
502 | | seen: SparseSet, |
503 | | /// Whether a match NFA state has been observed while constructing a |
504 | | /// one-pass DFA state. Once a match state is seen, assuming we are using |
505 | | /// leftmost-first match semantics, then we don't add any more transitions |
506 | | /// to the DFA state we're building. |
507 | | matched: bool, |
508 | | /// The config passed to the builder. |
509 | | /// |
510 | | /// This is duplicated in dfa.config. |
511 | | config: Config, |
512 | | /// The NFA we're building a one-pass DFA from. |
513 | | /// |
514 | | /// This is duplicated in dfa.nfa. |
515 | | nfa: &'a NFA, |
516 | | /// The equivalence classes that make up the alphabet for this DFA> |
517 | | /// |
518 | | /// This is duplicated in dfa.classes. |
519 | | classes: ByteClasses, |
520 | | } |
521 | | |
522 | | impl<'a> InternalBuilder<'a> { |
523 | | /// Create a new builder with an initial empty DFA. |
524 | 0 | fn new(config: Config, nfa: &'a NFA) -> InternalBuilder<'a> { |
525 | 0 | let classes = if !config.get_byte_classes() { |
526 | | // A one-pass DFA will always use the equivalence class map, but |
527 | | // enabling this option is useful for debugging. Namely, this will |
528 | | // cause all transitions to be defined over their actual bytes |
529 | | // instead of an opaque equivalence class identifier. The former is |
530 | | // much easier to grok as a human. |
531 | 0 | ByteClasses::singletons() |
532 | | } else { |
533 | 0 | nfa.byte_classes().clone() |
534 | | }; |
535 | | // Normally a DFA alphabet includes the EOI symbol, but we don't need |
536 | | // that in the one-pass DFA since we handle look-around explicitly |
537 | | // without encoding it into the DFA. Thus, we don't need to delay |
538 | | // matches by 1 byte. However, we reuse the space that *would* be used |
539 | | // by the EOI transition by putting match information there (like which |
540 | | // pattern matches and which look-around assertions need to hold). So |
541 | | // this means our real alphabet length is 1 fewer than what the byte |
542 | | // classes report, since we don't use EOI. |
543 | 0 | let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap(); |
544 | 0 | let stride2 = classes.stride2(); |
545 | 0 | let dfa = DFA { |
546 | 0 | config: config.clone(), |
547 | 0 | nfa: nfa.clone(), |
548 | 0 | table: vec![], |
549 | 0 | starts: vec![], |
550 | 0 | // Since one-pass DFAs have a smaller state ID max than |
551 | 0 | // StateID::MAX, it follows that StateID::MAX is a valid initial |
552 | 0 | // value for min_match_id since no state ID can ever be greater |
553 | 0 | // than it. In the case of a one-pass DFA with no match states, the |
554 | 0 | // min_match_id will keep this sentinel value. |
555 | 0 | min_match_id: StateID::MAX, |
556 | 0 | classes: classes.clone(), |
557 | 0 | alphabet_len, |
558 | 0 | stride2, |
559 | 0 | pateps_offset: alphabet_len, |
560 | 0 | // OK because PatternID::MAX*2 is guaranteed not to overflow. |
561 | 0 | explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(), |
562 | 0 | }; |
563 | 0 | InternalBuilder { |
564 | 0 | dfa, |
565 | 0 | uncompiled_nfa_ids: vec![], |
566 | 0 | nfa_to_dfa_id: vec![DEAD; nfa.states().len()], |
567 | 0 | stack: vec![], |
568 | 0 | seen: SparseSet::new(nfa.states().len()), |
569 | 0 | matched: false, |
570 | 0 | config, |
571 | 0 | nfa, |
572 | 0 | classes, |
573 | 0 | } |
574 | 0 | } |
575 | | |
576 | | /// Build the DFA from the NFA given to this builder. If the NFA is not |
577 | | /// one-pass, then return an error. An error may also be returned if a |
578 | | /// particular limit is exceeded. (Some limits, like the total heap memory |
579 | | /// used, are configurable. Others, like the total patterns or slots, are |
580 | | /// hard-coded based on representational limitations.) |
581 | 0 | fn build(mut self) -> Result<DFA, BuildError> { |
582 | 0 | self.nfa.look_set_any().available().map_err(BuildError::word)?; |
583 | 0 | for look in self.nfa.look_set_any().iter() { |
584 | | // This is a future incompatibility check where if we add any |
585 | | // more look-around assertions, then the one-pass DFA either |
586 | | // needs to reject them (what we do here) or it needs to have its |
587 | | // Transition representation modified to be capable of storing the |
588 | | // new assertions. |
589 | 0 | if look.as_repr() > Look::WordUnicodeNegate.as_repr() { |
590 | 0 | return Err(BuildError::unsupported_look(look)); |
591 | 0 | } |
592 | | } |
593 | 0 | if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT |
594 | | { |
595 | 0 | return Err(BuildError::too_many_patterns( |
596 | 0 | PatternEpsilons::PATTERN_ID_LIMIT, |
597 | 0 | )); |
598 | 0 | } |
599 | 0 | if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT { |
600 | 0 | return Err(BuildError::not_one_pass( |
601 | 0 | "too many explicit capturing groups (max is 16)", |
602 | 0 | )); |
603 | 0 | } |
604 | 0 | assert_eq!(DEAD, self.add_empty_state()?); |
605 | | |
606 | | // This is where the explicit slots start. We care about this because |
607 | | // we only need to track explicit slots. The implicit slots---two for |
608 | | // each pattern---are tracked as part of the search routine itself. |
609 | 0 | let explicit_slot_start = self.nfa.pattern_len() * 2; |
610 | 0 | self.add_start_state(None, self.nfa.start_anchored())?; |
611 | 0 | if self.config.get_starts_for_each_pattern() { |
612 | 0 | for pid in self.nfa.patterns() { |
613 | 0 | self.add_start_state( |
614 | 0 | Some(pid), |
615 | 0 | self.nfa.start_pattern(pid).unwrap(), |
616 | 0 | )?; |
617 | | } |
618 | 0 | } |
619 | | // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids' |
620 | | // as a stack are. It is really an unordered *set* of NFA state IDs. |
621 | | // If it, for example, in practice led to discovering whether a regex |
622 | | // was or wasn't one-pass later than if we processed NFA state IDs in |
623 | | // ascending order, then that would make this routine more costly in |
624 | | // the somewhat common case of a regex that isn't one-pass. |
625 | 0 | while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() { |
626 | 0 | let dfa_id = self.nfa_to_dfa_id[nfa_id]; |
627 | 0 | // Once we see a match, we keep going, but don't add any new |
628 | 0 | // transitions. Normally we'd just stop, but we have to keep |
629 | 0 | // going in order to verify that our regex is actually one-pass. |
630 | 0 | self.matched = false; |
631 | 0 | // The NFA states we've already explored for this DFA state. |
632 | 0 | self.seen.clear(); |
633 | 0 | // The NFA states to explore via epsilon transitions. If we ever |
634 | 0 | // try to push an NFA state that we've already seen, then the NFA |
635 | 0 | // is not one-pass because it implies there are multiple epsilon |
636 | 0 | // transition paths that lead to the same NFA state. In other |
637 | 0 | // words, there is ambiguity. |
638 | 0 | self.stack_push(nfa_id, Epsilons::empty())?; |
639 | 0 | while let Some((id, epsilons)) = self.stack.pop() { |
640 | 0 | match *self.nfa.state(id) { |
641 | 0 | thompson::State::ByteRange { ref trans } => { |
642 | 0 | self.compile_transition(dfa_id, trans, epsilons)?; |
643 | | } |
644 | 0 | thompson::State::Sparse(ref sparse) => { |
645 | 0 | for trans in sparse.transitions.iter() { |
646 | 0 | self.compile_transition(dfa_id, trans, epsilons)?; |
647 | | } |
648 | | } |
649 | 0 | thompson::State::Dense(ref dense) => { |
650 | 0 | for trans in dense.iter() { |
651 | 0 | self.compile_transition(dfa_id, &trans, epsilons)?; |
652 | | } |
653 | | } |
654 | 0 | thompson::State::Look { look, next } => { |
655 | 0 | let looks = epsilons.looks().insert(look); |
656 | 0 | self.stack_push(next, epsilons.set_looks(looks))?; |
657 | | } |
658 | 0 | thompson::State::Union { ref alternates } => { |
659 | 0 | for &sid in alternates.iter().rev() { |
660 | 0 | self.stack_push(sid, epsilons)?; |
661 | | } |
662 | | } |
663 | 0 | thompson::State::BinaryUnion { alt1, alt2 } => { |
664 | 0 | self.stack_push(alt2, epsilons)?; |
665 | 0 | self.stack_push(alt1, epsilons)?; |
666 | | } |
667 | 0 | thompson::State::Capture { next, slot, .. } => { |
668 | 0 | let slot = slot.as_usize(); |
669 | 0 | let epsilons = if slot < explicit_slot_start { |
670 | | // If this is an implicit slot, we don't care |
671 | | // about it, since we handle implicit slots in |
672 | | // the search routine. We can get away with that |
673 | | // because there are 2 implicit slots for every |
674 | | // pattern. |
675 | 0 | epsilons |
676 | | } else { |
677 | | // Offset our explicit slots so that they start |
678 | | // at index 0. |
679 | 0 | let offset = slot - explicit_slot_start; |
680 | 0 | epsilons.set_slots(epsilons.slots().insert(offset)) |
681 | | }; |
682 | 0 | self.stack_push(next, epsilons)?; |
683 | | } |
684 | | thompson::State::Fail => { |
685 | 0 | continue; |
686 | | } |
687 | 0 | thompson::State::Match { pattern_id } => { |
688 | 0 | // If we found two different paths to a match state |
689 | 0 | // for the same DFA state, then we have ambiguity. |
690 | 0 | // Thus, it's not one-pass. |
691 | 0 | if self.matched { |
692 | 0 | return Err(BuildError::not_one_pass( |
693 | 0 | "multiple epsilon transitions to match state", |
694 | 0 | )); |
695 | 0 | } |
696 | 0 | self.matched = true; |
697 | 0 | // Shove the matching pattern ID and the 'epsilons' |
698 | 0 | // into the current DFA state's pattern epsilons. The |
699 | 0 | // 'epsilons' includes the slots we need to capture |
700 | 0 | // before reporting the match and also the conditional |
701 | 0 | // epsilon transitions we need to check before we can |
702 | 0 | // report a match. |
703 | 0 | self.dfa.set_pattern_epsilons( |
704 | 0 | dfa_id, |
705 | 0 | PatternEpsilons::empty() |
706 | 0 | .set_pattern_id(pattern_id) |
707 | 0 | .set_epsilons(epsilons), |
708 | 0 | ); |
709 | | // N.B. It is tempting to just bail out here when |
710 | | // compiling a leftmost-first DFA, since we will never |
711 | | // compile any more transitions in that case. But we |
712 | | // actually need to keep going in order to verify that |
713 | | // we actually have a one-pass regex. e.g., We might |
714 | | // see more Match states (e.g., for other patterns) |
715 | | // that imply that we don't have a one-pass regex. |
716 | | // So instead, we mark that we've found a match and |
717 | | // continue on. When we go to compile a new DFA state, |
718 | | // we just skip that part. But otherwise check that the |
719 | | // one-pass property is upheld. |
720 | | } |
721 | | } |
722 | | } |
723 | | } |
724 | 0 | self.shuffle_states(); |
725 | 0 | Ok(self.dfa) |
726 | 0 | } |
727 | | |
728 | | /// Shuffle all match states to the end of the transition table and set |
729 | | /// 'min_match_id' to the ID of the first such match state. |
730 | | /// |
731 | | /// The point of this is to make it extremely cheap to determine whether |
732 | | /// a state is a match state or not. We need to check on this on every |
733 | | /// transition during a search, so it being cheap is important. This |
734 | | /// permits us to check it by simply comparing two state identifiers, as |
735 | | /// opposed to looking for the pattern ID in the state's `PatternEpsilons`. |
736 | | /// (Which requires a memory load and some light arithmetic.) |
737 | 0 | fn shuffle_states(&mut self) { |
738 | 0 | let mut remapper = Remapper::new(&self.dfa); |
739 | 0 | let mut next_dest = self.dfa.last_state_id(); |
740 | 0 | for i in (0..self.dfa.state_len()).rev() { |
741 | 0 | let id = StateID::must(i); |
742 | 0 | let is_match = |
743 | 0 | self.dfa.pattern_epsilons(id).pattern_id().is_some(); |
744 | 0 | if !is_match { |
745 | 0 | continue; |
746 | 0 | } |
747 | 0 | remapper.swap(&mut self.dfa, next_dest, id); |
748 | 0 | self.dfa.min_match_id = next_dest; |
749 | 0 | next_dest = self.dfa.prev_state_id(next_dest).expect( |
750 | 0 | "match states should be a proper subset of all states", |
751 | 0 | ); |
752 | | } |
753 | 0 | remapper.remap(&mut self.dfa); |
754 | 0 | } |
755 | | |
756 | | /// Compile the given NFA transition into the DFA state given. |
757 | | /// |
758 | | /// 'Epsilons' corresponds to any conditional epsilon transitions that need |
759 | | /// to be satisfied to follow this transition, and any slots that need to |
760 | | /// be saved if the transition is followed. |
761 | | /// |
762 | | /// If this transition indicates that the NFA is not one-pass, then |
763 | | /// this returns an error. (This occurs, for example, if the DFA state |
764 | | /// already has a transition defined for the same input symbols as the |
765 | | /// given transition, *and* the result of the old and new transitions is |
766 | | /// different.) |
767 | 0 | fn compile_transition( |
768 | 0 | &mut self, |
769 | 0 | dfa_id: StateID, |
770 | 0 | trans: &thompson::Transition, |
771 | 0 | epsilons: Epsilons, |
772 | 0 | ) -> Result<(), BuildError> { |
773 | 0 | let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?; |
774 | 0 | for byte in self |
775 | 0 | .classes |
776 | 0 | .representatives(trans.start..=trans.end) |
777 | 0 | .filter_map(|r| r.as_u8()) |
778 | | { |
779 | 0 | let oldtrans = self.dfa.transition(dfa_id, byte); |
780 | 0 | let newtrans = |
781 | 0 | Transition::new(self.matched, next_dfa_id, epsilons); |
782 | 0 | // If the old transition points to the DEAD state, then we know |
783 | 0 | // 'byte' has not been mapped to any transition for this DFA state |
784 | 0 | // yet. So set it unconditionally. Otherwise, we require that the |
785 | 0 | // old and new transitions are equivalent. Otherwise, there is |
786 | 0 | // ambiguity and thus the regex is not one-pass. |
787 | 0 | if oldtrans.state_id() == DEAD { |
788 | 0 | self.dfa.set_transition(dfa_id, byte, newtrans); |
789 | 0 | } else if oldtrans != newtrans { |
790 | 0 | return Err(BuildError::not_one_pass( |
791 | 0 | "conflicting transition", |
792 | 0 | )); |
793 | 0 | } |
794 | | } |
795 | 0 | Ok(()) |
796 | 0 | } |
797 | | |
798 | | /// Add a start state to the DFA corresponding to the given NFA starting |
799 | | /// state ID. |
800 | | /// |
801 | | /// If adding a state would blow any limits (configured or hard-coded), |
802 | | /// then an error is returned. |
803 | | /// |
804 | | /// If the starting state is an anchored state for a particular pattern, |
805 | | /// then callers must provide the pattern ID for that starting state. |
806 | | /// Callers must also ensure that the first starting state added is the |
807 | | /// start state for all patterns, and then each anchored starting state for |
808 | | /// each pattern (if necessary) added in order. Otherwise, this panics. |
809 | 0 | fn add_start_state( |
810 | 0 | &mut self, |
811 | 0 | pid: Option<PatternID>, |
812 | 0 | nfa_id: StateID, |
813 | 0 | ) -> Result<StateID, BuildError> { |
814 | 0 | match pid { |
815 | | // With no pid, this should be the start state for all patterns |
816 | | // and thus be the first one. |
817 | 0 | None => assert!(self.dfa.starts.is_empty()), |
818 | | // With a pid, we want it to be at self.dfa.starts[pid+1]. |
819 | 0 | Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()), |
820 | | } |
821 | 0 | let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?; |
822 | 0 | self.dfa.starts.push(dfa_id); |
823 | 0 | Ok(dfa_id) |
824 | 0 | } |
825 | | |
826 | | /// Add a new DFA state corresponding to the given NFA state. If adding a |
827 | | /// state would blow any limits (configured or hard-coded), then an error |
828 | | /// is returned. If a DFA state already exists for the given NFA state, |
829 | | /// then that DFA state's ID is returned and no new states are added. |
830 | | /// |
831 | | /// It is not expected that this routine is called for every NFA state. |
832 | | /// Instead, an NFA state ID will usually correspond to the "start" state |
833 | | /// for a sub-graph of the NFA, where all states in the sub-graph are |
834 | | /// reachable via epsilon transitions (conditional or unconditional). That |
835 | | /// sub-graph of NFA states is ultimately what produces a single DFA state. |
836 | 0 | fn add_dfa_state_for_nfa_state( |
837 | 0 | &mut self, |
838 | 0 | nfa_id: StateID, |
839 | 0 | ) -> Result<StateID, BuildError> { |
840 | 0 | // If we've already built a DFA state for the given NFA state, then |
841 | 0 | // just return that. We definitely do not want to have more than one |
842 | 0 | // DFA state in existence for the same NFA state, since all but one of |
843 | 0 | // them will likely become unreachable. And at least some of them are |
844 | 0 | // likely to wind up being incomplete. |
845 | 0 | let existing_dfa_id = self.nfa_to_dfa_id[nfa_id]; |
846 | 0 | if existing_dfa_id != DEAD { |
847 | 0 | return Ok(existing_dfa_id); |
848 | 0 | } |
849 | | // If we don't have any DFA state yet, add it and then add the given |
850 | | // NFA state to the list of states to explore. |
851 | 0 | let dfa_id = self.add_empty_state()?; |
852 | 0 | self.nfa_to_dfa_id[nfa_id] = dfa_id; |
853 | 0 | self.uncompiled_nfa_ids.push(nfa_id); |
854 | 0 | Ok(dfa_id) |
855 | 0 | } |
856 | | |
857 | | /// Unconditionally add a new empty DFA state. If adding it would exceed |
858 | | /// any limits (configured or hard-coded), then an error is returned. The |
859 | | /// ID of the new state is returned on success. |
860 | | /// |
861 | | /// The added state is *not* a match state. |
862 | 0 | fn add_empty_state(&mut self) -> Result<StateID, BuildError> { |
863 | 0 | let state_limit = Transition::STATE_ID_LIMIT; |
864 | 0 | // Note that unlike dense and lazy DFAs, we specifically do NOT |
865 | 0 | // premultiply our state IDs here. The reason is that we want to pack |
866 | 0 | // our state IDs into 64-bit transitions with other info, so the fewer |
867 | 0 | // the bits we use for state IDs the better. If we premultiply, then |
868 | 0 | // our state ID space shrinks. We justify this by the assumption that |
869 | 0 | // a one-pass DFA is just already doing a fair bit more work than a |
870 | 0 | // normal DFA anyway, so an extra multiplication to compute a state |
871 | 0 | // transition doesn't seem like a huge deal. |
872 | 0 | let next_id = self.dfa.table.len() >> self.dfa.stride2(); |
873 | 0 | let id = StateID::new(next_id) |
874 | 0 | .map_err(|_| BuildError::too_many_states(state_limit))?; |
875 | 0 | if id.as_u64() > Transition::STATE_ID_LIMIT { |
876 | 0 | return Err(BuildError::too_many_states(state_limit)); |
877 | 0 | } |
878 | 0 | self.dfa |
879 | 0 | .table |
880 | 0 | .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride())); |
881 | 0 | // The default empty value for 'PatternEpsilons' is sadly not all |
882 | 0 | // zeroes. Instead, a special sentinel is used to indicate that there |
883 | 0 | // is no pattern. So we need to explicitly set the pattern epsilons to |
884 | 0 | // the correct "empty" PatternEpsilons. |
885 | 0 | self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty()); |
886 | 0 | if let Some(size_limit) = self.config.get_size_limit() { |
887 | 0 | if self.dfa.memory_usage() > size_limit { |
888 | 0 | return Err(BuildError::exceeded_size_limit(size_limit)); |
889 | 0 | } |
890 | 0 | } |
891 | 0 | Ok(id) |
892 | 0 | } |
893 | | |
894 | | /// Push the given NFA state ID and its corresponding epsilons (slots and |
895 | | /// conditional epsilon transitions) on to a stack for use in a depth first |
896 | | /// traversal of a sub-graph of the NFA. |
897 | | /// |
898 | | /// If the given NFA state ID has already been pushed on to the stack, then |
899 | | /// it indicates the regex is not one-pass and this correspondingly returns |
900 | | /// an error. |
901 | 0 | fn stack_push( |
902 | 0 | &mut self, |
903 | 0 | nfa_id: StateID, |
904 | 0 | epsilons: Epsilons, |
905 | 0 | ) -> Result<(), BuildError> { |
906 | 0 | // If we already have seen a match and we are compiling a leftmost |
907 | 0 | // first DFA, then we shouldn't add any more states to look at. This is |
908 | 0 | // effectively how preference order and non-greediness is implemented. |
909 | 0 | // if !self.config.get_match_kind().continue_past_first_match() |
910 | 0 | // && self.matched |
911 | 0 | // { |
912 | 0 | // return Ok(()); |
913 | 0 | // } |
914 | 0 | if !self.seen.insert(nfa_id) { |
915 | 0 | return Err(BuildError::not_one_pass( |
916 | 0 | "multiple epsilon transitions to same state", |
917 | 0 | )); |
918 | 0 | } |
919 | 0 | self.stack.push((nfa_id, epsilons)); |
920 | 0 | Ok(()) |
921 | 0 | } |
922 | | } |
923 | | |
924 | | /// A one-pass DFA for executing a subset of anchored regex searches while |
925 | | /// resolving capturing groups. |
926 | | /// |
927 | | /// A one-pass DFA can be built from an NFA that is one-pass. An NFA is |
928 | | /// one-pass when there is never any ambiguity about how to continue a search. |
929 | | /// For example, `a*a` is not one-pass becuase during a search, it's not |
930 | | /// possible to know whether to continue matching the `a*` or to move on to |
931 | | /// the single `a`. However, `a*b` is one-pass, because for every byte in the |
932 | | /// input, it's always clear when to move on from `a*` to `b`. |
933 | | /// |
934 | | /// # Only anchored searches are supported |
935 | | /// |
936 | | /// In this crate, especially for DFAs, unanchored searches are implemented by |
937 | | /// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix |
938 | | /// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will |
939 | | /// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches |
940 | | /// any byte, and there is therefore ambiguity as to when the prefix should |
941 | | /// stop matching and something else should start matching. |
942 | | /// |
943 | | /// Therefore, one-pass DFAs do not support unanchored searches. In addition |
944 | | /// to many regexes simply not being one-pass, it implies that one-pass DFAs |
945 | | /// have limited utility. With that said, when a one-pass DFA can be used, it |
946 | | /// can potentially provide a dramatic speed up over alternatives like the |
947 | | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) |
948 | | /// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular, |
949 | | /// a one-pass DFA is the only DFA capable of reporting the spans of matching |
950 | | /// capturing groups. |
951 | | /// |
952 | | /// To clarify, when we say that unanchored searches are not supported, what |
953 | | /// that actually means is: |
954 | | /// |
955 | | /// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always |
956 | | /// do anchored searches. |
957 | | /// * Since iterators are most useful in the context of unanchored searches, |
958 | | /// there is no `DFA::captures_iter` method. |
959 | | /// * For lower level routines like [`DFA::try_search`], an error will be |
960 | | /// returned if the given [`Input`] is configured to do an unanchored search or |
961 | | /// search for an invalid pattern ID. (Note that an [`Input`] is configured to |
962 | | /// do an unanchored search by default, so just giving a `Input::new` is |
963 | | /// guaranteed to return an error.) |
964 | | /// |
965 | | /// # Other limitations |
966 | | /// |
967 | | /// In addition to the [configurable heap limit](Config::size_limit) and |
968 | | /// the requirement that a regex pattern be one-pass, there are some other |
969 | | /// limitations: |
970 | | /// |
971 | | /// * There is an internal limit on the total number of explicit capturing |
972 | | /// groups that appear across all patterns. It is somewhat small and there is |
973 | | /// no way to configure it. If your pattern(s) exceed this limit, then building |
974 | | /// a one-pass DFA will fail. |
975 | | /// * If the number of patterns exceeds an internal unconfigurable limit, then |
976 | | /// building a one-pass DFA will fail. This limit is quite large and you're |
977 | | /// unlikely to hit it. |
978 | | /// * If the total number of states exceeds an internal unconfigurable limit, |
979 | | /// then building a one-pass DFA will fail. This limit is quite large and |
980 | | /// you're unlikely to hit it. |
981 | | /// |
982 | | /// # Other examples of regexes that aren't one-pass |
983 | | /// |
984 | | /// One particularly unfortunate example is that enabling Unicode can cause |
985 | | /// regexes that were one-pass to no longer be one-pass. Consider the regex |
986 | | /// `(?-u)\w*\s` for example. It is one-pass because there is exactly no |
987 | | /// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s` |
988 | | /// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get |
989 | | /// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s` |
990 | | /// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the |
991 | | /// overlap between UTF-8 automata, the use of Unicode character classes will |
992 | | /// tend to vastly increase the likelihood of a regex not being one-pass. |
993 | | /// |
994 | | /// # How does one know if a regex is one-pass or not? |
995 | | /// |
996 | | /// At the time of writing, the only way to know is to try and build a one-pass |
997 | | /// DFA. The one-pass property is checked while constructing the DFA. |
998 | | /// |
999 | | /// This does mean that you might potentially waste some CPU cycles and memory |
1000 | | /// by optimistically trying to build a one-pass DFA. But this is currently the |
1001 | | /// only way. In the future, building a one-pass DFA might be able to use some |
1002 | | /// heuristics to detect common violations of the one-pass property and bail |
1003 | | /// more quickly. |
1004 | | /// |
1005 | | /// # Resource usage |
1006 | | /// |
1007 | | /// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource |
1008 | | /// usage. Namely, construction of a one-pass DFA has a time and space |
1009 | | /// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time |
1010 | | /// and space complexity is `O(2^n)`.) This smaller time bound is achieved |
1011 | | /// because there is at most one DFA state created for each NFA state. If |
1012 | | /// additional DFA states would be required, then the pattern is not one-pass |
1013 | | /// and construction will fail. |
1014 | | /// |
1015 | | /// Note though that currently, this DFA uses a fully dense representation. |
1016 | | /// This means that while its space complexity is no worse than an NFA, it may |
1017 | | /// in practice use more memory because of higher constant factors. The reason |
1018 | | /// for this trade off is two-fold. Firstly, a dense representation makes the |
1019 | | /// search faster. Secondly, the bigger an NFA, the more unlikely it is to be |
1020 | | /// one-pass. Therefore, most one-pass DFAs are usually pretty small. |
1021 | | /// |
1022 | | /// # Example |
1023 | | /// |
1024 | | /// This example shows that the one-pass DFA implements Unicode word boundaries |
1025 | | /// correctly while simultaneously reporting spans for capturing groups that |
1026 | | /// participate in a match. (This is the only DFA that implements full support |
1027 | | /// for Unicode word boundaries.) |
1028 | | /// |
1029 | | /// ``` |
1030 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1031 | | /// use regex_automata::{dfa::onepass::DFA, Match, Span}; |
1032 | | /// |
1033 | | /// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?; |
1034 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1035 | | /// |
1036 | | /// re.captures(&mut cache, "Шерлок Холмс", &mut caps); |
1037 | | /// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match()); |
1038 | | /// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first")); |
1039 | | /// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last")); |
1040 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1041 | | /// ``` |
1042 | | /// |
1043 | | /// # Example: iteration |
1044 | | /// |
1045 | | /// Unlike other regex engines in this crate, this one does not provide |
1046 | | /// iterator search functions. This is because a one-pass DFA only supports |
1047 | | /// anchored searches, and so iterator functions are generally not applicable. |
1048 | | /// |
1049 | | /// However, if you know that all of your matches are |
1050 | | /// directly adjacent, then an iterator can be used. The |
1051 | | /// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for |
1052 | | /// this purpose: |
1053 | | /// |
1054 | | /// ``` |
1055 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1056 | | /// use regex_automata::{ |
1057 | | /// dfa::onepass::DFA, |
1058 | | /// util::iter::Searcher, |
1059 | | /// Anchored, Input, Span, |
1060 | | /// }; |
1061 | | /// |
1062 | | /// let re = DFA::new(r"\w(\d)\w")?; |
1063 | | /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); |
1064 | | /// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes); |
1065 | | /// |
1066 | | /// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| { |
1067 | | /// Ok(re.try_search(&mut cache, input, caps)?) |
1068 | | /// }).infallible(); |
1069 | | /// let caps0 = it.next().unwrap(); |
1070 | | /// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1)); |
1071 | | /// |
1072 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1073 | | /// ``` |
1074 | | #[derive(Clone)] |
1075 | | pub struct DFA { |
1076 | | /// The configuration provided by the caller. |
1077 | | config: Config, |
1078 | | /// The NFA used to build this DFA. |
1079 | | /// |
1080 | | /// NOTE: We probably don't need to store the NFA here, but we use enough |
1081 | | /// bits from it that it's convenient to do so. And there really isn't much |
1082 | | /// cost to doing so either, since an NFA is reference counted internally. |
1083 | | nfa: NFA, |
1084 | | /// The transition table. Given a state ID 's' and a byte of haystack 'b', |
1085 | | /// the next state is `table[sid + classes[byte]]`. |
1086 | | /// |
1087 | | /// The stride of this table (i.e., the number of columns) is always |
1088 | | /// a power of 2, even if the alphabet length is smaller. This makes |
1089 | | /// converting between state IDs and state indices very cheap. |
1090 | | /// |
1091 | | /// Note that the stride always includes room for one extra "transition" |
1092 | | /// that isn't actually a transition. It is a 'PatternEpsilons' that is |
1093 | | /// used for match states only. Because of this, the maximum number of |
1094 | | /// active columns in the transition table is 257, which means the maximum |
1095 | | /// stride is 512 (the next power of 2 greater than or equal to 257). |
1096 | | table: Vec<Transition>, |
1097 | | /// The DFA state IDs of the starting states. |
1098 | | /// |
1099 | | /// `starts[0]` is always present and corresponds to the starting state |
1100 | | /// when searching for matches of any pattern in the DFA. |
1101 | | /// |
1102 | | /// `starts[i]` where i>0 corresponds to the starting state for the pattern |
1103 | | /// ID 'i-1'. These starting states are optional. |
1104 | | starts: Vec<StateID>, |
1105 | | /// Every state ID >= this value corresponds to a match state. |
1106 | | /// |
1107 | | /// This is what a search uses to detect whether a state is a match state |
1108 | | /// or not. It requires only a simple comparison instead of bit-unpacking |
1109 | | /// the PatternEpsilons from every state. |
1110 | | min_match_id: StateID, |
1111 | | /// The alphabet of this DFA, split into equivalence classes. Bytes in the |
1112 | | /// same equivalence class can never discriminate between a match and a |
1113 | | /// non-match. |
1114 | | classes: ByteClasses, |
1115 | | /// The number of elements in each state in the transition table. This may |
1116 | | /// be less than the stride, since the stride is always a power of 2 and |
1117 | | /// the alphabet length can be anything up to and including 256. |
1118 | | alphabet_len: usize, |
1119 | | /// The number of columns in the transition table, expressed as a power of |
1120 | | /// 2. |
1121 | | stride2: usize, |
1122 | | /// The offset at which the PatternEpsilons for a match state is stored in |
1123 | | /// the transition table. |
1124 | | /// |
1125 | | /// PERF: One wonders whether it would be better to put this in a separate |
1126 | | /// allocation, since only match states have a non-empty PatternEpsilons |
1127 | | /// and the number of match states tends be dwarfed by the number of |
1128 | | /// non-match states. So this would save '8*len(non_match_states)' for each |
1129 | | /// DFA. The question is whether moving this to a different allocation will |
1130 | | /// lead to a perf hit during searches. You might think dealing with match |
1131 | | /// states is rare, but some regexes spend a lot of time in match states |
1132 | | /// gobbling up input. But... match state handling is already somewhat |
1133 | | /// expensive, so maybe this wouldn't do much? Either way, it's worth |
1134 | | /// experimenting. |
1135 | | pateps_offset: usize, |
1136 | | /// The first explicit slot index. This refers to the first slot appearing |
1137 | | /// immediately after the last implicit slot. It is always 'patterns.len() |
1138 | | /// * 2'. |
1139 | | /// |
1140 | | /// We record this because we only store the explicit slots in our DFA |
1141 | | /// transition table that need to be saved. Implicit slots are handled |
1142 | | /// automatically as part of the search. |
1143 | | explicit_slot_start: usize, |
1144 | | } |
1145 | | |
1146 | | impl DFA { |
1147 | | /// Parse the given regular expression using the default configuration and |
1148 | | /// return the corresponding one-pass DFA. |
1149 | | /// |
1150 | | /// If you want a non-default configuration, then use the [`Builder`] to |
1151 | | /// set your own configuration. |
1152 | | /// |
1153 | | /// # Example |
1154 | | /// |
1155 | | /// ``` |
1156 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
1157 | | /// |
1158 | | /// let re = DFA::new("foo[0-9]+bar")?; |
1159 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1160 | | /// |
1161 | | /// re.captures(&mut cache, "foo12345barzzz", &mut caps); |
1162 | | /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match()); |
1163 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1164 | | /// ``` |
1165 | | #[cfg(feature = "syntax")] |
1166 | | #[inline] |
1167 | 0 | pub fn new(pattern: &str) -> Result<DFA, BuildError> { |
1168 | 0 | DFA::builder().build(pattern) |
1169 | 0 | } |
1170 | | |
1171 | | /// Like `new`, but parses multiple patterns into a single "multi regex." |
1172 | | /// This similarly uses the default regex configuration. |
1173 | | /// |
1174 | | /// # Example |
1175 | | /// |
1176 | | /// ``` |
1177 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
1178 | | /// |
1179 | | /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?; |
1180 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1181 | | /// |
1182 | | /// re.captures(&mut cache, "abc123", &mut caps); |
1183 | | /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match()); |
1184 | | /// |
1185 | | /// re.captures(&mut cache, "123abc", &mut caps); |
1186 | | /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); |
1187 | | /// |
1188 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1189 | | /// ``` |
1190 | | #[cfg(feature = "syntax")] |
1191 | | #[inline] |
1192 | 0 | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { |
1193 | 0 | DFA::builder().build_many(patterns) |
1194 | 0 | } |
1195 | | |
1196 | | /// Like `new`, but builds a one-pass DFA directly from an NFA. This is |
1197 | | /// useful if you already have an NFA, or even if you hand-assembled the |
1198 | | /// NFA. |
1199 | | /// |
1200 | | /// # Example |
1201 | | /// |
1202 | | /// This shows how to hand assemble a regular expression via its HIR, |
1203 | | /// compile an NFA from it and build a one-pass DFA from the NFA. |
1204 | | /// |
1205 | | /// ``` |
1206 | | /// use regex_automata::{ |
1207 | | /// dfa::onepass::DFA, |
1208 | | /// nfa::thompson::NFA, |
1209 | | /// Match, |
1210 | | /// }; |
1211 | | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
1212 | | /// |
1213 | | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
1214 | | /// ClassBytesRange::new(b'0', b'9'), |
1215 | | /// ClassBytesRange::new(b'A', b'Z'), |
1216 | | /// ClassBytesRange::new(b'_', b'_'), |
1217 | | /// ClassBytesRange::new(b'a', b'z'), |
1218 | | /// ]))); |
1219 | | /// |
1220 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
1221 | | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
1222 | | /// |
1223 | | /// let re = DFA::new_from_nfa(nfa)?; |
1224 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1225 | | /// let expected = Some(Match::must(0, 0..1)); |
1226 | | /// re.captures(&mut cache, "A", &mut caps); |
1227 | | /// assert_eq!(expected, caps.get_match()); |
1228 | | /// |
1229 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1230 | | /// ``` |
1231 | 0 | pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> { |
1232 | 0 | DFA::builder().build_from_nfa(nfa) |
1233 | 0 | } |
1234 | | |
1235 | | /// Create a new one-pass DFA that matches every input. |
1236 | | /// |
1237 | | /// # Example |
1238 | | /// |
1239 | | /// ``` |
1240 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
1241 | | /// |
1242 | | /// let dfa = DFA::always_match()?; |
1243 | | /// let mut cache = dfa.create_cache(); |
1244 | | /// let mut caps = dfa.create_captures(); |
1245 | | /// |
1246 | | /// let expected = Match::must(0, 0..0); |
1247 | | /// dfa.captures(&mut cache, "", &mut caps); |
1248 | | /// assert_eq!(Some(expected), caps.get_match()); |
1249 | | /// dfa.captures(&mut cache, "foo", &mut caps); |
1250 | | /// assert_eq!(Some(expected), caps.get_match()); |
1251 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1252 | | /// ``` |
1253 | 0 | pub fn always_match() -> Result<DFA, BuildError> { |
1254 | 0 | let nfa = thompson::NFA::always_match(); |
1255 | 0 | Builder::new().build_from_nfa(nfa) |
1256 | 0 | } |
1257 | | |
1258 | | /// Create a new one-pass DFA that never matches any input. |
1259 | | /// |
1260 | | /// # Example |
1261 | | /// |
1262 | | /// ``` |
1263 | | /// use regex_automata::dfa::onepass::DFA; |
1264 | | /// |
1265 | | /// let dfa = DFA::never_match()?; |
1266 | | /// let mut cache = dfa.create_cache(); |
1267 | | /// let mut caps = dfa.create_captures(); |
1268 | | /// |
1269 | | /// dfa.captures(&mut cache, "", &mut caps); |
1270 | | /// assert_eq!(None, caps.get_match()); |
1271 | | /// dfa.captures(&mut cache, "foo", &mut caps); |
1272 | | /// assert_eq!(None, caps.get_match()); |
1273 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1274 | | /// ``` |
1275 | 0 | pub fn never_match() -> Result<DFA, BuildError> { |
1276 | 0 | let nfa = thompson::NFA::never_match(); |
1277 | 0 | Builder::new().build_from_nfa(nfa) |
1278 | 0 | } |
1279 | | |
1280 | | /// Return a default configuration for a DFA. |
1281 | | /// |
1282 | | /// This is a convenience routine to avoid needing to import the `Config` |
1283 | | /// type when customizing the construction of a DFA. |
1284 | | /// |
1285 | | /// # Example |
1286 | | /// |
1287 | | /// This example shows how to change the match semantics of this DFA from |
1288 | | /// its default "leftmost first" to "all." When using "all," non-greediness |
1289 | | /// doesn't apply and neither does preference order matching. Instead, the |
1290 | | /// longest match possible is always returned. (Although, by construction, |
1291 | | /// it's impossible for a one-pass DFA to have a different answer for |
1292 | | /// "preference order" vs "longest match.") |
1293 | | /// |
1294 | | /// ``` |
1295 | | /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind}; |
1296 | | /// |
1297 | | /// let re = DFA::builder() |
1298 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1299 | | /// .build(r"(abc)+?")?; |
1300 | | /// let mut cache = re.create_cache(); |
1301 | | /// let mut caps = re.create_captures(); |
1302 | | /// |
1303 | | /// re.captures(&mut cache, "abcabc", &mut caps); |
1304 | | /// // Normally, the non-greedy repetition would give us a 0..3 match. |
1305 | | /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); |
1306 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1307 | | /// ``` |
1308 | | #[inline] |
1309 | 0 | pub fn config() -> Config { |
1310 | 0 | Config::new() |
1311 | 0 | } |
1312 | | |
1313 | | /// Return a builder for configuring the construction of a DFA. |
1314 | | /// |
1315 | | /// This is a convenience routine to avoid needing to import the |
1316 | | /// [`Builder`] type in common cases. |
1317 | | /// |
1318 | | /// # Example |
1319 | | /// |
1320 | | /// This example shows how to use the builder to disable UTF-8 mode. |
1321 | | /// |
1322 | | /// ``` |
1323 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1324 | | /// use regex_automata::{ |
1325 | | /// dfa::onepass::DFA, |
1326 | | /// nfa::thompson, |
1327 | | /// util::syntax, |
1328 | | /// Match, |
1329 | | /// }; |
1330 | | /// |
1331 | | /// let re = DFA::builder() |
1332 | | /// .syntax(syntax::Config::new().utf8(false)) |
1333 | | /// .thompson(thompson::Config::new().utf8(false)) |
1334 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
1335 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1336 | | /// |
1337 | | /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; |
1338 | | /// let expected = Some(Match::must(0, 0..8)); |
1339 | | /// re.captures(&mut cache, haystack, &mut caps); |
1340 | | /// assert_eq!(expected, caps.get_match()); |
1341 | | /// |
1342 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1343 | | /// ``` |
1344 | | #[inline] |
1345 | 0 | pub fn builder() -> Builder { |
1346 | 0 | Builder::new() |
1347 | 0 | } |
1348 | | |
1349 | | /// Create a new empty set of capturing groups that is guaranteed to be |
1350 | | /// valid for the search APIs on this DFA. |
1351 | | /// |
1352 | | /// A `Captures` value created for a specific DFA cannot be used with any |
1353 | | /// other DFA. |
1354 | | /// |
1355 | | /// This is a convenience function for [`Captures::all`]. See the |
1356 | | /// [`Captures`] documentation for an explanation of its alternative |
1357 | | /// constructors that permit the DFA to do less work during a search, and |
1358 | | /// thus might make it faster. |
1359 | | #[inline] |
1360 | 0 | pub fn create_captures(&self) -> Captures { |
1361 | 0 | Captures::all(self.nfa.group_info().clone()) |
1362 | 0 | } |
1363 | | |
1364 | | /// Create a new cache for this DFA. |
1365 | | /// |
1366 | | /// The cache returned should only be used for searches for this |
1367 | | /// DFA. If you want to reuse the cache for another DFA, then you |
1368 | | /// must call [`Cache::reset`] with that DFA (or, equivalently, |
1369 | | /// [`DFA::reset_cache`]). |
1370 | | #[inline] |
1371 | 0 | pub fn create_cache(&self) -> Cache { |
1372 | 0 | Cache::new(self) |
1373 | 0 | } |
1374 | | |
1375 | | /// Reset the given cache such that it can be used for searching with the |
1376 | | /// this DFA (and only this DFA). |
1377 | | /// |
1378 | | /// A cache reset permits reusing memory already allocated in this cache |
1379 | | /// with a different DFA. |
1380 | | /// |
1381 | | /// # Example |
1382 | | /// |
1383 | | /// This shows how to re-purpose a cache for use with a different DFA. |
1384 | | /// |
1385 | | /// ``` |
1386 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1387 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
1388 | | /// |
1389 | | /// let re1 = DFA::new(r"\w")?; |
1390 | | /// let re2 = DFA::new(r"\W")?; |
1391 | | /// let mut caps1 = re1.create_captures(); |
1392 | | /// let mut caps2 = re2.create_captures(); |
1393 | | /// |
1394 | | /// let mut cache = re1.create_cache(); |
1395 | | /// assert_eq!( |
1396 | | /// Some(Match::must(0, 0..2)), |
1397 | | /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, |
1398 | | /// ); |
1399 | | /// |
1400 | | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
1401 | | /// // incorrect results. In order to re-purpose the cache, we must reset |
1402 | | /// // it with the one-pass DFA we'd like to use it with. |
1403 | | /// // |
1404 | | /// // Similarly, after this reset, using the cache with 're1' is also not |
1405 | | /// // allowed. |
1406 | | /// re2.reset_cache(&mut cache); |
1407 | | /// assert_eq!( |
1408 | | /// Some(Match::must(0, 0..3)), |
1409 | | /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, |
1410 | | /// ); |
1411 | | /// |
1412 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1413 | | /// ``` |
1414 | | #[inline] |
1415 | 0 | pub fn reset_cache(&self, cache: &mut Cache) { |
1416 | 0 | cache.reset(self); |
1417 | 0 | } |
1418 | | |
1419 | | /// Return the config for this one-pass DFA. |
1420 | | #[inline] |
1421 | 0 | pub fn get_config(&self) -> &Config { |
1422 | 0 | &self.config |
1423 | 0 | } |
1424 | | |
1425 | | /// Returns a reference to the underlying NFA. |
1426 | | #[inline] |
1427 | 0 | pub fn get_nfa(&self) -> &NFA { |
1428 | 0 | &self.nfa |
1429 | 0 | } |
1430 | | |
1431 | | /// Returns the total number of patterns compiled into this DFA. |
1432 | | /// |
1433 | | /// In the case of a DFA that contains no patterns, this returns `0`. |
1434 | | #[inline] |
1435 | 0 | pub fn pattern_len(&self) -> usize { |
1436 | 0 | self.get_nfa().pattern_len() |
1437 | 0 | } |
1438 | | |
1439 | | /// Returns the total number of states in this one-pass DFA. |
1440 | | /// |
1441 | | /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose |
1442 | | /// a low level DFA API. Therefore, this routine has little use other than |
1443 | | /// being informational. |
1444 | | #[inline] |
1445 | 0 | pub fn state_len(&self) -> usize { |
1446 | 0 | self.table.len() >> self.stride2() |
1447 | 0 | } |
1448 | | |
1449 | | /// Returns the total number of elements in the alphabet for this DFA. |
1450 | | /// |
1451 | | /// That is, this returns the total number of transitions that each |
1452 | | /// state in this DFA must have. The maximum alphabet size is 256, which |
1453 | | /// corresponds to each possible byte value. |
1454 | | /// |
1455 | | /// The alphabet size may be less than 256 though, and unless |
1456 | | /// [`Config::byte_classes`] is disabled, it is typically must less than |
1457 | | /// 256. Namely, bytes are grouped into equivalence classes such that no |
1458 | | /// two bytes in the same class can distinguish a match from a non-match. |
1459 | | /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could |
1460 | | /// all be in the same equivalence class. This leads to a massive space |
1461 | | /// savings. |
1462 | | /// |
1463 | | /// Note though that the alphabet length does _not_ necessarily equal the |
1464 | | /// total stride space taken up by a single DFA state in the transition |
1465 | | /// table. Namely, for performance reasons, the stride is always the |
1466 | | /// smallest power of two that is greater than or equal to the alphabet |
1467 | | /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are |
1468 | | /// often more useful. The alphabet length is typically useful only for |
1469 | | /// informational purposes. |
1470 | | /// |
1471 | | /// Note also that unlike dense or sparse DFAs, a one-pass DFA does |
1472 | | /// not have a special end-of-input (EOI) transition. This is because |
1473 | | /// a one-pass DFA handles look-around assertions explicitly (like the |
1474 | | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build |
1475 | | /// them into the transitions of the DFA. |
1476 | | #[inline] |
1477 | 0 | pub fn alphabet_len(&self) -> usize { |
1478 | 0 | self.alphabet_len |
1479 | 0 | } |
1480 | | |
1481 | | /// Returns the total stride for every state in this DFA, expressed as the |
1482 | | /// exponent of a power of 2. The stride is the amount of space each state |
1483 | | /// takes up in the transition table, expressed as a number of transitions. |
1484 | | /// (Unused transitions map to dead states.) |
1485 | | /// |
1486 | | /// The stride of a DFA is always equivalent to the smallest power of |
1487 | | /// 2 that is greater than or equal to the DFA's alphabet length. This |
1488 | | /// definition uses extra space, but possibly permits faster translation |
1489 | | /// between state identifiers and their corresponding offsets in this DFA's |
1490 | | /// transition table. |
1491 | | /// |
1492 | | /// For example, if the DFA's stride is 16 transitions, then its `stride2` |
1493 | | /// is `4` since `2^4 = 16`. |
1494 | | /// |
1495 | | /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) |
1496 | | /// while the maximum `stride2` value is `9` (corresponding to a stride |
1497 | | /// of `512`). The maximum in theory should be `8`, but because of some |
1498 | | /// implementation quirks that may be relaxed in the future, it is one more |
1499 | | /// than `8`. (Do note that a maximal stride is incredibly rare, as it |
1500 | | /// would imply that there is almost no redundant in the regex pattern.) |
1501 | | /// |
1502 | | /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose |
1503 | | /// a low level DFA API. Therefore, this routine has little use other than |
1504 | | /// being informational. |
1505 | | #[inline] |
1506 | 0 | pub fn stride2(&self) -> usize { |
1507 | 0 | self.stride2 |
1508 | 0 | } |
1509 | | |
1510 | | /// Returns the total stride for every state in this DFA. This corresponds |
1511 | | /// to the total number of transitions used by each state in this DFA's |
1512 | | /// transition table. |
1513 | | /// |
1514 | | /// Please see [`DFA::stride2`] for more information. In particular, this |
1515 | | /// returns the stride as the number of transitions, where as `stride2` |
1516 | | /// returns it as the exponent of a power of 2. |
1517 | | /// |
1518 | | /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose |
1519 | | /// a low level DFA API. Therefore, this routine has little use other than |
1520 | | /// being informational. |
1521 | | #[inline] |
1522 | 0 | pub fn stride(&self) -> usize { |
1523 | 0 | 1 << self.stride2() |
1524 | 0 | } |
1525 | | |
1526 | | /// Returns the memory usage, in bytes, of this DFA. |
1527 | | /// |
1528 | | /// The memory usage is computed based on the number of bytes used to |
1529 | | /// represent this DFA. |
1530 | | /// |
1531 | | /// This does **not** include the stack size used up by this DFA. To |
1532 | | /// compute that, use `std::mem::size_of::<onepass::DFA>()`. |
1533 | | #[inline] |
1534 | 0 | pub fn memory_usage(&self) -> usize { |
1535 | | use core::mem::size_of; |
1536 | | |
1537 | 0 | self.table.len() * size_of::<Transition>() |
1538 | 0 | + self.starts.len() * size_of::<StateID>() |
1539 | 0 | } |
1540 | | } |
1541 | | |
1542 | | impl DFA { |
1543 | | /// Executes an anchored leftmost forward search, and returns true if and |
1544 | | /// only if this one-pass DFA matches the given haystack. |
1545 | | /// |
1546 | | /// This routine may short circuit if it knows that scanning future |
1547 | | /// input will never lead to a different result. In particular, if the |
1548 | | /// underlying DFA enters a match state, then this routine will return |
1549 | | /// `true` immediately without inspecting any future input. (Consider how |
1550 | | /// this might make a difference given the regex `a+` on the haystack |
1551 | | /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, |
1552 | | /// but routines like `find` need to continue searching because `+` is |
1553 | | /// greedy by default.) |
1554 | | /// |
1555 | | /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the |
1556 | | /// given configuration was [`Anchored::No`] (which is the default). |
1557 | | /// |
1558 | | /// # Panics |
1559 | | /// |
1560 | | /// This routine panics if the search could not complete. This can occur |
1561 | | /// in the following circumstances: |
1562 | | /// |
1563 | | /// * When the provided `Input` configuration is not supported. For |
1564 | | /// example, by providing an unsupported anchor mode. Concretely, |
1565 | | /// this occurs when using [`Anchored::Pattern`] without enabling |
1566 | | /// [`Config::starts_for_each_pattern`]. |
1567 | | /// |
1568 | | /// When a search panics, callers cannot know whether a match exists or |
1569 | | /// not. |
1570 | | /// |
1571 | | /// Use [`DFA::try_search`] if you want to handle these panics as error |
1572 | | /// values instead. |
1573 | | /// |
1574 | | /// # Example |
1575 | | /// |
1576 | | /// This shows basic usage: |
1577 | | /// |
1578 | | /// ``` |
1579 | | /// use regex_automata::dfa::onepass::DFA; |
1580 | | /// |
1581 | | /// let re = DFA::new("foo[0-9]+bar")?; |
1582 | | /// let mut cache = re.create_cache(); |
1583 | | /// |
1584 | | /// assert!(re.is_match(&mut cache, "foo12345bar")); |
1585 | | /// assert!(!re.is_match(&mut cache, "foobar")); |
1586 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1587 | | /// ``` |
1588 | | /// |
1589 | | /// # Example: consistency with search APIs |
1590 | | /// |
1591 | | /// `is_match` is guaranteed to return `true` whenever `captures` returns |
1592 | | /// a match. This includes searches that are executed entirely within a |
1593 | | /// codepoint: |
1594 | | /// |
1595 | | /// ``` |
1596 | | /// use regex_automata::{dfa::onepass::DFA, Input}; |
1597 | | /// |
1598 | | /// let re = DFA::new("a*")?; |
1599 | | /// let mut cache = re.create_cache(); |
1600 | | /// |
1601 | | /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); |
1602 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1603 | | /// ``` |
1604 | | /// |
1605 | | /// Notice that when UTF-8 mode is disabled, then the above reports a |
1606 | | /// match because the restriction against zero-width matches that split a |
1607 | | /// codepoint has been lifted: |
1608 | | /// |
1609 | | /// ``` |
1610 | | /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input}; |
1611 | | /// |
1612 | | /// let re = DFA::builder() |
1613 | | /// .thompson(NFA::config().utf8(false)) |
1614 | | /// .build("a*")?; |
1615 | | /// let mut cache = re.create_cache(); |
1616 | | /// |
1617 | | /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); |
1618 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1619 | | /// ``` |
1620 | | #[inline] |
1621 | 0 | pub fn is_match<'h, I: Into<Input<'h>>>( |
1622 | 0 | &self, |
1623 | 0 | cache: &mut Cache, |
1624 | 0 | input: I, |
1625 | 0 | ) -> bool { |
1626 | 0 | let mut input = input.into().earliest(true); |
1627 | 0 | if matches!(input.get_anchored(), Anchored::No) { |
1628 | 0 | input.set_anchored(Anchored::Yes); |
1629 | 0 | } |
1630 | 0 | self.try_search_slots(cache, &input, &mut []).unwrap().is_some() |
1631 | 0 | } |
1632 | | |
1633 | | /// Executes an anchored leftmost forward search, and returns a `Match` if |
1634 | | /// and only if this one-pass DFA matches the given haystack. |
1635 | | /// |
1636 | | /// This routine only includes the overall match span. To get access to the |
1637 | | /// individual spans of each capturing group, use [`DFA::captures`]. |
1638 | | /// |
1639 | | /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the |
1640 | | /// given configuration was [`Anchored::No`] (which is the default). |
1641 | | /// |
1642 | | /// # Panics |
1643 | | /// |
1644 | | /// This routine panics if the search could not complete. This can occur |
1645 | | /// in the following circumstances: |
1646 | | /// |
1647 | | /// * When the provided `Input` configuration is not supported. For |
1648 | | /// example, by providing an unsupported anchor mode. Concretely, |
1649 | | /// this occurs when using [`Anchored::Pattern`] without enabling |
1650 | | /// [`Config::starts_for_each_pattern`]. |
1651 | | /// |
1652 | | /// When a search panics, callers cannot know whether a match exists or |
1653 | | /// not. |
1654 | | /// |
1655 | | /// Use [`DFA::try_search`] if you want to handle these panics as error |
1656 | | /// values instead. |
1657 | | /// |
1658 | | /// # Example |
1659 | | /// |
1660 | | /// Leftmost first match semantics corresponds to the match with the |
1661 | | /// smallest starting offset, but where the end offset is determined by |
1662 | | /// preferring earlier branches in the original regular expression. For |
1663 | | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
1664 | | /// will match `Samwise` in `Samwise`. |
1665 | | /// |
1666 | | /// Generally speaking, the "leftmost first" match is how most backtracking |
1667 | | /// regular expressions tend to work. This is in contrast to POSIX-style |
1668 | | /// regular expressions that yield "leftmost longest" matches. Namely, |
1669 | | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
1670 | | /// leftmost longest semantics. (This crate does not currently support |
1671 | | /// leftmost longest semantics.) |
1672 | | /// |
1673 | | /// ``` |
1674 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
1675 | | /// |
1676 | | /// let re = DFA::new("foo[0-9]+")?; |
1677 | | /// let mut cache = re.create_cache(); |
1678 | | /// let expected = Match::must(0, 0..8); |
1679 | | /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); |
1680 | | /// |
1681 | | /// // Even though a match is found after reading the first byte (`a`), |
1682 | | /// // the leftmost first match semantics demand that we find the earliest |
1683 | | /// // match that prefers earlier parts of the pattern over later parts. |
1684 | | /// let re = DFA::new("abc|a")?; |
1685 | | /// let mut cache = re.create_cache(); |
1686 | | /// let expected = Match::must(0, 0..3); |
1687 | | /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); |
1688 | | /// |
1689 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1690 | | /// ``` |
1691 | | #[inline] |
1692 | 0 | pub fn find<'h, I: Into<Input<'h>>>( |
1693 | 0 | &self, |
1694 | 0 | cache: &mut Cache, |
1695 | 0 | input: I, |
1696 | 0 | ) -> Option<Match> { |
1697 | 0 | let mut input = input.into(); |
1698 | 0 | if matches!(input.get_anchored(), Anchored::No) { |
1699 | 0 | input.set_anchored(Anchored::Yes); |
1700 | 0 | } |
1701 | 0 | if self.get_nfa().pattern_len() == 1 { |
1702 | 0 | let mut slots = [None, None]; |
1703 | 0 | let pid = |
1704 | 0 | self.try_search_slots(cache, &input, &mut slots).unwrap()?; |
1705 | 0 | let start = slots[0].unwrap().get(); |
1706 | 0 | let end = slots[1].unwrap().get(); |
1707 | 0 | return Some(Match::new(pid, Span { start, end })); |
1708 | 0 | } |
1709 | 0 | let ginfo = self.get_nfa().group_info(); |
1710 | 0 | let slots_len = ginfo.implicit_slot_len(); |
1711 | 0 | let mut slots = vec![None; slots_len]; |
1712 | 0 | let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; |
1713 | 0 | let start = slots[pid.as_usize() * 2].unwrap().get(); |
1714 | 0 | let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); |
1715 | 0 | Some(Match::new(pid, Span { start, end })) |
1716 | 0 | } |
1717 | | |
1718 | | /// Executes an anchored leftmost forward search and writes the spans |
1719 | | /// of capturing groups that participated in a match into the provided |
1720 | | /// [`Captures`] value. If no match was found, then [`Captures::is_match`] |
1721 | | /// is guaranteed to return `false`. |
1722 | | /// |
1723 | | /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the |
1724 | | /// given configuration was [`Anchored::No`] (which is the default). |
1725 | | /// |
1726 | | /// # Panics |
1727 | | /// |
1728 | | /// This routine panics if the search could not complete. This can occur |
1729 | | /// in the following circumstances: |
1730 | | /// |
1731 | | /// * When the provided `Input` configuration is not supported. For |
1732 | | /// example, by providing an unsupported anchor mode. Concretely, |
1733 | | /// this occurs when using [`Anchored::Pattern`] without enabling |
1734 | | /// [`Config::starts_for_each_pattern`]. |
1735 | | /// |
1736 | | /// When a search panics, callers cannot know whether a match exists or |
1737 | | /// not. |
1738 | | /// |
1739 | | /// Use [`DFA::try_search`] if you want to handle these panics as error |
1740 | | /// values instead. |
1741 | | /// |
1742 | | /// # Example |
1743 | | /// |
1744 | | /// This shows a simple example of a one-pass regex that extracts |
1745 | | /// capturing group spans. |
1746 | | /// |
1747 | | /// ``` |
1748 | | /// use regex_automata::{dfa::onepass::DFA, Match, Span}; |
1749 | | /// |
1750 | | /// let re = DFA::new( |
1751 | | /// // Notice that we use ASCII here. The corresponding Unicode regex |
1752 | | /// // is sadly not one-pass. |
1753 | | /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)", |
1754 | | /// )?; |
1755 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1756 | | /// |
1757 | | /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); |
1758 | | /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); |
1759 | | /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); |
1760 | | /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); |
1761 | | /// |
1762 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1763 | | /// ``` |
1764 | | #[inline] |
1765 | 0 | pub fn captures<'h, I: Into<Input<'h>>>( |
1766 | 0 | &self, |
1767 | 0 | cache: &mut Cache, |
1768 | 0 | input: I, |
1769 | 0 | caps: &mut Captures, |
1770 | 0 | ) { |
1771 | 0 | let mut input = input.into(); |
1772 | 0 | if matches!(input.get_anchored(), Anchored::No) { |
1773 | 0 | input.set_anchored(Anchored::Yes); |
1774 | 0 | } |
1775 | 0 | self.try_search(cache, &input, caps).unwrap(); |
1776 | 0 | } |
1777 | | |
1778 | | /// Executes an anchored leftmost forward search and writes the spans |
1779 | | /// of capturing groups that participated in a match into the provided |
1780 | | /// [`Captures`] value. If no match was found, then [`Captures::is_match`] |
1781 | | /// is guaranteed to return `false`. |
1782 | | /// |
1783 | | /// The differences with [`DFA::captures`] are: |
1784 | | /// |
1785 | | /// 1. This returns an error instead of panicking if the search fails. |
1786 | | /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing |
1787 | | /// the same input for multiple searches, which _may_ be important for |
1788 | | /// latency. |
1789 | | /// 3. This does not automatically change the [`Anchored`] mode from `No` |
1790 | | /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an |
1791 | | /// error is returned. |
1792 | | /// |
1793 | | /// # Errors |
1794 | | /// |
1795 | | /// This routine errors if the search could not complete. This can occur |
1796 | | /// in the following circumstances: |
1797 | | /// |
1798 | | /// * When the provided `Input` configuration is not supported. For |
1799 | | /// example, by providing an unsupported anchor mode. Concretely, |
1800 | | /// this occurs when using [`Anchored::Pattern`] without enabling |
1801 | | /// [`Config::starts_for_each_pattern`]. |
1802 | | /// |
1803 | | /// When a search returns an error, callers cannot know whether a match |
1804 | | /// exists or not. |
1805 | | /// |
1806 | | /// # Example: specific pattern search |
1807 | | /// |
1808 | | /// This example shows how to build a multi-regex that permits searching |
1809 | | /// for specific patterns. Note that this is somewhat less useful than |
1810 | | /// in other regex engines, since a one-pass DFA by definition has no |
1811 | | /// ambiguity about which pattern can match at a position. That is, if it |
1812 | | /// were possible for two different patterns to match at the same starting |
1813 | | /// position, then the multi-regex would not be one-pass and construction |
1814 | | /// would have failed. |
1815 | | /// |
1816 | | /// Nevertheless, this can still be useful if you only care about matches |
1817 | | /// for a specific pattern, and want the DFA to report "no match" even if |
1818 | | /// some other pattern would have matched. |
1819 | | /// |
1820 | | /// Note that in order to make use of this functionality, |
1821 | | /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled |
1822 | | /// by default since it may result in higher memory usage. |
1823 | | /// |
1824 | | /// ``` |
1825 | | /// use regex_automata::{ |
1826 | | /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, |
1827 | | /// }; |
1828 | | /// |
1829 | | /// let re = DFA::builder() |
1830 | | /// .configure(DFA::config().starts_for_each_pattern(true)) |
1831 | | /// .build_many(&["[a-z]+", "[0-9]+"])?; |
1832 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1833 | | /// let haystack = "123abc"; |
1834 | | /// let input = Input::new(haystack).anchored(Anchored::Yes); |
1835 | | /// |
1836 | | /// // A normal multi-pattern search will show pattern 1 matches. |
1837 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
1838 | | /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); |
1839 | | /// |
1840 | | /// // If we only want to report pattern 0 matches, then we'll get no |
1841 | | /// // match here. |
1842 | | /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); |
1843 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
1844 | | /// assert_eq!(None, caps.get_match()); |
1845 | | /// |
1846 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1847 | | /// ``` |
1848 | | /// |
1849 | | /// # Example: specifying the bounds of a search |
1850 | | /// |
1851 | | /// This example shows how providing the bounds of a search can produce |
1852 | | /// different results than simply sub-slicing the haystack. |
1853 | | /// |
1854 | | /// ``` |
1855 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1856 | | /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match}; |
1857 | | /// |
1858 | | /// // one-pass DFAs fully support Unicode word boundaries! |
1859 | | /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass. |
1860 | | /// // :-( |
1861 | | /// let re = DFA::new(r"\b[0-9]{3}\b")?; |
1862 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
1863 | | /// let haystack = "foo123bar"; |
1864 | | /// |
1865 | | /// // Since we sub-slice the haystack, the search doesn't know about |
1866 | | /// // the larger context and assumes that `123` is surrounded by word |
1867 | | /// // boundaries. And of course, the match position is reported relative |
1868 | | /// // to the sub-slice as well, which means we get `0..3` instead of |
1869 | | /// // `3..6`. |
1870 | | /// let expected = Some(Match::must(0, 0..3)); |
1871 | | /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes); |
1872 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
1873 | | /// assert_eq!(expected, caps.get_match()); |
1874 | | /// |
1875 | | /// // But if we provide the bounds of the search within the context of the |
1876 | | /// // entire haystack, then the search can take the surrounding context |
1877 | | /// // into account. (And if we did find a match, it would be reported |
1878 | | /// // as a valid offset into `haystack` instead of its sub-slice.) |
1879 | | /// let expected = None; |
1880 | | /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes); |
1881 | | /// re.try_search(&mut cache, &input, &mut caps)?; |
1882 | | /// assert_eq!(expected, caps.get_match()); |
1883 | | /// |
1884 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1885 | | /// ``` |
1886 | | #[inline] |
1887 | 0 | pub fn try_search( |
1888 | 0 | &self, |
1889 | 0 | cache: &mut Cache, |
1890 | 0 | input: &Input<'_>, |
1891 | 0 | caps: &mut Captures, |
1892 | 0 | ) -> Result<(), MatchError> { |
1893 | 0 | let pid = self.try_search_slots(cache, input, caps.slots_mut())?; |
1894 | 0 | caps.set_pattern(pid); |
1895 | 0 | Ok(()) |
1896 | 0 | } |
1897 | | |
1898 | | /// Executes an anchored leftmost forward search and writes the spans |
1899 | | /// of capturing groups that participated in a match into the provided |
1900 | | /// `slots`, and returns the matching pattern ID. The contents of the |
1901 | | /// slots for patterns other than the matching pattern are unspecified. If |
1902 | | /// no match was found, then `None` is returned and the contents of all |
1903 | | /// `slots` is unspecified. |
1904 | | /// |
1905 | | /// This is like [`DFA::try_search`], but it accepts a raw slots slice |
1906 | | /// instead of a `Captures` value. This is useful in contexts where you |
1907 | | /// don't want or need to allocate a `Captures`. |
1908 | | /// |
1909 | | /// It is legal to pass _any_ number of slots to this routine. If the regex |
1910 | | /// engine would otherwise write a slot offset that doesn't fit in the |
1911 | | /// provided slice, then it is simply skipped. In general though, there are |
1912 | | /// usually three slice lengths you might want to use: |
1913 | | /// |
1914 | | /// * An empty slice, if you only care about which pattern matched. |
1915 | | /// * A slice with |
1916 | | /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len) |
1917 | | /// slots, if you only care about the overall match spans for each matching |
1918 | | /// pattern. |
1919 | | /// * A slice with |
1920 | | /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which |
1921 | | /// permits recording match offsets for every capturing group in every |
1922 | | /// pattern. |
1923 | | /// |
1924 | | /// # Errors |
1925 | | /// |
1926 | | /// This routine errors if the search could not complete. This can occur |
1927 | | /// in the following circumstances: |
1928 | | /// |
1929 | | /// * When the provided `Input` configuration is not supported. For |
1930 | | /// example, by providing an unsupported anchor mode. Concretely, |
1931 | | /// this occurs when using [`Anchored::Pattern`] without enabling |
1932 | | /// [`Config::starts_for_each_pattern`]. |
1933 | | /// |
1934 | | /// When a search returns an error, callers cannot know whether a match |
1935 | | /// exists or not. |
1936 | | /// |
1937 | | /// # Example |
1938 | | /// |
1939 | | /// This example shows how to find the overall match offsets in a |
1940 | | /// multi-pattern search without allocating a `Captures` value. Indeed, we |
1941 | | /// can put our slots right on the stack. |
1942 | | /// |
1943 | | /// ``` |
1944 | | /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID}; |
1945 | | /// |
1946 | | /// let re = DFA::new_many(&[ |
1947 | | /// r"[a-zA-Z]+", |
1948 | | /// r"[0-9]+", |
1949 | | /// ])?; |
1950 | | /// let mut cache = re.create_cache(); |
1951 | | /// let input = Input::new("123").anchored(Anchored::Yes); |
1952 | | /// |
1953 | | /// // We only care about the overall match offsets here, so we just |
1954 | | /// // allocate two slots for each pattern. Each slot records the start |
1955 | | /// // and end of the match. |
1956 | | /// let mut slots = [None; 4]; |
1957 | | /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; |
1958 | | /// assert_eq!(Some(PatternID::must(1)), pid); |
1959 | | /// |
1960 | | /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. |
1961 | | /// // See 'GroupInfo' for more details on the mapping between groups and |
1962 | | /// // slot indices. |
1963 | | /// let slot_start = pid.unwrap().as_usize() * 2; |
1964 | | /// let slot_end = slot_start + 1; |
1965 | | /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get())); |
1966 | | /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get())); |
1967 | | /// |
1968 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1969 | | /// ``` |
1970 | | #[inline] |
1971 | 0 | pub fn try_search_slots( |
1972 | 0 | &self, |
1973 | 0 | cache: &mut Cache, |
1974 | 0 | input: &Input<'_>, |
1975 | 0 | slots: &mut [Option<NonMaxUsize>], |
1976 | 0 | ) -> Result<Option<PatternID>, MatchError> { |
1977 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
1978 | 0 | if !utf8empty { |
1979 | 0 | return self.try_search_slots_imp(cache, input, slots); |
1980 | 0 | } |
1981 | 0 | // See PikeVM::try_search_slots for why we do this. |
1982 | 0 | let min = self.get_nfa().group_info().implicit_slot_len(); |
1983 | 0 | if slots.len() >= min { |
1984 | 0 | return self.try_search_slots_imp(cache, input, slots); |
1985 | 0 | } |
1986 | 0 | if self.get_nfa().pattern_len() == 1 { |
1987 | 0 | let mut enough = [None, None]; |
1988 | 0 | let got = self.try_search_slots_imp(cache, input, &mut enough)?; |
1989 | | // This is OK because we know `enough_slots` is strictly bigger |
1990 | | // than `slots`, otherwise this special case isn't reached. |
1991 | 0 | slots.copy_from_slice(&enough[..slots.len()]); |
1992 | 0 | return Ok(got); |
1993 | 0 | } |
1994 | 0 | let mut enough = vec![None; min]; |
1995 | 0 | let got = self.try_search_slots_imp(cache, input, &mut enough)?; |
1996 | | // This is OK because we know `enough_slots` is strictly bigger than |
1997 | | // `slots`, otherwise this special case isn't reached. |
1998 | 0 | slots.copy_from_slice(&enough[..slots.len()]); |
1999 | 0 | Ok(got) |
2000 | 0 | } |
2001 | | |
2002 | | #[inline(never)] |
2003 | 0 | fn try_search_slots_imp( |
2004 | 0 | &self, |
2005 | 0 | cache: &mut Cache, |
2006 | 0 | input: &Input<'_>, |
2007 | 0 | slots: &mut [Option<NonMaxUsize>], |
2008 | 0 | ) -> Result<Option<PatternID>, MatchError> { |
2009 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
2010 | 0 | match self.search_imp(cache, input, slots)? { |
2011 | 0 | None => return Ok(None), |
2012 | 0 | Some(pid) if !utf8empty => return Ok(Some(pid)), |
2013 | 0 | Some(pid) => { |
2014 | 0 | // These slot indices are always correct because we know our |
2015 | 0 | // 'pid' is valid and thus we know that the slot indices for it |
2016 | 0 | // are valid. |
2017 | 0 | let slot_start = pid.as_usize().wrapping_mul(2); |
2018 | 0 | let slot_end = slot_start.wrapping_add(1); |
2019 | 0 | // OK because we know we have a match and we know our caller |
2020 | 0 | // provided slots are big enough (which we make true above if |
2021 | 0 | // the caller didn't). Namely, we're only here when 'utf8empty' |
2022 | 0 | // is true, and when that's true, we require slots for every |
2023 | 0 | // pattern. |
2024 | 0 | let start = slots[slot_start].unwrap().get(); |
2025 | 0 | let end = slots[slot_end].unwrap().get(); |
2026 | 0 | // If our match splits a codepoint, then we cannot report is |
2027 | 0 | // as a match. And since one-pass DFAs only support anchored |
2028 | 0 | // searches, we don't try to skip ahead to find the next match. |
2029 | 0 | // We can just quit with nothing. |
2030 | 0 | if start == end && !input.is_char_boundary(start) { |
2031 | 0 | return Ok(None); |
2032 | 0 | } |
2033 | 0 | Ok(Some(pid)) |
2034 | | } |
2035 | | } |
2036 | 0 | } |
2037 | | } |
2038 | | |
2039 | | impl DFA { |
2040 | 0 | fn search_imp( |
2041 | 0 | &self, |
2042 | 0 | cache: &mut Cache, |
2043 | 0 | input: &Input<'_>, |
2044 | 0 | slots: &mut [Option<NonMaxUsize>], |
2045 | 0 | ) -> Result<Option<PatternID>, MatchError> { |
2046 | 0 | // PERF: Some ideas. I ran out of steam after my initial impl to try |
2047 | 0 | // many of these. |
2048 | 0 | // |
2049 | 0 | // 1) Try doing more state shuffling. Right now, all we do is push |
2050 | 0 | // match states to the end of the transition table so that we can do |
2051 | 0 | // 'if sid >= self.min_match_id' to know whether we're in a match |
2052 | 0 | // state or not. But what about doing something like dense DFAs and |
2053 | 0 | // pushing dead, match and states with captures/looks all toward the |
2054 | 0 | // beginning of the transition table. Then we could do 'if sid <= |
2055 | 0 | // self.max_special_id', in which case, we need to do some special |
2056 | 0 | // handling of some sort. Otherwise, we get the happy path, just |
2057 | 0 | // like in a DFA search. The main argument against this is that the |
2058 | 0 | // one-pass DFA is likely to be used most often with capturing groups |
2059 | 0 | // and if capturing groups are common, then this might wind up being a |
2060 | 0 | // pessimization. |
2061 | 0 | // |
2062 | 0 | // 2) Consider moving 'PatternEpsilons' out of the transition table. |
2063 | 0 | // It is only needed for match states and usually a small minority of |
2064 | 0 | // states are match states. Therefore, we're using an extra 'u64' for |
2065 | 0 | // most states. |
2066 | 0 | // |
2067 | 0 | // 3) I played around with the match state handling and it seems like |
2068 | 0 | // there is probably a lot left on the table for improvement. The |
2069 | 0 | // key tension is that the 'find_match' routine is a giant mess, but |
2070 | 0 | // splitting it out into a non-inlineable function is a non-starter |
2071 | 0 | // because the match state might consume input, so 'find_match' COULD |
2072 | 0 | // be called quite a lot, and a function call at that point would trash |
2073 | 0 | // perf. In theory, we could detect whether a match state consumes |
2074 | 0 | // input and then specialize our search routine based on that. In that |
2075 | 0 | // case, maybe an extra function call is OK, but even then, it might be |
2076 | 0 | // too much of a latency hit. Another idea is to just try and figure |
2077 | 0 | // out how to reduce the code size of 'find_match'. RE2 has a trick |
2078 | 0 | // here where the match handling isn't done if we know the next byte of |
2079 | 0 | // input yields a match too. Maybe we adopt that? |
2080 | 0 | // |
2081 | 0 | // This just might be a tricky DFA to optimize. |
2082 | 0 |
|
2083 | 0 | if input.is_done() { |
2084 | 0 | return Ok(None); |
2085 | 0 | } |
2086 | 0 | // We unfortunately have a bit of book-keeping to do to set things |
2087 | 0 | // up. We do have to setup our cache and clear all of our slots. In |
2088 | 0 | // particular, clearing the slots is necessary for the case where we |
2089 | 0 | // report a match, but one of the capturing groups didn't participate |
2090 | 0 | // in the match but had a span set from a previous search. That would |
2091 | 0 | // be bad. In theory, we could avoid all this slot clearing if we knew |
2092 | 0 | // that every slot was always activated for every match. Then we would |
2093 | 0 | // know they would always be overwritten when a match is found. |
2094 | 0 | let explicit_slots_len = core::cmp::min( |
2095 | 0 | Slots::LIMIT, |
2096 | 0 | slots.len().saturating_sub(self.explicit_slot_start), |
2097 | 0 | ); |
2098 | 0 | cache.setup_search(explicit_slots_len); |
2099 | 0 | for slot in cache.explicit_slots() { |
2100 | 0 | *slot = None; |
2101 | 0 | } |
2102 | 0 | for slot in slots.iter_mut() { |
2103 | 0 | *slot = None; |
2104 | 0 | } |
2105 | | // We set the starting slots for every pattern up front. This does |
2106 | | // increase our latency somewhat, but it avoids having to do it every |
2107 | | // time we see a match state (which could be many times in a single |
2108 | | // search if the match state consumes input). |
2109 | 0 | for pid in self.nfa.patterns() { |
2110 | 0 | let i = pid.as_usize() * 2; |
2111 | 0 | if i >= slots.len() { |
2112 | 0 | break; |
2113 | 0 | } |
2114 | 0 | slots[i] = NonMaxUsize::new(input.start()); |
2115 | | } |
2116 | 0 | let mut pid = None; |
2117 | 0 | let mut next_sid = match input.get_anchored() { |
2118 | 0 | Anchored::Yes => self.start(), |
2119 | 0 | Anchored::Pattern(pid) => self.start_pattern(pid)?, |
2120 | | Anchored::No => { |
2121 | | // If the regex is itself always anchored, then we're fine, |
2122 | | // even if the search is configured to be unanchored. |
2123 | 0 | if !self.nfa.is_always_start_anchored() { |
2124 | 0 | return Err(MatchError::unsupported_anchored( |
2125 | 0 | Anchored::No, |
2126 | 0 | )); |
2127 | 0 | } |
2128 | 0 | self.start() |
2129 | | } |
2130 | | }; |
2131 | 0 | let leftmost_first = |
2132 | 0 | matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst); |
2133 | 0 | for at in input.start()..input.end() { |
2134 | 0 | let sid = next_sid; |
2135 | 0 | let trans = self.transition(sid, input.haystack()[at]); |
2136 | 0 | next_sid = trans.state_id(); |
2137 | 0 | let epsilons = trans.epsilons(); |
2138 | 0 | if sid >= self.min_match_id { |
2139 | 0 | if self.find_match(cache, input, at, sid, slots, &mut pid) { |
2140 | 0 | if input.get_earliest() |
2141 | 0 | || (leftmost_first && trans.match_wins()) |
2142 | | { |
2143 | 0 | return Ok(pid); |
2144 | 0 | } |
2145 | 0 | } |
2146 | 0 | } |
2147 | 0 | if sid == DEAD |
2148 | 0 | || (!epsilons.looks().is_empty() |
2149 | 0 | && !self.nfa.look_matcher().matches_set_inline( |
2150 | 0 | epsilons.looks(), |
2151 | 0 | input.haystack(), |
2152 | 0 | at, |
2153 | 0 | )) |
2154 | | { |
2155 | 0 | return Ok(pid); |
2156 | 0 | } |
2157 | 0 | epsilons.slots().apply(at, cache.explicit_slots()); |
2158 | | } |
2159 | 0 | if next_sid >= self.min_match_id { |
2160 | 0 | self.find_match( |
2161 | 0 | cache, |
2162 | 0 | input, |
2163 | 0 | input.end(), |
2164 | 0 | next_sid, |
2165 | 0 | slots, |
2166 | 0 | &mut pid, |
2167 | 0 | ); |
2168 | 0 | } |
2169 | 0 | Ok(pid) |
2170 | 0 | } |
2171 | | |
2172 | | /// Assumes 'sid' is a match state and looks for whether a match can |
2173 | | /// be reported. If so, appropriate offsets are written to 'slots' and |
2174 | | /// 'matched_pid' is set to the matching pattern ID. |
2175 | | /// |
2176 | | /// Even when 'sid' is a match state, it's possible that a match won't |
2177 | | /// be reported. For example, when the conditional epsilon transitions |
2178 | | /// leading to the match state aren't satisfied at the given position in |
2179 | | /// the haystack. |
2180 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
2181 | 0 | fn find_match( |
2182 | 0 | &self, |
2183 | 0 | cache: &mut Cache, |
2184 | 0 | input: &Input<'_>, |
2185 | 0 | at: usize, |
2186 | 0 | sid: StateID, |
2187 | 0 | slots: &mut [Option<NonMaxUsize>], |
2188 | 0 | matched_pid: &mut Option<PatternID>, |
2189 | 0 | ) -> bool { |
2190 | 0 | debug_assert!(sid >= self.min_match_id); |
2191 | 0 | let pateps = self.pattern_epsilons(sid); |
2192 | 0 | let epsilons = pateps.epsilons(); |
2193 | 0 | if !epsilons.looks().is_empty() |
2194 | 0 | && !self.nfa.look_matcher().matches_set_inline( |
2195 | 0 | epsilons.looks(), |
2196 | 0 | input.haystack(), |
2197 | 0 | at, |
2198 | 0 | ) |
2199 | | { |
2200 | 0 | return false; |
2201 | 0 | } |
2202 | 0 | let pid = pateps.pattern_id_unchecked(); |
2203 | 0 | // This calculation is always correct because we know our 'pid' is |
2204 | 0 | // valid and thus we know that the slot indices for it are valid. |
2205 | 0 | let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1); |
2206 | 0 | // Set the implicit 'end' slot for the matching pattern. (The 'start' |
2207 | 0 | // slot was set at the beginning of the search.) |
2208 | 0 | if slot_end < slots.len() { |
2209 | 0 | slots[slot_end] = NonMaxUsize::new(at); |
2210 | 0 | } |
2211 | | // If the caller provided enough room, copy the previously recorded |
2212 | | // explicit slots from our scratch space to the caller provided slots. |
2213 | | // We *also* need to set any explicit slots that are active as part of |
2214 | | // the path to the match state. |
2215 | 0 | if self.explicit_slot_start < slots.len() { |
2216 | 0 | // NOTE: The 'cache.explicit_slots()' slice is setup at the |
2217 | 0 | // beginning of every search such that it is guaranteed to return a |
2218 | 0 | // slice of length equivalent to 'slots[explicit_slot_start..]'. |
2219 | 0 | slots[self.explicit_slot_start..] |
2220 | 0 | .copy_from_slice(cache.explicit_slots()); |
2221 | 0 | epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]); |
2222 | 0 | } |
2223 | 0 | *matched_pid = Some(pid); |
2224 | 0 | true |
2225 | 0 | } |
2226 | | } |
2227 | | |
2228 | | impl DFA { |
2229 | | /// Returns the anchored start state for matching any pattern in this DFA. |
2230 | 0 | fn start(&self) -> StateID { |
2231 | 0 | self.starts[0] |
2232 | 0 | } |
2233 | | |
2234 | | /// Returns the anchored start state for matching the given pattern. If |
2235 | | /// 'starts_for_each_pattern' |
2236 | | /// was not enabled, then this returns an error. If the given pattern is |
2237 | | /// not in this DFA, then `Ok(None)` is returned. |
2238 | 0 | fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> { |
2239 | 0 | if !self.config.get_starts_for_each_pattern() { |
2240 | 0 | return Err(MatchError::unsupported_anchored(Anchored::Pattern( |
2241 | 0 | pid, |
2242 | 0 | ))); |
2243 | 0 | } |
2244 | 0 | // 'starts' always has non-zero length. The first entry is always the |
2245 | 0 | // anchored starting state for all patterns, and the following entries |
2246 | 0 | // are optional and correspond to the anchored starting states for |
2247 | 0 | // patterns at pid+1. Thus, starts.len()-1 corresponds to the total |
2248 | 0 | // number of patterns that one can explicitly search for. (And it may |
2249 | 0 | // be zero.) |
2250 | 0 | Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD)) |
2251 | 0 | } |
2252 | | |
2253 | | /// Returns the transition from the given state ID and byte of input. The |
2254 | | /// transition includes the next state ID, the slots that should be saved |
2255 | | /// and any conditional epsilon transitions that must be satisfied in order |
2256 | | /// to take this transition. |
2257 | 0 | fn transition(&self, sid: StateID, byte: u8) -> Transition { |
2258 | 0 | let offset = sid.as_usize() << self.stride2(); |
2259 | 0 | let class = self.classes.get(byte).as_usize(); |
2260 | 0 | self.table[offset + class] |
2261 | 0 | } |
2262 | | |
2263 | | /// Set the transition from the given state ID and byte of input to the |
2264 | | /// transition given. |
2265 | 0 | fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) { |
2266 | 0 | let offset = sid.as_usize() << self.stride2(); |
2267 | 0 | let class = self.classes.get(byte).as_usize(); |
2268 | 0 | self.table[offset + class] = to; |
2269 | 0 | } |
2270 | | |
2271 | | /// Return an iterator of "sparse" transitions for the given state ID. |
2272 | | /// "sparse" in this context means that consecutive transitions that are |
2273 | | /// equivalent are returned as one group, and transitions to the DEAD state |
2274 | | /// are ignored. |
2275 | | /// |
2276 | | /// This winds up being useful for debug printing, since it's much terser |
2277 | | /// to display runs of equivalent transitions than the transition for every |
2278 | | /// possible byte value. Indeed, in practice, it's very common for runs |
2279 | | /// of equivalent transitions to appear. |
2280 | 0 | fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> { |
2281 | 0 | let start = sid.as_usize() << self.stride2(); |
2282 | 0 | let end = start + self.alphabet_len(); |
2283 | 0 | SparseTransitionIter { |
2284 | 0 | it: self.table[start..end].iter().enumerate(), |
2285 | 0 | cur: None, |
2286 | 0 | } |
2287 | 0 | } |
2288 | | |
2289 | | /// Return the pattern epsilons for the given state ID. |
2290 | | /// |
2291 | | /// If the given state ID does not correspond to a match state ID, then the |
2292 | | /// pattern epsilons returned is empty. |
2293 | 0 | fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons { |
2294 | 0 | let offset = sid.as_usize() << self.stride2(); |
2295 | 0 | PatternEpsilons(self.table[offset + self.pateps_offset].0) |
2296 | 0 | } |
2297 | | |
2298 | | /// Set the pattern epsilons for the given state ID. |
2299 | 0 | fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) { |
2300 | 0 | let offset = sid.as_usize() << self.stride2(); |
2301 | 0 | self.table[offset + self.pateps_offset] = Transition(pateps.0); |
2302 | 0 | } |
2303 | | |
2304 | | /// Returns the state ID prior to the one given. This returns None if the |
2305 | | /// given ID is the first DFA state. |
2306 | 0 | fn prev_state_id(&self, id: StateID) -> Option<StateID> { |
2307 | 0 | if id == DEAD { |
2308 | 0 | None |
2309 | | } else { |
2310 | | // CORRECTNESS: Since 'id' is not the first state, subtracting 1 |
2311 | | // is always valid. |
2312 | 0 | Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap())) |
2313 | | } |
2314 | 0 | } |
2315 | | |
2316 | | /// Returns the state ID of the last state in this DFA's transition table. |
2317 | | /// "last" in this context means the last state to appear in memory, i.e., |
2318 | | /// the one with the greatest ID. |
2319 | 0 | fn last_state_id(&self) -> StateID { |
2320 | 0 | // CORRECTNESS: A DFA table is always non-empty since it always at |
2321 | 0 | // least contains a DEAD state. Since every state has the same stride, |
2322 | 0 | // we can just compute what the "next" state ID would have been and |
2323 | 0 | // then subtract 1 from it. |
2324 | 0 | StateID::new_unchecked( |
2325 | 0 | (self.table.len() >> self.stride2()).checked_sub(1).unwrap(), |
2326 | 0 | ) |
2327 | 0 | } |
2328 | | |
2329 | | /// Move the transitions from 'id1' to 'id2' and vice versa. |
2330 | | /// |
2331 | | /// WARNING: This does not update the rest of the transition table to have |
2332 | | /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves |
2333 | | /// the states in memory. |
2334 | 0 | pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) { |
2335 | 0 | let o1 = id1.as_usize() << self.stride2(); |
2336 | 0 | let o2 = id2.as_usize() << self.stride2(); |
2337 | 0 | for b in 0..self.stride() { |
2338 | 0 | self.table.swap(o1 + b, o2 + b); |
2339 | 0 | } |
2340 | 0 | } |
2341 | | |
2342 | | /// Map all state IDs in this DFA (transition table + start states) |
2343 | | /// according to the closure given. |
2344 | 0 | pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { |
2345 | 0 | for i in 0..self.state_len() { |
2346 | 0 | let offset = i << self.stride2(); |
2347 | 0 | for b in 0..self.alphabet_len() { |
2348 | 0 | let next = self.table[offset + b].state_id(); |
2349 | 0 | self.table[offset + b].set_state_id(map(next)); |
2350 | 0 | } |
2351 | | } |
2352 | 0 | for i in 0..self.starts.len() { |
2353 | 0 | self.starts[i] = map(self.starts[i]); |
2354 | 0 | } |
2355 | 0 | } |
2356 | | } |
2357 | | |
2358 | | impl core::fmt::Debug for DFA { |
2359 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2360 | 0 | fn debug_state_transitions( |
2361 | 0 | f: &mut core::fmt::Formatter, |
2362 | 0 | dfa: &DFA, |
2363 | 0 | sid: StateID, |
2364 | 0 | ) -> core::fmt::Result { |
2365 | 0 | for (i, (start, end, trans)) in |
2366 | 0 | dfa.sparse_transitions(sid).enumerate() |
2367 | | { |
2368 | 0 | let next = trans.state_id(); |
2369 | 0 | if i > 0 { |
2370 | 0 | write!(f, ", ")?; |
2371 | 0 | } |
2372 | 0 | if start == end { |
2373 | 0 | write!( |
2374 | 0 | f, |
2375 | 0 | "{:?} => {:?}", |
2376 | 0 | DebugByte(start), |
2377 | 0 | next.as_usize(), |
2378 | 0 | )?; |
2379 | | } else { |
2380 | 0 | write!( |
2381 | 0 | f, |
2382 | 0 | "{:?}-{:?} => {:?}", |
2383 | 0 | DebugByte(start), |
2384 | 0 | DebugByte(end), |
2385 | 0 | next.as_usize(), |
2386 | 0 | )?; |
2387 | | } |
2388 | 0 | if trans.match_wins() { |
2389 | 0 | write!(f, " (MW)")?; |
2390 | 0 | } |
2391 | 0 | if !trans.epsilons().is_empty() { |
2392 | 0 | write!(f, " ({:?})", trans.epsilons())?; |
2393 | 0 | } |
2394 | | } |
2395 | 0 | Ok(()) |
2396 | 0 | } |
2397 | | |
2398 | 0 | writeln!(f, "onepass::DFA(")?; |
2399 | 0 | for index in 0..self.state_len() { |
2400 | 0 | let sid = StateID::must(index); |
2401 | 0 | let pateps = self.pattern_epsilons(sid); |
2402 | 0 | if sid == DEAD { |
2403 | 0 | write!(f, "D ")?; |
2404 | 0 | } else if pateps.pattern_id().is_some() { |
2405 | 0 | write!(f, "* ")?; |
2406 | | } else { |
2407 | 0 | write!(f, " ")?; |
2408 | | } |
2409 | 0 | write!(f, "{:06?}", sid.as_usize())?; |
2410 | 0 | if !pateps.is_empty() { |
2411 | 0 | write!(f, " ({:?})", pateps)?; |
2412 | 0 | } |
2413 | 0 | write!(f, ": ")?; |
2414 | 0 | debug_state_transitions(f, self, sid)?; |
2415 | 0 | write!(f, "\n")?; |
2416 | | } |
2417 | 0 | writeln!(f, "")?; |
2418 | 0 | for (i, &sid) in self.starts.iter().enumerate() { |
2419 | 0 | if i == 0 { |
2420 | 0 | writeln!(f, "START(ALL): {:?}", sid.as_usize())?; |
2421 | | } else { |
2422 | 0 | writeln!( |
2423 | 0 | f, |
2424 | 0 | "START(pattern: {:?}): {:?}", |
2425 | 0 | i - 1, |
2426 | 0 | sid.as_usize(), |
2427 | 0 | )?; |
2428 | | } |
2429 | | } |
2430 | 0 | writeln!(f, "state length: {:?}", self.state_len())?; |
2431 | 0 | writeln!(f, "pattern length: {:?}", self.pattern_len())?; |
2432 | 0 | writeln!(f, ")")?; |
2433 | 0 | Ok(()) |
2434 | 0 | } |
2435 | | } |
2436 | | |
2437 | | /// An iterator over groups of consecutive equivalent transitions in a single |
2438 | | /// state. |
2439 | | #[derive(Debug)] |
2440 | | struct SparseTransitionIter<'a> { |
2441 | | it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>, |
2442 | | cur: Option<(u8, u8, Transition)>, |
2443 | | } |
2444 | | |
2445 | | impl<'a> Iterator for SparseTransitionIter<'a> { |
2446 | | type Item = (u8, u8, Transition); |
2447 | | |
2448 | 0 | fn next(&mut self) -> Option<(u8, u8, Transition)> { |
2449 | 0 | while let Some((b, &trans)) = self.it.next() { |
2450 | | // Fine because we'll never have more than u8::MAX transitions in |
2451 | | // one state. |
2452 | 0 | let b = b.as_u8(); |
2453 | 0 | let (prev_start, prev_end, prev_trans) = match self.cur { |
2454 | 0 | Some(t) => t, |
2455 | | None => { |
2456 | 0 | self.cur = Some((b, b, trans)); |
2457 | 0 | continue; |
2458 | | } |
2459 | | }; |
2460 | 0 | if prev_trans == trans { |
2461 | 0 | self.cur = Some((prev_start, b, prev_trans)); |
2462 | 0 | } else { |
2463 | 0 | self.cur = Some((b, b, trans)); |
2464 | 0 | if prev_trans.state_id() != DEAD { |
2465 | 0 | return Some((prev_start, prev_end, prev_trans)); |
2466 | 0 | } |
2467 | | } |
2468 | | } |
2469 | 0 | if let Some((start, end, trans)) = self.cur.take() { |
2470 | 0 | if trans.state_id() != DEAD { |
2471 | 0 | return Some((start, end, trans)); |
2472 | 0 | } |
2473 | 0 | } |
2474 | 0 | None |
2475 | 0 | } |
2476 | | } |
2477 | | |
2478 | | /// A cache represents mutable state that a one-pass [`DFA`] requires during a |
2479 | | /// search. |
2480 | | /// |
2481 | | /// For a given one-pass DFA, its corresponding cache may be created either via |
2482 | | /// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every |
2483 | | /// way, except the former does not require explicitly importing `Cache`. |
2484 | | /// |
2485 | | /// A particular `Cache` is coupled with the one-pass DFA from which it was |
2486 | | /// created. It may only be used with that one-pass DFA. A cache and its |
2487 | | /// allocations may be re-purposed via [`Cache::reset`], in which case, it can |
2488 | | /// only be used with the new one-pass DFA (and not the old one). |
2489 | | #[derive(Clone, Debug)] |
2490 | | pub struct Cache { |
2491 | | /// Scratch space used to store slots during a search. Basically, we use |
2492 | | /// the caller provided slots to store slots known when a match occurs. |
2493 | | /// But after a match occurs, we might continue a search but ultimately |
2494 | | /// fail to extend the match. When continuing the search, we need some |
2495 | | /// place to store candidate capture offsets without overwriting the slot |
2496 | | /// offsets recorded for the most recently seen match. |
2497 | | explicit_slots: Vec<Option<NonMaxUsize>>, |
2498 | | /// The number of slots in the caller-provided 'Captures' value for the |
2499 | | /// current search. This is always at most 'explicit_slots.len()', but |
2500 | | /// might be less than it, if the caller provided fewer slots to fill. |
2501 | | explicit_slot_len: usize, |
2502 | | } |
2503 | | |
2504 | | impl Cache { |
2505 | | /// Create a new [`onepass::DFA`](DFA) cache. |
2506 | | /// |
2507 | | /// A potentially more convenient routine to create a cache is |
2508 | | /// [`DFA::create_cache`], as it does not require also importing the |
2509 | | /// `Cache` type. |
2510 | | /// |
2511 | | /// If you want to reuse the returned `Cache` with some other one-pass DFA, |
2512 | | /// then you must call [`Cache::reset`] with the desired one-pass DFA. |
2513 | 0 | pub fn new(re: &DFA) -> Cache { |
2514 | 0 | let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 }; |
2515 | 0 | cache.reset(re); |
2516 | 0 | cache |
2517 | 0 | } |
2518 | | |
2519 | | /// Reset this cache such that it can be used for searching with a |
2520 | | /// different [`onepass::DFA`](DFA). |
2521 | | /// |
2522 | | /// A cache reset permits reusing memory already allocated in this cache |
2523 | | /// with a different one-pass DFA. |
2524 | | /// |
2525 | | /// # Example |
2526 | | /// |
2527 | | /// This shows how to re-purpose a cache for use with a different one-pass |
2528 | | /// DFA. |
2529 | | /// |
2530 | | /// ``` |
2531 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
2532 | | /// use regex_automata::{dfa::onepass::DFA, Match}; |
2533 | | /// |
2534 | | /// let re1 = DFA::new(r"\w")?; |
2535 | | /// let re2 = DFA::new(r"\W")?; |
2536 | | /// let mut caps1 = re1.create_captures(); |
2537 | | /// let mut caps2 = re2.create_captures(); |
2538 | | /// |
2539 | | /// let mut cache = re1.create_cache(); |
2540 | | /// assert_eq!( |
2541 | | /// Some(Match::must(0, 0..2)), |
2542 | | /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, |
2543 | | /// ); |
2544 | | /// |
2545 | | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
2546 | | /// // incorrect results. In order to re-purpose the cache, we must reset |
2547 | | /// // it with the one-pass DFA we'd like to use it with. |
2548 | | /// // |
2549 | | /// // Similarly, after this reset, using the cache with 're1' is also not |
2550 | | /// // allowed. |
2551 | | /// re2.reset_cache(&mut cache); |
2552 | | /// assert_eq!( |
2553 | | /// Some(Match::must(0, 0..3)), |
2554 | | /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, |
2555 | | /// ); |
2556 | | /// |
2557 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2558 | | /// ``` |
2559 | 0 | pub fn reset(&mut self, re: &DFA) { |
2560 | 0 | let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len(); |
2561 | 0 | self.explicit_slots.resize(explicit_slot_len, None); |
2562 | 0 | self.explicit_slot_len = explicit_slot_len; |
2563 | 0 | } |
2564 | | |
2565 | | /// Returns the heap memory usage, in bytes, of this cache. |
2566 | | /// |
2567 | | /// This does **not** include the stack size used up by this cache. To |
2568 | | /// compute that, use `std::mem::size_of::<Cache>()`. |
2569 | 0 | pub fn memory_usage(&self) -> usize { |
2570 | 0 | self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>() |
2571 | 0 | } |
2572 | | |
2573 | 0 | fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] { |
2574 | 0 | &mut self.explicit_slots[..self.explicit_slot_len] |
2575 | 0 | } |
2576 | | |
2577 | 0 | fn setup_search(&mut self, explicit_slot_len: usize) { |
2578 | 0 | self.explicit_slot_len = explicit_slot_len; |
2579 | 0 | } |
2580 | | } |
2581 | | |
2582 | | /// Represents a single transition in a one-pass DFA. |
2583 | | /// |
2584 | | /// The high 21 bits corresponds to the state ID. The bit following corresponds |
2585 | | /// to the special "match wins" flag. The remaining low 42 bits corresponds to |
2586 | | /// the transition epsilons, which contains the slots that should be saved when |
2587 | | /// this transition is followed and the conditional epsilon transitions that |
2588 | | /// must be satisfied in order to follow this transition. |
2589 | | #[derive(Clone, Copy, Eq, PartialEq)] |
2590 | | struct Transition(u64); |
2591 | | |
2592 | | impl Transition { |
2593 | | const STATE_ID_BITS: u64 = 21; |
2594 | | const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS; |
2595 | | const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS; |
2596 | | const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1); |
2597 | | const INFO_MASK: u64 = 0x000003FF_FFFFFFFF; |
2598 | | |
2599 | | /// Return a new transition to the given state ID with the given epsilons. |
2600 | 0 | fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition { |
2601 | 0 | let match_wins = |
2602 | 0 | if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 }; |
2603 | 0 | let sid = sid.as_u64() << Transition::STATE_ID_SHIFT; |
2604 | 0 | Transition(sid | match_wins | epsilons.0) |
2605 | 0 | } |
2606 | | |
2607 | | /// Returns true if and only if this transition points to the DEAD state. |
2608 | 0 | fn is_dead(self) -> bool { |
2609 | 0 | self.state_id() == DEAD |
2610 | 0 | } |
2611 | | |
2612 | | /// Return whether this transition has a "match wins" property. |
2613 | | /// |
2614 | | /// When a transition has this property, it means that if a match has been |
2615 | | /// found and the search uses leftmost-first semantics, then that match |
2616 | | /// should be returned immediately instead of continuing on. |
2617 | | /// |
2618 | | /// The "match wins" name comes from RE2, which uses a pretty much |
2619 | | /// identical mechanism for implementing leftmost-first semantics. |
2620 | 0 | fn match_wins(&self) -> bool { |
2621 | 0 | (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1 |
2622 | 0 | } |
2623 | | |
2624 | | /// Return the "next" state ID that this transition points to. |
2625 | 0 | fn state_id(&self) -> StateID { |
2626 | 0 | // OK because a Transition has a valid StateID in its upper bits by |
2627 | 0 | // construction. The cast to usize is also correct, even on 16-bit |
2628 | 0 | // targets because, again, we know the upper bits is a valid StateID, |
2629 | 0 | // which can never overflow usize on any supported target. |
2630 | 0 | StateID::new_unchecked( |
2631 | 0 | (self.0 >> Transition::STATE_ID_SHIFT).as_usize(), |
2632 | 0 | ) |
2633 | 0 | } |
2634 | | |
2635 | | /// Set the "next" state ID in this transition. |
2636 | 0 | fn set_state_id(&mut self, sid: StateID) { |
2637 | 0 | *self = Transition::new(self.match_wins(), sid, self.epsilons()); |
2638 | 0 | } |
2639 | | |
2640 | | /// Return the epsilons embedded in this transition. |
2641 | 0 | fn epsilons(&self) -> Epsilons { |
2642 | 0 | Epsilons(self.0 & Transition::INFO_MASK) |
2643 | 0 | } |
2644 | | } |
2645 | | |
2646 | | impl core::fmt::Debug for Transition { |
2647 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2648 | 0 | if self.is_dead() { |
2649 | 0 | return write!(f, "0"); |
2650 | 0 | } |
2651 | 0 | write!(f, "{}", self.state_id().as_usize())?; |
2652 | 0 | if self.match_wins() { |
2653 | 0 | write!(f, "-MW")?; |
2654 | 0 | } |
2655 | 0 | if !self.epsilons().is_empty() { |
2656 | 0 | write!(f, "-{:?}", self.epsilons())?; |
2657 | 0 | } |
2658 | 0 | Ok(()) |
2659 | 0 | } |
2660 | | } |
2661 | | |
2662 | | /// A representation of a match state's pattern ID along with the epsilons for |
2663 | | /// when a match occurs. |
2664 | | /// |
2665 | | /// A match state in a one-pass DFA, unlike in a more general DFA, has exactly |
2666 | | /// one pattern ID. If it had more, then the original NFA would not have been |
2667 | | /// one-pass. |
2668 | | /// |
2669 | | /// The "epsilons" part of this corresponds to what was found in the epsilon |
2670 | | /// transitions between the transition taken in the last byte of input and the |
2671 | | /// ultimate match state. This might include saving slots and/or conditional |
2672 | | /// epsilon transitions that must be satisfied before one can report the match. |
2673 | | /// |
2674 | | /// Technically, every state has room for a 'PatternEpsilons', but it is only |
2675 | | /// ever non-empty for match states. |
2676 | | #[derive(Clone, Copy)] |
2677 | | struct PatternEpsilons(u64); |
2678 | | |
2679 | | impl PatternEpsilons { |
2680 | | const PATTERN_ID_BITS: u64 = 22; |
2681 | | const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS; |
2682 | | // A sentinel value indicating that this is not a match state. We don't |
2683 | | // use 0 since 0 is a valid pattern ID. |
2684 | | const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF; |
2685 | | const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE; |
2686 | | const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000; |
2687 | | const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF; |
2688 | | |
2689 | | /// Return a new empty pattern epsilons that has no pattern ID and has no |
2690 | | /// epsilons. This is suitable for non-match states. |
2691 | 0 | fn empty() -> PatternEpsilons { |
2692 | 0 | PatternEpsilons( |
2693 | 0 | PatternEpsilons::PATTERN_ID_NONE |
2694 | 0 | << PatternEpsilons::PATTERN_ID_SHIFT, |
2695 | 0 | ) |
2696 | 0 | } |
2697 | | |
2698 | | /// Whether this pattern epsilons is empty or not. It's empty when it has |
2699 | | /// no pattern ID and an empty epsilons. |
2700 | 0 | fn is_empty(self) -> bool { |
2701 | 0 | self.pattern_id().is_none() && self.epsilons().is_empty() |
2702 | 0 | } |
2703 | | |
2704 | | /// Return the pattern ID in this pattern epsilons if one exists. |
2705 | 0 | fn pattern_id(self) -> Option<PatternID> { |
2706 | 0 | let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; |
2707 | 0 | if pid == PatternEpsilons::PATTERN_ID_LIMIT { |
2708 | 0 | None |
2709 | | } else { |
2710 | 0 | Some(PatternID::new_unchecked(pid.as_usize())) |
2711 | | } |
2712 | 0 | } |
2713 | | |
2714 | | /// Returns the pattern ID without checking whether it's valid. If this is |
2715 | | /// called and there is no pattern ID in this `PatternEpsilons`, then this |
2716 | | /// will likely produce an incorrect result or possibly even a panic or |
2717 | | /// an overflow. But safety will not be violated. |
2718 | | /// |
2719 | | /// This is useful when you know a particular state is a match state. If |
2720 | | /// it's a match state, then it must have a pattern ID. |
2721 | 0 | fn pattern_id_unchecked(self) -> PatternID { |
2722 | 0 | let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; |
2723 | 0 | PatternID::new_unchecked(pid.as_usize()) |
2724 | 0 | } |
2725 | | |
2726 | | /// Return a new pattern epsilons with the given pattern ID, but the same |
2727 | | /// epsilons. |
2728 | 0 | fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons { |
2729 | 0 | PatternEpsilons( |
2730 | 0 | (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT) |
2731 | 0 | | (self.0 & PatternEpsilons::EPSILONS_MASK), |
2732 | 0 | ) |
2733 | 0 | } |
2734 | | |
2735 | | /// Return the epsilons part of this pattern epsilons. |
2736 | 0 | fn epsilons(self) -> Epsilons { |
2737 | 0 | Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK) |
2738 | 0 | } |
2739 | | |
2740 | | /// Return a new pattern epsilons with the given epsilons, but the same |
2741 | | /// pattern ID. |
2742 | 0 | fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { |
2743 | 0 | PatternEpsilons( |
2744 | 0 | (self.0 & PatternEpsilons::PATTERN_ID_MASK) |
2745 | 0 | | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), |
2746 | 0 | ) |
2747 | 0 | } |
2748 | | } |
2749 | | |
2750 | | impl core::fmt::Debug for PatternEpsilons { |
2751 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2752 | 0 | if self.is_empty() { |
2753 | 0 | return write!(f, "N/A"); |
2754 | 0 | } |
2755 | 0 | if let Some(pid) = self.pattern_id() { |
2756 | 0 | write!(f, "{}", pid.as_usize())?; |
2757 | 0 | } |
2758 | 0 | if !self.epsilons().is_empty() { |
2759 | 0 | if self.pattern_id().is_some() { |
2760 | 0 | write!(f, "/")?; |
2761 | 0 | } |
2762 | 0 | write!(f, "{:?}", self.epsilons())?; |
2763 | 0 | } |
2764 | 0 | Ok(()) |
2765 | 0 | } |
2766 | | } |
2767 | | |
2768 | | /// Epsilons represents all of the NFA epsilons transitions that went into a |
2769 | | /// single transition in a single DFA state. In this case, it only represents |
2770 | | /// the epsilon transitions that have some kind of non-consuming side effect: |
2771 | | /// either the transition requires storing the current position of the search |
2772 | | /// into a slot, or the transition is conditional and requires the current |
2773 | | /// position in the input to satisfy an assertion before the transition may be |
2774 | | /// taken. |
2775 | | /// |
2776 | | /// This folds the cumulative effect of a group of NFA states (all connected |
2777 | | /// by epsilon transitions) down into a single set of bits. While these bits |
2778 | | /// can represent all possible conditional epsilon transitions, it only permits |
2779 | | /// storing up to a somewhat small number of slots. |
2780 | | /// |
2781 | | /// Epsilons is represented as a 42-bit integer. For example, it is packed into |
2782 | | /// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a |
2783 | | /// `StateID` and a special "match wins" property.) |
2784 | | #[derive(Clone, Copy)] |
2785 | | struct Epsilons(u64); |
2786 | | |
2787 | | impl Epsilons { |
2788 | | const SLOT_MASK: u64 = 0x000003FF_FFFFFC00; |
2789 | | const SLOT_SHIFT: u64 = 10; |
2790 | | const LOOK_MASK: u64 = 0x00000000_000003FF; |
2791 | | |
2792 | | /// Create a new empty epsilons. It has no slots and no assertions that |
2793 | | /// need to be satisfied. |
2794 | 0 | fn empty() -> Epsilons { |
2795 | 0 | Epsilons(0) |
2796 | 0 | } |
2797 | | |
2798 | | /// Returns true if this epsilons contains no slots and no assertions. |
2799 | 0 | fn is_empty(self) -> bool { |
2800 | 0 | self.0 == 0 |
2801 | 0 | } |
2802 | | |
2803 | | /// Returns the slot epsilon transitions. |
2804 | 0 | fn slots(self) -> Slots { |
2805 | 0 | Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32()) |
2806 | 0 | } |
2807 | | |
2808 | | /// Set the slot epsilon transitions. |
2809 | 0 | fn set_slots(self, slots: Slots) -> Epsilons { |
2810 | 0 | Epsilons( |
2811 | 0 | (u64::from(slots.0) << Epsilons::SLOT_SHIFT) |
2812 | 0 | | (self.0 & Epsilons::LOOK_MASK), |
2813 | 0 | ) |
2814 | 0 | } |
2815 | | |
2816 | | /// Return the set of look-around assertions in these epsilon transitions. |
2817 | 0 | fn looks(self) -> LookSet { |
2818 | 0 | LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } |
2819 | 0 | } |
2820 | | |
2821 | | /// Set the look-around assertions on these epsilon transitions. |
2822 | 0 | fn set_looks(self, look_set: LookSet) -> Epsilons { |
2823 | 0 | Epsilons( |
2824 | 0 | (self.0 & Epsilons::SLOT_MASK) |
2825 | 0 | | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), |
2826 | 0 | ) |
2827 | 0 | } |
2828 | | } |
2829 | | |
2830 | | impl core::fmt::Debug for Epsilons { |
2831 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2832 | 0 | let mut wrote = false; |
2833 | 0 | if !self.slots().is_empty() { |
2834 | 0 | write!(f, "{:?}", self.slots())?; |
2835 | 0 | wrote = true; |
2836 | 0 | } |
2837 | 0 | if !self.looks().is_empty() { |
2838 | 0 | if wrote { |
2839 | 0 | write!(f, "/")?; |
2840 | 0 | } |
2841 | 0 | write!(f, "{:?}", self.looks())?; |
2842 | 0 | wrote = true; |
2843 | 0 | } |
2844 | 0 | if !wrote { |
2845 | 0 | write!(f, "N/A")?; |
2846 | 0 | } |
2847 | 0 | Ok(()) |
2848 | 0 | } |
2849 | | } |
2850 | | |
2851 | | /// The set of epsilon transitions indicating that the current position in a |
2852 | | /// search should be saved to a slot. |
2853 | | /// |
2854 | | /// This *only* represents explicit slots. So for example, the pattern |
2855 | | /// `[a-z]+([0-9]+)([a-z]+)` has: |
2856 | | /// |
2857 | | /// * 3 capturing groups, thus 6 slots. |
2858 | | /// * 1 implicit capturing group, thus 2 implicit slots. |
2859 | | /// * 2 explicit capturing groups, thus 4 explicit slots. |
2860 | | /// |
2861 | | /// While implicit slots are represented by epsilon transitions in an NFA, we |
2862 | | /// do not explicitly represent them here. Instead, implicit slots are assumed |
2863 | | /// to be present and handled automatically in the search code. Therefore, |
2864 | | /// that means we only need to represent explicit slots in our epsilon |
2865 | | /// transitions. |
2866 | | /// |
2867 | | /// Its representation is a bit set. The bit 'i' is set if and only if there |
2868 | | /// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That |
2869 | | /// is, the bit 'i' corresponds to the first explicit slot and the first |
2870 | | /// explicit slot appears immediately following the last implicit slot. (If |
2871 | | /// this is confusing, see `GroupInfo` for more details on how slots works.) |
2872 | | /// |
2873 | | /// A single `Slots` represents all the active slots in a sub-graph of an NFA, |
2874 | | /// where all the states are connected by epsilon transitions. In effect, when |
2875 | | /// traversing the one-pass DFA during a search, all slots set in a particular |
2876 | | /// transition must be captured by recording the current search position. |
2877 | | /// |
2878 | | /// The API of `Slots` requires the caller to handle the explicit slot offset. |
2879 | | /// That is, a `Slots` doesn't know where the explicit slots start for a |
2880 | | /// particular NFA. Thus, if the callers see's the bit 'i' is set, then they |
2881 | | /// need to do the arithmetic above to find 'c', which is the real actual slot |
2882 | | /// index in the corresponding NFA. |
2883 | | #[derive(Clone, Copy)] |
2884 | | struct Slots(u32); |
2885 | | |
2886 | | impl Slots { |
2887 | | const LIMIT: usize = 32; |
2888 | | |
2889 | | /// Insert the slot at the given bit index. |
2890 | 0 | fn insert(self, slot: usize) -> Slots { |
2891 | 0 | debug_assert!(slot < Slots::LIMIT); |
2892 | 0 | Slots(self.0 | (1 << slot.as_u32())) |
2893 | 0 | } |
2894 | | |
2895 | | /// Remove the slot at the given bit index. |
2896 | 0 | fn remove(self, slot: usize) -> Slots { |
2897 | 0 | debug_assert!(slot < Slots::LIMIT); |
2898 | 0 | Slots(self.0 & !(1 << slot.as_u32())) |
2899 | 0 | } |
2900 | | |
2901 | | /// Returns true if and only if this set contains no slots. |
2902 | 0 | fn is_empty(self) -> bool { |
2903 | 0 | self.0 == 0 |
2904 | 0 | } |
2905 | | |
2906 | | /// Returns an iterator over all of the set bits in this set. |
2907 | 0 | fn iter(self) -> SlotsIter { |
2908 | 0 | SlotsIter { slots: self } |
2909 | 0 | } |
2910 | | |
2911 | | /// For the position `at` in the current haystack, copy it to |
2912 | | /// `caller_explicit_slots` for all slots that are in this set. |
2913 | | /// |
2914 | | /// Callers may pass a slice of any length. Slots in this set bigger than |
2915 | | /// the length of the given explicit slots are simply skipped. |
2916 | | /// |
2917 | | /// The slice *must* correspond only to the explicit slots and the first |
2918 | | /// element of the slice must always correspond to the first explicit slot |
2919 | | /// in the corresponding NFA. |
2920 | 0 | fn apply( |
2921 | 0 | self, |
2922 | 0 | at: usize, |
2923 | 0 | caller_explicit_slots: &mut [Option<NonMaxUsize>], |
2924 | 0 | ) { |
2925 | 0 | if self.is_empty() { |
2926 | 0 | return; |
2927 | 0 | } |
2928 | 0 | let at = NonMaxUsize::new(at); |
2929 | 0 | for slot in self.iter() { |
2930 | 0 | if slot >= caller_explicit_slots.len() { |
2931 | 0 | break; |
2932 | 0 | } |
2933 | 0 | caller_explicit_slots[slot] = at; |
2934 | | } |
2935 | 0 | } |
2936 | | } |
2937 | | |
2938 | | impl core::fmt::Debug for Slots { |
2939 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2940 | 0 | write!(f, "S")?; |
2941 | 0 | for slot in self.iter() { |
2942 | 0 | write!(f, "-{:?}", slot)?; |
2943 | | } |
2944 | 0 | Ok(()) |
2945 | 0 | } |
2946 | | } |
2947 | | |
2948 | | /// An iterator over all of the bits set in a slot set. |
2949 | | /// |
2950 | | /// This returns the bit index that is set, so callers may need to offset it |
2951 | | /// to get the actual NFA slot index. |
2952 | | #[derive(Debug)] |
2953 | | struct SlotsIter { |
2954 | | slots: Slots, |
2955 | | } |
2956 | | |
2957 | | impl Iterator for SlotsIter { |
2958 | | type Item = usize; |
2959 | | |
2960 | 0 | fn next(&mut self) -> Option<usize> { |
2961 | 0 | // Number of zeroes here is always <= u8::MAX, and so fits in a usize. |
2962 | 0 | let slot = self.slots.0.trailing_zeros().as_usize(); |
2963 | 0 | if slot >= Slots::LIMIT { |
2964 | 0 | return None; |
2965 | 0 | } |
2966 | 0 | self.slots = self.slots.remove(slot); |
2967 | 0 | Some(slot) |
2968 | 0 | } |
2969 | | } |
2970 | | |
2971 | | /// An error that occurred during the construction of a one-pass DFA. |
2972 | | /// |
2973 | | /// This error does not provide many introspection capabilities. There are |
2974 | | /// generally only two things you can do with it: |
2975 | | /// |
2976 | | /// * Obtain a human readable message via its `std::fmt::Display` impl. |
2977 | | /// * Access an underlying [`thompson::BuildError`] type from its `source` |
2978 | | /// method via the `std::error::Error` trait. This error only occurs when using |
2979 | | /// convenience routines for building a one-pass DFA directly from a pattern |
2980 | | /// string. |
2981 | | /// |
2982 | | /// When the `std` feature is enabled, this implements the `std::error::Error` |
2983 | | /// trait. |
2984 | | #[derive(Clone, Debug)] |
2985 | | pub struct BuildError { |
2986 | | kind: BuildErrorKind, |
2987 | | } |
2988 | | |
2989 | | /// The kind of error that occurred during the construction of a one-pass DFA. |
2990 | | #[derive(Clone, Debug)] |
2991 | | enum BuildErrorKind { |
2992 | | NFA(crate::nfa::thompson::BuildError), |
2993 | | Word(UnicodeWordBoundaryError), |
2994 | | TooManyStates { limit: u64 }, |
2995 | | TooManyPatterns { limit: u64 }, |
2996 | | UnsupportedLook { look: Look }, |
2997 | | ExceededSizeLimit { limit: usize }, |
2998 | | NotOnePass { msg: &'static str }, |
2999 | | } |
3000 | | |
3001 | | impl BuildError { |
3002 | 0 | fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError { |
3003 | 0 | BuildError { kind: BuildErrorKind::NFA(err) } |
3004 | 0 | } |
3005 | | |
3006 | 0 | fn word(err: UnicodeWordBoundaryError) -> BuildError { |
3007 | 0 | BuildError { kind: BuildErrorKind::Word(err) } |
3008 | 0 | } |
3009 | | |
3010 | 0 | fn too_many_states(limit: u64) -> BuildError { |
3011 | 0 | BuildError { kind: BuildErrorKind::TooManyStates { limit } } |
3012 | 0 | } |
3013 | | |
3014 | 0 | fn too_many_patterns(limit: u64) -> BuildError { |
3015 | 0 | BuildError { kind: BuildErrorKind::TooManyPatterns { limit } } |
3016 | 0 | } |
3017 | | |
3018 | 0 | fn unsupported_look(look: Look) -> BuildError { |
3019 | 0 | BuildError { kind: BuildErrorKind::UnsupportedLook { look } } |
3020 | 0 | } |
3021 | | |
3022 | 0 | fn exceeded_size_limit(limit: usize) -> BuildError { |
3023 | 0 | BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } |
3024 | 0 | } |
3025 | | |
3026 | 0 | fn not_one_pass(msg: &'static str) -> BuildError { |
3027 | 0 | BuildError { kind: BuildErrorKind::NotOnePass { msg } } |
3028 | 0 | } |
3029 | | } |
3030 | | |
3031 | | #[cfg(feature = "std")] |
3032 | | impl std::error::Error for BuildError { |
3033 | 0 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { |
3034 | | use self::BuildErrorKind::*; |
3035 | | |
3036 | 0 | match self.kind { |
3037 | 0 | NFA(ref err) => Some(err), |
3038 | 0 | Word(ref err) => Some(err), |
3039 | 0 | _ => None, |
3040 | | } |
3041 | 0 | } |
3042 | | } |
3043 | | |
3044 | | impl core::fmt::Display for BuildError { |
3045 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
3046 | | use self::BuildErrorKind::*; |
3047 | | |
3048 | 0 | match self.kind { |
3049 | 0 | NFA(_) => write!(f, "error building NFA"), |
3050 | 0 | Word(_) => write!(f, "NFA contains Unicode word boundary"), |
3051 | 0 | TooManyStates { limit } => write!( |
3052 | 0 | f, |
3053 | 0 | "one-pass DFA exceeded a limit of {:?} for number of states", |
3054 | 0 | limit, |
3055 | 0 | ), |
3056 | 0 | TooManyPatterns { limit } => write!( |
3057 | 0 | f, |
3058 | 0 | "one-pass DFA exceeded a limit of {:?} for number of patterns", |
3059 | 0 | limit, |
3060 | 0 | ), |
3061 | 0 | UnsupportedLook { look } => write!( |
3062 | 0 | f, |
3063 | 0 | "one-pass DFA does not support the {:?} assertion", |
3064 | 0 | look, |
3065 | 0 | ), |
3066 | 0 | ExceededSizeLimit { limit } => write!( |
3067 | 0 | f, |
3068 | 0 | "one-pass DFA exceeded size limit of {:?} during building", |
3069 | 0 | limit, |
3070 | 0 | ), |
3071 | 0 | NotOnePass { msg } => write!( |
3072 | 0 | f, |
3073 | 0 | "one-pass DFA could not be built because \ |
3074 | 0 | pattern is not one-pass: {}", |
3075 | 0 | msg, |
3076 | 0 | ), |
3077 | | } |
3078 | 0 | } |
3079 | | } |
3080 | | |
3081 | | #[cfg(all(test, feature = "syntax"))] |
3082 | | mod tests { |
3083 | | use alloc::string::ToString; |
3084 | | |
3085 | | use super::*; |
3086 | | |
3087 | | #[test] |
3088 | | fn fail_conflicting_transition() { |
3089 | | let predicate = |err: &str| err.contains("conflicting transition"); |
3090 | | |
3091 | | let err = DFA::new(r"a*[ab]").unwrap_err().to_string(); |
3092 | | assert!(predicate(&err), "{}", err); |
3093 | | } |
3094 | | |
3095 | | #[test] |
3096 | | fn fail_multiple_epsilon() { |
3097 | | let predicate = |err: &str| { |
3098 | | err.contains("multiple epsilon transitions to same state") |
3099 | | }; |
3100 | | |
3101 | | let err = DFA::new(r"(^|$)a").unwrap_err().to_string(); |
3102 | | assert!(predicate(&err), "{}", err); |
3103 | | } |
3104 | | |
3105 | | #[test] |
3106 | | fn fail_multiple_match() { |
3107 | | let predicate = |err: &str| { |
3108 | | err.contains("multiple epsilon transitions to match state") |
3109 | | }; |
3110 | | |
3111 | | let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string(); |
3112 | | assert!(predicate(&err), "{}", err); |
3113 | | } |
3114 | | |
3115 | | // This test is meant to build a one-pass regex with the maximum number of |
3116 | | // possible slots. |
3117 | | // |
3118 | | // NOTE: Remember that the slot limit only applies to explicit capturing |
3119 | | // groups. Any number of implicit capturing groups is supported (up to the |
3120 | | // maximum number of supported patterns), since implicit groups are handled |
3121 | | // by the search loop itself. |
3122 | | #[test] |
3123 | | fn max_slots() { |
3124 | | // One too many... |
3125 | | let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)"; |
3126 | | assert!(DFA::new(pat).is_err()); |
3127 | | // Just right. |
3128 | | let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)"; |
3129 | | assert!(DFA::new(pat).is_ok()); |
3130 | | } |
3131 | | |
3132 | | // This test ensures that the one-pass DFA works with all look-around |
3133 | | // assertions that we expect it to work with. |
3134 | | // |
3135 | | // The utility of this test is that each one-pass transition has a small |
3136 | | // amount of space to store look-around assertions. Currently, there is |
3137 | | // logic in the one-pass constructor to ensure there aren't more than ten |
3138 | | // possible assertions. And indeed, there are only ten possible assertions |
3139 | | // (at time of writing), so this is okay. But conceivably, more assertions |
3140 | | // could be added. So we check that things at least work with what we |
3141 | | // expect them to work with. |
3142 | | #[test] |
3143 | | fn assertions() { |
3144 | | // haystack anchors |
3145 | | assert!(DFA::new(r"^").is_ok()); |
3146 | | assert!(DFA::new(r"$").is_ok()); |
3147 | | |
3148 | | // line anchors |
3149 | | assert!(DFA::new(r"(?m)^").is_ok()); |
3150 | | assert!(DFA::new(r"(?m)$").is_ok()); |
3151 | | assert!(DFA::new(r"(?Rm)^").is_ok()); |
3152 | | assert!(DFA::new(r"(?Rm)$").is_ok()); |
3153 | | |
3154 | | // word boundaries |
3155 | | if cfg!(feature = "unicode-word-boundary") { |
3156 | | assert!(DFA::new(r"\b").is_ok()); |
3157 | | assert!(DFA::new(r"\B").is_ok()); |
3158 | | } |
3159 | | assert!(DFA::new(r"(?-u)\b").is_ok()); |
3160 | | assert!(DFA::new(r"(?-u)\B").is_ok()); |
3161 | | } |
3162 | | |
3163 | | #[cfg(not(miri))] // takes too long on miri |
3164 | | #[test] |
3165 | | fn is_one_pass() { |
3166 | | use crate::util::syntax; |
3167 | | |
3168 | | assert!(DFA::new(r"a*b").is_ok()); |
3169 | | if cfg!(feature = "unicode-perl") { |
3170 | | assert!(DFA::new(r"\w").is_ok()); |
3171 | | } |
3172 | | assert!(DFA::new(r"(?-u)\w*\s").is_ok()); |
3173 | | assert!(DFA::new(r"(?s:.)*?").is_ok()); |
3174 | | assert!(DFA::builder() |
3175 | | .syntax(syntax::Config::new().utf8(false)) |
3176 | | .build(r"(?s-u:.)*?") |
3177 | | .is_ok()); |
3178 | | } |
3179 | | |
3180 | | #[test] |
3181 | | fn is_not_one_pass() { |
3182 | | assert!(DFA::new(r"a*a").is_err()); |
3183 | | assert!(DFA::new(r"(?s-u:.)*?").is_err()); |
3184 | | assert!(DFA::new(r"(?s:.)*?a").is_err()); |
3185 | | } |
3186 | | |
3187 | | #[cfg(not(miri))] |
3188 | | #[test] |
3189 | | fn is_not_one_pass_bigger() { |
3190 | | assert!(DFA::new(r"\w*\s").is_err()); |
3191 | | } |
3192 | | } |