/src/regex/regex-automata/src/nfa/thompson/compiler.rs
Line | Count | Source |
1 | | use core::{borrow::Borrow, cell::RefCell}; |
2 | | |
3 | | use alloc::{sync::Arc, vec, vec::Vec}; |
4 | | |
5 | | use regex_syntax::{ |
6 | | hir::{self, Hir}, |
7 | | utf8::{Utf8Range, Utf8Sequences}, |
8 | | ParserBuilder, |
9 | | }; |
10 | | |
11 | | use crate::{ |
12 | | nfa::thompson::{ |
13 | | builder::Builder, |
14 | | error::BuildError, |
15 | | literal_trie::LiteralTrie, |
16 | | map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, |
17 | | nfa::{Transition, NFA}, |
18 | | range_trie::RangeTrie, |
19 | | }, |
20 | | util::{ |
21 | | look::{Look, LookMatcher}, |
22 | | primitives::{PatternID, StateID}, |
23 | | }, |
24 | | }; |
25 | | |
26 | | /// The configuration used for a Thompson NFA compiler. |
27 | | #[derive(Clone, Debug, Default)] |
28 | | pub struct Config { |
29 | | utf8: Option<bool>, |
30 | | reverse: Option<bool>, |
31 | | nfa_size_limit: Option<Option<usize>>, |
32 | | shrink: Option<bool>, |
33 | | which_captures: Option<WhichCaptures>, |
34 | | look_matcher: Option<LookMatcher>, |
35 | | #[cfg(test)] |
36 | | unanchored_prefix: Option<bool>, |
37 | | } |
38 | | |
39 | | impl Config { |
40 | | /// Return a new default Thompson NFA compiler configuration. |
41 | 72.7k | pub fn new() -> Config { |
42 | 72.7k | Config::default() |
43 | 72.7k | } |
44 | | |
45 | | /// Whether to enable UTF-8 mode during search or not. |
46 | | /// |
47 | | /// A regex engine is said to be in UTF-8 mode when it guarantees that |
48 | | /// all matches returned by it have spans consisting of only valid UTF-8. |
49 | | /// That is, it is impossible for a match span to be returned that |
50 | | /// contains any invalid UTF-8. |
51 | | /// |
52 | | /// UTF-8 mode generally consists of two things: |
53 | | /// |
54 | | /// 1. Whether the NFA's states are constructed such that all paths to a |
55 | | /// match state that consume at least one byte always correspond to valid |
56 | | /// UTF-8. |
57 | | /// 2. Whether all paths to a match state that do _not_ consume any bytes |
58 | | /// should always correspond to valid UTF-8 boundaries. |
59 | | /// |
60 | | /// (1) is a guarantee made by whoever constructs the NFA. |
61 | | /// If you're parsing a regex from its concrete syntax, then |
62 | | /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make |
63 | | /// this guarantee for you. It does it by returning an error if the regex |
64 | | /// pattern could every report a non-empty match span that contains invalid |
65 | | /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex |
66 | | /// successfully parses, then you're guaranteed that the corresponding NFA |
67 | | /// will only ever report non-empty match spans containing valid UTF-8. |
68 | | /// |
69 | | /// (2) is a trickier guarantee because it cannot be enforced by the NFA |
70 | | /// state graph itself. Consider, for example, the regex `a*`. It matches |
71 | | /// the empty strings in `ā` at positions `0`, `1`, `2` and `3`, where |
72 | | /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, |
73 | | /// and thus correspond to invalid UTF-8 boundaries. Therefore, this |
74 | | /// guarantee must be made at a higher level than the NFA state graph |
75 | | /// itself. This crate deals with this case in each regex engine. Namely, |
76 | | /// when a zero-width match that splits a codepoint is found and UTF-8 |
77 | | /// mode enabled, then it is ignored and the engine moves on looking for |
78 | | /// the next match. |
79 | | /// |
80 | | /// Thus, UTF-8 mode is both a promise that the NFA built only reports |
81 | | /// non-empty matches that are valid UTF-8, and an *instruction* to regex |
82 | | /// engines that empty matches that split codepoints should be banned. |
83 | | /// |
84 | | /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, |
85 | | /// it only makes sense to enable this option when you *know* your haystack |
86 | | /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and |
87 | | /// searching a haystack that contains invalid UTF-8 leads to **unspecified |
88 | | /// behavior**. |
89 | | /// |
90 | | /// Therefore, it may make sense to enable `syntax::Config::utf8` while |
91 | | /// simultaneously *disabling* this option. That would ensure all non-empty |
92 | | /// match spans are valid UTF-8, but that empty match spans may still split |
93 | | /// a codepoint or match at other places that aren't valid UTF-8. |
94 | | /// |
95 | | /// In general, this mode is only relevant if your regex can match the |
96 | | /// empty string. Most regexes don't. |
97 | | /// |
98 | | /// This is enabled by default. |
99 | | /// |
100 | | /// # Example |
101 | | /// |
102 | | /// This example shows how UTF-8 mode can impact the match spans that may |
103 | | /// be reported in certain cases. |
104 | | /// |
105 | | /// ``` |
106 | | /// use regex_automata::{ |
107 | | /// nfa::thompson::{self, pikevm::PikeVM}, |
108 | | /// Match, Input, |
109 | | /// }; |
110 | | /// |
111 | | /// let re = PikeVM::new("")?; |
112 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
113 | | /// |
114 | | /// // UTF-8 mode is enabled by default. |
115 | | /// let mut input = Input::new("ā"); |
116 | | /// re.search(&mut cache, &input, &mut caps); |
117 | | /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); |
118 | | /// |
119 | | /// // Even though an empty regex matches at 1..1, our next match is |
120 | | /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is |
121 | | /// // three bytes long). |
122 | | /// input.set_start(1); |
123 | | /// re.search(&mut cache, &input, &mut caps); |
124 | | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
125 | | /// |
126 | | /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: |
127 | | /// let re = PikeVM::builder() |
128 | | /// .thompson(thompson::Config::new().utf8(false)) |
129 | | /// .build("")?; |
130 | | /// re.search(&mut cache, &input, &mut caps); |
131 | | /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); |
132 | | /// |
133 | | /// input.set_start(2); |
134 | | /// re.search(&mut cache, &input, &mut caps); |
135 | | /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); |
136 | | /// |
137 | | /// input.set_start(3); |
138 | | /// re.search(&mut cache, &input, &mut caps); |
139 | | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
140 | | /// |
141 | | /// input.set_start(4); |
142 | | /// re.search(&mut cache, &input, &mut caps); |
143 | | /// assert_eq!(None, caps.get_match()); |
144 | | /// |
145 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
146 | | /// ``` |
147 | 72.7k | pub fn utf8(mut self, yes: bool) -> Config { |
148 | 72.7k | self.utf8 = Some(yes); |
149 | 72.7k | self |
150 | 72.7k | } |
151 | | |
152 | | /// Reverse the NFA. |
153 | | /// |
154 | | /// A NFA reversal is performed by reversing all of the concatenated |
155 | | /// sub-expressions in the original pattern, recursively. (Look around |
156 | | /// operators are also inverted.) The resulting NFA can be used to match |
157 | | /// the pattern starting from the end of a string instead of the beginning |
158 | | /// of a string. |
159 | | /// |
160 | | /// Reversing the NFA is useful for building a reverse DFA, which is most |
161 | | /// useful for finding the start of a match after its ending position has |
162 | | /// been found. NFA execution engines typically do not work on reverse |
163 | | /// NFAs. For example, currently, the Pike VM reports the starting location |
164 | | /// of matches without a reverse NFA. |
165 | | /// |
166 | | /// Currently, enabling this setting requires disabling the |
167 | | /// [`captures`](Config::captures) setting. If both are enabled, then the |
168 | | /// compiler will return an error. It is expected that this limitation will |
169 | | /// be lifted in the future. |
170 | | /// |
171 | | /// This is disabled by default. |
172 | | /// |
173 | | /// # Example |
174 | | /// |
175 | | /// This example shows how to build a DFA from a reverse NFA, and then use |
176 | | /// the DFA to search backwards. |
177 | | /// |
178 | | /// ``` |
179 | | /// use regex_automata::{ |
180 | | /// dfa::{self, Automaton}, |
181 | | /// nfa::thompson::{NFA, WhichCaptures}, |
182 | | /// HalfMatch, Input, |
183 | | /// }; |
184 | | /// |
185 | | /// let dfa = dfa::dense::Builder::new() |
186 | | /// .thompson(NFA::config() |
187 | | /// .which_captures(WhichCaptures::None) |
188 | | /// .reverse(true) |
189 | | /// ) |
190 | | /// .build("baz[0-9]+")?; |
191 | | /// let expected = Some(HalfMatch::must(0, 3)); |
192 | | /// assert_eq!( |
193 | | /// expected, |
194 | | /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?, |
195 | | /// ); |
196 | | /// |
197 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
198 | | /// ``` |
199 | 70.7k | pub fn reverse(mut self, yes: bool) -> Config { |
200 | 70.7k | self.reverse = Some(yes); |
201 | 70.7k | self |
202 | 70.7k | } |
203 | | |
204 | | /// Sets an approximate size limit on the total heap used by the NFA being |
205 | | /// compiled. |
206 | | /// |
207 | | /// This permits imposing constraints on the size of a compiled NFA. This |
208 | | /// may be useful in contexts where the regex pattern is untrusted and one |
209 | | /// wants to avoid using too much memory. |
210 | | /// |
211 | | /// This size limit does not apply to auxiliary heap used during |
212 | | /// compilation that is not part of the built NFA. |
213 | | /// |
214 | | /// Note that this size limit is applied during compilation in order for |
215 | | /// the limit to prevent too much heap from being used. However, the |
216 | | /// implementation may use an intermediate NFA representation that is |
217 | | /// otherwise slightly bigger than the final public form. Since the size |
218 | | /// limit may be applied to an intermediate representation, there is not |
219 | | /// necessarily a precise correspondence between the configured size limit |
220 | | /// and the heap usage of the final NFA. |
221 | | /// |
222 | | /// There is no size limit by default. |
223 | | /// |
224 | | /// # Example |
225 | | /// |
226 | | /// This example demonstrates how Unicode mode can greatly increase the |
227 | | /// size of the NFA. |
228 | | /// |
229 | | /// ``` |
230 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
231 | | /// use regex_automata::nfa::thompson::NFA; |
232 | | /// |
233 | | /// // 300KB isn't enough! |
234 | | /// NFA::compiler() |
235 | | /// .configure(NFA::config().nfa_size_limit(Some(300_000))) |
236 | | /// .build(r"\w{20}") |
237 | | /// .unwrap_err(); |
238 | | /// |
239 | | /// // ... but 500KB probably is. |
240 | | /// let nfa = NFA::compiler() |
241 | | /// .configure(NFA::config().nfa_size_limit(Some(500_000))) |
242 | | /// .build(r"\w{20}")?; |
243 | | /// |
244 | | /// assert_eq!(nfa.pattern_len(), 1); |
245 | | /// |
246 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
247 | | /// ``` |
248 | 72.7k | pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { |
249 | 72.7k | self.nfa_size_limit = Some(bytes); |
250 | 72.7k | self |
251 | 72.7k | } |
252 | | |
253 | | /// Apply best effort heuristics to shrink the NFA at the expense of more |
254 | | /// time/memory. |
255 | | /// |
256 | | /// Generally speaking, if one is using an NFA to compile a DFA, then the |
257 | | /// extra time used to shrink the NFA will be more than made up for during |
258 | | /// DFA construction (potentially by a lot). In other words, enabling this |
259 | | /// can substantially decrease the overall amount of time it takes to build |
260 | | /// a DFA. |
261 | | /// |
262 | | /// A reason to keep this disabled is if you want to compile an NFA and |
263 | | /// start using it as quickly as possible without needing to build a DFA, |
264 | | /// and you don't mind using a bit of extra memory for the NFA. e.g., for |
265 | | /// an NFA simulation or for a lazy DFA. |
266 | | /// |
267 | | /// NFA shrinking is currently most useful when compiling a reverse |
268 | | /// NFA with large Unicode character classes. In particular, it trades |
269 | | /// additional CPU time during NFA compilation in favor of generating fewer |
270 | | /// NFA states. |
271 | | /// |
272 | | /// This is disabled by default because it can increase compile times |
273 | | /// quite a bit if you aren't building a full DFA. |
274 | | /// |
275 | | /// # Example |
276 | | /// |
277 | | /// This example shows that NFA shrinking can lead to substantial space |
278 | | /// savings in some cases. Notice that, as noted above, we build a reverse |
279 | | /// DFA and use a pattern with a large Unicode character class. |
280 | | /// |
281 | | /// ``` |
282 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
283 | | /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; |
284 | | /// |
285 | | /// // Currently we have to disable captures when enabling reverse NFA. |
286 | | /// let config = NFA::config() |
287 | | /// .which_captures(WhichCaptures::None) |
288 | | /// .reverse(true); |
289 | | /// let not_shrunk = NFA::compiler() |
290 | | /// .configure(config.clone().shrink(false)) |
291 | | /// .build(r"\w")?; |
292 | | /// let shrunk = NFA::compiler() |
293 | | /// .configure(config.clone().shrink(true)) |
294 | | /// .build(r"\w")?; |
295 | | /// |
296 | | /// // While a specific shrink factor is not guaranteed, the savings can be |
297 | | /// // considerable in some cases. |
298 | | /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); |
299 | | /// |
300 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
301 | | /// ``` |
302 | 72.7k | pub fn shrink(mut self, yes: bool) -> Config { |
303 | 72.7k | self.shrink = Some(yes); |
304 | 72.7k | self |
305 | 72.7k | } |
306 | | |
307 | | /// Whether to include 'Capture' states in the NFA. |
308 | | /// |
309 | | /// Currently, enabling this setting requires disabling the |
310 | | /// [`reverse`](Config::reverse) setting. If both are enabled, then the |
311 | | /// compiler will return an error. It is expected that this limitation will |
312 | | /// be lifted in the future. |
313 | | /// |
314 | | /// This is enabled by default. |
315 | | /// |
316 | | /// # Example |
317 | | /// |
318 | | /// This example demonstrates that some regex engines, like the Pike VM, |
319 | | /// require capturing states to be present in the NFA to report match |
320 | | /// offsets. |
321 | | /// |
322 | | /// (Note that since this method is deprecated, the example below uses |
323 | | /// [`Config::which_captures`] to disable capture states.) |
324 | | /// |
325 | | /// ``` |
326 | | /// use regex_automata::nfa::thompson::{ |
327 | | /// pikevm::PikeVM, |
328 | | /// NFA, |
329 | | /// WhichCaptures, |
330 | | /// }; |
331 | | /// |
332 | | /// let re = PikeVM::builder() |
333 | | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
334 | | /// .build(r"[a-z]+")?; |
335 | | /// let mut cache = re.create_cache(); |
336 | | /// |
337 | | /// assert!(re.is_match(&mut cache, "abc")); |
338 | | /// assert_eq!(None, re.find(&mut cache, "abc")); |
339 | | /// |
340 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
341 | | /// ``` |
342 | | #[deprecated(since = "0.3.5", note = "use which_captures instead")] |
343 | 0 | pub fn captures(self, yes: bool) -> Config { |
344 | 0 | self.which_captures(if yes { |
345 | 0 | WhichCaptures::All |
346 | | } else { |
347 | 0 | WhichCaptures::None |
348 | | }) |
349 | 0 | } |
350 | | |
351 | | /// Configures what kinds of capture groups are compiled into |
352 | | /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a |
353 | | /// Thompson NFA. |
354 | | /// |
355 | | /// Currently, using any option except for [`WhichCaptures::None`] requires |
356 | | /// disabling the [`reverse`](Config::reverse) setting. If both are |
357 | | /// enabled, then the compiler will return an error. It is expected that |
358 | | /// this limitation will be lifted in the future. |
359 | | /// |
360 | | /// This is set to [`WhichCaptures::All`] by default. Callers may wish to |
361 | | /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the |
362 | | /// overhead of capture states for explicit groups. Usually this occurs |
363 | | /// when one wants to use the `PikeVM` only for determining the overall |
364 | | /// match. Otherwise, the `PikeVM` could use much more memory than is |
365 | | /// necessary. |
366 | | /// |
367 | | /// # Example |
368 | | /// |
369 | | /// This example demonstrates that some regex engines, like the Pike VM, |
370 | | /// require capturing states to be present in the NFA to report match |
371 | | /// offsets. |
372 | | /// |
373 | | /// ``` |
374 | | /// use regex_automata::nfa::thompson::{ |
375 | | /// pikevm::PikeVM, |
376 | | /// NFA, |
377 | | /// WhichCaptures, |
378 | | /// }; |
379 | | /// |
380 | | /// let re = PikeVM::builder() |
381 | | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
382 | | /// .build(r"[a-z]+")?; |
383 | | /// let mut cache = re.create_cache(); |
384 | | /// |
385 | | /// assert!(re.is_match(&mut cache, "abc")); |
386 | | /// assert_eq!(None, re.find(&mut cache, "abc")); |
387 | | /// |
388 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
389 | | /// ``` |
390 | | /// |
391 | | /// The same applies to the bounded backtracker: |
392 | | /// |
393 | | /// ``` |
394 | | /// use regex_automata::nfa::thompson::{ |
395 | | /// backtrack::BoundedBacktracker, |
396 | | /// NFA, |
397 | | /// WhichCaptures, |
398 | | /// }; |
399 | | /// |
400 | | /// let re = BoundedBacktracker::builder() |
401 | | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
402 | | /// .build(r"[a-z]+")?; |
403 | | /// let mut cache = re.create_cache(); |
404 | | /// |
405 | | /// assert!(re.try_is_match(&mut cache, "abc")?); |
406 | | /// assert_eq!(None, re.try_find(&mut cache, "abc")?); |
407 | | /// |
408 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
409 | | /// ``` |
410 | 138k | pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { |
411 | 138k | self.which_captures = Some(which_captures); |
412 | 138k | self |
413 | 138k | } |
414 | | |
415 | | /// Sets the look-around matcher that should be used with this NFA. |
416 | | /// |
417 | | /// A look-around matcher determines how to match look-around assertions. |
418 | | /// In particular, some assertions are configurable. For example, the |
419 | | /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed |
420 | | /// from the default of `\n` to any other byte. |
421 | | /// |
422 | | /// # Example |
423 | | /// |
424 | | /// This shows how to change the line terminator for multi-line assertions. |
425 | | /// |
426 | | /// ``` |
427 | | /// use regex_automata::{ |
428 | | /// nfa::thompson::{self, pikevm::PikeVM}, |
429 | | /// util::look::LookMatcher, |
430 | | /// Match, Input, |
431 | | /// }; |
432 | | /// |
433 | | /// let mut lookm = LookMatcher::new(); |
434 | | /// lookm.set_line_terminator(b'\x00'); |
435 | | /// |
436 | | /// let re = PikeVM::builder() |
437 | | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
438 | | /// .build(r"(?m)^[a-z]+$")?; |
439 | | /// let mut cache = re.create_cache(); |
440 | | /// |
441 | | /// // Multi-line assertions now use NUL as a terminator. |
442 | | /// assert_eq!( |
443 | | /// Some(Match::must(0, 1..4)), |
444 | | /// re.find(&mut cache, b"\x00abc\x00"), |
445 | | /// ); |
446 | | /// // ... and \n is no longer recognized as a terminator. |
447 | | /// assert_eq!( |
448 | | /// None, |
449 | | /// re.find(&mut cache, b"\nabc\n"), |
450 | | /// ); |
451 | | /// |
452 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
453 | | /// ``` |
454 | 72.7k | pub fn look_matcher(mut self, m: LookMatcher) -> Config { |
455 | 72.7k | self.look_matcher = Some(m); |
456 | 72.7k | self |
457 | 72.7k | } |
458 | | |
459 | | /// Whether to compile an unanchored prefix into this NFA. |
460 | | /// |
461 | | /// This is enabled by default. It is made available for tests only to make |
462 | | /// it easier to unit test the output of the compiler. |
463 | | #[cfg(test)] |
464 | | fn unanchored_prefix(mut self, yes: bool) -> Config { |
465 | | self.unanchored_prefix = Some(yes); |
466 | | self |
467 | | } |
468 | | |
469 | | /// Returns whether this configuration has enabled UTF-8 mode. |
470 | 138k | pub fn get_utf8(&self) -> bool { |
471 | 138k | self.utf8.unwrap_or(true) |
472 | 138k | } |
473 | | |
474 | | /// Returns whether this configuration has enabled reverse NFA compilation. |
475 | 189M | pub fn get_reverse(&self) -> bool { |
476 | 189M | self.reverse.unwrap_or(false) |
477 | 189M | } |
478 | | |
479 | | /// Return the configured NFA size limit, if it exists, in the number of |
480 | | /// bytes of heap used. |
481 | 138k | pub fn get_nfa_size_limit(&self) -> Option<usize> { |
482 | 138k | self.nfa_size_limit.unwrap_or(None) |
483 | 138k | } |
484 | | |
485 | | /// Return whether NFA shrinking is enabled. |
486 | 1.13M | pub fn get_shrink(&self) -> bool { |
487 | 1.13M | self.shrink.unwrap_or(false) |
488 | 1.13M | } |
489 | | |
490 | | /// Return whether NFA compilation is configured to produce capture states. |
491 | | #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] |
492 | 0 | pub fn get_captures(&self) -> bool { |
493 | 0 | self.get_which_captures().is_any() |
494 | 0 | } |
495 | | |
496 | | /// Return what kinds of capture states will be compiled into an NFA. |
497 | 4.99M | pub fn get_which_captures(&self) -> WhichCaptures { |
498 | 4.99M | self.which_captures.unwrap_or(WhichCaptures::All) |
499 | 4.99M | } |
500 | | |
501 | | /// Return the look-around matcher for this NFA. |
502 | 138k | pub fn get_look_matcher(&self) -> LookMatcher { |
503 | 138k | self.look_matcher.clone().unwrap_or(LookMatcher::default()) |
504 | 138k | } |
505 | | |
506 | | /// Return whether NFA compilation is configured to include an unanchored |
507 | | /// prefix. |
508 | | /// |
509 | | /// This is always false when not in test mode. |
510 | 138k | fn get_unanchored_prefix(&self) -> bool { |
511 | | #[cfg(test)] |
512 | | { |
513 | | self.unanchored_prefix.unwrap_or(true) |
514 | | } |
515 | | #[cfg(not(test))] |
516 | | { |
517 | 138k | true |
518 | | } |
519 | 138k | } |
520 | | |
521 | | /// Overwrite the default configuration such that the options in `o` are |
522 | | /// always used. If an option in `o` is not set, then the corresponding |
523 | | /// option in `self` is used. If it's not set in `self` either, then it |
524 | | /// remains not set. |
525 | 138k | pub(crate) fn overwrite(&self, o: Config) -> Config { |
526 | | Config { |
527 | 138k | utf8: o.utf8.or(self.utf8), |
528 | 138k | reverse: o.reverse.or(self.reverse), |
529 | 138k | nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), |
530 | 138k | shrink: o.shrink.or(self.shrink), |
531 | 138k | which_captures: o.which_captures.or(self.which_captures), |
532 | 138k | look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), |
533 | | #[cfg(test)] |
534 | | unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), |
535 | | } |
536 | 138k | } |
537 | | } |
538 | | |
539 | | /// A configuration indicating which kinds of |
540 | | /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. |
541 | | /// |
542 | | /// This configuration can be used with [`Config::which_captures`] to control |
543 | | /// which capture states are compiled into a Thompson NFA. |
544 | | /// |
545 | | /// The default configuration is [`WhichCaptures::All`]. |
546 | | #[derive(Clone, Copy, Debug)] |
547 | | pub enum WhichCaptures { |
548 | | /// All capture states, including those corresponding to both implicit and |
549 | | /// explicit capture groups, are included in the Thompson NFA. |
550 | | All, |
551 | | /// Only capture states corresponding to implicit capture groups are |
552 | | /// included. Implicit capture groups appear in every pattern implicitly |
553 | | /// and correspond to the overall match of a pattern. |
554 | | /// |
555 | | /// This is useful when one only cares about the overall match of a |
556 | | /// pattern. By excluding capture states from explicit capture groups, |
557 | | /// one might be able to reduce the memory usage of a multi-pattern regex |
558 | | /// substantially if it was otherwise written to have many explicit capture |
559 | | /// groups. |
560 | | Implicit, |
561 | | /// No capture states are compiled into the Thompson NFA. |
562 | | /// |
563 | | /// This is useful when capture states are either not needed (for example, |
564 | | /// if one is only trying to build a DFA) or if they aren't supported (for |
565 | | /// example, a reverse NFA). |
566 | | /// |
567 | | /// # Warning |
568 | | /// |
569 | | /// Callers must be exceedingly careful when using this |
570 | | /// option. In particular, not all regex engines support |
571 | | /// reporting match spans when using this option (for example, |
572 | | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or |
573 | | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)). |
574 | | /// |
575 | | /// Perhaps more confusingly, using this option with such an |
576 | | /// engine means that an `is_match` routine could report `true` |
577 | | /// when `find` reports `None`. This is generally not something |
578 | | /// that _should_ happen, but the low level control provided by |
579 | | /// this crate makes it possible. |
580 | | /// |
581 | | /// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex)) |
582 | | /// should always return `None` from `find` routines when this option is |
583 | | /// used, even if _some_ of its internal engines could find the match |
584 | | /// boundaries. This is because inputs from user data could influence |
585 | | /// engine selection, and thus influence whether a match is found or not. |
586 | | /// Indeed, `meta::Regex::find` will always return `None` when configured |
587 | | /// with this option. |
588 | | None, |
589 | | } |
590 | | |
591 | | impl Default for WhichCaptures { |
592 | 0 | fn default() -> WhichCaptures { |
593 | 0 | WhichCaptures::All |
594 | 0 | } |
595 | | } |
596 | | |
597 | | impl WhichCaptures { |
598 | | /// Returns true if this configuration indicates that no capture states |
599 | | /// should be produced in an NFA. |
600 | 70.7k | pub fn is_none(&self) -> bool { |
601 | 70.7k | matches!(*self, WhichCaptures::None) |
602 | 70.7k | } |
603 | | |
604 | | /// Returns true if this configuration indicates that some capture states |
605 | | /// should be added to an NFA. Note that this might only include capture |
606 | | /// states for implicit capture groups. |
607 | 70.7k | pub fn is_any(&self) -> bool { |
608 | 70.7k | !self.is_none() |
609 | 70.7k | } |
610 | | } |
611 | | |
612 | | /* |
613 | | This compiler below uses Thompson's construction algorithm. The compiler takes |
614 | | a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph |
615 | | is structured in a way that permits it to be executed by a virtual machine and |
616 | | also used to efficiently build a DFA. |
617 | | |
618 | | The compiler deals with a slightly expanded set of NFA states than what is |
619 | | in a final NFA (as exhibited by builder::State and nfa::State). Notably a |
620 | | compiler state includes an empty node that has exactly one unconditional |
621 | | epsilon transition to the next state. In other words, it's a "goto" instruction |
622 | | if one views Thompson's NFA as a set of bytecode instructions. These goto |
623 | | instructions are removed in a subsequent phase before returning the NFA to the |
624 | | caller. The purpose of these empty nodes is that they make the construction |
625 | | algorithm substantially simpler to implement. We remove them before returning |
626 | | to the caller because they can represent substantial overhead when traversing |
627 | | the NFA graph (either while searching using the NFA directly or while building |
628 | | a DFA). |
629 | | |
630 | | In the future, it would be nice to provide a Glushkov compiler as well, as it |
631 | | would work well as a bit-parallel NFA for smaller regexes. But the Thompson |
632 | | construction is one I'm more familiar with and seems more straight-forward to |
633 | | deal with when it comes to large Unicode character classes. |
634 | | |
635 | | Internally, the compiler uses interior mutability to improve composition in the |
636 | | face of the borrow checker. In particular, we'd really like to be able to write |
637 | | things like this: |
638 | | |
639 | | self.c_concat(exprs.iter().map(|e| self.c(e))) |
640 | | |
641 | | Which elegantly uses iterators to build up a sequence of compiled regex |
642 | | sub-expressions and then hands it off to the concatenating compiler routine. |
643 | | Without interior mutability, the borrow checker won't let us borrow `self` |
644 | | mutably both inside and outside the closure at the same time. |
645 | | */ |
646 | | |
647 | | /// A builder for compiling an NFA from a regex's high-level intermediate |
648 | | /// representation (HIR). |
649 | | /// |
650 | | /// This compiler provides a way to translate a parsed regex pattern into an |
651 | | /// NFA state graph. The NFA state graph can either be used directly to execute |
652 | | /// a search (e.g., with a Pike VM), or it can be further used to build a DFA. |
653 | | /// |
654 | | /// This compiler provides APIs both for compiling regex patterns directly from |
655 | | /// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. |
656 | | /// |
657 | | /// This compiler has various options that may be configured via |
658 | | /// [`thompson::Config`](Config). |
659 | | /// |
660 | | /// Note that a compiler is not the same as a [`thompson::Builder`](Builder). |
661 | | /// A `Builder` provides a lower level API that is uncoupled from a regex |
662 | | /// pattern's concrete syntax or even its HIR. Instead, it permits stitching |
663 | | /// together an NFA by hand. See its docs for examples. |
664 | | /// |
665 | | /// # Example: compilation from concrete syntax |
666 | | /// |
667 | | /// This shows how to compile an NFA from a pattern string while setting a size |
668 | | /// limit on how big the NFA is allowed to be (in terms of bytes of heap used). |
669 | | /// |
670 | | /// ``` |
671 | | /// use regex_automata::{ |
672 | | /// nfa::thompson::{NFA, pikevm::PikeVM}, |
673 | | /// Match, |
674 | | /// }; |
675 | | /// |
676 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
677 | | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; |
678 | | /// |
679 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
680 | | /// let mut cache = re.create_cache(); |
681 | | /// let mut caps = re.create_captures(); |
682 | | /// let expected = Some(Match::must(0, 3..4)); |
683 | | /// re.captures(&mut cache, "!@#A#@!", &mut caps); |
684 | | /// assert_eq!(expected, caps.get_match()); |
685 | | /// |
686 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
687 | | /// ``` |
688 | | /// |
689 | | /// # Example: compilation from HIR |
690 | | /// |
691 | | /// This shows how to hand assemble a regular expression via its HIR, and then |
692 | | /// compile an NFA directly from it. |
693 | | /// |
694 | | /// ``` |
695 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
696 | | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
697 | | /// |
698 | | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
699 | | /// ClassBytesRange::new(b'0', b'9'), |
700 | | /// ClassBytesRange::new(b'A', b'Z'), |
701 | | /// ClassBytesRange::new(b'_', b'_'), |
702 | | /// ClassBytesRange::new(b'a', b'z'), |
703 | | /// ]))); |
704 | | /// |
705 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
706 | | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
707 | | /// |
708 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
709 | | /// let mut cache = re.create_cache(); |
710 | | /// let mut caps = re.create_captures(); |
711 | | /// let expected = Some(Match::must(0, 3..4)); |
712 | | /// re.captures(&mut cache, "!@#A#@!", &mut caps); |
713 | | /// assert_eq!(expected, caps.get_match()); |
714 | | /// |
715 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
716 | | /// ``` |
717 | | #[derive(Clone, Debug)] |
718 | | pub struct Compiler { |
719 | | /// A regex parser, used when compiling an NFA directly from a pattern |
720 | | /// string. |
721 | | parser: ParserBuilder, |
722 | | /// The compiler configuration. |
723 | | config: Config, |
724 | | /// The builder for actually constructing an NFA. This provides a |
725 | | /// convenient abstraction for writing a compiler. |
726 | | builder: RefCell<Builder>, |
727 | | /// State used for compiling character classes to UTF-8 byte automata. |
728 | | /// State is not retained between character class compilations. This just |
729 | | /// serves to amortize allocation to the extent possible. |
730 | | utf8_state: RefCell<Utf8State>, |
731 | | /// State used for arranging character classes in reverse into a trie. |
732 | | trie_state: RefCell<RangeTrie>, |
733 | | /// State used for caching common suffixes when compiling reverse UTF-8 |
734 | | /// automata (for Unicode character classes). |
735 | | utf8_suffix: RefCell<Utf8SuffixMap>, |
736 | | } |
737 | | |
738 | | impl Compiler { |
739 | | /// Create a new NFA builder with its default configuration. |
740 | 506k | pub fn new() -> Compiler { |
741 | 506k | Compiler { |
742 | 506k | parser: ParserBuilder::new(), |
743 | 506k | config: Config::default(), |
744 | 506k | builder: RefCell::new(Builder::new()), |
745 | 506k | utf8_state: RefCell::new(Utf8State::new()), |
746 | 506k | trie_state: RefCell::new(RangeTrie::new()), |
747 | 506k | utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), |
748 | 506k | } |
749 | 506k | } |
750 | | |
751 | | /// Compile the given regular expression pattern into an NFA. |
752 | | /// |
753 | | /// If there was a problem parsing the regex, then that error is returned. |
754 | | /// |
755 | | /// Otherwise, if there was a problem building the NFA, then an error is |
756 | | /// returned. The only error that can occur is if the compiled regex would |
757 | | /// exceed the size limits configured on this builder, or if any part of |
758 | | /// the NFA would exceed the integer representations used. (For example, |
759 | | /// too many states might plausibly occur on a 16-bit target.) |
760 | | /// |
761 | | /// # Example |
762 | | /// |
763 | | /// ``` |
764 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
765 | | /// |
766 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
767 | | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; |
768 | | /// |
769 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
770 | | /// let mut cache = re.create_cache(); |
771 | | /// let mut caps = re.create_captures(); |
772 | | /// let expected = Some(Match::must(0, 3..4)); |
773 | | /// re.captures(&mut cache, "!@#A#@!", &mut caps); |
774 | | /// assert_eq!(expected, caps.get_match()); |
775 | | /// |
776 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
777 | | /// ``` |
778 | 0 | pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> { |
779 | 0 | self.build_many(&[pattern]) |
780 | 0 | } |
781 | | |
782 | | /// Compile the given regular expression patterns into a single NFA. |
783 | | /// |
784 | | /// When matches are returned, the pattern ID corresponds to the index of |
785 | | /// the pattern in the slice given. |
786 | | /// |
787 | | /// # Example |
788 | | /// |
789 | | /// ``` |
790 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
791 | | /// |
792 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
793 | | /// let nfa = NFA::compiler().configure(config).build_many(&[ |
794 | | /// r"(?-u)\s", |
795 | | /// r"(?-u)\w", |
796 | | /// ])?; |
797 | | /// |
798 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
799 | | /// let mut cache = re.create_cache(); |
800 | | /// let mut caps = re.create_captures(); |
801 | | /// let expected = Some(Match::must(1, 1..2)); |
802 | | /// re.captures(&mut cache, "!A! !A!", &mut caps); |
803 | | /// assert_eq!(expected, caps.get_match()); |
804 | | /// |
805 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
806 | | /// ``` |
807 | 0 | pub fn build_many<P: AsRef<str>>( |
808 | 0 | &self, |
809 | 0 | patterns: &[P], |
810 | 0 | ) -> Result<NFA, BuildError> { |
811 | 0 | let mut hirs = vec![]; |
812 | 0 | for p in patterns { |
813 | 0 | hirs.push( |
814 | 0 | self.parser |
815 | 0 | .build() |
816 | 0 | .parse(p.as_ref()) |
817 | 0 | .map_err(BuildError::syntax)?, |
818 | | ); |
819 | 0 | debug!("parsed: {:?}", p.as_ref()); |
820 | | } |
821 | 0 | self.build_many_from_hir(&hirs) |
822 | 0 | } |
823 | | |
824 | | /// Compile the given high level intermediate representation of a regular |
825 | | /// expression into an NFA. |
826 | | /// |
827 | | /// If there was a problem building the NFA, then an error is returned. The |
828 | | /// only error that can occur is if the compiled regex would exceed the |
829 | | /// size limits configured on this builder, or if any part of the NFA would |
830 | | /// exceed the integer representations used. (For example, too many states |
831 | | /// might plausibly occur on a 16-bit target.) |
832 | | /// |
833 | | /// # Example |
834 | | /// |
835 | | /// ``` |
836 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
837 | | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
838 | | /// |
839 | | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
840 | | /// ClassBytesRange::new(b'0', b'9'), |
841 | | /// ClassBytesRange::new(b'A', b'Z'), |
842 | | /// ClassBytesRange::new(b'_', b'_'), |
843 | | /// ClassBytesRange::new(b'a', b'z'), |
844 | | /// ]))); |
845 | | /// |
846 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
847 | | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
848 | | /// |
849 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
850 | | /// let mut cache = re.create_cache(); |
851 | | /// let mut caps = re.create_captures(); |
852 | | /// let expected = Some(Match::must(0, 3..4)); |
853 | | /// re.captures(&mut cache, "!@#A#@!", &mut caps); |
854 | | /// assert_eq!(expected, caps.get_match()); |
855 | | /// |
856 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
857 | | /// ``` |
858 | 4.80k | pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> { |
859 | 4.80k | self.build_many_from_hir(&[expr]) |
860 | 4.80k | } |
861 | | |
862 | | /// Compile the given high level intermediate representations of regular |
863 | | /// expressions into a single NFA. |
864 | | /// |
865 | | /// When matches are returned, the pattern ID corresponds to the index of |
866 | | /// the pattern in the slice given. |
867 | | /// |
868 | | /// # Example |
869 | | /// |
870 | | /// ``` |
871 | | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
872 | | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
873 | | /// |
874 | | /// let hirs = &[ |
875 | | /// Hir::class(Class::Bytes(ClassBytes::new(vec![ |
876 | | /// ClassBytesRange::new(b'\t', b'\r'), |
877 | | /// ClassBytesRange::new(b' ', b' '), |
878 | | /// ]))), |
879 | | /// Hir::class(Class::Bytes(ClassBytes::new(vec![ |
880 | | /// ClassBytesRange::new(b'0', b'9'), |
881 | | /// ClassBytesRange::new(b'A', b'Z'), |
882 | | /// ClassBytesRange::new(b'_', b'_'), |
883 | | /// ClassBytesRange::new(b'a', b'z'), |
884 | | /// ]))), |
885 | | /// ]; |
886 | | /// |
887 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
888 | | /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; |
889 | | /// |
890 | | /// let re = PikeVM::new_from_nfa(nfa)?; |
891 | | /// let mut cache = re.create_cache(); |
892 | | /// let mut caps = re.create_captures(); |
893 | | /// let expected = Some(Match::must(1, 1..2)); |
894 | | /// re.captures(&mut cache, "!A! !A!", &mut caps); |
895 | | /// assert_eq!(expected, caps.get_match()); |
896 | | /// |
897 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
898 | | /// ``` |
899 | 138k | pub fn build_many_from_hir<H: Borrow<Hir>>( |
900 | 138k | &self, |
901 | 138k | exprs: &[H], |
902 | 138k | ) -> Result<NFA, BuildError> { |
903 | 138k | self.compile(exprs) |
904 | 138k | } Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<regex_syntax::hir::Hir> <regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<®ex_syntax::hir::Hir> Line | Count | Source | 899 | 138k | pub fn build_many_from_hir<H: Borrow<Hir>>( | 900 | 138k | &self, | 901 | 138k | exprs: &[H], | 902 | 138k | ) -> Result<NFA, BuildError> { | 903 | 138k | self.compile(exprs) | 904 | 138k | } |
|
905 | | |
906 | | /// Apply the given NFA configuration options to this builder. |
907 | | /// |
908 | | /// # Example |
909 | | /// |
910 | | /// ``` |
911 | | /// use regex_automata::nfa::thompson::NFA; |
912 | | /// |
913 | | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
914 | | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; |
915 | | /// assert_eq!(nfa.pattern_len(), 1); |
916 | | /// |
917 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
918 | | /// ``` |
919 | 138k | pub fn configure(&mut self, config: Config) -> &mut Compiler { |
920 | 138k | self.config = self.config.overwrite(config); |
921 | 138k | self |
922 | 138k | } |
923 | | |
924 | | /// Set the syntax configuration for this builder using |
925 | | /// [`syntax::Config`](crate::util::syntax::Config). |
926 | | /// |
927 | | /// This permits setting things like case insensitivity, Unicode and multi |
928 | | /// line mode. |
929 | | /// |
930 | | /// This syntax configuration only applies when an NFA is built directly |
931 | | /// from a pattern string. If an NFA is built from an HIR, then all syntax |
932 | | /// settings are ignored. |
933 | | /// |
934 | | /// # Example |
935 | | /// |
936 | | /// ``` |
937 | | /// use regex_automata::{nfa::thompson::NFA, util::syntax}; |
938 | | /// |
939 | | /// let syntax_config = syntax::Config::new().unicode(false); |
940 | | /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?; |
941 | | /// // If Unicode were enabled, the number of states would be much bigger. |
942 | | /// assert!(nfa.states().len() < 15); |
943 | | /// |
944 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
945 | | /// ``` |
946 | 0 | pub fn syntax( |
947 | 0 | &mut self, |
948 | 0 | config: crate::util::syntax::Config, |
949 | 0 | ) -> &mut Compiler { |
950 | 0 | config.apply(&mut self.parser); |
951 | 0 | self |
952 | 0 | } |
953 | | } |
954 | | |
955 | | impl Compiler { |
956 | | /// Compile the sequence of HIR expressions given. Pattern IDs are |
957 | | /// allocated starting from 0, in correspondence with the slice given. |
958 | | /// |
959 | | /// It is legal to provide an empty slice. In that case, the NFA returned |
960 | | /// has no patterns and will never match anything. |
961 | 138k | fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { |
962 | 138k | if exprs.len() > PatternID::LIMIT { |
963 | 0 | return Err(BuildError::too_many_patterns(exprs.len())); |
964 | 138k | } |
965 | 138k | if self.config.get_reverse() |
966 | 70.7k | && self.config.get_which_captures().is_any() |
967 | | { |
968 | 0 | return Err(BuildError::unsupported_captures()); |
969 | 138k | } |
970 | | |
971 | 138k | self.builder.borrow_mut().clear(); |
972 | 138k | self.builder.borrow_mut().set_utf8(self.config.get_utf8()); |
973 | 138k | self.builder.borrow_mut().set_reverse(self.config.get_reverse()); |
974 | 138k | self.builder |
975 | 138k | .borrow_mut() |
976 | 138k | .set_look_matcher(self.config.get_look_matcher()); |
977 | 138k | self.builder |
978 | 138k | .borrow_mut() |
979 | 138k | .set_size_limit(self.config.get_nfa_size_limit())?; |
980 | | |
981 | | // We always add an unanchored prefix unless we were specifically told |
982 | | // not to (for tests only), or if we know that the regex is anchored |
983 | | // for all matches. When an unanchored prefix is not added, then the |
984 | | // NFA's anchored and unanchored start states are equivalent. |
985 | 138k | let all_anchored = exprs.iter().all(|e| { |
986 | 138k | let props = e.borrow().properties(); |
987 | 138k | if self.config.get_reverse() { |
988 | 70.7k | props.look_set_suffix().contains(hir::Look::End) |
989 | | } else { |
990 | 67.9k | props.look_set_prefix().contains(hir::Look::Start) |
991 | | } |
992 | 138k | }); Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#0}<regex_automata::nfa::thompson::compiler::Compiler>::compile::<®ex_syntax::hir::Hir>::{closure#0}Line | Count | Source | 985 | 138k | let all_anchored = exprs.iter().all(|e| { | 986 | 138k | let props = e.borrow().properties(); | 987 | 138k | if self.config.get_reverse() { | 988 | 70.7k | props.look_set_suffix().contains(hir::Look::End) | 989 | | } else { | 990 | 67.9k | props.look_set_prefix().contains(hir::Look::Start) | 991 | | } | 992 | 138k | }); |
|
993 | 138k | let anchored = !self.config.get_unanchored_prefix() || all_anchored; |
994 | 138k | let unanchored_prefix = if anchored { |
995 | 3.36k | self.c_empty()? |
996 | | } else { |
997 | 135k | self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? |
998 | | }; |
999 | | |
1000 | 138k | let compiled = self.c_alt_iter(exprs.iter().map(|e| { |
1001 | 138k | let _ = self.start_pattern()?; |
1002 | 138k | let one = self.c_cap(0, None, e.borrow())?; |
1003 | 136k | let match_state_id = self.add_match()?; |
1004 | 136k | self.patch(one.end, match_state_id)?; |
1005 | 136k | let _ = self.finish_pattern(one.start)?; |
1006 | 136k | Ok(ThompsonRef { start: one.start, end: match_state_id }) |
1007 | 138k | }))?; Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#1}<regex_automata::nfa::thompson::compiler::Compiler>::compile::<®ex_syntax::hir::Hir>::{closure#1}Line | Count | Source | 1000 | 138k | let compiled = self.c_alt_iter(exprs.iter().map(|e| { | 1001 | 138k | let _ = self.start_pattern()?; | 1002 | 138k | let one = self.c_cap(0, None, e.borrow())?; | 1003 | 136k | let match_state_id = self.add_match()?; | 1004 | 136k | self.patch(one.end, match_state_id)?; | 1005 | 136k | let _ = self.finish_pattern(one.start)?; | 1006 | 136k | Ok(ThompsonRef { start: one.start, end: match_state_id }) | 1007 | 138k | }))?; |
|
1008 | 136k | self.patch(unanchored_prefix.end, compiled.start)?; |
1009 | 136k | let nfa = self |
1010 | 136k | .builder |
1011 | 136k | .borrow_mut() |
1012 | 136k | .build(compiled.start, unanchored_prefix.start)?; |
1013 | | |
1014 | 136k | debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); |
1015 | 136k | Ok(nfa) |
1016 | 138k | } Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir> <regex_automata::nfa::thompson::compiler::Compiler>::compile::<®ex_syntax::hir::Hir> Line | Count | Source | 961 | 138k | fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { | 962 | 138k | if exprs.len() > PatternID::LIMIT { | 963 | 0 | return Err(BuildError::too_many_patterns(exprs.len())); | 964 | 138k | } | 965 | 138k | if self.config.get_reverse() | 966 | 70.7k | && self.config.get_which_captures().is_any() | 967 | | { | 968 | 0 | return Err(BuildError::unsupported_captures()); | 969 | 138k | } | 970 | | | 971 | 138k | self.builder.borrow_mut().clear(); | 972 | 138k | self.builder.borrow_mut().set_utf8(self.config.get_utf8()); | 973 | 138k | self.builder.borrow_mut().set_reverse(self.config.get_reverse()); | 974 | 138k | self.builder | 975 | 138k | .borrow_mut() | 976 | 138k | .set_look_matcher(self.config.get_look_matcher()); | 977 | 138k | self.builder | 978 | 138k | .borrow_mut() | 979 | 138k | .set_size_limit(self.config.get_nfa_size_limit())?; | 980 | | | 981 | | // We always add an unanchored prefix unless we were specifically told | 982 | | // not to (for tests only), or if we know that the regex is anchored | 983 | | // for all matches. When an unanchored prefix is not added, then the | 984 | | // NFA's anchored and unanchored start states are equivalent. | 985 | 138k | let all_anchored = exprs.iter().all(|e| { | 986 | | let props = e.borrow().properties(); | 987 | | if self.config.get_reverse() { | 988 | | props.look_set_suffix().contains(hir::Look::End) | 989 | | } else { | 990 | | props.look_set_prefix().contains(hir::Look::Start) | 991 | | } | 992 | | }); | 993 | 138k | let anchored = !self.config.get_unanchored_prefix() || all_anchored; | 994 | 138k | let unanchored_prefix = if anchored { | 995 | 3.36k | self.c_empty()? | 996 | | } else { | 997 | 135k | self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? | 998 | | }; | 999 | | | 1000 | 138k | let compiled = self.c_alt_iter(exprs.iter().map(|e| { | 1001 | | let _ = self.start_pattern()?; | 1002 | | let one = self.c_cap(0, None, e.borrow())?; | 1003 | | let match_state_id = self.add_match()?; | 1004 | | self.patch(one.end, match_state_id)?; | 1005 | | let _ = self.finish_pattern(one.start)?; | 1006 | | Ok(ThompsonRef { start: one.start, end: match_state_id }) | 1007 | 2.13k | }))?; | 1008 | 136k | self.patch(unanchored_prefix.end, compiled.start)?; | 1009 | 136k | let nfa = self | 1010 | 136k | .builder | 1011 | 136k | .borrow_mut() | 1012 | 136k | .build(compiled.start, unanchored_prefix.start)?; | 1013 | | | 1014 | 136k | debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); | 1015 | 136k | Ok(nfa) | 1016 | 138k | } |
|
1017 | | |
1018 | | /// Compile an arbitrary HIR expression. |
1019 | 82.4M | fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> { |
1020 | | use regex_syntax::hir::{Class, HirKind::*}; |
1021 | | |
1022 | 82.4M | match *expr.kind() { |
1023 | 1.09M | Empty => self.c_empty(), |
1024 | 53.8M | Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), |
1025 | 3.90M | Class(Class::Bytes(ref c)) => self.c_byte_class(c), |
1026 | 5.46M | Class(Class::Unicode(ref c)) => self.c_unicode_class(c), |
1027 | 5.76M | Look(ref look) => self.c_look(look), |
1028 | 3.49M | Repetition(ref rep) => self.c_repetition(rep), |
1029 | 4.78M | Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), |
1030 | 4.30M | Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), |
1031 | 3.10M | Alternation(ref es) => self.c_alt_slice(es), |
1032 | | } |
1033 | 82.4M | } |
1034 | | |
1035 | | /// Compile a concatenation of the sub-expressions yielded by the given |
1036 | | /// iterator. If the iterator yields no elements, then this compiles down |
1037 | | /// to an "empty" state that always matches. |
1038 | | /// |
1039 | | /// If the compiler is in reverse mode, then the expressions given are |
1040 | | /// automatically compiled in reverse. |
1041 | 55.0M | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> |
1042 | 55.0M | where |
1043 | 55.0M | I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, |
1044 | | { |
1045 | 55.0M | let first = if self.is_reverse() { it.next_back() } else { it.next() }; |
1046 | 55.0M | let ThompsonRef { start, mut end } = match first { |
1047 | 55.0M | Some(result) => result?, |
1048 | 9.94k | None => return self.c_empty(), |
1049 | | }; |
1050 | | loop { |
1051 | 124M | let next = |
1052 | 124M | if self.is_reverse() { it.next_back() } else { it.next() }; |
1053 | 124M | let compiled = match next { |
1054 | 69.5M | Some(result) => result?, |
1055 | 55.0M | None => break, |
1056 | | }; |
1057 | 69.5M | self.patch(end, compiled.start)?; |
1058 | 69.5M | end = compiled.end; |
1059 | | } |
1060 | 55.0M | Ok(ThompsonRef { start, end }) |
1061 | 55.0M | } <regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<u8>>, <regex_automata::nfa::thompson::compiler::Compiler>::c_literal::{closure#0}>>Line | Count | Source | 1041 | 53.8M | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> | 1042 | 53.8M | where | 1043 | 53.8M | I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, | 1044 | | { | 1045 | 53.8M | let first = if self.is_reverse() { it.next_back() } else { it.next() }; | 1046 | 53.8M | let ThompsonRef { start, mut end } = match first { | 1047 | 53.8M | Some(result) => result?, | 1048 | 0 | None => return self.c_empty(), | 1049 | | }; | 1050 | | loop { | 1051 | 65.0M | let next = | 1052 | 65.0M | if self.is_reverse() { it.next_back() } else { it.next() }; | 1053 | 65.0M | let compiled = match next { | 1054 | 11.1M | Some(result) => result?, | 1055 | 53.8M | None => break, | 1056 | | }; | 1057 | 11.1M | self.patch(end, compiled.start)?; | 1058 | 11.1M | end = compiled.end; | 1059 | | } | 1060 | 53.8M | Ok(ThompsonRef { start, end }) | 1061 | 53.8M | } |
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_automata::nfa::thompson::compiler::Compiler>::c_exactly::{closure#0}>>Line | Count | Source | 1041 | 287k | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> | 1042 | 287k | where | 1043 | 287k | I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, | 1044 | | { | 1045 | 287k | let first = if self.is_reverse() { it.next_back() } else { it.next() }; | 1046 | 287k | let ThompsonRef { start, mut end } = match first { | 1047 | 277k | Some(result) => result?, | 1048 | 9.94k | None => return self.c_empty(), | 1049 | | }; | 1050 | | loop { | 1051 | 55.3M | let next = | 1052 | 55.3M | if self.is_reverse() { it.next_back() } else { it.next() }; | 1053 | 55.3M | let compiled = match next { | 1054 | 55.0M | Some(result) => result?, | 1055 | 270k | None => break, | 1056 | | }; | 1057 | 55.0M | self.patch(end, compiled.start)?; | 1058 | 55.0M | end = compiled.end; | 1059 | | } | 1060 | 270k | Ok(ThompsonRef { start, end }) | 1061 | 287k | } |
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c::{closure#0}>>Line | Count | Source | 1041 | 928k | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> | 1042 | 928k | where | 1043 | 928k | I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, | 1044 | | { | 1045 | 928k | let first = if self.is_reverse() { it.next_back() } else { it.next() }; | 1046 | 928k | let ThompsonRef { start, mut end } = match first { | 1047 | 928k | Some(result) => result?, | 1048 | 0 | None => return self.c_empty(), | 1049 | | }; | 1050 | | loop { | 1051 | 4.30M | let next = | 1052 | 4.30M | if self.is_reverse() { it.next_back() } else { it.next() }; | 1053 | 4.30M | let compiled = match next { | 1054 | 3.37M | Some(result) => result?, | 1055 | 927k | None => break, | 1056 | | }; | 1057 | 3.37M | self.patch(end, compiled.start)?; | 1058 | 3.37M | end = compiled.end; | 1059 | | } | 1060 | 927k | Ok(ThompsonRef { start, end }) | 1061 | 928k | } |
|
1062 | | |
1063 | | /// Compile an alternation of the given HIR values. |
1064 | | /// |
1065 | | /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead |
1066 | | /// of an iterator of compiled NFA sub-graphs. The point of accepting a |
1067 | | /// slice here is that it opens up some optimization opportunities. For |
1068 | | /// example, if all of the HIR values are literals, then this routine might |
1069 | | /// re-shuffle them to make NFA epsilon closures substantially faster. |
1070 | 3.10M | fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> { |
1071 | | // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) |
1072 | 3.10M | let literal_count = exprs |
1073 | 3.10M | .iter() |
1074 | 10.9M | .filter(|e| { |
1075 | 10.9M | matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) |
1076 | 10.9M | }) |
1077 | 3.10M | .count(); |
1078 | 3.10M | if literal_count <= 1 || literal_count < exprs.len() { |
1079 | 10.1M | return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); |
1080 | 252k | } |
1081 | | |
1082 | 252k | let mut trie = if self.is_reverse() { |
1083 | 66.4k | LiteralTrie::reverse() |
1084 | | } else { |
1085 | 186k | LiteralTrie::forward() |
1086 | | }; |
1087 | 750k | for expr in exprs.iter() { |
1088 | 750k | let literal = match *expr.kind() { |
1089 | 750k | hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, |
1090 | 0 | _ => unreachable!(), |
1091 | | }; |
1092 | 750k | trie.add(literal)?; |
1093 | | } |
1094 | 252k | trie.compile(&mut self.builder.borrow_mut()) |
1095 | 3.10M | } |
1096 | | |
1097 | | /// Compile an alternation, where each element yielded by the given |
1098 | | /// iterator represents an item in the alternation. If the iterator yields |
1099 | | /// no elements, then this compiles down to a "fail" state. |
1100 | | /// |
1101 | | /// In an alternation, expressions appearing earlier are "preferred" at |
1102 | | /// match time over expressions appearing later. At least, this is true |
1103 | | /// when using "leftmost first" match semantics. (If "leftmost longest" are |
1104 | | /// ever added in the future, then this preference order of priority would |
1105 | | /// not apply in that mode.) |
1106 | 2.99M | fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> |
1107 | 2.99M | where |
1108 | 2.99M | I: Iterator<Item = Result<ThompsonRef, BuildError>>, |
1109 | | { |
1110 | 2.99M | let first = match it.next() { |
1111 | 0 | None => return self.c_fail(), |
1112 | 2.99M | Some(result) => result?, |
1113 | | }; |
1114 | 2.98M | let second = match it.next() { |
1115 | 136k | None => return Ok(first), |
1116 | 2.85M | Some(result) => result?, |
1117 | | }; |
1118 | | |
1119 | 2.85M | let union = self.add_union()?; |
1120 | 2.85M | let end = self.add_empty()?; |
1121 | 2.85M | self.patch(union, first.start)?; |
1122 | 2.85M | self.patch(first.end, end)?; |
1123 | 2.85M | self.patch(union, second.start)?; |
1124 | 2.85M | self.patch(second.end, end)?; |
1125 | 7.33M | for result in it { |
1126 | 4.48M | let compiled = result?; |
1127 | 4.48M | self.patch(union, compiled.start)?; |
1128 | 4.48M | self.patch(compiled.end, end)?; |
1129 | | } |
1130 | 2.85M | Ok(ThompsonRef { start: union, end }) |
1131 | 2.99M | } Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<regex_syntax::hir::Hir>::{closure#1}>><regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_slice::{closure#1}>>Line | Count | Source | 1106 | 2.85M | fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> | 1107 | 2.85M | where | 1108 | 2.85M | I: Iterator<Item = Result<ThompsonRef, BuildError>>, | 1109 | | { | 1110 | 2.85M | let first = match it.next() { | 1111 | 0 | None => return self.c_fail(), | 1112 | 2.85M | Some(result) => result?, | 1113 | | }; | 1114 | 2.85M | let second = match it.next() { | 1115 | 0 | None => return Ok(first), | 1116 | 2.85M | Some(result) => result?, | 1117 | | }; | 1118 | | | 1119 | 2.85M | let union = self.add_union()?; | 1120 | 2.85M | let end = self.add_empty()?; | 1121 | 2.85M | self.patch(union, first.start)?; | 1122 | 2.85M | self.patch(first.end, end)?; | 1123 | 2.85M | self.patch(union, second.start)?; | 1124 | 2.85M | self.patch(second.end, end)?; | 1125 | 7.33M | for result in it { | 1126 | 4.48M | let compiled = result?; | 1127 | 4.48M | self.patch(union, compiled.start)?; | 1128 | 4.48M | self.patch(compiled.end, end)?; | 1129 | | } | 1130 | 2.85M | Ok(ThompsonRef { start: union, end }) | 1131 | 2.85M | } |
<regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<®ex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<®ex_syntax::hir::Hir>::{closure#1}>>Line | Count | Source | 1106 | 138k | fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> | 1107 | 138k | where | 1108 | 138k | I: Iterator<Item = Result<ThompsonRef, BuildError>>, | 1109 | | { | 1110 | 138k | let first = match it.next() { | 1111 | 0 | None => return self.c_fail(), | 1112 | 138k | Some(result) => result?, | 1113 | | }; | 1114 | 136k | let second = match it.next() { | 1115 | 136k | None => return Ok(first), | 1116 | 0 | Some(result) => result?, | 1117 | | }; | 1118 | | | 1119 | 0 | let union = self.add_union()?; | 1120 | 0 | let end = self.add_empty()?; | 1121 | 0 | self.patch(union, first.start)?; | 1122 | 0 | self.patch(first.end, end)?; | 1123 | 0 | self.patch(union, second.start)?; | 1124 | 0 | self.patch(second.end, end)?; | 1125 | 0 | for result in it { | 1126 | 0 | let compiled = result?; | 1127 | 0 | self.patch(union, compiled.start)?; | 1128 | 0 | self.patch(compiled.end, end)?; | 1129 | | } | 1130 | 0 | Ok(ThompsonRef { start: union, end }) | 1131 | 138k | } |
|
1132 | | |
1133 | | /// Compile the given capture sub-expression. `expr` should be the |
1134 | | /// sub-expression contained inside the capture. If "capture" states are |
1135 | | /// enabled, then they are added as appropriate. |
1136 | | /// |
1137 | | /// This accepts the pieces of a capture instead of a `hir::Capture` so |
1138 | | /// that it's easy to manufacture a "fake" group when necessary, e.g., for |
1139 | | /// adding the entire pattern as if it were a group in order to create |
1140 | | /// appropriate "capture" states in the NFA. |
1141 | 4.92M | fn c_cap( |
1142 | 4.92M | &self, |
1143 | 4.92M | index: u32, |
1144 | 4.92M | name: Option<&str>, |
1145 | 4.92M | expr: &Hir, |
1146 | 4.92M | ) -> Result<ThompsonRef, BuildError> { |
1147 | 4.92M | match self.config.get_which_captures() { |
1148 | | // No capture states means we always skip them. |
1149 | 2.05M | WhichCaptures::None => return self.c(expr), |
1150 | | // Implicit captures states means we only add when index==0 since |
1151 | | // index==0 implies the group is implicit. |
1152 | 0 | WhichCaptures::Implicit if index > 0 => return self.c(expr), |
1153 | 2.86M | _ => {} |
1154 | | } |
1155 | | |
1156 | 2.86M | let start = self.add_capture_start(index, name)?; |
1157 | 2.86M | let inner = self.c(expr)?; |
1158 | 2.86M | let end = self.add_capture_end(index)?; |
1159 | 2.86M | self.patch(start, inner.start)?; |
1160 | 2.86M | self.patch(inner.end, end)?; |
1161 | 2.86M | Ok(ThompsonRef { start, end }) |
1162 | 4.92M | } |
1163 | | |
1164 | | /// Compile the given repetition expression. This handles all types of |
1165 | | /// repetitions and greediness. |
1166 | 3.49M | fn c_repetition( |
1167 | 3.49M | &self, |
1168 | 3.49M | rep: &hir::Repetition, |
1169 | 3.49M | ) -> Result<ThompsonRef, BuildError> { |
1170 | 3.49M | match (rep.min, rep.max) { |
1171 | 1.53M | (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
1172 | 1.72M | (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
1173 | 229k | (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
1174 | 18.7k | (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
1175 | | } |
1176 | 3.49M | } |
1177 | | |
1178 | | /// Compile the given expression such that it matches at least `min` times, |
1179 | | /// but no more than `max` times. |
1180 | | /// |
1181 | | /// When `greedy` is true, then the preference is for the expression to |
1182 | | /// match as much as possible. Otherwise, it will match as little as |
1183 | | /// possible. |
1184 | 18.7k | fn c_bounded( |
1185 | 18.7k | &self, |
1186 | 18.7k | expr: &Hir, |
1187 | 18.7k | greedy: bool, |
1188 | 18.7k | min: u32, |
1189 | 18.7k | max: u32, |
1190 | 18.7k | ) -> Result<ThompsonRef, BuildError> { |
1191 | 18.7k | let prefix = self.c_exactly(expr, min)?; |
1192 | 17.2k | if min == max { |
1193 | 0 | return Ok(prefix); |
1194 | 17.2k | } |
1195 | | |
1196 | | // It is tempting here to compile the rest here as a concatenation |
1197 | | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
1198 | | // were `aaa?a?a?`. The problem here is that it leads to this program: |
1199 | | // |
1200 | | // >000000: 61 => 01 |
1201 | | // 000001: 61 => 02 |
1202 | | // 000002: union(03, 04) |
1203 | | // 000003: 61 => 04 |
1204 | | // 000004: union(05, 06) |
1205 | | // 000005: 61 => 06 |
1206 | | // 000006: union(07, 08) |
1207 | | // 000007: 61 => 08 |
1208 | | // 000008: MATCH |
1209 | | // |
1210 | | // And effectively, once you hit state 2, the epsilon closure will |
1211 | | // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
1212 | | // to instead compile it like so: |
1213 | | // |
1214 | | // >000000: 61 => 01 |
1215 | | // 000001: 61 => 02 |
1216 | | // 000002: union(03, 08) |
1217 | | // 000003: 61 => 04 |
1218 | | // 000004: union(05, 08) |
1219 | | // 000005: 61 => 06 |
1220 | | // 000006: union(07, 08) |
1221 | | // 000007: 61 => 08 |
1222 | | // 000008: MATCH |
1223 | | // |
1224 | | // So that the epsilon closure of state 2 is now just 3 and 8. |
1225 | 17.2k | let empty = self.add_empty()?; |
1226 | 17.2k | let mut prev_end = prefix.end; |
1227 | 17.2k | for _ in min..max { |
1228 | 4.29M | let union = if greedy { |
1229 | 2.92M | self.add_union() |
1230 | | } else { |
1231 | 1.36M | self.add_union_reverse() |
1232 | 22 | }?; |
1233 | 4.29M | let compiled = self.c(expr)?; |
1234 | 4.29M | self.patch(prev_end, union)?; |
1235 | 4.29M | self.patch(union, compiled.start)?; |
1236 | 4.29M | self.patch(union, empty)?; |
1237 | 4.29M | prev_end = compiled.end; |
1238 | | } |
1239 | 16.5k | self.patch(prev_end, empty)?; |
1240 | 16.5k | Ok(ThompsonRef { start: prefix.start, end: empty }) |
1241 | 18.7k | } |
1242 | | |
1243 | | /// Compile the given expression such that it may be matched `n` or more |
1244 | | /// times, where `n` can be any integer. (Although a particularly large |
1245 | | /// integer is likely to run afoul of any configured size limits.) |
1246 | | /// |
1247 | | /// When `greedy` is true, then the preference is for the expression to |
1248 | | /// match as much as possible. Otherwise, it will match as little as |
1249 | | /// possible. |
1250 | 1.86M | fn c_at_least( |
1251 | 1.86M | &self, |
1252 | 1.86M | expr: &Hir, |
1253 | 1.86M | greedy: bool, |
1254 | 1.86M | n: u32, |
1255 | 1.86M | ) -> Result<ThompsonRef, BuildError> { |
1256 | 1.86M | if n == 0 { |
1257 | | // When the expression cannot match the empty string, then we |
1258 | | // can get away with something much simpler: just one 'alt' |
1259 | | // instruction that optionally repeats itself. But if the expr |
1260 | | // can match the empty string... see below. |
1261 | 1.02M | if expr.properties().minimum_len().map_or(false, |len| len > 0) { |
1262 | 768k | let union = if greedy { |
1263 | 347k | self.add_union() |
1264 | | } else { |
1265 | 420k | self.add_union_reverse() |
1266 | 7 | }?; |
1267 | 768k | let compiled = self.c(expr)?; |
1268 | 768k | self.patch(union, compiled.start)?; |
1269 | 768k | self.patch(compiled.end, union)?; |
1270 | 768k | return Ok(ThompsonRef { start: union, end: union }); |
1271 | 258k | } |
1272 | | |
1273 | | // What's going on here? Shouldn't x* be simpler than this? It |
1274 | | // turns out that when implementing leftmost-first (Perl-like) |
1275 | | // match semantics, x* results in an incorrect preference order |
1276 | | // when computing the transitive closure of states if and only if |
1277 | | // 'x' can match the empty string. So instead, we compile x* as |
1278 | | // (x+)?, which preserves the correct preference order. |
1279 | | // |
1280 | | // See: https://github.com/rust-lang/regex/issues/779 |
1281 | 258k | let compiled = self.c(expr)?; |
1282 | 257k | let plus = if greedy { |
1283 | 176k | self.add_union() |
1284 | | } else { |
1285 | 81.0k | self.add_union_reverse() |
1286 | 5 | }?; |
1287 | 257k | self.patch(compiled.end, plus)?; |
1288 | 257k | self.patch(plus, compiled.start)?; |
1289 | | |
1290 | 257k | let question = if greedy { |
1291 | 176k | self.add_union() |
1292 | | } else { |
1293 | 81.0k | self.add_union_reverse() |
1294 | 4 | }?; |
1295 | 257k | let empty = self.add_empty()?; |
1296 | 257k | self.patch(question, compiled.start)?; |
1297 | 257k | self.patch(question, empty)?; |
1298 | 257k | self.patch(plus, empty)?; |
1299 | 257k | Ok(ThompsonRef { start: question, end: empty }) |
1300 | 837k | } else if n == 1 { |
1301 | 778k | let compiled = self.c(expr)?; |
1302 | 777k | let union = if greedy { |
1303 | 508k | self.add_union() |
1304 | | } else { |
1305 | 269k | self.add_union_reverse() |
1306 | 10 | }?; |
1307 | 777k | self.patch(compiled.end, union)?; |
1308 | 777k | self.patch(union, compiled.start)?; |
1309 | 777k | Ok(ThompsonRef { start: compiled.start, end: union }) |
1310 | | } else { |
1311 | 58.1k | let prefix = self.c_exactly(expr, n - 1)?; |
1312 | 56.7k | let last = self.c(expr)?; |
1313 | 56.7k | let union = if greedy { |
1314 | 14.5k | self.add_union() |
1315 | | } else { |
1316 | 42.2k | self.add_union_reverse() |
1317 | 2 | }?; |
1318 | 56.7k | self.patch(prefix.end, last.start)?; |
1319 | 56.7k | self.patch(last.end, union)?; |
1320 | 56.7k | self.patch(union, last.start)?; |
1321 | 56.7k | Ok(ThompsonRef { start: prefix.start, end: union }) |
1322 | | } |
1323 | 1.86M | } |
1324 | | |
1325 | | /// Compile the given expression such that it may be matched zero or one |
1326 | | /// times. |
1327 | | /// |
1328 | | /// When `greedy` is true, then the preference is for the expression to |
1329 | | /// match as much as possible. Otherwise, it will match as little as |
1330 | | /// possible. |
1331 | 1.53M | fn c_zero_or_one( |
1332 | 1.53M | &self, |
1333 | 1.53M | expr: &Hir, |
1334 | 1.53M | greedy: bool, |
1335 | 1.53M | ) -> Result<ThompsonRef, BuildError> { |
1336 | 1.53M | let union = |
1337 | 1.53M | if greedy { self.add_union() } else { self.add_union_reverse() }?; |
1338 | 1.53M | let compiled = self.c(expr)?; |
1339 | 1.53M | let empty = self.add_empty()?; |
1340 | 1.53M | self.patch(union, compiled.start)?; |
1341 | 1.53M | self.patch(union, empty)?; |
1342 | 1.53M | self.patch(compiled.end, empty)?; |
1343 | 1.53M | Ok(ThompsonRef { start: union, end: empty }) |
1344 | 1.53M | } |
1345 | | |
1346 | | /// Compile the given HIR expression exactly `n` times. |
1347 | 287k | fn c_exactly( |
1348 | 287k | &self, |
1349 | 287k | expr: &Hir, |
1350 | 287k | n: u32, |
1351 | 287k | ) -> Result<ThompsonRef, BuildError> { |
1352 | 55.3M | let it = (0..n).map(|_| self.c(expr)); |
1353 | 287k | self.c_concat(it) |
1354 | 287k | } |
1355 | | |
1356 | | /// Compile the given byte oriented character class. |
1357 | | /// |
1358 | | /// This uses "sparse" states to represent an alternation between ranges in |
1359 | | /// this character class. We can use "sparse" states instead of stitching |
1360 | | /// together a "union" state because all ranges in a character class have |
1361 | | /// equal priority *and* are non-overlapping (thus, only one can match, so |
1362 | | /// there's never a question of priority in the first place). This saves a |
1363 | | /// fair bit of overhead when traversing an NFA. |
1364 | | /// |
1365 | | /// This routine compiles an empty character class into a "fail" state. |
1366 | 3.90M | fn c_byte_class( |
1367 | 3.90M | &self, |
1368 | 3.90M | cls: &hir::ClassBytes, |
1369 | 3.90M | ) -> Result<ThompsonRef, BuildError> { |
1370 | 3.90M | let end = self.add_empty()?; |
1371 | 3.90M | let mut trans = Vec::with_capacity(cls.ranges().len()); |
1372 | 6.18M | for r in cls.iter() { |
1373 | 6.18M | trans.push(Transition { |
1374 | 6.18M | start: r.start(), |
1375 | 6.18M | end: r.end(), |
1376 | 6.18M | next: end, |
1377 | 6.18M | }); |
1378 | 6.18M | } |
1379 | 3.90M | Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) |
1380 | 3.90M | } |
1381 | | |
1382 | | /// Compile the given Unicode character class. |
1383 | | /// |
1384 | | /// This routine specifically tries to use various types of compression, |
1385 | | /// since UTF-8 automata of large classes can get quite large. The specific |
1386 | | /// type of compression used depends on forward vs reverse compilation, and |
1387 | | /// whether NFA shrinking is enabled or not. |
1388 | | /// |
1389 | | /// Aside from repetitions causing lots of repeat group, this is like the |
1390 | | /// single most expensive part of regex compilation. Therefore, a large part |
1391 | | /// of the expense of compilation may be reduce by disabling Unicode in the |
1392 | | /// pattern. |
1393 | | /// |
1394 | | /// This routine compiles an empty character class into a "fail" state. |
1395 | 5.46M | fn c_unicode_class( |
1396 | 5.46M | &self, |
1397 | 5.46M | cls: &hir::ClassUnicode, |
1398 | 5.46M | ) -> Result<ThompsonRef, BuildError> { |
1399 | | // If all we have are ASCII ranges wrapped in a Unicode package, then |
1400 | | // there is zero reason to bring out the big guns. We can fit all ASCII |
1401 | | // ranges within a single sparse state. |
1402 | 5.46M | if cls.is_ascii() { |
1403 | 1.93M | let end = self.add_empty()?; |
1404 | 1.93M | let mut trans = Vec::with_capacity(cls.ranges().len()); |
1405 | 4.29M | for r in cls.iter() { |
1406 | 4.29M | // The unwraps below are OK because we've verified that this |
1407 | 4.29M | // class only contains ASCII codepoints. |
1408 | 4.29M | trans.push(Transition { |
1409 | 4.29M | // FIXME(1.59): use the 'TryFrom<char> for u8' impl. |
1410 | 4.29M | start: u8::try_from(u32::from(r.start())).unwrap(), |
1411 | 4.29M | end: u8::try_from(u32::from(r.end())).unwrap(), |
1412 | 4.29M | next: end, |
1413 | 4.29M | }); |
1414 | 4.29M | } |
1415 | 1.93M | Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) |
1416 | 3.53M | } else if self.is_reverse() { |
1417 | 1.13M | if !self.config.get_shrink() { |
1418 | | // When we don't want to spend the extra time shrinking, we |
1419 | | // compile the UTF-8 automaton in reverse using something like |
1420 | | // the "naive" approach, but will attempt to re-use common |
1421 | | // suffixes. |
1422 | 1.13M | self.c_unicode_class_reverse_with_suffix(cls) |
1423 | | } else { |
1424 | | // When we want to shrink our NFA for reverse UTF-8 automata, |
1425 | | // we cannot feed UTF-8 sequences directly to the UTF-8 |
1426 | | // compiler, since the UTF-8 compiler requires all sequences |
1427 | | // to be lexicographically sorted. Instead, we organize our |
1428 | | // sequences into a range trie, which can then output our |
1429 | | // sequences in the correct order. Unfortunately, building the |
1430 | | // range trie is fairly expensive (but not nearly as expensive |
1431 | | // as building a DFA). Hence the reason why the 'shrink' option |
1432 | | // exists, so that this path can be toggled off. For example, |
1433 | | // we might want to turn this off if we know we won't be |
1434 | | // compiling a DFA. |
1435 | 0 | let mut trie = self.trie_state.borrow_mut(); |
1436 | 0 | trie.clear(); |
1437 | | |
1438 | 0 | for rng in cls.iter() { |
1439 | 0 | for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { |
1440 | 0 | seq.reverse(); |
1441 | 0 | trie.insert(seq.as_slice()); |
1442 | 0 | } |
1443 | | } |
1444 | 0 | let mut builder = self.builder.borrow_mut(); |
1445 | 0 | let mut utf8_state = self.utf8_state.borrow_mut(); |
1446 | 0 | let mut utf8c = |
1447 | 0 | Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; |
1448 | 0 | trie.iter(|seq| { |
1449 | 0 | utf8c.add(&seq)?; |
1450 | 0 | Ok(()) |
1451 | 0 | })?; |
1452 | 0 | utf8c.finish() |
1453 | | } |
1454 | | } else { |
1455 | | // In the forward direction, we always shrink our UTF-8 automata |
1456 | | // because we can stream it right into the UTF-8 compiler. There |
1457 | | // is almost no downside (in either memory or time) to using this |
1458 | | // approach. |
1459 | 2.39M | let mut builder = self.builder.borrow_mut(); |
1460 | 2.39M | let mut utf8_state = self.utf8_state.borrow_mut(); |
1461 | 2.39M | let mut utf8c = |
1462 | 2.39M | Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; |
1463 | 26.3M | for rng in cls.iter() { |
1464 | 44.2M | for seq in Utf8Sequences::new(rng.start(), rng.end()) { |
1465 | 44.2M | utf8c.add(seq.as_slice())?; |
1466 | | } |
1467 | | } |
1468 | 2.39M | utf8c.finish() |
1469 | | } |
1470 | | |
1471 | | // For reference, the code below is the "naive" version of compiling a |
1472 | | // UTF-8 automaton. It is deliciously simple (and works for both the |
1473 | | // forward and reverse cases), but will unfortunately produce very |
1474 | | // large NFAs. When compiling a forward automaton, the size difference |
1475 | | // can sometimes be an order of magnitude. For example, the '\w' regex |
1476 | | // will generate about ~3000 NFA states using the naive approach below, |
1477 | | // but only 283 states when using the approach above. This is because |
1478 | | // the approach above actually compiles a *minimal* (or near minimal, |
1479 | | // because of the bounded hashmap for reusing equivalent states) UTF-8 |
1480 | | // automaton. |
1481 | | // |
1482 | | // The code below is kept as a reference point in order to make it |
1483 | | // easier to understand the higher level goal here. Although, it will |
1484 | | // almost certainly bit-rot, so keep that in mind. Also, if you try to |
1485 | | // use it, some of the tests in this module will fail because they look |
1486 | | // for terser byte code produce by the more optimized handling above. |
1487 | | // But the integration test suite should still pass. |
1488 | | // |
1489 | | // One good example of the substantial difference this can make is to |
1490 | | // compare and contrast performance of the Pike VM when the code below |
1491 | | // is active vs the code above. Here's an example to try: |
1492 | | // |
1493 | | // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file |
1494 | | // |
1495 | | // With Unicode classes generated below, this search takes about 45s on |
1496 | | // my machine. But with the compressed version above, the search takes |
1497 | | // only around 1.4s. The NFA is also 20% smaller. This is in part due |
1498 | | // to the compression, but also because of the utilization of 'sparse' |
1499 | | // NFA states. They lead to much less state shuffling during the NFA |
1500 | | // search. |
1501 | | /* |
1502 | | let it = cls |
1503 | | .iter() |
1504 | | .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) |
1505 | | .map(|seq| { |
1506 | | let it = seq |
1507 | | .as_slice() |
1508 | | .iter() |
1509 | | .map(|rng| self.c_range(rng.start, rng.end)); |
1510 | | self.c_concat(it) |
1511 | | }); |
1512 | | self.c_alt_iter(it) |
1513 | | */ |
1514 | 5.46M | } |
1515 | | |
1516 | | /// Compile the given Unicode character class in reverse with suffix |
1517 | | /// caching. |
1518 | | /// |
1519 | | /// This is a "quick" way to compile large Unicode classes into reverse |
1520 | | /// UTF-8 automata while doing a small amount of compression on that |
1521 | | /// automata by reusing common suffixes. |
1522 | | /// |
1523 | | /// A more comprehensive compression scheme can be accomplished by using |
1524 | | /// a range trie to efficiently sort a reverse sequence of UTF-8 byte |
1525 | | /// ranges, and then use Daciuk's algorithm via `Utf8Compiler`. |
1526 | | /// |
1527 | | /// This is the technique used when "NFA shrinking" is disabled. |
1528 | | /// |
1529 | | /// (This also tries to use "sparse" states where possible, just like |
1530 | | /// `c_byte_class` does.) |
1531 | 1.13M | fn c_unicode_class_reverse_with_suffix( |
1532 | 1.13M | &self, |
1533 | 1.13M | cls: &hir::ClassUnicode, |
1534 | 1.13M | ) -> Result<ThompsonRef, BuildError> { |
1535 | | // N.B. It would likely be better to cache common *prefixes* in the |
1536 | | // reverse direction, but it's not quite clear how to do that. The |
1537 | | // advantage of caching suffixes is that it does give us a win, and |
1538 | | // has a very small additional overhead. |
1539 | 1.13M | let mut cache = self.utf8_suffix.borrow_mut(); |
1540 | 1.13M | cache.clear(); |
1541 | | |
1542 | 1.13M | let union = self.add_union()?; |
1543 | 1.13M | let alt_end = self.add_empty()?; |
1544 | 13.5M | for urng in cls.iter() { |
1545 | 23.7M | for seq in Utf8Sequences::new(urng.start(), urng.end()) { |
1546 | 23.7M | let mut end = alt_end; |
1547 | 74.3M | for brng in seq.as_slice() { |
1548 | 74.3M | let key = Utf8SuffixKey { |
1549 | 74.3M | from: end, |
1550 | 74.3M | start: brng.start, |
1551 | 74.3M | end: brng.end, |
1552 | 74.3M | }; |
1553 | 74.3M | let hash = cache.hash(&key); |
1554 | 74.3M | if let Some(id) = cache.get(&key, hash) { |
1555 | 25.8M | end = id; |
1556 | 25.8M | continue; |
1557 | 48.4M | } |
1558 | | |
1559 | 48.4M | let compiled = self.c_range(brng.start, brng.end)?; |
1560 | 48.4M | self.patch(compiled.end, end)?; |
1561 | 48.4M | end = compiled.start; |
1562 | 48.4M | cache.set(key, hash, end); |
1563 | | } |
1564 | 23.7M | self.patch(union, end)?; |
1565 | | } |
1566 | | } |
1567 | 1.13M | Ok(ThompsonRef { start: union, end: alt_end }) |
1568 | 1.13M | } |
1569 | | |
1570 | | /// Compile the given HIR look-around assertion to an NFA look-around |
1571 | | /// assertion. |
1572 | 5.76M | fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> { |
1573 | 5.76M | let look = match *anchor { |
1574 | 477k | hir::Look::Start => Look::Start, |
1575 | 425k | hir::Look::End => Look::End, |
1576 | 55.7k | hir::Look::StartLF => Look::StartLF, |
1577 | 92.3k | hir::Look::EndLF => Look::EndLF, |
1578 | 71.4k | hir::Look::StartCRLF => Look::StartCRLF, |
1579 | 68.9k | hir::Look::EndCRLF => Look::EndCRLF, |
1580 | 1.00M | hir::Look::WordAscii => Look::WordAscii, |
1581 | 335k | hir::Look::WordAsciiNegate => Look::WordAsciiNegate, |
1582 | 928k | hir::Look::WordUnicode => Look::WordUnicode, |
1583 | 189k | hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, |
1584 | 506k | hir::Look::WordStartAscii => Look::WordStartAscii, |
1585 | 296k | hir::Look::WordEndAscii => Look::WordEndAscii, |
1586 | 156k | hir::Look::WordStartUnicode => Look::WordStartUnicode, |
1587 | 176k | hir::Look::WordEndUnicode => Look::WordEndUnicode, |
1588 | 321k | hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, |
1589 | 280k | hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, |
1590 | 96.5k | hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, |
1591 | 283k | hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, |
1592 | | }; |
1593 | 5.76M | let id = self.add_look(look)?; |
1594 | 5.76M | Ok(ThompsonRef { start: id, end: id }) |
1595 | 5.76M | } |
1596 | | |
1597 | | /// Compile the given byte string to a concatenation of bytes. |
1598 | 53.8M | fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> { |
1599 | 65.0M | self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) |
1600 | 53.8M | } |
1601 | | |
1602 | | /// Compile a "range" state with one transition that may only be followed |
1603 | | /// if the input byte is in the (inclusive) range given. |
1604 | | /// |
1605 | | /// Both the `start` and `end` locations point to the state created. |
1606 | | /// Callers will likely want to keep the `start`, but patch the `end` to |
1607 | | /// point to some other state. |
1608 | 113M | fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> { |
1609 | 113M | let id = self.add_range(start, end)?; |
1610 | 113M | Ok(ThompsonRef { start: id, end: id }) |
1611 | 113M | } |
1612 | | |
1613 | | /// Compile an "empty" state with one unconditional epsilon transition. |
1614 | | /// |
1615 | | /// Both the `start` and `end` locations point to the state created. |
1616 | | /// Callers will likely want to keep the `start`, but patch the `end` to |
1617 | | /// point to some other state. |
1618 | 1.10M | fn c_empty(&self) -> Result<ThompsonRef, BuildError> { |
1619 | 1.10M | let id = self.add_empty()?; |
1620 | 1.10M | Ok(ThompsonRef { start: id, end: id }) |
1621 | 1.10M | } |
1622 | | |
1623 | | /// Compile a "fail" state that can never have any outgoing transitions. |
1624 | 0 | fn c_fail(&self) -> Result<ThompsonRef, BuildError> { |
1625 | 0 | let id = self.add_fail()?; |
1626 | 0 | Ok(ThompsonRef { start: id, end: id }) |
1627 | 0 | } |
1628 | | |
1629 | | // The below helpers are meant to be simple wrappers around the |
1630 | | // corresponding Builder methods. For the most part, they let us write |
1631 | | // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where |
1632 | | // the latter is a mouthful. Some of the methods do inject a little bit |
1633 | | // of extra logic. e.g., Flipping look-around operators when compiling in |
1634 | | // reverse mode. |
1635 | | |
1636 | 190M | fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { |
1637 | 190M | self.builder.borrow_mut().patch(from, to) |
1638 | 190M | } |
1639 | | |
1640 | 138k | fn start_pattern(&self) -> Result<PatternID, BuildError> { |
1641 | 138k | self.builder.borrow_mut().start_pattern() |
1642 | 138k | } |
1643 | | |
1644 | 136k | fn finish_pattern( |
1645 | 136k | &self, |
1646 | 136k | start_id: StateID, |
1647 | 136k | ) -> Result<PatternID, BuildError> { |
1648 | 136k | self.builder.borrow_mut().finish_pattern(start_id) |
1649 | 136k | } |
1650 | | |
1651 | 12.7M | fn add_empty(&self) -> Result<StateID, BuildError> { |
1652 | 12.7M | self.builder.borrow_mut().add_empty() |
1653 | 12.7M | } |
1654 | | |
1655 | 113M | fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> { |
1656 | 113M | self.builder.borrow_mut().add_range(Transition { |
1657 | 113M | start, |
1658 | 113M | end, |
1659 | 113M | next: StateID::ZERO, |
1660 | 113M | }) |
1661 | 113M | } |
1662 | | |
1663 | 5.83M | fn add_sparse( |
1664 | 5.83M | &self, |
1665 | 5.83M | ranges: Vec<Transition>, |
1666 | 5.83M | ) -> Result<StateID, BuildError> { |
1667 | 5.83M | self.builder.borrow_mut().add_sparse(ranges) |
1668 | 5.83M | } |
1669 | | |
1670 | 5.76M | fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> { |
1671 | 5.76M | if self.is_reverse() { |
1672 | 2.81M | look = look.reversed(); |
1673 | 2.94M | } |
1674 | 5.76M | self.builder.borrow_mut().add_look(StateID::ZERO, look) |
1675 | 5.76M | } |
1676 | | |
1677 | 9.13M | fn add_union(&self) -> Result<StateID, BuildError> { |
1678 | 9.13M | self.builder.borrow_mut().add_union(vec![]) |
1679 | 9.13M | } |
1680 | | |
1681 | 2.80M | fn add_union_reverse(&self) -> Result<StateID, BuildError> { |
1682 | 2.80M | self.builder.borrow_mut().add_union_reverse(vec![]) |
1683 | 2.80M | } |
1684 | | |
1685 | 2.86M | fn add_capture_start( |
1686 | 2.86M | &self, |
1687 | 2.86M | capture_index: u32, |
1688 | 2.86M | name: Option<&str>, |
1689 | 2.86M | ) -> Result<StateID, BuildError> { |
1690 | 2.86M | let name = name.map(Arc::from); |
1691 | 2.86M | self.builder.borrow_mut().add_capture_start( |
1692 | 2.86M | StateID::ZERO, |
1693 | 2.86M | capture_index, |
1694 | 2.86M | name, |
1695 | 2.86M | ) |
1696 | 2.86M | } |
1697 | | |
1698 | 2.86M | fn add_capture_end( |
1699 | 2.86M | &self, |
1700 | 2.86M | capture_index: u32, |
1701 | 2.86M | ) -> Result<StateID, BuildError> { |
1702 | 2.86M | self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) |
1703 | 2.86M | } |
1704 | | |
1705 | 0 | fn add_fail(&self) -> Result<StateID, BuildError> { |
1706 | 0 | self.builder.borrow_mut().add_fail() |
1707 | 0 | } |
1708 | | |
1709 | 136k | fn add_match(&self) -> Result<StateID, BuildError> { |
1710 | 136k | self.builder.borrow_mut().add_match() |
1711 | 136k | } |
1712 | | |
1713 | 189M | fn is_reverse(&self) -> bool { |
1714 | 189M | self.config.get_reverse() |
1715 | 189M | } |
1716 | | } |
1717 | | |
1718 | | /// A value that represents the result of compiling a sub-expression of a |
1719 | | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
1720 | | /// has an initial state at `start` and a final state at `end`. |
1721 | | #[derive(Clone, Copy, Debug)] |
1722 | | pub(crate) struct ThompsonRef { |
1723 | | pub(crate) start: StateID, |
1724 | | pub(crate) end: StateID, |
1725 | | } |
1726 | | |
1727 | | /// A UTF-8 compiler based on Daciuk's algorithm for compiling minimal DFAs |
1728 | | /// from a lexicographically sorted sequence of strings in linear time. |
1729 | | /// |
1730 | | /// The trick here is that any Unicode codepoint range can be converted to |
1731 | | /// a sequence of byte ranges that form a UTF-8 automaton. Connecting them |
1732 | | /// together via an alternation is trivial, and indeed, it works. However, |
1733 | | /// there is a lot of redundant structure in many UTF-8 automatons. Since our |
1734 | | /// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm |
1735 | | /// to build nearly minimal DFAs in linear time. (They are guaranteed to be |
1736 | | /// minimal because we use a bounded cache of previously build DFA states.) |
1737 | | /// |
1738 | | /// The drawback is that this sadly doesn't work for reverse automata, since |
1739 | | /// the ranges are no longer in lexicographic order. For that, we invented the |
1740 | | /// range trie (which gets its own module). Once a range trie is built, we then |
1741 | | /// use this same Utf8Compiler to build a reverse UTF-8 automaton. |
1742 | | /// |
1743 | | /// The high level idea is described here: |
1744 | | /// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures |
1745 | | /// |
1746 | | /// There is also another implementation of this in the `fst` crate. |
1747 | | #[derive(Debug)] |
1748 | | struct Utf8Compiler<'a> { |
1749 | | builder: &'a mut Builder, |
1750 | | state: &'a mut Utf8State, |
1751 | | target: StateID, |
1752 | | } |
1753 | | |
1754 | | #[derive(Clone, Debug)] |
1755 | | struct Utf8State { |
1756 | | compiled: Utf8BoundedMap, |
1757 | | uncompiled: Vec<Utf8Node>, |
1758 | | } |
1759 | | |
1760 | | #[derive(Clone, Debug)] |
1761 | | struct Utf8Node { |
1762 | | trans: Vec<Transition>, |
1763 | | last: Option<Utf8LastTransition>, |
1764 | | } |
1765 | | |
1766 | | #[derive(Clone, Debug)] |
1767 | | struct Utf8LastTransition { |
1768 | | start: u8, |
1769 | | end: u8, |
1770 | | } |
1771 | | |
1772 | | impl Utf8State { |
1773 | 506k | fn new() -> Utf8State { |
1774 | 506k | Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } |
1775 | 506k | } |
1776 | | |
1777 | 2.39M | fn clear(&mut self) { |
1778 | 2.39M | self.compiled.clear(); |
1779 | 2.39M | self.uncompiled.clear(); |
1780 | 2.39M | } |
1781 | | } |
1782 | | |
1783 | | impl<'a> Utf8Compiler<'a> { |
1784 | 2.39M | fn new( |
1785 | 2.39M | builder: &'a mut Builder, |
1786 | 2.39M | state: &'a mut Utf8State, |
1787 | 2.39M | ) -> Result<Utf8Compiler<'a>, BuildError> { |
1788 | 2.39M | let target = builder.add_empty()?; |
1789 | 2.39M | state.clear(); |
1790 | 2.39M | let mut utf8c = Utf8Compiler { builder, state, target }; |
1791 | 2.39M | utf8c.add_empty(); |
1792 | 2.39M | Ok(utf8c) |
1793 | 2.39M | } |
1794 | | |
1795 | 2.39M | fn finish(&mut self) -> Result<ThompsonRef, BuildError> { |
1796 | 2.39M | self.compile_from(0)?; |
1797 | 2.39M | let node = self.pop_root(); |
1798 | 2.39M | let start = self.compile(node)?; |
1799 | 2.39M | Ok(ThompsonRef { start, end: self.target }) |
1800 | 2.39M | } |
1801 | | |
1802 | 44.2M | fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { |
1803 | 44.2M | let prefix_len = ranges |
1804 | 44.2M | .iter() |
1805 | 44.2M | .zip(&self.state.uncompiled) |
1806 | 89.2M | .take_while(|&(range, node)| { |
1807 | 89.2M | node.last.as_ref().map_or(false, |t| { |
1808 | 86.8M | (t.start, t.end) == (range.start, range.end) |
1809 | 86.8M | }) |
1810 | 89.2M | }) |
1811 | 44.2M | .count(); |
1812 | 44.2M | assert!(prefix_len < ranges.len()); |
1813 | 44.2M | self.compile_from(prefix_len)?; |
1814 | 44.2M | self.add_suffix(&ranges[prefix_len..]); |
1815 | 44.2M | Ok(()) |
1816 | 44.2M | } |
1817 | | |
1818 | 46.6M | fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { |
1819 | 46.6M | let mut next = self.target; |
1820 | 95.0M | while from + 1 < self.state.uncompiled.len() { |
1821 | 48.3M | let node = self.pop_freeze(next); |
1822 | 48.3M | next = self.compile(node)?; |
1823 | | } |
1824 | 46.6M | self.top_last_freeze(next); |
1825 | 46.6M | Ok(()) |
1826 | 46.6M | } |
1827 | | |
1828 | 50.7M | fn compile( |
1829 | 50.7M | &mut self, |
1830 | 50.7M | node: Vec<Transition>, |
1831 | 50.7M | ) -> Result<StateID, BuildError> { |
1832 | 50.7M | let hash = self.state.compiled.hash(&node); |
1833 | 50.7M | if let Some(id) = self.state.compiled.get(&node, hash) { |
1834 | 26.9M | return Ok(id); |
1835 | 23.8M | } |
1836 | 23.8M | let id = self.builder.add_sparse(node.clone())?; |
1837 | 23.8M | self.state.compiled.set(node, hash, id); |
1838 | 23.8M | Ok(id) |
1839 | 50.7M | } |
1840 | | |
1841 | 44.2M | fn add_suffix(&mut self, ranges: &[Utf8Range]) { |
1842 | 44.2M | assert!(!ranges.is_empty()); |
1843 | 44.2M | let last = self |
1844 | 44.2M | .state |
1845 | 44.2M | .uncompiled |
1846 | 44.2M | .len() |
1847 | 44.2M | .checked_sub(1) |
1848 | 44.2M | .expect("non-empty nodes"); |
1849 | 44.2M | assert!(self.state.uncompiled[last].last.is_none()); |
1850 | 44.2M | self.state.uncompiled[last].last = Some(Utf8LastTransition { |
1851 | 44.2M | start: ranges[0].start, |
1852 | 44.2M | end: ranges[0].end, |
1853 | 44.2M | }); |
1854 | 48.3M | for r in &ranges[1..] { |
1855 | 48.3M | self.state.uncompiled.push(Utf8Node { |
1856 | 48.3M | trans: vec![], |
1857 | 48.3M | last: Some(Utf8LastTransition { start: r.start, end: r.end }), |
1858 | 48.3M | }); |
1859 | 48.3M | } |
1860 | 44.2M | } |
1861 | | |
1862 | 2.39M | fn add_empty(&mut self) { |
1863 | 2.39M | self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); |
1864 | 2.39M | } |
1865 | | |
1866 | 48.3M | fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { |
1867 | 48.3M | let mut uncompiled = self.state.uncompiled.pop().unwrap(); |
1868 | 48.3M | uncompiled.set_last_transition(next); |
1869 | 48.3M | uncompiled.trans |
1870 | 48.3M | } |
1871 | | |
1872 | 2.39M | fn pop_root(&mut self) -> Vec<Transition> { |
1873 | 2.39M | assert_eq!(self.state.uncompiled.len(), 1); |
1874 | 2.39M | assert!(self.state.uncompiled[0].last.is_none()); |
1875 | 2.39M | self.state.uncompiled.pop().expect("non-empty nodes").trans |
1876 | 2.39M | } |
1877 | | |
1878 | 46.6M | fn top_last_freeze(&mut self, next: StateID) { |
1879 | 46.6M | let last = self |
1880 | 46.6M | .state |
1881 | 46.6M | .uncompiled |
1882 | 46.6M | .len() |
1883 | 46.6M | .checked_sub(1) |
1884 | 46.6M | .expect("non-empty nodes"); |
1885 | 46.6M | self.state.uncompiled[last].set_last_transition(next); |
1886 | 46.6M | } |
1887 | | } |
1888 | | |
1889 | | impl Utf8Node { |
1890 | 95.0M | fn set_last_transition(&mut self, next: StateID) { |
1891 | 95.0M | if let Some(last) = self.last.take() { |
1892 | 92.6M | self.trans.push(Transition { |
1893 | 92.6M | start: last.start, |
1894 | 92.6M | end: last.end, |
1895 | 92.6M | next, |
1896 | 92.6M | }); |
1897 | 92.6M | } |
1898 | 95.0M | } |
1899 | | } |
1900 | | |
1901 | | #[cfg(test)] |
1902 | | mod tests { |
1903 | | use alloc::vec; |
1904 | | |
1905 | | use crate::{ |
1906 | | nfa::thompson::{SparseTransitions, State}, |
1907 | | util::primitives::SmallIndex, |
1908 | | }; |
1909 | | |
1910 | | use super::*; |
1911 | | |
1912 | | fn build(pattern: &str) -> NFA { |
1913 | | NFA::compiler() |
1914 | | .configure( |
1915 | | NFA::config() |
1916 | | .which_captures(WhichCaptures::None) |
1917 | | .unanchored_prefix(false), |
1918 | | ) |
1919 | | .build(pattern) |
1920 | | .unwrap() |
1921 | | } |
1922 | | |
1923 | | fn pid(id: usize) -> PatternID { |
1924 | | PatternID::new(id).unwrap() |
1925 | | } |
1926 | | |
1927 | | fn sid(id: usize) -> StateID { |
1928 | | StateID::new(id).unwrap() |
1929 | | } |
1930 | | |
1931 | | fn s_byte(byte: u8, next: usize) -> State { |
1932 | | let next = sid(next); |
1933 | | let trans = Transition { start: byte, end: byte, next }; |
1934 | | State::ByteRange { trans } |
1935 | | } |
1936 | | |
1937 | | fn s_range(start: u8, end: u8, next: usize) -> State { |
1938 | | let next = sid(next); |
1939 | | let trans = Transition { start, end, next }; |
1940 | | State::ByteRange { trans } |
1941 | | } |
1942 | | |
1943 | | fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { |
1944 | | let transitions = transitions |
1945 | | .iter() |
1946 | | .map(|&(start, end, next)| Transition { |
1947 | | start, |
1948 | | end, |
1949 | | next: sid(next), |
1950 | | }) |
1951 | | .collect(); |
1952 | | State::Sparse(SparseTransitions { transitions }) |
1953 | | } |
1954 | | |
1955 | | fn s_look(look: Look, next: usize) -> State { |
1956 | | let next = sid(next); |
1957 | | State::Look { look, next } |
1958 | | } |
1959 | | |
1960 | | fn s_bin_union(alt1: usize, alt2: usize) -> State { |
1961 | | State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } |
1962 | | } |
1963 | | |
1964 | | fn s_union(alts: &[usize]) -> State { |
1965 | | State::Union { |
1966 | | alternates: alts |
1967 | | .iter() |
1968 | | .map(|&id| sid(id)) |
1969 | | .collect::<Vec<StateID>>() |
1970 | | .into_boxed_slice(), |
1971 | | } |
1972 | | } |
1973 | | |
1974 | | fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { |
1975 | | State::Capture { |
1976 | | next: sid(next), |
1977 | | pattern_id: pid(pattern), |
1978 | | group_index: SmallIndex::new(index).unwrap(), |
1979 | | slot: SmallIndex::new(slot).unwrap(), |
1980 | | } |
1981 | | } |
1982 | | |
1983 | | fn s_fail() -> State { |
1984 | | State::Fail |
1985 | | } |
1986 | | |
1987 | | fn s_match(id: usize) -> State { |
1988 | | State::Match { pattern_id: pid(id) } |
1989 | | } |
1990 | | |
1991 | | // Test that building an unanchored NFA has an appropriate `(?s:.)*?` |
1992 | | // prefix. |
1993 | | #[test] |
1994 | | fn compile_unanchored_prefix() { |
1995 | | let nfa = NFA::compiler() |
1996 | | .configure(NFA::config().which_captures(WhichCaptures::None)) |
1997 | | .build(r"a") |
1998 | | .unwrap(); |
1999 | | assert_eq!( |
2000 | | nfa.states(), |
2001 | | &[ |
2002 | | s_bin_union(2, 1), |
2003 | | s_range(0, 255, 0), |
2004 | | s_byte(b'a', 3), |
2005 | | s_match(0), |
2006 | | ] |
2007 | | ); |
2008 | | } |
2009 | | |
2010 | | #[test] |
2011 | | fn compile_no_unanchored_prefix_with_start_anchor() { |
2012 | | let nfa = NFA::compiler() |
2013 | | .configure(NFA::config().which_captures(WhichCaptures::None)) |
2014 | | .build(r"^a") |
2015 | | .unwrap(); |
2016 | | assert_eq!( |
2017 | | nfa.states(), |
2018 | | &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] |
2019 | | ); |
2020 | | } |
2021 | | |
2022 | | #[test] |
2023 | | fn compile_yes_unanchored_prefix_with_end_anchor() { |
2024 | | let nfa = NFA::compiler() |
2025 | | .configure(NFA::config().which_captures(WhichCaptures::None)) |
2026 | | .build(r"a$") |
2027 | | .unwrap(); |
2028 | | assert_eq!( |
2029 | | nfa.states(), |
2030 | | &[ |
2031 | | s_bin_union(2, 1), |
2032 | | s_range(0, 255, 0), |
2033 | | s_byte(b'a', 3), |
2034 | | s_look(Look::End, 4), |
2035 | | s_match(0), |
2036 | | ] |
2037 | | ); |
2038 | | } |
2039 | | |
2040 | | #[test] |
2041 | | fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { |
2042 | | let nfa = NFA::compiler() |
2043 | | .configure( |
2044 | | NFA::config() |
2045 | | .reverse(true) |
2046 | | .which_captures(WhichCaptures::None), |
2047 | | ) |
2048 | | .build(r"^a") |
2049 | | .unwrap(); |
2050 | | assert_eq!( |
2051 | | nfa.states(), |
2052 | | &[ |
2053 | | s_bin_union(2, 1), |
2054 | | s_range(0, 255, 0), |
2055 | | s_byte(b'a', 3), |
2056 | | // Anchors get flipped in a reverse automaton. |
2057 | | s_look(Look::End, 4), |
2058 | | s_match(0), |
2059 | | ], |
2060 | | ); |
2061 | | } |
2062 | | |
2063 | | #[test] |
2064 | | fn compile_no_reverse_unanchored_prefix_with_end_anchor() { |
2065 | | let nfa = NFA::compiler() |
2066 | | .configure( |
2067 | | NFA::config() |
2068 | | .reverse(true) |
2069 | | .which_captures(WhichCaptures::None), |
2070 | | ) |
2071 | | .build(r"a$") |
2072 | | .unwrap(); |
2073 | | assert_eq!( |
2074 | | nfa.states(), |
2075 | | &[ |
2076 | | // Anchors get flipped in a reverse automaton. |
2077 | | s_look(Look::Start, 1), |
2078 | | s_byte(b'a', 2), |
2079 | | s_match(0), |
2080 | | ], |
2081 | | ); |
2082 | | } |
2083 | | |
2084 | | #[test] |
2085 | | fn compile_empty() { |
2086 | | assert_eq!(build("").states(), &[s_match(0),]); |
2087 | | } |
2088 | | |
2089 | | #[test] |
2090 | | fn compile_literal() { |
2091 | | assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]); |
2092 | | assert_eq!( |
2093 | | build("ab").states(), |
2094 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] |
2095 | | ); |
2096 | | assert_eq!( |
2097 | | build("ā").states(), |
2098 | | &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] |
2099 | | ); |
2100 | | |
2101 | | // Check that non-UTF-8 literals work. |
2102 | | let nfa = NFA::compiler() |
2103 | | .configure( |
2104 | | NFA::config() |
2105 | | .which_captures(WhichCaptures::None) |
2106 | | .unanchored_prefix(false), |
2107 | | ) |
2108 | | .syntax(crate::util::syntax::Config::new().utf8(false)) |
2109 | | .build(r"(?-u)\xFF") |
2110 | | .unwrap(); |
2111 | | assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]); |
2112 | | } |
2113 | | |
2114 | | #[test] |
2115 | | fn compile_class_ascii() { |
2116 | | assert_eq!( |
2117 | | build(r"[a-z]").states(), |
2118 | | &[s_range(b'a', b'z', 1), s_match(0),] |
2119 | | ); |
2120 | | assert_eq!( |
2121 | | build(r"[x-za-c]").states(), |
2122 | | &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] |
2123 | | ); |
2124 | | } |
2125 | | |
2126 | | #[test] |
2127 | | #[cfg(not(miri))] |
2128 | | fn compile_class_unicode() { |
2129 | | assert_eq!( |
2130 | | build(r"[\u03B1-\u03B4]").states(), |
2131 | | &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] |
2132 | | ); |
2133 | | assert_eq!( |
2134 | | build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(), |
2135 | | &[ |
2136 | | s_range(0xB1, 0xB4, 5), |
2137 | | s_range(0x99, 0x9E, 5), |
2138 | | s_byte(0xA4, 1), |
2139 | | s_byte(0x9F, 2), |
2140 | | s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), |
2141 | | s_match(0), |
2142 | | ] |
2143 | | ); |
2144 | | assert_eq!( |
2145 | | build(r"[a-zā]").states(), |
2146 | | &[ |
2147 | | s_byte(0x83, 3), |
2148 | | s_byte(0x98, 0), |
2149 | | s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), |
2150 | | s_match(0), |
2151 | | ] |
2152 | | ); |
2153 | | } |
2154 | | |
2155 | | #[test] |
2156 | | fn compile_repetition() { |
2157 | | assert_eq!( |
2158 | | build(r"a?").states(), |
2159 | | &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),] |
2160 | | ); |
2161 | | assert_eq!( |
2162 | | build(r"a??").states(), |
2163 | | &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),] |
2164 | | ); |
2165 | | } |
2166 | | |
2167 | | #[test] |
2168 | | fn compile_group() { |
2169 | | assert_eq!( |
2170 | | build(r"ab+").states(), |
2171 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)] |
2172 | | ); |
2173 | | assert_eq!( |
2174 | | build(r"(ab)").states(), |
2175 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] |
2176 | | ); |
2177 | | assert_eq!( |
2178 | | build(r"(ab)+").states(), |
2179 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)] |
2180 | | ); |
2181 | | } |
2182 | | |
2183 | | #[test] |
2184 | | fn compile_alternation() { |
2185 | | assert_eq!( |
2186 | | build(r"a|b").states(), |
2187 | | &[s_range(b'a', b'b', 1), s_match(0)] |
2188 | | ); |
2189 | | assert_eq!( |
2190 | | build(r"ab|cd").states(), |
2191 | | &[ |
2192 | | s_byte(b'b', 3), |
2193 | | s_byte(b'd', 3), |
2194 | | s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]), |
2195 | | s_match(0) |
2196 | | ], |
2197 | | ); |
2198 | | assert_eq!( |
2199 | | build(r"|b").states(), |
2200 | | &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)] |
2201 | | ); |
2202 | | assert_eq!( |
2203 | | build(r"a|").states(), |
2204 | | &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)] |
2205 | | ); |
2206 | | } |
2207 | | |
2208 | | // This tests the use of a non-binary union, i.e., a state with more than |
2209 | | // 2 unconditional epsilon transitions. The only place they tend to appear |
2210 | | // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' |
2211 | | // and 'sparse' tend to cover all other cases of alternation. |
2212 | | #[test] |
2213 | | fn compile_non_binary_union() { |
2214 | | let nfa = NFA::compiler() |
2215 | | .configure( |
2216 | | NFA::config() |
2217 | | .which_captures(WhichCaptures::None) |
2218 | | .reverse(true) |
2219 | | .shrink(false) |
2220 | | .unanchored_prefix(false), |
2221 | | ) |
2222 | | .build(r"[\u1000\u2000\u3000]") |
2223 | | .unwrap(); |
2224 | | assert_eq!( |
2225 | | nfa.states(), |
2226 | | &[ |
2227 | | s_union(&[3, 6, 9]), |
2228 | | s_byte(0xE1, 10), |
2229 | | s_byte(0x80, 1), |
2230 | | s_byte(0x80, 2), |
2231 | | s_byte(0xE2, 10), |
2232 | | s_byte(0x80, 4), |
2233 | | s_byte(0x80, 5), |
2234 | | s_byte(0xE3, 10), |
2235 | | s_byte(0x80, 7), |
2236 | | s_byte(0x80, 8), |
2237 | | s_match(0), |
2238 | | ] |
2239 | | ); |
2240 | | } |
2241 | | |
2242 | | #[test] |
2243 | | fn compile_many_start_pattern() { |
2244 | | let nfa = NFA::compiler() |
2245 | | .configure( |
2246 | | NFA::config() |
2247 | | .which_captures(WhichCaptures::None) |
2248 | | .unanchored_prefix(false), |
2249 | | ) |
2250 | | .build_many(&["a", "b"]) |
2251 | | .unwrap(); |
2252 | | assert_eq!( |
2253 | | nfa.states(), |
2254 | | &[ |
2255 | | s_byte(b'a', 1), |
2256 | | s_match(0), |
2257 | | s_byte(b'b', 3), |
2258 | | s_match(1), |
2259 | | s_bin_union(0, 2), |
2260 | | ] |
2261 | | ); |
2262 | | assert_eq!(nfa.start_anchored().as_usize(), 4); |
2263 | | assert_eq!(nfa.start_unanchored().as_usize(), 4); |
2264 | | // Test that the start states for each individual pattern are correct. |
2265 | | assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); |
2266 | | assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); |
2267 | | } |
2268 | | |
2269 | | // This tests that our compiler can handle an empty character class. At the |
2270 | | // time of writing, the regex parser forbids it, so the only way to test it |
2271 | | // is to provide a hand written HIR. |
2272 | | #[test] |
2273 | | fn empty_class_bytes() { |
2274 | | use regex_syntax::hir::{Class, ClassBytes, Hir}; |
2275 | | |
2276 | | let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); |
2277 | | let config = NFA::config() |
2278 | | .which_captures(WhichCaptures::None) |
2279 | | .unanchored_prefix(false); |
2280 | | let nfa = |
2281 | | NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); |
2282 | | assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); |
2283 | | } |
2284 | | |
2285 | | // Like empty_class_bytes, but for a Unicode class. |
2286 | | #[test] |
2287 | | fn empty_class_unicode() { |
2288 | | use regex_syntax::hir::{Class, ClassUnicode, Hir}; |
2289 | | |
2290 | | let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); |
2291 | | let config = NFA::config() |
2292 | | .which_captures(WhichCaptures::None) |
2293 | | .unanchored_prefix(false); |
2294 | | let nfa = |
2295 | | NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); |
2296 | | assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); |
2297 | | } |
2298 | | |
2299 | | #[test] |
2300 | | fn compile_captures_all() { |
2301 | | let nfa = NFA::compiler() |
2302 | | .configure( |
2303 | | NFA::config() |
2304 | | .unanchored_prefix(false) |
2305 | | .which_captures(WhichCaptures::All), |
2306 | | ) |
2307 | | .build("a(b)c") |
2308 | | .unwrap(); |
2309 | | assert_eq!( |
2310 | | nfa.states(), |
2311 | | &[ |
2312 | | s_cap(1, 0, 0, 0), |
2313 | | s_byte(b'a', 2), |
2314 | | s_cap(3, 0, 1, 2), |
2315 | | s_byte(b'b', 4), |
2316 | | s_cap(5, 0, 1, 3), |
2317 | | s_byte(b'c', 6), |
2318 | | s_cap(7, 0, 0, 1), |
2319 | | s_match(0) |
2320 | | ] |
2321 | | ); |
2322 | | let ginfo = nfa.group_info(); |
2323 | | assert_eq!(2, ginfo.all_group_len()); |
2324 | | } |
2325 | | |
2326 | | #[test] |
2327 | | fn compile_captures_implicit() { |
2328 | | let nfa = NFA::compiler() |
2329 | | .configure( |
2330 | | NFA::config() |
2331 | | .unanchored_prefix(false) |
2332 | | .which_captures(WhichCaptures::Implicit), |
2333 | | ) |
2334 | | .build("a(b)c") |
2335 | | .unwrap(); |
2336 | | assert_eq!( |
2337 | | nfa.states(), |
2338 | | &[ |
2339 | | s_cap(1, 0, 0, 0), |
2340 | | s_byte(b'a', 2), |
2341 | | s_byte(b'b', 3), |
2342 | | s_byte(b'c', 4), |
2343 | | s_cap(5, 0, 0, 1), |
2344 | | s_match(0) |
2345 | | ] |
2346 | | ); |
2347 | | let ginfo = nfa.group_info(); |
2348 | | assert_eq!(1, ginfo.all_group_len()); |
2349 | | } |
2350 | | |
2351 | | #[test] |
2352 | | fn compile_captures_none() { |
2353 | | let nfa = NFA::compiler() |
2354 | | .configure( |
2355 | | NFA::config() |
2356 | | .unanchored_prefix(false) |
2357 | | .which_captures(WhichCaptures::None), |
2358 | | ) |
2359 | | .build("a(b)c") |
2360 | | .unwrap(); |
2361 | | assert_eq!( |
2362 | | nfa.states(), |
2363 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] |
2364 | | ); |
2365 | | let ginfo = nfa.group_info(); |
2366 | | assert_eq!(0, ginfo.all_group_len()); |
2367 | | } |
2368 | | } |