Coverage Report

Created: 2025-12-12 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/nfa/thompson/compiler.rs
Line
Count
Source
1
use core::{borrow::Borrow, cell::RefCell};
2
3
use alloc::{sync::Arc, vec, vec::Vec};
4
5
use regex_syntax::{
6
    hir::{self, Hir},
7
    utf8::{Utf8Range, Utf8Sequences},
8
    ParserBuilder,
9
};
10
11
use crate::{
12
    nfa::thompson::{
13
        builder::Builder,
14
        error::BuildError,
15
        literal_trie::LiteralTrie,
16
        map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17
        nfa::{Transition, NFA},
18
        range_trie::RangeTrie,
19
    },
20
    util::{
21
        look::{Look, LookMatcher},
22
        primitives::{PatternID, StateID},
23
    },
24
};
25
26
/// The configuration used for a Thompson NFA compiler.
27
#[derive(Clone, Debug, Default)]
28
pub struct Config {
29
    utf8: Option<bool>,
30
    reverse: Option<bool>,
31
    nfa_size_limit: Option<Option<usize>>,
32
    shrink: Option<bool>,
33
    which_captures: Option<WhichCaptures>,
34
    look_matcher: Option<LookMatcher>,
35
    #[cfg(test)]
36
    unanchored_prefix: Option<bool>,
37
}
38
39
impl Config {
40
    /// Return a new default Thompson NFA compiler configuration.
41
72.7k
    pub fn new() -> Config {
42
72.7k
        Config::default()
43
72.7k
    }
44
45
    /// Whether to enable UTF-8 mode during search or not.
46
    ///
47
    /// A regex engine is said to be in UTF-8 mode when it guarantees that
48
    /// all matches returned by it have spans consisting of only valid UTF-8.
49
    /// That is, it is impossible for a match span to be returned that
50
    /// contains any invalid UTF-8.
51
    ///
52
    /// UTF-8 mode generally consists of two things:
53
    ///
54
    /// 1. Whether the NFA's states are constructed such that all paths to a
55
    /// match state that consume at least one byte always correspond to valid
56
    /// UTF-8.
57
    /// 2. Whether all paths to a match state that do _not_ consume any bytes
58
    /// should always correspond to valid UTF-8 boundaries.
59
    ///
60
    /// (1) is a guarantee made by whoever constructs the NFA.
61
    /// If you're parsing a regex from its concrete syntax, then
62
    /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63
    /// this guarantee for you. It does it by returning an error if the regex
64
    /// pattern could every report a non-empty match span that contains invalid
65
    /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66
    /// successfully parses, then you're guaranteed that the corresponding NFA
67
    /// will only ever report non-empty match spans containing valid UTF-8.
68
    ///
69
    /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70
    /// state graph itself. Consider, for example, the regex `a*`. It matches
71
    /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72
    /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73
    /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74
    /// guarantee must be made at a higher level than the NFA state graph
75
    /// itself. This crate deals with this case in each regex engine. Namely,
76
    /// when a zero-width match that splits a codepoint is found and UTF-8
77
    /// mode enabled, then it is ignored and the engine moves on looking for
78
    /// the next match.
79
    ///
80
    /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81
    /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82
    /// engines that empty matches that split codepoints should be banned.
83
    ///
84
    /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85
    /// it only makes sense to enable this option when you *know* your haystack
86
    /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87
    /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88
    /// behavior**.
89
    ///
90
    /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91
    /// simultaneously *disabling* this option. That would ensure all non-empty
92
    /// match spans are valid UTF-8, but that empty match spans may still split
93
    /// a codepoint or match at other places that aren't valid UTF-8.
94
    ///
95
    /// In general, this mode is only relevant if your regex can match the
96
    /// empty string. Most regexes don't.
97
    ///
98
    /// This is enabled by default.
99
    ///
100
    /// # Example
101
    ///
102
    /// This example shows how UTF-8 mode can impact the match spans that may
103
    /// be reported in certain cases.
104
    ///
105
    /// ```
106
    /// use regex_automata::{
107
    ///     nfa::thompson::{self, pikevm::PikeVM},
108
    ///     Match, Input,
109
    /// };
110
    ///
111
    /// let re = PikeVM::new("")?;
112
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113
    ///
114
    /// // UTF-8 mode is enabled by default.
115
    /// let mut input = Input::new("☃");
116
    /// re.search(&mut cache, &input, &mut caps);
117
    /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118
    ///
119
    /// // Even though an empty regex matches at 1..1, our next match is
120
    /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121
    /// // three bytes long).
122
    /// input.set_start(1);
123
    /// re.search(&mut cache, &input, &mut caps);
124
    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125
    ///
126
    /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127
    /// let re = PikeVM::builder()
128
    ///     .thompson(thompson::Config::new().utf8(false))
129
    ///     .build("")?;
130
    /// re.search(&mut cache, &input, &mut caps);
131
    /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132
    ///
133
    /// input.set_start(2);
134
    /// re.search(&mut cache, &input, &mut caps);
135
    /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136
    ///
137
    /// input.set_start(3);
138
    /// re.search(&mut cache, &input, &mut caps);
139
    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140
    ///
141
    /// input.set_start(4);
142
    /// re.search(&mut cache, &input, &mut caps);
143
    /// assert_eq!(None, caps.get_match());
144
    ///
145
    /// # Ok::<(), Box<dyn std::error::Error>>(())
146
    /// ```
147
72.7k
    pub fn utf8(mut self, yes: bool) -> Config {
148
72.7k
        self.utf8 = Some(yes);
149
72.7k
        self
150
72.7k
    }
151
152
    /// Reverse the NFA.
153
    ///
154
    /// A NFA reversal is performed by reversing all of the concatenated
155
    /// sub-expressions in the original pattern, recursively. (Look around
156
    /// operators are also inverted.) The resulting NFA can be used to match
157
    /// the pattern starting from the end of a string instead of the beginning
158
    /// of a string.
159
    ///
160
    /// Reversing the NFA is useful for building a reverse DFA, which is most
161
    /// useful for finding the start of a match after its ending position has
162
    /// been found. NFA execution engines typically do not work on reverse
163
    /// NFAs. For example, currently, the Pike VM reports the starting location
164
    /// of matches without a reverse NFA.
165
    ///
166
    /// Currently, enabling this setting requires disabling the
167
    /// [`captures`](Config::captures) setting. If both are enabled, then the
168
    /// compiler will return an error. It is expected that this limitation will
169
    /// be lifted in the future.
170
    ///
171
    /// This is disabled by default.
172
    ///
173
    /// # Example
174
    ///
175
    /// This example shows how to build a DFA from a reverse NFA, and then use
176
    /// the DFA to search backwards.
177
    ///
178
    /// ```
179
    /// use regex_automata::{
180
    ///     dfa::{self, Automaton},
181
    ///     nfa::thompson::{NFA, WhichCaptures},
182
    ///     HalfMatch, Input,
183
    /// };
184
    ///
185
    /// let dfa = dfa::dense::Builder::new()
186
    ///     .thompson(NFA::config()
187
    ///         .which_captures(WhichCaptures::None)
188
    ///         .reverse(true)
189
    ///     )
190
    ///     .build("baz[0-9]+")?;
191
    /// let expected = Some(HalfMatch::must(0, 3));
192
    /// assert_eq!(
193
    ///     expected,
194
    ///     dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195
    /// );
196
    ///
197
    /// # Ok::<(), Box<dyn std::error::Error>>(())
198
    /// ```
199
70.7k
    pub fn reverse(mut self, yes: bool) -> Config {
200
70.7k
        self.reverse = Some(yes);
201
70.7k
        self
202
70.7k
    }
203
204
    /// Sets an approximate size limit on the total heap used by the NFA being
205
    /// compiled.
206
    ///
207
    /// This permits imposing constraints on the size of a compiled NFA. This
208
    /// may be useful in contexts where the regex pattern is untrusted and one
209
    /// wants to avoid using too much memory.
210
    ///
211
    /// This size limit does not apply to auxiliary heap used during
212
    /// compilation that is not part of the built NFA.
213
    ///
214
    /// Note that this size limit is applied during compilation in order for
215
    /// the limit to prevent too much heap from being used. However, the
216
    /// implementation may use an intermediate NFA representation that is
217
    /// otherwise slightly bigger than the final public form. Since the size
218
    /// limit may be applied to an intermediate representation, there is not
219
    /// necessarily a precise correspondence between the configured size limit
220
    /// and the heap usage of the final NFA.
221
    ///
222
    /// There is no size limit by default.
223
    ///
224
    /// # Example
225
    ///
226
    /// This example demonstrates how Unicode mode can greatly increase the
227
    /// size of the NFA.
228
    ///
229
    /// ```
230
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231
    /// use regex_automata::nfa::thompson::NFA;
232
    ///
233
    /// // 300KB isn't enough!
234
    /// NFA::compiler()
235
    ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
236
    ///     .build(r"\w{20}")
237
    ///     .unwrap_err();
238
    ///
239
    /// // ... but 500KB probably is.
240
    /// let nfa = NFA::compiler()
241
    ///     .configure(NFA::config().nfa_size_limit(Some(500_000)))
242
    ///     .build(r"\w{20}")?;
243
    ///
244
    /// assert_eq!(nfa.pattern_len(), 1);
245
    ///
246
    /// # Ok::<(), Box<dyn std::error::Error>>(())
247
    /// ```
248
72.7k
    pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249
72.7k
        self.nfa_size_limit = Some(bytes);
250
72.7k
        self
251
72.7k
    }
252
253
    /// Apply best effort heuristics to shrink the NFA at the expense of more
254
    /// time/memory.
255
    ///
256
    /// Generally speaking, if one is using an NFA to compile a DFA, then the
257
    /// extra time used to shrink the NFA will be more than made up for during
258
    /// DFA construction (potentially by a lot). In other words, enabling this
259
    /// can substantially decrease the overall amount of time it takes to build
260
    /// a DFA.
261
    ///
262
    /// A reason to keep this disabled is if you want to compile an NFA and
263
    /// start using it as quickly as possible without needing to build a DFA,
264
    /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265
    /// an NFA simulation or for a lazy DFA.
266
    ///
267
    /// NFA shrinking is currently most useful when compiling a reverse
268
    /// NFA with large Unicode character classes. In particular, it trades
269
    /// additional CPU time during NFA compilation in favor of generating fewer
270
    /// NFA states.
271
    ///
272
    /// This is disabled by default because it can increase compile times
273
    /// quite a bit if you aren't building a full DFA.
274
    ///
275
    /// # Example
276
    ///
277
    /// This example shows that NFA shrinking can lead to substantial space
278
    /// savings in some cases. Notice that, as noted above, we build a reverse
279
    /// DFA and use a pattern with a large Unicode character class.
280
    ///
281
    /// ```
282
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283
    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284
    ///
285
    /// // Currently we have to disable captures when enabling reverse NFA.
286
    /// let config = NFA::config()
287
    ///     .which_captures(WhichCaptures::None)
288
    ///     .reverse(true);
289
    /// let not_shrunk = NFA::compiler()
290
    ///     .configure(config.clone().shrink(false))
291
    ///     .build(r"\w")?;
292
    /// let shrunk = NFA::compiler()
293
    ///     .configure(config.clone().shrink(true))
294
    ///     .build(r"\w")?;
295
    ///
296
    /// // While a specific shrink factor is not guaranteed, the savings can be
297
    /// // considerable in some cases.
298
    /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299
    ///
300
    /// # Ok::<(), Box<dyn std::error::Error>>(())
301
    /// ```
302
72.7k
    pub fn shrink(mut self, yes: bool) -> Config {
303
72.7k
        self.shrink = Some(yes);
304
72.7k
        self
305
72.7k
    }
306
307
    /// Whether to include 'Capture' states in the NFA.
308
    ///
309
    /// Currently, enabling this setting requires disabling the
310
    /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311
    /// compiler will return an error. It is expected that this limitation will
312
    /// be lifted in the future.
313
    ///
314
    /// This is enabled by default.
315
    ///
316
    /// # Example
317
    ///
318
    /// This example demonstrates that some regex engines, like the Pike VM,
319
    /// require capturing states to be present in the NFA to report match
320
    /// offsets.
321
    ///
322
    /// (Note that since this method is deprecated, the example below uses
323
    /// [`Config::which_captures`] to disable capture states.)
324
    ///
325
    /// ```
326
    /// use regex_automata::nfa::thompson::{
327
    ///     pikevm::PikeVM,
328
    ///     NFA,
329
    ///     WhichCaptures,
330
    /// };
331
    ///
332
    /// let re = PikeVM::builder()
333
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
334
    ///     .build(r"[a-z]+")?;
335
    /// let mut cache = re.create_cache();
336
    ///
337
    /// assert!(re.is_match(&mut cache, "abc"));
338
    /// assert_eq!(None, re.find(&mut cache, "abc"));
339
    ///
340
    /// # Ok::<(), Box<dyn std::error::Error>>(())
341
    /// ```
342
    #[deprecated(since = "0.3.5", note = "use which_captures instead")]
343
0
    pub fn captures(self, yes: bool) -> Config {
344
0
        self.which_captures(if yes {
345
0
            WhichCaptures::All
346
        } else {
347
0
            WhichCaptures::None
348
        })
349
0
    }
350
351
    /// Configures what kinds of capture groups are compiled into
352
    /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353
    /// Thompson NFA.
354
    ///
355
    /// Currently, using any option except for [`WhichCaptures::None`] requires
356
    /// disabling the [`reverse`](Config::reverse) setting. If both are
357
    /// enabled, then the compiler will return an error. It is expected that
358
    /// this limitation will be lifted in the future.
359
    ///
360
    /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361
    /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362
    /// overhead of capture states for explicit groups. Usually this occurs
363
    /// when one wants to use the `PikeVM` only for determining the overall
364
    /// match. Otherwise, the `PikeVM` could use much more memory than is
365
    /// necessary.
366
    ///
367
    /// # Example
368
    ///
369
    /// This example demonstrates that some regex engines, like the Pike VM,
370
    /// require capturing states to be present in the NFA to report match
371
    /// offsets.
372
    ///
373
    /// ```
374
    /// use regex_automata::nfa::thompson::{
375
    ///     pikevm::PikeVM,
376
    ///     NFA,
377
    ///     WhichCaptures,
378
    /// };
379
    ///
380
    /// let re = PikeVM::builder()
381
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
382
    ///     .build(r"[a-z]+")?;
383
    /// let mut cache = re.create_cache();
384
    ///
385
    /// assert!(re.is_match(&mut cache, "abc"));
386
    /// assert_eq!(None, re.find(&mut cache, "abc"));
387
    ///
388
    /// # Ok::<(), Box<dyn std::error::Error>>(())
389
    /// ```
390
    ///
391
    /// The same applies to the bounded backtracker:
392
    ///
393
    /// ```
394
    /// use regex_automata::nfa::thompson::{
395
    ///     backtrack::BoundedBacktracker,
396
    ///     NFA,
397
    ///     WhichCaptures,
398
    /// };
399
    ///
400
    /// let re = BoundedBacktracker::builder()
401
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
402
    ///     .build(r"[a-z]+")?;
403
    /// let mut cache = re.create_cache();
404
    ///
405
    /// assert!(re.try_is_match(&mut cache, "abc")?);
406
    /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407
    ///
408
    /// # Ok::<(), Box<dyn std::error::Error>>(())
409
    /// ```
410
138k
    pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411
138k
        self.which_captures = Some(which_captures);
412
138k
        self
413
138k
    }
414
415
    /// Sets the look-around matcher that should be used with this NFA.
416
    ///
417
    /// A look-around matcher determines how to match look-around assertions.
418
    /// In particular, some assertions are configurable. For example, the
419
    /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420
    /// from the default of `\n` to any other byte.
421
    ///
422
    /// # Example
423
    ///
424
    /// This shows how to change the line terminator for multi-line assertions.
425
    ///
426
    /// ```
427
    /// use regex_automata::{
428
    ///     nfa::thompson::{self, pikevm::PikeVM},
429
    ///     util::look::LookMatcher,
430
    ///     Match, Input,
431
    /// };
432
    ///
433
    /// let mut lookm = LookMatcher::new();
434
    /// lookm.set_line_terminator(b'\x00');
435
    ///
436
    /// let re = PikeVM::builder()
437
    ///     .thompson(thompson::Config::new().look_matcher(lookm))
438
    ///     .build(r"(?m)^[a-z]+$")?;
439
    /// let mut cache = re.create_cache();
440
    ///
441
    /// // Multi-line assertions now use NUL as a terminator.
442
    /// assert_eq!(
443
    ///     Some(Match::must(0, 1..4)),
444
    ///     re.find(&mut cache, b"\x00abc\x00"),
445
    /// );
446
    /// // ... and \n is no longer recognized as a terminator.
447
    /// assert_eq!(
448
    ///     None,
449
    ///     re.find(&mut cache, b"\nabc\n"),
450
    /// );
451
    ///
452
    /// # Ok::<(), Box<dyn std::error::Error>>(())
453
    /// ```
454
72.7k
    pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455
72.7k
        self.look_matcher = Some(m);
456
72.7k
        self
457
72.7k
    }
458
459
    /// Whether to compile an unanchored prefix into this NFA.
460
    ///
461
    /// This is enabled by default. It is made available for tests only to make
462
    /// it easier to unit test the output of the compiler.
463
    #[cfg(test)]
464
    fn unanchored_prefix(mut self, yes: bool) -> Config {
465
        self.unanchored_prefix = Some(yes);
466
        self
467
    }
468
469
    /// Returns whether this configuration has enabled UTF-8 mode.
470
138k
    pub fn get_utf8(&self) -> bool {
471
138k
        self.utf8.unwrap_or(true)
472
138k
    }
473
474
    /// Returns whether this configuration has enabled reverse NFA compilation.
475
189M
    pub fn get_reverse(&self) -> bool {
476
189M
        self.reverse.unwrap_or(false)
477
189M
    }
478
479
    /// Return the configured NFA size limit, if it exists, in the number of
480
    /// bytes of heap used.
481
138k
    pub fn get_nfa_size_limit(&self) -> Option<usize> {
482
138k
        self.nfa_size_limit.unwrap_or(None)
483
138k
    }
484
485
    /// Return whether NFA shrinking is enabled.
486
1.13M
    pub fn get_shrink(&self) -> bool {
487
1.13M
        self.shrink.unwrap_or(false)
488
1.13M
    }
489
490
    /// Return whether NFA compilation is configured to produce capture states.
491
    #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492
0
    pub fn get_captures(&self) -> bool {
493
0
        self.get_which_captures().is_any()
494
0
    }
495
496
    /// Return what kinds of capture states will be compiled into an NFA.
497
4.99M
    pub fn get_which_captures(&self) -> WhichCaptures {
498
4.99M
        self.which_captures.unwrap_or(WhichCaptures::All)
499
4.99M
    }
500
501
    /// Return the look-around matcher for this NFA.
502
138k
    pub fn get_look_matcher(&self) -> LookMatcher {
503
138k
        self.look_matcher.clone().unwrap_or(LookMatcher::default())
504
138k
    }
505
506
    /// Return whether NFA compilation is configured to include an unanchored
507
    /// prefix.
508
    ///
509
    /// This is always false when not in test mode.
510
138k
    fn get_unanchored_prefix(&self) -> bool {
511
        #[cfg(test)]
512
        {
513
            self.unanchored_prefix.unwrap_or(true)
514
        }
515
        #[cfg(not(test))]
516
        {
517
138k
            true
518
        }
519
138k
    }
520
521
    /// Overwrite the default configuration such that the options in `o` are
522
    /// always used. If an option in `o` is not set, then the corresponding
523
    /// option in `self` is used. If it's not set in `self` either, then it
524
    /// remains not set.
525
138k
    pub(crate) fn overwrite(&self, o: Config) -> Config {
526
        Config {
527
138k
            utf8: o.utf8.or(self.utf8),
528
138k
            reverse: o.reverse.or(self.reverse),
529
138k
            nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530
138k
            shrink: o.shrink.or(self.shrink),
531
138k
            which_captures: o.which_captures.or(self.which_captures),
532
138k
            look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533
            #[cfg(test)]
534
            unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535
        }
536
138k
    }
537
}
538
539
/// A configuration indicating which kinds of
540
/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541
///
542
/// This configuration can be used with [`Config::which_captures`] to control
543
/// which capture states are compiled into a Thompson NFA.
544
///
545
/// The default configuration is [`WhichCaptures::All`].
546
#[derive(Clone, Copy, Debug)]
547
pub enum WhichCaptures {
548
    /// All capture states, including those corresponding to both implicit and
549
    /// explicit capture groups, are included in the Thompson NFA.
550
    All,
551
    /// Only capture states corresponding to implicit capture groups are
552
    /// included. Implicit capture groups appear in every pattern implicitly
553
    /// and correspond to the overall match of a pattern.
554
    ///
555
    /// This is useful when one only cares about the overall match of a
556
    /// pattern. By excluding capture states from explicit capture groups,
557
    /// one might be able to reduce the memory usage of a multi-pattern regex
558
    /// substantially if it was otherwise written to have many explicit capture
559
    /// groups.
560
    Implicit,
561
    /// No capture states are compiled into the Thompson NFA.
562
    ///
563
    /// This is useful when capture states are either not needed (for example,
564
    /// if one is only trying to build a DFA) or if they aren't supported (for
565
    /// example, a reverse NFA).
566
    ///
567
    /// # Warning
568
    ///
569
    /// Callers must be exceedingly careful when using this
570
    /// option. In particular, not all regex engines support
571
    /// reporting match spans when using this option (for example,
572
    /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or
573
    /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)).
574
    ///
575
    /// Perhaps more confusingly, using this option with such an
576
    /// engine means that an `is_match` routine could report `true`
577
    /// when `find` reports `None`. This is generally not something
578
    /// that _should_ happen, but the low level control provided by
579
    /// this crate makes it possible.
580
    ///
581
    /// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex))
582
    /// should always return `None` from `find` routines when this option is
583
    /// used, even if _some_ of its internal engines could find the match
584
    /// boundaries. This is because inputs from user data could influence
585
    /// engine selection, and thus influence whether a match is found or not.
586
    /// Indeed, `meta::Regex::find` will always return `None` when configured
587
    /// with this option.
588
    None,
589
}
590
591
impl Default for WhichCaptures {
592
0
    fn default() -> WhichCaptures {
593
0
        WhichCaptures::All
594
0
    }
595
}
596
597
impl WhichCaptures {
598
    /// Returns true if this configuration indicates that no capture states
599
    /// should be produced in an NFA.
600
70.7k
    pub fn is_none(&self) -> bool {
601
70.7k
        matches!(*self, WhichCaptures::None)
602
70.7k
    }
603
604
    /// Returns true if this configuration indicates that some capture states
605
    /// should be added to an NFA. Note that this might only include capture
606
    /// states for implicit capture groups.
607
70.7k
    pub fn is_any(&self) -> bool {
608
70.7k
        !self.is_none()
609
70.7k
    }
610
}
611
612
/*
613
This compiler below uses Thompson's construction algorithm. The compiler takes
614
a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
615
is structured in a way that permits it to be executed by a virtual machine and
616
also used to efficiently build a DFA.
617
618
The compiler deals with a slightly expanded set of NFA states than what is
619
in a final NFA (as exhibited by builder::State and nfa::State). Notably a
620
compiler state includes an empty node that has exactly one unconditional
621
epsilon transition to the next state. In other words, it's a "goto" instruction
622
if one views Thompson's NFA as a set of bytecode instructions. These goto
623
instructions are removed in a subsequent phase before returning the NFA to the
624
caller. The purpose of these empty nodes is that they make the construction
625
algorithm substantially simpler to implement. We remove them before returning
626
to the caller because they can represent substantial overhead when traversing
627
the NFA graph (either while searching using the NFA directly or while building
628
a DFA).
629
630
In the future, it would be nice to provide a Glushkov compiler as well, as it
631
would work well as a bit-parallel NFA for smaller regexes. But the Thompson
632
construction is one I'm more familiar with and seems more straight-forward to
633
deal with when it comes to large Unicode character classes.
634
635
Internally, the compiler uses interior mutability to improve composition in the
636
face of the borrow checker. In particular, we'd really like to be able to write
637
things like this:
638
639
    self.c_concat(exprs.iter().map(|e| self.c(e)))
640
641
Which elegantly uses iterators to build up a sequence of compiled regex
642
sub-expressions and then hands it off to the concatenating compiler routine.
643
Without interior mutability, the borrow checker won't let us borrow `self`
644
mutably both inside and outside the closure at the same time.
645
*/
646
647
/// A builder for compiling an NFA from a regex's high-level intermediate
648
/// representation (HIR).
649
///
650
/// This compiler provides a way to translate a parsed regex pattern into an
651
/// NFA state graph. The NFA state graph can either be used directly to execute
652
/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
653
///
654
/// This compiler provides APIs both for compiling regex patterns directly from
655
/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
656
///
657
/// This compiler has various options that may be configured via
658
/// [`thompson::Config`](Config).
659
///
660
/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
661
/// A `Builder` provides a lower level API that is uncoupled from a regex
662
/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
663
/// together an NFA by hand. See its docs for examples.
664
///
665
/// # Example: compilation from concrete syntax
666
///
667
/// This shows how to compile an NFA from a pattern string while setting a size
668
/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
669
///
670
/// ```
671
/// use regex_automata::{
672
///     nfa::thompson::{NFA, pikevm::PikeVM},
673
///     Match,
674
/// };
675
///
676
/// let config = NFA::config().nfa_size_limit(Some(1_000));
677
/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
678
///
679
/// let re = PikeVM::new_from_nfa(nfa)?;
680
/// let mut cache = re.create_cache();
681
/// let mut caps = re.create_captures();
682
/// let expected = Some(Match::must(0, 3..4));
683
/// re.captures(&mut cache, "!@#A#@!", &mut caps);
684
/// assert_eq!(expected, caps.get_match());
685
///
686
/// # Ok::<(), Box<dyn std::error::Error>>(())
687
/// ```
688
///
689
/// # Example: compilation from HIR
690
///
691
/// This shows how to hand assemble a regular expression via its HIR, and then
692
/// compile an NFA directly from it.
693
///
694
/// ```
695
/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
696
/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
697
///
698
/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
699
///     ClassBytesRange::new(b'0', b'9'),
700
///     ClassBytesRange::new(b'A', b'Z'),
701
///     ClassBytesRange::new(b'_', b'_'),
702
///     ClassBytesRange::new(b'a', b'z'),
703
/// ])));
704
///
705
/// let config = NFA::config().nfa_size_limit(Some(1_000));
706
/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
707
///
708
/// let re = PikeVM::new_from_nfa(nfa)?;
709
/// let mut cache = re.create_cache();
710
/// let mut caps = re.create_captures();
711
/// let expected = Some(Match::must(0, 3..4));
712
/// re.captures(&mut cache, "!@#A#@!", &mut caps);
713
/// assert_eq!(expected, caps.get_match());
714
///
715
/// # Ok::<(), Box<dyn std::error::Error>>(())
716
/// ```
717
#[derive(Clone, Debug)]
718
pub struct Compiler {
719
    /// A regex parser, used when compiling an NFA directly from a pattern
720
    /// string.
721
    parser: ParserBuilder,
722
    /// The compiler configuration.
723
    config: Config,
724
    /// The builder for actually constructing an NFA. This provides a
725
    /// convenient abstraction for writing a compiler.
726
    builder: RefCell<Builder>,
727
    /// State used for compiling character classes to UTF-8 byte automata.
728
    /// State is not retained between character class compilations. This just
729
    /// serves to amortize allocation to the extent possible.
730
    utf8_state: RefCell<Utf8State>,
731
    /// State used for arranging character classes in reverse into a trie.
732
    trie_state: RefCell<RangeTrie>,
733
    /// State used for caching common suffixes when compiling reverse UTF-8
734
    /// automata (for Unicode character classes).
735
    utf8_suffix: RefCell<Utf8SuffixMap>,
736
}
737
738
impl Compiler {
739
    /// Create a new NFA builder with its default configuration.
740
506k
    pub fn new() -> Compiler {
741
506k
        Compiler {
742
506k
            parser: ParserBuilder::new(),
743
506k
            config: Config::default(),
744
506k
            builder: RefCell::new(Builder::new()),
745
506k
            utf8_state: RefCell::new(Utf8State::new()),
746
506k
            trie_state: RefCell::new(RangeTrie::new()),
747
506k
            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
748
506k
        }
749
506k
    }
750
751
    /// Compile the given regular expression pattern into an NFA.
752
    ///
753
    /// If there was a problem parsing the regex, then that error is returned.
754
    ///
755
    /// Otherwise, if there was a problem building the NFA, then an error is
756
    /// returned. The only error that can occur is if the compiled regex would
757
    /// exceed the size limits configured on this builder, or if any part of
758
    /// the NFA would exceed the integer representations used. (For example,
759
    /// too many states might plausibly occur on a 16-bit target.)
760
    ///
761
    /// # Example
762
    ///
763
    /// ```
764
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
765
    ///
766
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
767
    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
768
    ///
769
    /// let re = PikeVM::new_from_nfa(nfa)?;
770
    /// let mut cache = re.create_cache();
771
    /// let mut caps = re.create_captures();
772
    /// let expected = Some(Match::must(0, 3..4));
773
    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
774
    /// assert_eq!(expected, caps.get_match());
775
    ///
776
    /// # Ok::<(), Box<dyn std::error::Error>>(())
777
    /// ```
778
0
    pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
779
0
        self.build_many(&[pattern])
780
0
    }
781
782
    /// Compile the given regular expression patterns into a single NFA.
783
    ///
784
    /// When matches are returned, the pattern ID corresponds to the index of
785
    /// the pattern in the slice given.
786
    ///
787
    /// # Example
788
    ///
789
    /// ```
790
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
791
    ///
792
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
793
    /// let nfa = NFA::compiler().configure(config).build_many(&[
794
    ///     r"(?-u)\s",
795
    ///     r"(?-u)\w",
796
    /// ])?;
797
    ///
798
    /// let re = PikeVM::new_from_nfa(nfa)?;
799
    /// let mut cache = re.create_cache();
800
    /// let mut caps = re.create_captures();
801
    /// let expected = Some(Match::must(1, 1..2));
802
    /// re.captures(&mut cache, "!A! !A!", &mut caps);
803
    /// assert_eq!(expected, caps.get_match());
804
    ///
805
    /// # Ok::<(), Box<dyn std::error::Error>>(())
806
    /// ```
807
0
    pub fn build_many<P: AsRef<str>>(
808
0
        &self,
809
0
        patterns: &[P],
810
0
    ) -> Result<NFA, BuildError> {
811
0
        let mut hirs = vec![];
812
0
        for p in patterns {
813
0
            hirs.push(
814
0
                self.parser
815
0
                    .build()
816
0
                    .parse(p.as_ref())
817
0
                    .map_err(BuildError::syntax)?,
818
            );
819
0
            debug!("parsed: {:?}", p.as_ref());
820
        }
821
0
        self.build_many_from_hir(&hirs)
822
0
    }
823
824
    /// Compile the given high level intermediate representation of a regular
825
    /// expression into an NFA.
826
    ///
827
    /// If there was a problem building the NFA, then an error is returned. The
828
    /// only error that can occur is if the compiled regex would exceed the
829
    /// size limits configured on this builder, or if any part of the NFA would
830
    /// exceed the integer representations used. (For example, too many states
831
    /// might plausibly occur on a 16-bit target.)
832
    ///
833
    /// # Example
834
    ///
835
    /// ```
836
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
837
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
838
    ///
839
    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
840
    ///     ClassBytesRange::new(b'0', b'9'),
841
    ///     ClassBytesRange::new(b'A', b'Z'),
842
    ///     ClassBytesRange::new(b'_', b'_'),
843
    ///     ClassBytesRange::new(b'a', b'z'),
844
    /// ])));
845
    ///
846
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
847
    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
848
    ///
849
    /// let re = PikeVM::new_from_nfa(nfa)?;
850
    /// let mut cache = re.create_cache();
851
    /// let mut caps = re.create_captures();
852
    /// let expected = Some(Match::must(0, 3..4));
853
    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
854
    /// assert_eq!(expected, caps.get_match());
855
    ///
856
    /// # Ok::<(), Box<dyn std::error::Error>>(())
857
    /// ```
858
4.80k
    pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
859
4.80k
        self.build_many_from_hir(&[expr])
860
4.80k
    }
861
862
    /// Compile the given high level intermediate representations of regular
863
    /// expressions into a single NFA.
864
    ///
865
    /// When matches are returned, the pattern ID corresponds to the index of
866
    /// the pattern in the slice given.
867
    ///
868
    /// # Example
869
    ///
870
    /// ```
871
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
872
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
873
    ///
874
    /// let hirs = &[
875
    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
876
    ///         ClassBytesRange::new(b'\t', b'\r'),
877
    ///         ClassBytesRange::new(b' ', b' '),
878
    ///     ]))),
879
    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
880
    ///         ClassBytesRange::new(b'0', b'9'),
881
    ///         ClassBytesRange::new(b'A', b'Z'),
882
    ///         ClassBytesRange::new(b'_', b'_'),
883
    ///         ClassBytesRange::new(b'a', b'z'),
884
    ///     ]))),
885
    /// ];
886
    ///
887
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
888
    /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
889
    ///
890
    /// let re = PikeVM::new_from_nfa(nfa)?;
891
    /// let mut cache = re.create_cache();
892
    /// let mut caps = re.create_captures();
893
    /// let expected = Some(Match::must(1, 1..2));
894
    /// re.captures(&mut cache, "!A! !A!", &mut caps);
895
    /// assert_eq!(expected, caps.get_match());
896
    ///
897
    /// # Ok::<(), Box<dyn std::error::Error>>(())
898
    /// ```
899
138k
    pub fn build_many_from_hir<H: Borrow<Hir>>(
900
138k
        &self,
901
138k
        exprs: &[H],
902
138k
    ) -> Result<NFA, BuildError> {
903
138k
        self.compile(exprs)
904
138k
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<regex_syntax::hir::Hir>
<regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<&regex_syntax::hir::Hir>
Line
Count
Source
899
138k
    pub fn build_many_from_hir<H: Borrow<Hir>>(
900
138k
        &self,
901
138k
        exprs: &[H],
902
138k
    ) -> Result<NFA, BuildError> {
903
138k
        self.compile(exprs)
904
138k
    }
905
906
    /// Apply the given NFA configuration options to this builder.
907
    ///
908
    /// # Example
909
    ///
910
    /// ```
911
    /// use regex_automata::nfa::thompson::NFA;
912
    ///
913
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
914
    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
915
    /// assert_eq!(nfa.pattern_len(), 1);
916
    ///
917
    /// # Ok::<(), Box<dyn std::error::Error>>(())
918
    /// ```
919
138k
    pub fn configure(&mut self, config: Config) -> &mut Compiler {
920
138k
        self.config = self.config.overwrite(config);
921
138k
        self
922
138k
    }
923
924
    /// Set the syntax configuration for this builder using
925
    /// [`syntax::Config`](crate::util::syntax::Config).
926
    ///
927
    /// This permits setting things like case insensitivity, Unicode and multi
928
    /// line mode.
929
    ///
930
    /// This syntax configuration only applies when an NFA is built directly
931
    /// from a pattern string. If an NFA is built from an HIR, then all syntax
932
    /// settings are ignored.
933
    ///
934
    /// # Example
935
    ///
936
    /// ```
937
    /// use regex_automata::{nfa::thompson::NFA, util::syntax};
938
    ///
939
    /// let syntax_config = syntax::Config::new().unicode(false);
940
    /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
941
    /// // If Unicode were enabled, the number of states would be much bigger.
942
    /// assert!(nfa.states().len() < 15);
943
    ///
944
    /// # Ok::<(), Box<dyn std::error::Error>>(())
945
    /// ```
946
0
    pub fn syntax(
947
0
        &mut self,
948
0
        config: crate::util::syntax::Config,
949
0
    ) -> &mut Compiler {
950
0
        config.apply(&mut self.parser);
951
0
        self
952
0
    }
953
}
954
955
impl Compiler {
956
    /// Compile the sequence of HIR expressions given. Pattern IDs are
957
    /// allocated starting from 0, in correspondence with the slice given.
958
    ///
959
    /// It is legal to provide an empty slice. In that case, the NFA returned
960
    /// has no patterns and will never match anything.
961
138k
    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
962
138k
        if exprs.len() > PatternID::LIMIT {
963
0
            return Err(BuildError::too_many_patterns(exprs.len()));
964
138k
        }
965
138k
        if self.config.get_reverse()
966
70.7k
            && self.config.get_which_captures().is_any()
967
        {
968
0
            return Err(BuildError::unsupported_captures());
969
138k
        }
970
971
138k
        self.builder.borrow_mut().clear();
972
138k
        self.builder.borrow_mut().set_utf8(self.config.get_utf8());
973
138k
        self.builder.borrow_mut().set_reverse(self.config.get_reverse());
974
138k
        self.builder
975
138k
            .borrow_mut()
976
138k
            .set_look_matcher(self.config.get_look_matcher());
977
138k
        self.builder
978
138k
            .borrow_mut()
979
138k
            .set_size_limit(self.config.get_nfa_size_limit())?;
980
981
        // We always add an unanchored prefix unless we were specifically told
982
        // not to (for tests only), or if we know that the regex is anchored
983
        // for all matches. When an unanchored prefix is not added, then the
984
        // NFA's anchored and unanchored start states are equivalent.
985
138k
        let all_anchored = exprs.iter().all(|e| {
986
138k
            let props = e.borrow().properties();
987
138k
            if self.config.get_reverse() {
988
70.7k
                props.look_set_suffix().contains(hir::Look::End)
989
            } else {
990
67.9k
                props.look_set_prefix().contains(hir::Look::Start)
991
            }
992
138k
        });
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#0}
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>::{closure#0}
Line
Count
Source
985
138k
        let all_anchored = exprs.iter().all(|e| {
986
138k
            let props = e.borrow().properties();
987
138k
            if self.config.get_reverse() {
988
70.7k
                props.look_set_suffix().contains(hir::Look::End)
989
            } else {
990
67.9k
                props.look_set_prefix().contains(hir::Look::Start)
991
            }
992
138k
        });
993
138k
        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
994
138k
        let unanchored_prefix = if anchored {
995
3.36k
            self.c_empty()?
996
        } else {
997
135k
            self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
998
        };
999
1000
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
1001
138k
            let _ = self.start_pattern()?;
1002
138k
            let one = self.c_cap(0, None, e.borrow())?;
1003
136k
            let match_state_id = self.add_match()?;
1004
136k
            self.patch(one.end, match_state_id)?;
1005
136k
            let _ = self.finish_pattern(one.start)?;
1006
136k
            Ok(ThompsonRef { start: one.start, end: match_state_id })
1007
138k
        }))?;
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#1}
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>::{closure#1}
Line
Count
Source
1000
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
1001
138k
            let _ = self.start_pattern()?;
1002
138k
            let one = self.c_cap(0, None, e.borrow())?;
1003
136k
            let match_state_id = self.add_match()?;
1004
136k
            self.patch(one.end, match_state_id)?;
1005
136k
            let _ = self.finish_pattern(one.start)?;
1006
136k
            Ok(ThompsonRef { start: one.start, end: match_state_id })
1007
138k
        }))?;
1008
136k
        self.patch(unanchored_prefix.end, compiled.start)?;
1009
136k
        let nfa = self
1010
136k
            .builder
1011
136k
            .borrow_mut()
1012
136k
            .build(compiled.start, unanchored_prefix.start)?;
1013
1014
136k
        debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
1015
136k
        Ok(nfa)
1016
138k
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>
Line
Count
Source
961
138k
    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
962
138k
        if exprs.len() > PatternID::LIMIT {
963
0
            return Err(BuildError::too_many_patterns(exprs.len()));
964
138k
        }
965
138k
        if self.config.get_reverse()
966
70.7k
            && self.config.get_which_captures().is_any()
967
        {
968
0
            return Err(BuildError::unsupported_captures());
969
138k
        }
970
971
138k
        self.builder.borrow_mut().clear();
972
138k
        self.builder.borrow_mut().set_utf8(self.config.get_utf8());
973
138k
        self.builder.borrow_mut().set_reverse(self.config.get_reverse());
974
138k
        self.builder
975
138k
            .borrow_mut()
976
138k
            .set_look_matcher(self.config.get_look_matcher());
977
138k
        self.builder
978
138k
            .borrow_mut()
979
138k
            .set_size_limit(self.config.get_nfa_size_limit())?;
980
981
        // We always add an unanchored prefix unless we were specifically told
982
        // not to (for tests only), or if we know that the regex is anchored
983
        // for all matches. When an unanchored prefix is not added, then the
984
        // NFA's anchored and unanchored start states are equivalent.
985
138k
        let all_anchored = exprs.iter().all(|e| {
986
            let props = e.borrow().properties();
987
            if self.config.get_reverse() {
988
                props.look_set_suffix().contains(hir::Look::End)
989
            } else {
990
                props.look_set_prefix().contains(hir::Look::Start)
991
            }
992
        });
993
138k
        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
994
138k
        let unanchored_prefix = if anchored {
995
3.36k
            self.c_empty()?
996
        } else {
997
135k
            self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
998
        };
999
1000
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
1001
            let _ = self.start_pattern()?;
1002
            let one = self.c_cap(0, None, e.borrow())?;
1003
            let match_state_id = self.add_match()?;
1004
            self.patch(one.end, match_state_id)?;
1005
            let _ = self.finish_pattern(one.start)?;
1006
            Ok(ThompsonRef { start: one.start, end: match_state_id })
1007
2.13k
        }))?;
1008
136k
        self.patch(unanchored_prefix.end, compiled.start)?;
1009
136k
        let nfa = self
1010
136k
            .builder
1011
136k
            .borrow_mut()
1012
136k
            .build(compiled.start, unanchored_prefix.start)?;
1013
1014
136k
        debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
1015
136k
        Ok(nfa)
1016
138k
    }
1017
1018
    /// Compile an arbitrary HIR expression.
1019
82.4M
    fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
1020
        use regex_syntax::hir::{Class, HirKind::*};
1021
1022
82.4M
        match *expr.kind() {
1023
1.09M
            Empty => self.c_empty(),
1024
53.8M
            Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1025
3.90M
            Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1026
5.46M
            Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1027
5.76M
            Look(ref look) => self.c_look(look),
1028
3.49M
            Repetition(ref rep) => self.c_repetition(rep),
1029
4.78M
            Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1030
4.30M
            Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1031
3.10M
            Alternation(ref es) => self.c_alt_slice(es),
1032
        }
1033
82.4M
    }
1034
1035
    /// Compile a concatenation of the sub-expressions yielded by the given
1036
    /// iterator. If the iterator yields no elements, then this compiles down
1037
    /// to an "empty" state that always matches.
1038
    ///
1039
    /// If the compiler is in reverse mode, then the expressions given are
1040
    /// automatically compiled in reverse.
1041
55.0M
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042
55.0M
    where
1043
55.0M
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044
    {
1045
55.0M
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046
55.0M
        let ThompsonRef { start, mut end } = match first {
1047
55.0M
            Some(result) => result?,
1048
9.94k
            None => return self.c_empty(),
1049
        };
1050
        loop {
1051
124M
            let next =
1052
124M
                if self.is_reverse() { it.next_back() } else { it.next() };
1053
124M
            let compiled = match next {
1054
69.5M
                Some(result) => result?,
1055
55.0M
                None => break,
1056
            };
1057
69.5M
            self.patch(end, compiled.start)?;
1058
69.5M
            end = compiled.end;
1059
        }
1060
55.0M
        Ok(ThompsonRef { start, end })
1061
55.0M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<u8>>, <regex_automata::nfa::thompson::compiler::Compiler>::c_literal::{closure#0}>>
Line
Count
Source
1041
53.8M
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042
53.8M
    where
1043
53.8M
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044
    {
1045
53.8M
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046
53.8M
        let ThompsonRef { start, mut end } = match first {
1047
53.8M
            Some(result) => result?,
1048
0
            None => return self.c_empty(),
1049
        };
1050
        loop {
1051
65.0M
            let next =
1052
65.0M
                if self.is_reverse() { it.next_back() } else { it.next() };
1053
65.0M
            let compiled = match next {
1054
11.1M
                Some(result) => result?,
1055
53.8M
                None => break,
1056
            };
1057
11.1M
            self.patch(end, compiled.start)?;
1058
11.1M
            end = compiled.end;
1059
        }
1060
53.8M
        Ok(ThompsonRef { start, end })
1061
53.8M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_automata::nfa::thompson::compiler::Compiler>::c_exactly::{closure#0}>>
Line
Count
Source
1041
287k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042
287k
    where
1043
287k
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044
    {
1045
287k
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046
287k
        let ThompsonRef { start, mut end } = match first {
1047
277k
            Some(result) => result?,
1048
9.94k
            None => return self.c_empty(),
1049
        };
1050
        loop {
1051
55.3M
            let next =
1052
55.3M
                if self.is_reverse() { it.next_back() } else { it.next() };
1053
55.3M
            let compiled = match next {
1054
55.0M
                Some(result) => result?,
1055
270k
                None => break,
1056
            };
1057
55.0M
            self.patch(end, compiled.start)?;
1058
55.0M
            end = compiled.end;
1059
        }
1060
270k
        Ok(ThompsonRef { start, end })
1061
287k
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c::{closure#0}>>
Line
Count
Source
1041
928k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042
928k
    where
1043
928k
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044
    {
1045
928k
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046
928k
        let ThompsonRef { start, mut end } = match first {
1047
928k
            Some(result) => result?,
1048
0
            None => return self.c_empty(),
1049
        };
1050
        loop {
1051
4.30M
            let next =
1052
4.30M
                if self.is_reverse() { it.next_back() } else { it.next() };
1053
4.30M
            let compiled = match next {
1054
3.37M
                Some(result) => result?,
1055
927k
                None => break,
1056
            };
1057
3.37M
            self.patch(end, compiled.start)?;
1058
3.37M
            end = compiled.end;
1059
        }
1060
927k
        Ok(ThompsonRef { start, end })
1061
928k
    }
1062
1063
    /// Compile an alternation of the given HIR values.
1064
    ///
1065
    /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1066
    /// of an iterator of compiled NFA sub-graphs. The point of accepting a
1067
    /// slice here is that it opens up some optimization opportunities. For
1068
    /// example, if all of the HIR values are literals, then this routine might
1069
    /// re-shuffle them to make NFA epsilon closures substantially faster.
1070
3.10M
    fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1071
        // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1072
3.10M
        let literal_count = exprs
1073
3.10M
            .iter()
1074
10.9M
            .filter(|e| {
1075
10.9M
                matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1076
10.9M
            })
1077
3.10M
            .count();
1078
3.10M
        if literal_count <= 1 || literal_count < exprs.len() {
1079
10.1M
            return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1080
252k
        }
1081
1082
252k
        let mut trie = if self.is_reverse() {
1083
66.4k
            LiteralTrie::reverse()
1084
        } else {
1085
186k
            LiteralTrie::forward()
1086
        };
1087
750k
        for expr in exprs.iter() {
1088
750k
            let literal = match *expr.kind() {
1089
750k
                hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1090
0
                _ => unreachable!(),
1091
            };
1092
750k
            trie.add(literal)?;
1093
        }
1094
252k
        trie.compile(&mut self.builder.borrow_mut())
1095
3.10M
    }
1096
1097
    /// Compile an alternation, where each element yielded by the given
1098
    /// iterator represents an item in the alternation. If the iterator yields
1099
    /// no elements, then this compiles down to a "fail" state.
1100
    ///
1101
    /// In an alternation, expressions appearing earlier are "preferred" at
1102
    /// match time over expressions appearing later. At least, this is true
1103
    /// when using "leftmost first" match semantics. (If "leftmost longest" are
1104
    /// ever added in the future, then this preference order of priority would
1105
    /// not apply in that mode.)
1106
2.99M
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1107
2.99M
    where
1108
2.99M
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1109
    {
1110
2.99M
        let first = match it.next() {
1111
0
            None => return self.c_fail(),
1112
2.99M
            Some(result) => result?,
1113
        };
1114
2.98M
        let second = match it.next() {
1115
136k
            None => return Ok(first),
1116
2.85M
            Some(result) => result?,
1117
        };
1118
1119
2.85M
        let union = self.add_union()?;
1120
2.85M
        let end = self.add_empty()?;
1121
2.85M
        self.patch(union, first.start)?;
1122
2.85M
        self.patch(first.end, end)?;
1123
2.85M
        self.patch(union, second.start)?;
1124
2.85M
        self.patch(second.end, end)?;
1125
7.33M
        for result in it {
1126
4.48M
            let compiled = result?;
1127
4.48M
            self.patch(union, compiled.start)?;
1128
4.48M
            self.patch(compiled.end, end)?;
1129
        }
1130
2.85M
        Ok(ThompsonRef { start: union, end })
1131
2.99M
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<regex_syntax::hir::Hir>::{closure#1}>>
<regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_slice::{closure#1}>>
Line
Count
Source
1106
2.85M
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1107
2.85M
    where
1108
2.85M
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1109
    {
1110
2.85M
        let first = match it.next() {
1111
0
            None => return self.c_fail(),
1112
2.85M
            Some(result) => result?,
1113
        };
1114
2.85M
        let second = match it.next() {
1115
0
            None => return Ok(first),
1116
2.85M
            Some(result) => result?,
1117
        };
1118
1119
2.85M
        let union = self.add_union()?;
1120
2.85M
        let end = self.add_empty()?;
1121
2.85M
        self.patch(union, first.start)?;
1122
2.85M
        self.patch(first.end, end)?;
1123
2.85M
        self.patch(union, second.start)?;
1124
2.85M
        self.patch(second.end, end)?;
1125
7.33M
        for result in it {
1126
4.48M
            let compiled = result?;
1127
4.48M
            self.patch(union, compiled.start)?;
1128
4.48M
            self.patch(compiled.end, end)?;
1129
        }
1130
2.85M
        Ok(ThompsonRef { start: union, end })
1131
2.85M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<&regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<&regex_syntax::hir::Hir>::{closure#1}>>
Line
Count
Source
1106
138k
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1107
138k
    where
1108
138k
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1109
    {
1110
138k
        let first = match it.next() {
1111
0
            None => return self.c_fail(),
1112
138k
            Some(result) => result?,
1113
        };
1114
136k
        let second = match it.next() {
1115
136k
            None => return Ok(first),
1116
0
            Some(result) => result?,
1117
        };
1118
1119
0
        let union = self.add_union()?;
1120
0
        let end = self.add_empty()?;
1121
0
        self.patch(union, first.start)?;
1122
0
        self.patch(first.end, end)?;
1123
0
        self.patch(union, second.start)?;
1124
0
        self.patch(second.end, end)?;
1125
0
        for result in it {
1126
0
            let compiled = result?;
1127
0
            self.patch(union, compiled.start)?;
1128
0
            self.patch(compiled.end, end)?;
1129
        }
1130
0
        Ok(ThompsonRef { start: union, end })
1131
138k
    }
1132
1133
    /// Compile the given capture sub-expression. `expr` should be the
1134
    /// sub-expression contained inside the capture. If "capture" states are
1135
    /// enabled, then they are added as appropriate.
1136
    ///
1137
    /// This accepts the pieces of a capture instead of a `hir::Capture` so
1138
    /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1139
    /// adding the entire pattern as if it were a group in order to create
1140
    /// appropriate "capture" states in the NFA.
1141
4.92M
    fn c_cap(
1142
4.92M
        &self,
1143
4.92M
        index: u32,
1144
4.92M
        name: Option<&str>,
1145
4.92M
        expr: &Hir,
1146
4.92M
    ) -> Result<ThompsonRef, BuildError> {
1147
4.92M
        match self.config.get_which_captures() {
1148
            // No capture states means we always skip them.
1149
2.05M
            WhichCaptures::None => return self.c(expr),
1150
            // Implicit captures states means we only add when index==0 since
1151
            // index==0 implies the group is implicit.
1152
0
            WhichCaptures::Implicit if index > 0 => return self.c(expr),
1153
2.86M
            _ => {}
1154
        }
1155
1156
2.86M
        let start = self.add_capture_start(index, name)?;
1157
2.86M
        let inner = self.c(expr)?;
1158
2.86M
        let end = self.add_capture_end(index)?;
1159
2.86M
        self.patch(start, inner.start)?;
1160
2.86M
        self.patch(inner.end, end)?;
1161
2.86M
        Ok(ThompsonRef { start, end })
1162
4.92M
    }
1163
1164
    /// Compile the given repetition expression. This handles all types of
1165
    /// repetitions and greediness.
1166
3.49M
    fn c_repetition(
1167
3.49M
        &self,
1168
3.49M
        rep: &hir::Repetition,
1169
3.49M
    ) -> Result<ThompsonRef, BuildError> {
1170
3.49M
        match (rep.min, rep.max) {
1171
1.53M
            (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1172
1.72M
            (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1173
229k
            (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1174
18.7k
            (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1175
        }
1176
3.49M
    }
1177
1178
    /// Compile the given expression such that it matches at least `min` times,
1179
    /// but no more than `max` times.
1180
    ///
1181
    /// When `greedy` is true, then the preference is for the expression to
1182
    /// match as much as possible. Otherwise, it will match as little as
1183
    /// possible.
1184
18.7k
    fn c_bounded(
1185
18.7k
        &self,
1186
18.7k
        expr: &Hir,
1187
18.7k
        greedy: bool,
1188
18.7k
        min: u32,
1189
18.7k
        max: u32,
1190
18.7k
    ) -> Result<ThompsonRef, BuildError> {
1191
18.7k
        let prefix = self.c_exactly(expr, min)?;
1192
17.2k
        if min == max {
1193
0
            return Ok(prefix);
1194
17.2k
        }
1195
1196
        // It is tempting here to compile the rest here as a concatenation
1197
        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1198
        // were `aaa?a?a?`. The problem here is that it leads to this program:
1199
        //
1200
        //     >000000: 61 => 01
1201
        //      000001: 61 => 02
1202
        //      000002: union(03, 04)
1203
        //      000003: 61 => 04
1204
        //      000004: union(05, 06)
1205
        //      000005: 61 => 06
1206
        //      000006: union(07, 08)
1207
        //      000007: 61 => 08
1208
        //      000008: MATCH
1209
        //
1210
        // And effectively, once you hit state 2, the epsilon closure will
1211
        // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1212
        // to instead compile it like so:
1213
        //
1214
        //     >000000: 61 => 01
1215
        //      000001: 61 => 02
1216
        //      000002: union(03, 08)
1217
        //      000003: 61 => 04
1218
        //      000004: union(05, 08)
1219
        //      000005: 61 => 06
1220
        //      000006: union(07, 08)
1221
        //      000007: 61 => 08
1222
        //      000008: MATCH
1223
        //
1224
        // So that the epsilon closure of state 2 is now just 3 and 8.
1225
17.2k
        let empty = self.add_empty()?;
1226
17.2k
        let mut prev_end = prefix.end;
1227
17.2k
        for _ in min..max {
1228
4.29M
            let union = if greedy {
1229
2.92M
                self.add_union()
1230
            } else {
1231
1.36M
                self.add_union_reverse()
1232
22
            }?;
1233
4.29M
            let compiled = self.c(expr)?;
1234
4.29M
            self.patch(prev_end, union)?;
1235
4.29M
            self.patch(union, compiled.start)?;
1236
4.29M
            self.patch(union, empty)?;
1237
4.29M
            prev_end = compiled.end;
1238
        }
1239
16.5k
        self.patch(prev_end, empty)?;
1240
16.5k
        Ok(ThompsonRef { start: prefix.start, end: empty })
1241
18.7k
    }
1242
1243
    /// Compile the given expression such that it may be matched `n` or more
1244
    /// times, where `n` can be any integer. (Although a particularly large
1245
    /// integer is likely to run afoul of any configured size limits.)
1246
    ///
1247
    /// When `greedy` is true, then the preference is for the expression to
1248
    /// match as much as possible. Otherwise, it will match as little as
1249
    /// possible.
1250
1.86M
    fn c_at_least(
1251
1.86M
        &self,
1252
1.86M
        expr: &Hir,
1253
1.86M
        greedy: bool,
1254
1.86M
        n: u32,
1255
1.86M
    ) -> Result<ThompsonRef, BuildError> {
1256
1.86M
        if n == 0 {
1257
            // When the expression cannot match the empty string, then we
1258
            // can get away with something much simpler: just one 'alt'
1259
            // instruction that optionally repeats itself. But if the expr
1260
            // can match the empty string... see below.
1261
1.02M
            if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1262
768k
                let union = if greedy {
1263
347k
                    self.add_union()
1264
                } else {
1265
420k
                    self.add_union_reverse()
1266
7
                }?;
1267
768k
                let compiled = self.c(expr)?;
1268
768k
                self.patch(union, compiled.start)?;
1269
768k
                self.patch(compiled.end, union)?;
1270
768k
                return Ok(ThompsonRef { start: union, end: union });
1271
258k
            }
1272
1273
            // What's going on here? Shouldn't x* be simpler than this? It
1274
            // turns out that when implementing leftmost-first (Perl-like)
1275
            // match semantics, x* results in an incorrect preference order
1276
            // when computing the transitive closure of states if and only if
1277
            // 'x' can match the empty string. So instead, we compile x* as
1278
            // (x+)?, which preserves the correct preference order.
1279
            //
1280
            // See: https://github.com/rust-lang/regex/issues/779
1281
258k
            let compiled = self.c(expr)?;
1282
257k
            let plus = if greedy {
1283
176k
                self.add_union()
1284
            } else {
1285
81.0k
                self.add_union_reverse()
1286
5
            }?;
1287
257k
            self.patch(compiled.end, plus)?;
1288
257k
            self.patch(plus, compiled.start)?;
1289
1290
257k
            let question = if greedy {
1291
176k
                self.add_union()
1292
            } else {
1293
81.0k
                self.add_union_reverse()
1294
4
            }?;
1295
257k
            let empty = self.add_empty()?;
1296
257k
            self.patch(question, compiled.start)?;
1297
257k
            self.patch(question, empty)?;
1298
257k
            self.patch(plus, empty)?;
1299
257k
            Ok(ThompsonRef { start: question, end: empty })
1300
837k
        } else if n == 1 {
1301
778k
            let compiled = self.c(expr)?;
1302
777k
            let union = if greedy {
1303
508k
                self.add_union()
1304
            } else {
1305
269k
                self.add_union_reverse()
1306
10
            }?;
1307
777k
            self.patch(compiled.end, union)?;
1308
777k
            self.patch(union, compiled.start)?;
1309
777k
            Ok(ThompsonRef { start: compiled.start, end: union })
1310
        } else {
1311
58.1k
            let prefix = self.c_exactly(expr, n - 1)?;
1312
56.7k
            let last = self.c(expr)?;
1313
56.7k
            let union = if greedy {
1314
14.5k
                self.add_union()
1315
            } else {
1316
42.2k
                self.add_union_reverse()
1317
2
            }?;
1318
56.7k
            self.patch(prefix.end, last.start)?;
1319
56.7k
            self.patch(last.end, union)?;
1320
56.7k
            self.patch(union, last.start)?;
1321
56.7k
            Ok(ThompsonRef { start: prefix.start, end: union })
1322
        }
1323
1.86M
    }
1324
1325
    /// Compile the given expression such that it may be matched zero or one
1326
    /// times.
1327
    ///
1328
    /// When `greedy` is true, then the preference is for the expression to
1329
    /// match as much as possible. Otherwise, it will match as little as
1330
    /// possible.
1331
1.53M
    fn c_zero_or_one(
1332
1.53M
        &self,
1333
1.53M
        expr: &Hir,
1334
1.53M
        greedy: bool,
1335
1.53M
    ) -> Result<ThompsonRef, BuildError> {
1336
1.53M
        let union =
1337
1.53M
            if greedy { self.add_union() } else { self.add_union_reverse() }?;
1338
1.53M
        let compiled = self.c(expr)?;
1339
1.53M
        let empty = self.add_empty()?;
1340
1.53M
        self.patch(union, compiled.start)?;
1341
1.53M
        self.patch(union, empty)?;
1342
1.53M
        self.patch(compiled.end, empty)?;
1343
1.53M
        Ok(ThompsonRef { start: union, end: empty })
1344
1.53M
    }
1345
1346
    /// Compile the given HIR expression exactly `n` times.
1347
287k
    fn c_exactly(
1348
287k
        &self,
1349
287k
        expr: &Hir,
1350
287k
        n: u32,
1351
287k
    ) -> Result<ThompsonRef, BuildError> {
1352
55.3M
        let it = (0..n).map(|_| self.c(expr));
1353
287k
        self.c_concat(it)
1354
287k
    }
1355
1356
    /// Compile the given byte oriented character class.
1357
    ///
1358
    /// This uses "sparse" states to represent an alternation between ranges in
1359
    /// this character class. We can use "sparse" states instead of stitching
1360
    /// together a "union" state because all ranges in a character class have
1361
    /// equal priority *and* are non-overlapping (thus, only one can match, so
1362
    /// there's never a question of priority in the first place). This saves a
1363
    /// fair bit of overhead when traversing an NFA.
1364
    ///
1365
    /// This routine compiles an empty character class into a "fail" state.
1366
3.90M
    fn c_byte_class(
1367
3.90M
        &self,
1368
3.90M
        cls: &hir::ClassBytes,
1369
3.90M
    ) -> Result<ThompsonRef, BuildError> {
1370
3.90M
        let end = self.add_empty()?;
1371
3.90M
        let mut trans = Vec::with_capacity(cls.ranges().len());
1372
6.18M
        for r in cls.iter() {
1373
6.18M
            trans.push(Transition {
1374
6.18M
                start: r.start(),
1375
6.18M
                end: r.end(),
1376
6.18M
                next: end,
1377
6.18M
            });
1378
6.18M
        }
1379
3.90M
        Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1380
3.90M
    }
1381
1382
    /// Compile the given Unicode character class.
1383
    ///
1384
    /// This routine specifically tries to use various types of compression,
1385
    /// since UTF-8 automata of large classes can get quite large. The specific
1386
    /// type of compression used depends on forward vs reverse compilation, and
1387
    /// whether NFA shrinking is enabled or not.
1388
    ///
1389
    /// Aside from repetitions causing lots of repeat group, this is like the
1390
    /// single most expensive part of regex compilation. Therefore, a large part
1391
    /// of the expense of compilation may be reduce by disabling Unicode in the
1392
    /// pattern.
1393
    ///
1394
    /// This routine compiles an empty character class into a "fail" state.
1395
5.46M
    fn c_unicode_class(
1396
5.46M
        &self,
1397
5.46M
        cls: &hir::ClassUnicode,
1398
5.46M
    ) -> Result<ThompsonRef, BuildError> {
1399
        // If all we have are ASCII ranges wrapped in a Unicode package, then
1400
        // there is zero reason to bring out the big guns. We can fit all ASCII
1401
        // ranges within a single sparse state.
1402
5.46M
        if cls.is_ascii() {
1403
1.93M
            let end = self.add_empty()?;
1404
1.93M
            let mut trans = Vec::with_capacity(cls.ranges().len());
1405
4.29M
            for r in cls.iter() {
1406
4.29M
                // The unwraps below are OK because we've verified that this
1407
4.29M
                // class only contains ASCII codepoints.
1408
4.29M
                trans.push(Transition {
1409
4.29M
                    // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1410
4.29M
                    start: u8::try_from(u32::from(r.start())).unwrap(),
1411
4.29M
                    end: u8::try_from(u32::from(r.end())).unwrap(),
1412
4.29M
                    next: end,
1413
4.29M
                });
1414
4.29M
            }
1415
1.93M
            Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1416
3.53M
        } else if self.is_reverse() {
1417
1.13M
            if !self.config.get_shrink() {
1418
                // When we don't want to spend the extra time shrinking, we
1419
                // compile the UTF-8 automaton in reverse using something like
1420
                // the "naive" approach, but will attempt to re-use common
1421
                // suffixes.
1422
1.13M
                self.c_unicode_class_reverse_with_suffix(cls)
1423
            } else {
1424
                // When we want to shrink our NFA for reverse UTF-8 automata,
1425
                // we cannot feed UTF-8 sequences directly to the UTF-8
1426
                // compiler, since the UTF-8 compiler requires all sequences
1427
                // to be lexicographically sorted. Instead, we organize our
1428
                // sequences into a range trie, which can then output our
1429
                // sequences in the correct order. Unfortunately, building the
1430
                // range trie is fairly expensive (but not nearly as expensive
1431
                // as building a DFA). Hence the reason why the 'shrink' option
1432
                // exists, so that this path can be toggled off. For example,
1433
                // we might want to turn this off if we know we won't be
1434
                // compiling a DFA.
1435
0
                let mut trie = self.trie_state.borrow_mut();
1436
0
                trie.clear();
1437
1438
0
                for rng in cls.iter() {
1439
0
                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1440
0
                        seq.reverse();
1441
0
                        trie.insert(seq.as_slice());
1442
0
                    }
1443
                }
1444
0
                let mut builder = self.builder.borrow_mut();
1445
0
                let mut utf8_state = self.utf8_state.borrow_mut();
1446
0
                let mut utf8c =
1447
0
                    Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1448
0
                trie.iter(|seq| {
1449
0
                    utf8c.add(&seq)?;
1450
0
                    Ok(())
1451
0
                })?;
1452
0
                utf8c.finish()
1453
            }
1454
        } else {
1455
            // In the forward direction, we always shrink our UTF-8 automata
1456
            // because we can stream it right into the UTF-8 compiler. There
1457
            // is almost no downside (in either memory or time) to using this
1458
            // approach.
1459
2.39M
            let mut builder = self.builder.borrow_mut();
1460
2.39M
            let mut utf8_state = self.utf8_state.borrow_mut();
1461
2.39M
            let mut utf8c =
1462
2.39M
                Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1463
26.3M
            for rng in cls.iter() {
1464
44.2M
                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1465
44.2M
                    utf8c.add(seq.as_slice())?;
1466
                }
1467
            }
1468
2.39M
            utf8c.finish()
1469
        }
1470
1471
        // For reference, the code below is the "naive" version of compiling a
1472
        // UTF-8 automaton. It is deliciously simple (and works for both the
1473
        // forward and reverse cases), but will unfortunately produce very
1474
        // large NFAs. When compiling a forward automaton, the size difference
1475
        // can sometimes be an order of magnitude. For example, the '\w' regex
1476
        // will generate about ~3000 NFA states using the naive approach below,
1477
        // but only 283 states when using the approach above. This is because
1478
        // the approach above actually compiles a *minimal* (or near minimal,
1479
        // because of the bounded hashmap for reusing equivalent states) UTF-8
1480
        // automaton.
1481
        //
1482
        // The code below is kept as a reference point in order to make it
1483
        // easier to understand the higher level goal here. Although, it will
1484
        // almost certainly bit-rot, so keep that in mind. Also, if you try to
1485
        // use it, some of the tests in this module will fail because they look
1486
        // for terser byte code produce by the more optimized handling above.
1487
        // But the integration test suite should still pass.
1488
        //
1489
        // One good example of the substantial difference this can make is to
1490
        // compare and contrast performance of the Pike VM when the code below
1491
        // is active vs the code above. Here's an example to try:
1492
        //
1493
        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1494
        //
1495
        // With Unicode classes generated below, this search takes about 45s on
1496
        // my machine. But with the compressed version above, the search takes
1497
        // only around 1.4s. The NFA is also 20% smaller. This is in part due
1498
        // to the compression, but also because of the utilization of 'sparse'
1499
        // NFA states. They lead to much less state shuffling during the NFA
1500
        // search.
1501
        /*
1502
        let it = cls
1503
            .iter()
1504
            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1505
            .map(|seq| {
1506
                let it = seq
1507
                    .as_slice()
1508
                    .iter()
1509
                    .map(|rng| self.c_range(rng.start, rng.end));
1510
                self.c_concat(it)
1511
            });
1512
        self.c_alt_iter(it)
1513
        */
1514
5.46M
    }
1515
1516
    /// Compile the given Unicode character class in reverse with suffix
1517
    /// caching.
1518
    ///
1519
    /// This is a "quick" way to compile large Unicode classes into reverse
1520
    /// UTF-8 automata while doing a small amount of compression on that
1521
    /// automata by reusing common suffixes.
1522
    ///
1523
    /// A more comprehensive compression scheme can be accomplished by using
1524
    /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1525
    /// ranges, and then use Daciuk's algorithm via `Utf8Compiler`.
1526
    ///
1527
    /// This is the technique used when "NFA shrinking" is disabled.
1528
    ///
1529
    /// (This also tries to use "sparse" states where possible, just like
1530
    /// `c_byte_class` does.)
1531
1.13M
    fn c_unicode_class_reverse_with_suffix(
1532
1.13M
        &self,
1533
1.13M
        cls: &hir::ClassUnicode,
1534
1.13M
    ) -> Result<ThompsonRef, BuildError> {
1535
        // N.B. It would likely be better to cache common *prefixes* in the
1536
        // reverse direction, but it's not quite clear how to do that. The
1537
        // advantage of caching suffixes is that it does give us a win, and
1538
        // has a very small additional overhead.
1539
1.13M
        let mut cache = self.utf8_suffix.borrow_mut();
1540
1.13M
        cache.clear();
1541
1542
1.13M
        let union = self.add_union()?;
1543
1.13M
        let alt_end = self.add_empty()?;
1544
13.5M
        for urng in cls.iter() {
1545
23.7M
            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1546
23.7M
                let mut end = alt_end;
1547
74.3M
                for brng in seq.as_slice() {
1548
74.3M
                    let key = Utf8SuffixKey {
1549
74.3M
                        from: end,
1550
74.3M
                        start: brng.start,
1551
74.3M
                        end: brng.end,
1552
74.3M
                    };
1553
74.3M
                    let hash = cache.hash(&key);
1554
74.3M
                    if let Some(id) = cache.get(&key, hash) {
1555
25.8M
                        end = id;
1556
25.8M
                        continue;
1557
48.4M
                    }
1558
1559
48.4M
                    let compiled = self.c_range(brng.start, brng.end)?;
1560
48.4M
                    self.patch(compiled.end, end)?;
1561
48.4M
                    end = compiled.start;
1562
48.4M
                    cache.set(key, hash, end);
1563
                }
1564
23.7M
                self.patch(union, end)?;
1565
            }
1566
        }
1567
1.13M
        Ok(ThompsonRef { start: union, end: alt_end })
1568
1.13M
    }
1569
1570
    /// Compile the given HIR look-around assertion to an NFA look-around
1571
    /// assertion.
1572
5.76M
    fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1573
5.76M
        let look = match *anchor {
1574
477k
            hir::Look::Start => Look::Start,
1575
425k
            hir::Look::End => Look::End,
1576
55.7k
            hir::Look::StartLF => Look::StartLF,
1577
92.3k
            hir::Look::EndLF => Look::EndLF,
1578
71.4k
            hir::Look::StartCRLF => Look::StartCRLF,
1579
68.9k
            hir::Look::EndCRLF => Look::EndCRLF,
1580
1.00M
            hir::Look::WordAscii => Look::WordAscii,
1581
335k
            hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1582
928k
            hir::Look::WordUnicode => Look::WordUnicode,
1583
189k
            hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1584
506k
            hir::Look::WordStartAscii => Look::WordStartAscii,
1585
296k
            hir::Look::WordEndAscii => Look::WordEndAscii,
1586
156k
            hir::Look::WordStartUnicode => Look::WordStartUnicode,
1587
176k
            hir::Look::WordEndUnicode => Look::WordEndUnicode,
1588
321k
            hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1589
280k
            hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1590
96.5k
            hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1591
283k
            hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1592
        };
1593
5.76M
        let id = self.add_look(look)?;
1594
5.76M
        Ok(ThompsonRef { start: id, end: id })
1595
5.76M
    }
1596
1597
    /// Compile the given byte string to a concatenation of bytes.
1598
53.8M
    fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1599
65.0M
        self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1600
53.8M
    }
1601
1602
    /// Compile a "range" state with one transition that may only be followed
1603
    /// if the input byte is in the (inclusive) range given.
1604
    ///
1605
    /// Both the `start` and `end` locations point to the state created.
1606
    /// Callers will likely want to keep the `start`, but patch the `end` to
1607
    /// point to some other state.
1608
113M
    fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1609
113M
        let id = self.add_range(start, end)?;
1610
113M
        Ok(ThompsonRef { start: id, end: id })
1611
113M
    }
1612
1613
    /// Compile an "empty" state with one unconditional epsilon transition.
1614
    ///
1615
    /// Both the `start` and `end` locations point to the state created.
1616
    /// Callers will likely want to keep the `start`, but patch the `end` to
1617
    /// point to some other state.
1618
1.10M
    fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1619
1.10M
        let id = self.add_empty()?;
1620
1.10M
        Ok(ThompsonRef { start: id, end: id })
1621
1.10M
    }
1622
1623
    /// Compile a "fail" state that can never have any outgoing transitions.
1624
0
    fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1625
0
        let id = self.add_fail()?;
1626
0
        Ok(ThompsonRef { start: id, end: id })
1627
0
    }
1628
1629
    // The below helpers are meant to be simple wrappers around the
1630
    // corresponding Builder methods. For the most part, they let us write
1631
    // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1632
    // the latter is a mouthful. Some of the methods do inject a little bit
1633
    // of extra logic. e.g., Flipping look-around operators when compiling in
1634
    // reverse mode.
1635
1636
190M
    fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1637
190M
        self.builder.borrow_mut().patch(from, to)
1638
190M
    }
1639
1640
138k
    fn start_pattern(&self) -> Result<PatternID, BuildError> {
1641
138k
        self.builder.borrow_mut().start_pattern()
1642
138k
    }
1643
1644
136k
    fn finish_pattern(
1645
136k
        &self,
1646
136k
        start_id: StateID,
1647
136k
    ) -> Result<PatternID, BuildError> {
1648
136k
        self.builder.borrow_mut().finish_pattern(start_id)
1649
136k
    }
1650
1651
12.7M
    fn add_empty(&self) -> Result<StateID, BuildError> {
1652
12.7M
        self.builder.borrow_mut().add_empty()
1653
12.7M
    }
1654
1655
113M
    fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1656
113M
        self.builder.borrow_mut().add_range(Transition {
1657
113M
            start,
1658
113M
            end,
1659
113M
            next: StateID::ZERO,
1660
113M
        })
1661
113M
    }
1662
1663
5.83M
    fn add_sparse(
1664
5.83M
        &self,
1665
5.83M
        ranges: Vec<Transition>,
1666
5.83M
    ) -> Result<StateID, BuildError> {
1667
5.83M
        self.builder.borrow_mut().add_sparse(ranges)
1668
5.83M
    }
1669
1670
5.76M
    fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1671
5.76M
        if self.is_reverse() {
1672
2.81M
            look = look.reversed();
1673
2.94M
        }
1674
5.76M
        self.builder.borrow_mut().add_look(StateID::ZERO, look)
1675
5.76M
    }
1676
1677
9.13M
    fn add_union(&self) -> Result<StateID, BuildError> {
1678
9.13M
        self.builder.borrow_mut().add_union(vec![])
1679
9.13M
    }
1680
1681
2.80M
    fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1682
2.80M
        self.builder.borrow_mut().add_union_reverse(vec![])
1683
2.80M
    }
1684
1685
2.86M
    fn add_capture_start(
1686
2.86M
        &self,
1687
2.86M
        capture_index: u32,
1688
2.86M
        name: Option<&str>,
1689
2.86M
    ) -> Result<StateID, BuildError> {
1690
2.86M
        let name = name.map(Arc::from);
1691
2.86M
        self.builder.borrow_mut().add_capture_start(
1692
2.86M
            StateID::ZERO,
1693
2.86M
            capture_index,
1694
2.86M
            name,
1695
2.86M
        )
1696
2.86M
    }
1697
1698
2.86M
    fn add_capture_end(
1699
2.86M
        &self,
1700
2.86M
        capture_index: u32,
1701
2.86M
    ) -> Result<StateID, BuildError> {
1702
2.86M
        self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1703
2.86M
    }
1704
1705
0
    fn add_fail(&self) -> Result<StateID, BuildError> {
1706
0
        self.builder.borrow_mut().add_fail()
1707
0
    }
1708
1709
136k
    fn add_match(&self) -> Result<StateID, BuildError> {
1710
136k
        self.builder.borrow_mut().add_match()
1711
136k
    }
1712
1713
189M
    fn is_reverse(&self) -> bool {
1714
189M
        self.config.get_reverse()
1715
189M
    }
1716
}
1717
1718
/// A value that represents the result of compiling a sub-expression of a
1719
/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1720
/// has an initial state at `start` and a final state at `end`.
1721
#[derive(Clone, Copy, Debug)]
1722
pub(crate) struct ThompsonRef {
1723
    pub(crate) start: StateID,
1724
    pub(crate) end: StateID,
1725
}
1726
1727
/// A UTF-8 compiler based on Daciuk's algorithm for compiling minimal DFAs
1728
/// from a lexicographically sorted sequence of strings in linear time.
1729
///
1730
/// The trick here is that any Unicode codepoint range can be converted to
1731
/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1732
/// together via an alternation is trivial, and indeed, it works. However,
1733
/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1734
/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1735
/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1736
/// minimal because we use a bounded cache of previously build DFA states.)
1737
///
1738
/// The drawback is that this sadly doesn't work for reverse automata, since
1739
/// the ranges are no longer in lexicographic order. For that, we invented the
1740
/// range trie (which gets its own module). Once a range trie is built, we then
1741
/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1742
///
1743
/// The high level idea is described here:
1744
/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1745
///
1746
/// There is also another implementation of this in the `fst` crate.
1747
#[derive(Debug)]
1748
struct Utf8Compiler<'a> {
1749
    builder: &'a mut Builder,
1750
    state: &'a mut Utf8State,
1751
    target: StateID,
1752
}
1753
1754
#[derive(Clone, Debug)]
1755
struct Utf8State {
1756
    compiled: Utf8BoundedMap,
1757
    uncompiled: Vec<Utf8Node>,
1758
}
1759
1760
#[derive(Clone, Debug)]
1761
struct Utf8Node {
1762
    trans: Vec<Transition>,
1763
    last: Option<Utf8LastTransition>,
1764
}
1765
1766
#[derive(Clone, Debug)]
1767
struct Utf8LastTransition {
1768
    start: u8,
1769
    end: u8,
1770
}
1771
1772
impl Utf8State {
1773
506k
    fn new() -> Utf8State {
1774
506k
        Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1775
506k
    }
1776
1777
2.39M
    fn clear(&mut self) {
1778
2.39M
        self.compiled.clear();
1779
2.39M
        self.uncompiled.clear();
1780
2.39M
    }
1781
}
1782
1783
impl<'a> Utf8Compiler<'a> {
1784
2.39M
    fn new(
1785
2.39M
        builder: &'a mut Builder,
1786
2.39M
        state: &'a mut Utf8State,
1787
2.39M
    ) -> Result<Utf8Compiler<'a>, BuildError> {
1788
2.39M
        let target = builder.add_empty()?;
1789
2.39M
        state.clear();
1790
2.39M
        let mut utf8c = Utf8Compiler { builder, state, target };
1791
2.39M
        utf8c.add_empty();
1792
2.39M
        Ok(utf8c)
1793
2.39M
    }
1794
1795
2.39M
    fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1796
2.39M
        self.compile_from(0)?;
1797
2.39M
        let node = self.pop_root();
1798
2.39M
        let start = self.compile(node)?;
1799
2.39M
        Ok(ThompsonRef { start, end: self.target })
1800
2.39M
    }
1801
1802
44.2M
    fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1803
44.2M
        let prefix_len = ranges
1804
44.2M
            .iter()
1805
44.2M
            .zip(&self.state.uncompiled)
1806
89.2M
            .take_while(|&(range, node)| {
1807
89.2M
                node.last.as_ref().map_or(false, |t| {
1808
86.8M
                    (t.start, t.end) == (range.start, range.end)
1809
86.8M
                })
1810
89.2M
            })
1811
44.2M
            .count();
1812
44.2M
        assert!(prefix_len < ranges.len());
1813
44.2M
        self.compile_from(prefix_len)?;
1814
44.2M
        self.add_suffix(&ranges[prefix_len..]);
1815
44.2M
        Ok(())
1816
44.2M
    }
1817
1818
46.6M
    fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1819
46.6M
        let mut next = self.target;
1820
95.0M
        while from + 1 < self.state.uncompiled.len() {
1821
48.3M
            let node = self.pop_freeze(next);
1822
48.3M
            next = self.compile(node)?;
1823
        }
1824
46.6M
        self.top_last_freeze(next);
1825
46.6M
        Ok(())
1826
46.6M
    }
1827
1828
50.7M
    fn compile(
1829
50.7M
        &mut self,
1830
50.7M
        node: Vec<Transition>,
1831
50.7M
    ) -> Result<StateID, BuildError> {
1832
50.7M
        let hash = self.state.compiled.hash(&node);
1833
50.7M
        if let Some(id) = self.state.compiled.get(&node, hash) {
1834
26.9M
            return Ok(id);
1835
23.8M
        }
1836
23.8M
        let id = self.builder.add_sparse(node.clone())?;
1837
23.8M
        self.state.compiled.set(node, hash, id);
1838
23.8M
        Ok(id)
1839
50.7M
    }
1840
1841
44.2M
    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1842
44.2M
        assert!(!ranges.is_empty());
1843
44.2M
        let last = self
1844
44.2M
            .state
1845
44.2M
            .uncompiled
1846
44.2M
            .len()
1847
44.2M
            .checked_sub(1)
1848
44.2M
            .expect("non-empty nodes");
1849
44.2M
        assert!(self.state.uncompiled[last].last.is_none());
1850
44.2M
        self.state.uncompiled[last].last = Some(Utf8LastTransition {
1851
44.2M
            start: ranges[0].start,
1852
44.2M
            end: ranges[0].end,
1853
44.2M
        });
1854
48.3M
        for r in &ranges[1..] {
1855
48.3M
            self.state.uncompiled.push(Utf8Node {
1856
48.3M
                trans: vec![],
1857
48.3M
                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1858
48.3M
            });
1859
48.3M
        }
1860
44.2M
    }
1861
1862
2.39M
    fn add_empty(&mut self) {
1863
2.39M
        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1864
2.39M
    }
1865
1866
48.3M
    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1867
48.3M
        let mut uncompiled = self.state.uncompiled.pop().unwrap();
1868
48.3M
        uncompiled.set_last_transition(next);
1869
48.3M
        uncompiled.trans
1870
48.3M
    }
1871
1872
2.39M
    fn pop_root(&mut self) -> Vec<Transition> {
1873
2.39M
        assert_eq!(self.state.uncompiled.len(), 1);
1874
2.39M
        assert!(self.state.uncompiled[0].last.is_none());
1875
2.39M
        self.state.uncompiled.pop().expect("non-empty nodes").trans
1876
2.39M
    }
1877
1878
46.6M
    fn top_last_freeze(&mut self, next: StateID) {
1879
46.6M
        let last = self
1880
46.6M
            .state
1881
46.6M
            .uncompiled
1882
46.6M
            .len()
1883
46.6M
            .checked_sub(1)
1884
46.6M
            .expect("non-empty nodes");
1885
46.6M
        self.state.uncompiled[last].set_last_transition(next);
1886
46.6M
    }
1887
}
1888
1889
impl Utf8Node {
1890
95.0M
    fn set_last_transition(&mut self, next: StateID) {
1891
95.0M
        if let Some(last) = self.last.take() {
1892
92.6M
            self.trans.push(Transition {
1893
92.6M
                start: last.start,
1894
92.6M
                end: last.end,
1895
92.6M
                next,
1896
92.6M
            });
1897
92.6M
        }
1898
95.0M
    }
1899
}
1900
1901
#[cfg(test)]
1902
mod tests {
1903
    use alloc::vec;
1904
1905
    use crate::{
1906
        nfa::thompson::{SparseTransitions, State},
1907
        util::primitives::SmallIndex,
1908
    };
1909
1910
    use super::*;
1911
1912
    fn build(pattern: &str) -> NFA {
1913
        NFA::compiler()
1914
            .configure(
1915
                NFA::config()
1916
                    .which_captures(WhichCaptures::None)
1917
                    .unanchored_prefix(false),
1918
            )
1919
            .build(pattern)
1920
            .unwrap()
1921
    }
1922
1923
    fn pid(id: usize) -> PatternID {
1924
        PatternID::new(id).unwrap()
1925
    }
1926
1927
    fn sid(id: usize) -> StateID {
1928
        StateID::new(id).unwrap()
1929
    }
1930
1931
    fn s_byte(byte: u8, next: usize) -> State {
1932
        let next = sid(next);
1933
        let trans = Transition { start: byte, end: byte, next };
1934
        State::ByteRange { trans }
1935
    }
1936
1937
    fn s_range(start: u8, end: u8, next: usize) -> State {
1938
        let next = sid(next);
1939
        let trans = Transition { start, end, next };
1940
        State::ByteRange { trans }
1941
    }
1942
1943
    fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1944
        let transitions = transitions
1945
            .iter()
1946
            .map(|&(start, end, next)| Transition {
1947
                start,
1948
                end,
1949
                next: sid(next),
1950
            })
1951
            .collect();
1952
        State::Sparse(SparseTransitions { transitions })
1953
    }
1954
1955
    fn s_look(look: Look, next: usize) -> State {
1956
        let next = sid(next);
1957
        State::Look { look, next }
1958
    }
1959
1960
    fn s_bin_union(alt1: usize, alt2: usize) -> State {
1961
        State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1962
    }
1963
1964
    fn s_union(alts: &[usize]) -> State {
1965
        State::Union {
1966
            alternates: alts
1967
                .iter()
1968
                .map(|&id| sid(id))
1969
                .collect::<Vec<StateID>>()
1970
                .into_boxed_slice(),
1971
        }
1972
    }
1973
1974
    fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1975
        State::Capture {
1976
            next: sid(next),
1977
            pattern_id: pid(pattern),
1978
            group_index: SmallIndex::new(index).unwrap(),
1979
            slot: SmallIndex::new(slot).unwrap(),
1980
        }
1981
    }
1982
1983
    fn s_fail() -> State {
1984
        State::Fail
1985
    }
1986
1987
    fn s_match(id: usize) -> State {
1988
        State::Match { pattern_id: pid(id) }
1989
    }
1990
1991
    // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1992
    // prefix.
1993
    #[test]
1994
    fn compile_unanchored_prefix() {
1995
        let nfa = NFA::compiler()
1996
            .configure(NFA::config().which_captures(WhichCaptures::None))
1997
            .build(r"a")
1998
            .unwrap();
1999
        assert_eq!(
2000
            nfa.states(),
2001
            &[
2002
                s_bin_union(2, 1),
2003
                s_range(0, 255, 0),
2004
                s_byte(b'a', 3),
2005
                s_match(0),
2006
            ]
2007
        );
2008
    }
2009
2010
    #[test]
2011
    fn compile_no_unanchored_prefix_with_start_anchor() {
2012
        let nfa = NFA::compiler()
2013
            .configure(NFA::config().which_captures(WhichCaptures::None))
2014
            .build(r"^a")
2015
            .unwrap();
2016
        assert_eq!(
2017
            nfa.states(),
2018
            &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
2019
        );
2020
    }
2021
2022
    #[test]
2023
    fn compile_yes_unanchored_prefix_with_end_anchor() {
2024
        let nfa = NFA::compiler()
2025
            .configure(NFA::config().which_captures(WhichCaptures::None))
2026
            .build(r"a$")
2027
            .unwrap();
2028
        assert_eq!(
2029
            nfa.states(),
2030
            &[
2031
                s_bin_union(2, 1),
2032
                s_range(0, 255, 0),
2033
                s_byte(b'a', 3),
2034
                s_look(Look::End, 4),
2035
                s_match(0),
2036
            ]
2037
        );
2038
    }
2039
2040
    #[test]
2041
    fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2042
        let nfa = NFA::compiler()
2043
            .configure(
2044
                NFA::config()
2045
                    .reverse(true)
2046
                    .which_captures(WhichCaptures::None),
2047
            )
2048
            .build(r"^a")
2049
            .unwrap();
2050
        assert_eq!(
2051
            nfa.states(),
2052
            &[
2053
                s_bin_union(2, 1),
2054
                s_range(0, 255, 0),
2055
                s_byte(b'a', 3),
2056
                // Anchors get flipped in a reverse automaton.
2057
                s_look(Look::End, 4),
2058
                s_match(0),
2059
            ],
2060
        );
2061
    }
2062
2063
    #[test]
2064
    fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2065
        let nfa = NFA::compiler()
2066
            .configure(
2067
                NFA::config()
2068
                    .reverse(true)
2069
                    .which_captures(WhichCaptures::None),
2070
            )
2071
            .build(r"a$")
2072
            .unwrap();
2073
        assert_eq!(
2074
            nfa.states(),
2075
            &[
2076
                // Anchors get flipped in a reverse automaton.
2077
                s_look(Look::Start, 1),
2078
                s_byte(b'a', 2),
2079
                s_match(0),
2080
            ],
2081
        );
2082
    }
2083
2084
    #[test]
2085
    fn compile_empty() {
2086
        assert_eq!(build("").states(), &[s_match(0),]);
2087
    }
2088
2089
    #[test]
2090
    fn compile_literal() {
2091
        assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
2092
        assert_eq!(
2093
            build("ab").states(),
2094
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
2095
        );
2096
        assert_eq!(
2097
            build("☃").states(),
2098
            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
2099
        );
2100
2101
        // Check that non-UTF-8 literals work.
2102
        let nfa = NFA::compiler()
2103
            .configure(
2104
                NFA::config()
2105
                    .which_captures(WhichCaptures::None)
2106
                    .unanchored_prefix(false),
2107
            )
2108
            .syntax(crate::util::syntax::Config::new().utf8(false))
2109
            .build(r"(?-u)\xFF")
2110
            .unwrap();
2111
        assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2112
    }
2113
2114
    #[test]
2115
    fn compile_class_ascii() {
2116
        assert_eq!(
2117
            build(r"[a-z]").states(),
2118
            &[s_range(b'a', b'z', 1), s_match(0),]
2119
        );
2120
        assert_eq!(
2121
            build(r"[x-za-c]").states(),
2122
            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2123
        );
2124
    }
2125
2126
    #[test]
2127
    #[cfg(not(miri))]
2128
    fn compile_class_unicode() {
2129
        assert_eq!(
2130
            build(r"[\u03B1-\u03B4]").states(),
2131
            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2132
        );
2133
        assert_eq!(
2134
            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2135
            &[
2136
                s_range(0xB1, 0xB4, 5),
2137
                s_range(0x99, 0x9E, 5),
2138
                s_byte(0xA4, 1),
2139
                s_byte(0x9F, 2),
2140
                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2141
                s_match(0),
2142
            ]
2143
        );
2144
        assert_eq!(
2145
            build(r"[a-z☃]").states(),
2146
            &[
2147
                s_byte(0x83, 3),
2148
                s_byte(0x98, 0),
2149
                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2150
                s_match(0),
2151
            ]
2152
        );
2153
    }
2154
2155
    #[test]
2156
    fn compile_repetition() {
2157
        assert_eq!(
2158
            build(r"a?").states(),
2159
            &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2160
        );
2161
        assert_eq!(
2162
            build(r"a??").states(),
2163
            &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2164
        );
2165
    }
2166
2167
    #[test]
2168
    fn compile_group() {
2169
        assert_eq!(
2170
            build(r"ab+").states(),
2171
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2172
        );
2173
        assert_eq!(
2174
            build(r"(ab)").states(),
2175
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2176
        );
2177
        assert_eq!(
2178
            build(r"(ab)+").states(),
2179
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2180
        );
2181
    }
2182
2183
    #[test]
2184
    fn compile_alternation() {
2185
        assert_eq!(
2186
            build(r"a|b").states(),
2187
            &[s_range(b'a', b'b', 1), s_match(0)]
2188
        );
2189
        assert_eq!(
2190
            build(r"ab|cd").states(),
2191
            &[
2192
                s_byte(b'b', 3),
2193
                s_byte(b'd', 3),
2194
                s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2195
                s_match(0)
2196
            ],
2197
        );
2198
        assert_eq!(
2199
            build(r"|b").states(),
2200
            &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2201
        );
2202
        assert_eq!(
2203
            build(r"a|").states(),
2204
            &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2205
        );
2206
    }
2207
2208
    // This tests the use of a non-binary union, i.e., a state with more than
2209
    // 2 unconditional epsilon transitions. The only place they tend to appear
2210
    // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2211
    // and 'sparse' tend to cover all other cases of alternation.
2212
    #[test]
2213
    fn compile_non_binary_union() {
2214
        let nfa = NFA::compiler()
2215
            .configure(
2216
                NFA::config()
2217
                    .which_captures(WhichCaptures::None)
2218
                    .reverse(true)
2219
                    .shrink(false)
2220
                    .unanchored_prefix(false),
2221
            )
2222
            .build(r"[\u1000\u2000\u3000]")
2223
            .unwrap();
2224
        assert_eq!(
2225
            nfa.states(),
2226
            &[
2227
                s_union(&[3, 6, 9]),
2228
                s_byte(0xE1, 10),
2229
                s_byte(0x80, 1),
2230
                s_byte(0x80, 2),
2231
                s_byte(0xE2, 10),
2232
                s_byte(0x80, 4),
2233
                s_byte(0x80, 5),
2234
                s_byte(0xE3, 10),
2235
                s_byte(0x80, 7),
2236
                s_byte(0x80, 8),
2237
                s_match(0),
2238
            ]
2239
        );
2240
    }
2241
2242
    #[test]
2243
    fn compile_many_start_pattern() {
2244
        let nfa = NFA::compiler()
2245
            .configure(
2246
                NFA::config()
2247
                    .which_captures(WhichCaptures::None)
2248
                    .unanchored_prefix(false),
2249
            )
2250
            .build_many(&["a", "b"])
2251
            .unwrap();
2252
        assert_eq!(
2253
            nfa.states(),
2254
            &[
2255
                s_byte(b'a', 1),
2256
                s_match(0),
2257
                s_byte(b'b', 3),
2258
                s_match(1),
2259
                s_bin_union(0, 2),
2260
            ]
2261
        );
2262
        assert_eq!(nfa.start_anchored().as_usize(), 4);
2263
        assert_eq!(nfa.start_unanchored().as_usize(), 4);
2264
        // Test that the start states for each individual pattern are correct.
2265
        assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2266
        assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2267
    }
2268
2269
    // This tests that our compiler can handle an empty character class. At the
2270
    // time of writing, the regex parser forbids it, so the only way to test it
2271
    // is to provide a hand written HIR.
2272
    #[test]
2273
    fn empty_class_bytes() {
2274
        use regex_syntax::hir::{Class, ClassBytes, Hir};
2275
2276
        let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2277
        let config = NFA::config()
2278
            .which_captures(WhichCaptures::None)
2279
            .unanchored_prefix(false);
2280
        let nfa =
2281
            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2282
        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2283
    }
2284
2285
    // Like empty_class_bytes, but for a Unicode class.
2286
    #[test]
2287
    fn empty_class_unicode() {
2288
        use regex_syntax::hir::{Class, ClassUnicode, Hir};
2289
2290
        let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2291
        let config = NFA::config()
2292
            .which_captures(WhichCaptures::None)
2293
            .unanchored_prefix(false);
2294
        let nfa =
2295
            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2296
        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2297
    }
2298
2299
    #[test]
2300
    fn compile_captures_all() {
2301
        let nfa = NFA::compiler()
2302
            .configure(
2303
                NFA::config()
2304
                    .unanchored_prefix(false)
2305
                    .which_captures(WhichCaptures::All),
2306
            )
2307
            .build("a(b)c")
2308
            .unwrap();
2309
        assert_eq!(
2310
            nfa.states(),
2311
            &[
2312
                s_cap(1, 0, 0, 0),
2313
                s_byte(b'a', 2),
2314
                s_cap(3, 0, 1, 2),
2315
                s_byte(b'b', 4),
2316
                s_cap(5, 0, 1, 3),
2317
                s_byte(b'c', 6),
2318
                s_cap(7, 0, 0, 1),
2319
                s_match(0)
2320
            ]
2321
        );
2322
        let ginfo = nfa.group_info();
2323
        assert_eq!(2, ginfo.all_group_len());
2324
    }
2325
2326
    #[test]
2327
    fn compile_captures_implicit() {
2328
        let nfa = NFA::compiler()
2329
            .configure(
2330
                NFA::config()
2331
                    .unanchored_prefix(false)
2332
                    .which_captures(WhichCaptures::Implicit),
2333
            )
2334
            .build("a(b)c")
2335
            .unwrap();
2336
        assert_eq!(
2337
            nfa.states(),
2338
            &[
2339
                s_cap(1, 0, 0, 0),
2340
                s_byte(b'a', 2),
2341
                s_byte(b'b', 3),
2342
                s_byte(b'c', 4),
2343
                s_cap(5, 0, 0, 1),
2344
                s_match(0)
2345
            ]
2346
        );
2347
        let ginfo = nfa.group_info();
2348
        assert_eq!(1, ginfo.all_group_len());
2349
    }
2350
2351
    #[test]
2352
    fn compile_captures_none() {
2353
        let nfa = NFA::compiler()
2354
            .configure(
2355
                NFA::config()
2356
                    .unanchored_prefix(false)
2357
                    .which_captures(WhichCaptures::None),
2358
            )
2359
            .build("a(b)c")
2360
            .unwrap();
2361
        assert_eq!(
2362
            nfa.states(),
2363
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2364
        );
2365
        let ginfo = nfa.group_info();
2366
        assert_eq!(0, ginfo.all_group_len());
2367
    }
2368
}