Coverage Report

Created: 2025-07-23 07:16

/src/regex/regex-automata/src/nfa/thompson/compiler.rs
Line
Count
Source (jump to first uncovered line)
1
use core::{borrow::Borrow, cell::RefCell};
2
3
use alloc::{sync::Arc, vec, vec::Vec};
4
5
use regex_syntax::{
6
    hir::{self, Hir},
7
    utf8::{Utf8Range, Utf8Sequences},
8
    ParserBuilder,
9
};
10
11
use crate::{
12
    nfa::thompson::{
13
        builder::Builder,
14
        error::BuildError,
15
        literal_trie::LiteralTrie,
16
        map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17
        nfa::{Transition, NFA},
18
        range_trie::RangeTrie,
19
    },
20
    util::{
21
        look::{Look, LookMatcher},
22
        primitives::{PatternID, StateID},
23
    },
24
};
25
26
/// The configuration used for a Thompson NFA compiler.
27
#[derive(Clone, Debug, Default)]
28
pub struct Config {
29
    utf8: Option<bool>,
30
    reverse: Option<bool>,
31
    nfa_size_limit: Option<Option<usize>>,
32
    shrink: Option<bool>,
33
    which_captures: Option<WhichCaptures>,
34
    look_matcher: Option<LookMatcher>,
35
    #[cfg(test)]
36
    unanchored_prefix: Option<bool>,
37
}
38
39
impl Config {
40
    /// Return a new default Thompson NFA compiler configuration.
41
72.5k
    pub fn new() -> Config {
42
72.5k
        Config::default()
43
72.5k
    }
44
45
    /// Whether to enable UTF-8 mode during search or not.
46
    ///
47
    /// A regex engine is said to be in UTF-8 mode when it guarantees that
48
    /// all matches returned by it have spans consisting of only valid UTF-8.
49
    /// That is, it is impossible for a match span to be returned that
50
    /// contains any invalid UTF-8.
51
    ///
52
    /// UTF-8 mode generally consists of two things:
53
    ///
54
    /// 1. Whether the NFA's states are constructed such that all paths to a
55
    /// match state that consume at least one byte always correspond to valid
56
    /// UTF-8.
57
    /// 2. Whether all paths to a match state that do _not_ consume any bytes
58
    /// should always correspond to valid UTF-8 boundaries.
59
    ///
60
    /// (1) is a guarantee made by whoever constructs the NFA.
61
    /// If you're parsing a regex from its concrete syntax, then
62
    /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63
    /// this guarantee for you. It does it by returning an error if the regex
64
    /// pattern could every report a non-empty match span that contains invalid
65
    /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66
    /// successfully parses, then you're guaranteed that the corresponding NFA
67
    /// will only ever report non-empty match spans containing valid UTF-8.
68
    ///
69
    /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70
    /// state graph itself. Consider, for example, the regex `a*`. It matches
71
    /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72
    /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73
    /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74
    /// guarantee must be made at a higher level than the NFA state graph
75
    /// itself. This crate deals with this case in each regex engine. Namely,
76
    /// when a zero-width match that splits a codepoint is found and UTF-8
77
    /// mode enabled, then it is ignored and the engine moves on looking for
78
    /// the next match.
79
    ///
80
    /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81
    /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82
    /// engines that empty matches that split codepoints should be banned.
83
    ///
84
    /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85
    /// it only makes sense to enable this option when you *know* your haystack
86
    /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87
    /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88
    /// behavior**.
89
    ///
90
    /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91
    /// simultaneously *disabling* this option. That would ensure all non-empty
92
    /// match spans are valid UTF-8, but that empty match spans may still split
93
    /// a codepoint or match at other places that aren't valid UTF-8.
94
    ///
95
    /// In general, this mode is only relevant if your regex can match the
96
    /// empty string. Most regexes don't.
97
    ///
98
    /// This is enabled by default.
99
    ///
100
    /// # Example
101
    ///
102
    /// This example shows how UTF-8 mode can impact the match spans that may
103
    /// be reported in certain cases.
104
    ///
105
    /// ```
106
    /// use regex_automata::{
107
    ///     nfa::thompson::{self, pikevm::PikeVM},
108
    ///     Match, Input,
109
    /// };
110
    ///
111
    /// let re = PikeVM::new("")?;
112
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113
    ///
114
    /// // UTF-8 mode is enabled by default.
115
    /// let mut input = Input::new("☃");
116
    /// re.search(&mut cache, &input, &mut caps);
117
    /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118
    ///
119
    /// // Even though an empty regex matches at 1..1, our next match is
120
    /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121
    /// // three bytes long).
122
    /// input.set_start(1);
123
    /// re.search(&mut cache, &input, &mut caps);
124
    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125
    ///
126
    /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127
    /// let re = PikeVM::builder()
128
    ///     .thompson(thompson::Config::new().utf8(false))
129
    ///     .build("")?;
130
    /// re.search(&mut cache, &input, &mut caps);
131
    /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132
    ///
133
    /// input.set_start(2);
134
    /// re.search(&mut cache, &input, &mut caps);
135
    /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136
    ///
137
    /// input.set_start(3);
138
    /// re.search(&mut cache, &input, &mut caps);
139
    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140
    ///
141
    /// input.set_start(4);
142
    /// re.search(&mut cache, &input, &mut caps);
143
    /// assert_eq!(None, caps.get_match());
144
    ///
145
    /// # Ok::<(), Box<dyn std::error::Error>>(())
146
    /// ```
147
72.5k
    pub fn utf8(mut self, yes: bool) -> Config {
148
72.5k
        self.utf8 = Some(yes);
149
72.5k
        self
150
72.5k
    }
151
152
    /// Reverse the NFA.
153
    ///
154
    /// A NFA reversal is performed by reversing all of the concatenated
155
    /// sub-expressions in the original pattern, recursively. (Look around
156
    /// operators are also inverted.) The resulting NFA can be used to match
157
    /// the pattern starting from the end of a string instead of the beginning
158
    /// of a string.
159
    ///
160
    /// Reversing the NFA is useful for building a reverse DFA, which is most
161
    /// useful for finding the start of a match after its ending position has
162
    /// been found. NFA execution engines typically do not work on reverse
163
    /// NFAs. For example, currently, the Pike VM reports the starting location
164
    /// of matches without a reverse NFA.
165
    ///
166
    /// Currently, enabling this setting requires disabling the
167
    /// [`captures`](Config::captures) setting. If both are enabled, then the
168
    /// compiler will return an error. It is expected that this limitation will
169
    /// be lifted in the future.
170
    ///
171
    /// This is disabled by default.
172
    ///
173
    /// # Example
174
    ///
175
    /// This example shows how to build a DFA from a reverse NFA, and then use
176
    /// the DFA to search backwards.
177
    ///
178
    /// ```
179
    /// use regex_automata::{
180
    ///     dfa::{self, Automaton},
181
    ///     nfa::thompson::{NFA, WhichCaptures},
182
    ///     HalfMatch, Input,
183
    /// };
184
    ///
185
    /// let dfa = dfa::dense::Builder::new()
186
    ///     .thompson(NFA::config()
187
    ///         .which_captures(WhichCaptures::None)
188
    ///         .reverse(true)
189
    ///     )
190
    ///     .build("baz[0-9]+")?;
191
    /// let expected = Some(HalfMatch::must(0, 3));
192
    /// assert_eq!(
193
    ///     expected,
194
    ///     dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195
    /// );
196
    ///
197
    /// # Ok::<(), Box<dyn std::error::Error>>(())
198
    /// ```
199
70.6k
    pub fn reverse(mut self, yes: bool) -> Config {
200
70.6k
        self.reverse = Some(yes);
201
70.6k
        self
202
70.6k
    }
203
204
    /// Sets an approximate size limit on the total heap used by the NFA being
205
    /// compiled.
206
    ///
207
    /// This permits imposing constraints on the size of a compiled NFA. This
208
    /// may be useful in contexts where the regex pattern is untrusted and one
209
    /// wants to avoid using too much memory.
210
    ///
211
    /// This size limit does not apply to auxiliary heap used during
212
    /// compilation that is not part of the built NFA.
213
    ///
214
    /// Note that this size limit is applied during compilation in order for
215
    /// the limit to prevent too much heap from being used. However, the
216
    /// implementation may use an intermediate NFA representation that is
217
    /// otherwise slightly bigger than the final public form. Since the size
218
    /// limit may be applied to an intermediate representation, there is not
219
    /// necessarily a precise correspondence between the configured size limit
220
    /// and the heap usage of the final NFA.
221
    ///
222
    /// There is no size limit by default.
223
    ///
224
    /// # Example
225
    ///
226
    /// This example demonstrates how Unicode mode can greatly increase the
227
    /// size of the NFA.
228
    ///
229
    /// ```
230
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231
    /// use regex_automata::nfa::thompson::NFA;
232
    ///
233
    /// // 400KB isn't enough!
234
    /// NFA::compiler()
235
    ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
236
    ///     .build(r"\w{20}")
237
    ///     .unwrap_err();
238
    ///
239
    /// // ... but 500KB probably is.
240
    /// let nfa = NFA::compiler()
241
    ///     .configure(NFA::config().nfa_size_limit(Some(500_000)))
242
    ///     .build(r"\w{20}")?;
243
    ///
244
    /// assert_eq!(nfa.pattern_len(), 1);
245
    ///
246
    /// # Ok::<(), Box<dyn std::error::Error>>(())
247
    /// ```
248
72.5k
    pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249
72.5k
        self.nfa_size_limit = Some(bytes);
250
72.5k
        self
251
72.5k
    }
252
253
    /// Apply best effort heuristics to shrink the NFA at the expense of more
254
    /// time/memory.
255
    ///
256
    /// Generally speaking, if one is using an NFA to compile a DFA, then the
257
    /// extra time used to shrink the NFA will be more than made up for during
258
    /// DFA construction (potentially by a lot). In other words, enabling this
259
    /// can substantially decrease the overall amount of time it takes to build
260
    /// a DFA.
261
    ///
262
    /// A reason to keep this disabled is if you want to compile an NFA and
263
    /// start using it as quickly as possible without needing to build a DFA,
264
    /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265
    /// an NFA simulation or for a lazy DFA.
266
    ///
267
    /// NFA shrinking is currently most useful when compiling a reverse
268
    /// NFA with large Unicode character classes. In particular, it trades
269
    /// additional CPU time during NFA compilation in favor of generating fewer
270
    /// NFA states.
271
    ///
272
    /// This is disabled by default because it can increase compile times
273
    /// quite a bit if you aren't building a full DFA.
274
    ///
275
    /// # Example
276
    ///
277
    /// This example shows that NFA shrinking can lead to substantial space
278
    /// savings in some cases. Notice that, as noted above, we build a reverse
279
    /// DFA and use a pattern with a large Unicode character class.
280
    ///
281
    /// ```
282
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283
    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284
    ///
285
    /// // Currently we have to disable captures when enabling reverse NFA.
286
    /// let config = NFA::config()
287
    ///     .which_captures(WhichCaptures::None)
288
    ///     .reverse(true);
289
    /// let not_shrunk = NFA::compiler()
290
    ///     .configure(config.clone().shrink(false))
291
    ///     .build(r"\w")?;
292
    /// let shrunk = NFA::compiler()
293
    ///     .configure(config.clone().shrink(true))
294
    ///     .build(r"\w")?;
295
    ///
296
    /// // While a specific shrink factor is not guaranteed, the savings can be
297
    /// // considerable in some cases.
298
    /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299
    ///
300
    /// # Ok::<(), Box<dyn std::error::Error>>(())
301
    /// ```
302
72.5k
    pub fn shrink(mut self, yes: bool) -> Config {
303
72.5k
        self.shrink = Some(yes);
304
72.5k
        self
305
72.5k
    }
306
307
    /// Whether to include 'Capture' states in the NFA.
308
    ///
309
    /// Currently, enabling this setting requires disabling the
310
    /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311
    /// compiler will return an error. It is expected that this limitation will
312
    /// be lifted in the future.
313
    ///
314
    /// This is enabled by default.
315
    ///
316
    /// # Example
317
    ///
318
    /// This example demonstrates that some regex engines, like the Pike VM,
319
    /// require capturing states to be present in the NFA to report match
320
    /// offsets.
321
    ///
322
    /// (Note that since this method is deprecated, the example below uses
323
    /// [`Config::which_captures`] to disable capture states.)
324
    ///
325
    /// ```
326
    /// use regex_automata::nfa::thompson::{
327
    ///     pikevm::PikeVM,
328
    ///     NFA,
329
    ///     WhichCaptures,
330
    /// };
331
    ///
332
    /// let re = PikeVM::builder()
333
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
334
    ///     .build(r"[a-z]+")?;
335
    /// let mut cache = re.create_cache();
336
    ///
337
    /// assert!(re.is_match(&mut cache, "abc"));
338
    /// assert_eq!(None, re.find(&mut cache, "abc"));
339
    ///
340
    /// # Ok::<(), Box<dyn std::error::Error>>(())
341
    /// ```
342
    #[deprecated(since = "0.3.5", note = "use which_captures instead")]
343
0
    pub fn captures(self, yes: bool) -> Config {
344
0
        self.which_captures(if yes {
345
0
            WhichCaptures::All
346
        } else {
347
0
            WhichCaptures::None
348
        })
349
0
    }
350
351
    /// Configures what kinds of capture groups are compiled into
352
    /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353
    /// Thompson NFA.
354
    ///
355
    /// Currently, using any option except for [`WhichCaptures::None`] requires
356
    /// disabling the [`reverse`](Config::reverse) setting. If both are
357
    /// enabled, then the compiler will return an error. It is expected that
358
    /// this limitation will be lifted in the future.
359
    ///
360
    /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361
    /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362
    /// overhead of capture states for explicit groups. Usually this occurs
363
    /// when one wants to use the `PikeVM` only for determining the overall
364
    /// match. Otherwise, the `PikeVM` could use much more memory than is
365
    /// necessary.
366
    ///
367
    /// # Example
368
    ///
369
    /// This example demonstrates that some regex engines, like the Pike VM,
370
    /// require capturing states to be present in the NFA to report match
371
    /// offsets.
372
    ///
373
    /// ```
374
    /// use regex_automata::nfa::thompson::{
375
    ///     pikevm::PikeVM,
376
    ///     NFA,
377
    ///     WhichCaptures,
378
    /// };
379
    ///
380
    /// let re = PikeVM::builder()
381
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
382
    ///     .build(r"[a-z]+")?;
383
    /// let mut cache = re.create_cache();
384
    ///
385
    /// assert!(re.is_match(&mut cache, "abc"));
386
    /// assert_eq!(None, re.find(&mut cache, "abc"));
387
    ///
388
    /// # Ok::<(), Box<dyn std::error::Error>>(())
389
    /// ```
390
    ///
391
    /// The same applies to the bounded backtracker:
392
    ///
393
    /// ```
394
    /// use regex_automata::nfa::thompson::{
395
    ///     backtrack::BoundedBacktracker,
396
    ///     NFA,
397
    ///     WhichCaptures,
398
    /// };
399
    ///
400
    /// let re = BoundedBacktracker::builder()
401
    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
402
    ///     .build(r"[a-z]+")?;
403
    /// let mut cache = re.create_cache();
404
    ///
405
    /// assert!(re.try_is_match(&mut cache, "abc")?);
406
    /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407
    ///
408
    /// # Ok::<(), Box<dyn std::error::Error>>(())
409
    /// ```
410
138k
    pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411
138k
        self.which_captures = Some(which_captures);
412
138k
        self
413
138k
    }
414
415
    /// Sets the look-around matcher that should be used with this NFA.
416
    ///
417
    /// A look-around matcher determines how to match look-around assertions.
418
    /// In particular, some assertions are configurable. For example, the
419
    /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420
    /// from the default of `\n` to any other byte.
421
    ///
422
    /// # Example
423
    ///
424
    /// This shows how to change the line terminator for multi-line assertions.
425
    ///
426
    /// ```
427
    /// use regex_automata::{
428
    ///     nfa::thompson::{self, pikevm::PikeVM},
429
    ///     util::look::LookMatcher,
430
    ///     Match, Input,
431
    /// };
432
    ///
433
    /// let mut lookm = LookMatcher::new();
434
    /// lookm.set_line_terminator(b'\x00');
435
    ///
436
    /// let re = PikeVM::builder()
437
    ///     .thompson(thompson::Config::new().look_matcher(lookm))
438
    ///     .build(r"(?m)^[a-z]+$")?;
439
    /// let mut cache = re.create_cache();
440
    ///
441
    /// // Multi-line assertions now use NUL as a terminator.
442
    /// assert_eq!(
443
    ///     Some(Match::must(0, 1..4)),
444
    ///     re.find(&mut cache, b"\x00abc\x00"),
445
    /// );
446
    /// // ... and \n is no longer recognized as a terminator.
447
    /// assert_eq!(
448
    ///     None,
449
    ///     re.find(&mut cache, b"\nabc\n"),
450
    /// );
451
    ///
452
    /// # Ok::<(), Box<dyn std::error::Error>>(())
453
    /// ```
454
72.5k
    pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455
72.5k
        self.look_matcher = Some(m);
456
72.5k
        self
457
72.5k
    }
458
459
    /// Whether to compile an unanchored prefix into this NFA.
460
    ///
461
    /// This is enabled by default. It is made available for tests only to make
462
    /// it easier to unit test the output of the compiler.
463
    #[cfg(test)]
464
    fn unanchored_prefix(mut self, yes: bool) -> Config {
465
        self.unanchored_prefix = Some(yes);
466
        self
467
    }
468
469
    /// Returns whether this configuration has enabled UTF-8 mode.
470
138k
    pub fn get_utf8(&self) -> bool {
471
138k
        self.utf8.unwrap_or(true)
472
138k
    }
473
474
    /// Returns whether this configuration has enabled reverse NFA compilation.
475
174M
    pub fn get_reverse(&self) -> bool {
476
174M
        self.reverse.unwrap_or(false)
477
174M
    }
478
479
    /// Return the configured NFA size limit, if it exists, in the number of
480
    /// bytes of heap used.
481
138k
    pub fn get_nfa_size_limit(&self) -> Option<usize> {
482
138k
        self.nfa_size_limit.unwrap_or(None)
483
138k
    }
484
485
    /// Return whether NFA shrinking is enabled.
486
1.02M
    pub fn get_shrink(&self) -> bool {
487
1.02M
        self.shrink.unwrap_or(false)
488
1.02M
    }
489
490
    /// Return whether NFA compilation is configured to produce capture states.
491
    #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492
0
    pub fn get_captures(&self) -> bool {
493
0
        self.get_which_captures().is_any()
494
0
    }
495
496
    /// Return what kinds of capture states will be compiled into an NFA.
497
4.39M
    pub fn get_which_captures(&self) -> WhichCaptures {
498
4.39M
        self.which_captures.unwrap_or(WhichCaptures::All)
499
4.39M
    }
500
501
    /// Return the look-around matcher for this NFA.
502
138k
    pub fn get_look_matcher(&self) -> LookMatcher {
503
138k
        self.look_matcher.clone().unwrap_or(LookMatcher::default())
504
138k
    }
505
506
    /// Return whether NFA compilation is configured to include an unanchored
507
    /// prefix.
508
    ///
509
    /// This is always false when not in test mode.
510
138k
    fn get_unanchored_prefix(&self) -> bool {
511
138k
        #[cfg(test)]
512
138k
        {
513
138k
            self.unanchored_prefix.unwrap_or(true)
514
138k
        }
515
138k
        #[cfg(not(test))]
516
138k
        {
517
138k
            true
518
138k
        }
519
138k
    }
520
521
    /// Overwrite the default configuration such that the options in `o` are
522
    /// always used. If an option in `o` is not set, then the corresponding
523
    /// option in `self` is used. If it's not set in `self` either, then it
524
    /// remains not set.
525
138k
    pub(crate) fn overwrite(&self, o: Config) -> Config {
526
138k
        Config {
527
138k
            utf8: o.utf8.or(self.utf8),
528
138k
            reverse: o.reverse.or(self.reverse),
529
138k
            nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530
138k
            shrink: o.shrink.or(self.shrink),
531
138k
            which_captures: o.which_captures.or(self.which_captures),
532
138k
            look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533
138k
            #[cfg(test)]
534
138k
            unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535
138k
        }
536
138k
    }
537
}
538
539
/// A configuration indicating which kinds of
540
/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541
///
542
/// This configuration can be used with [`Config::which_captures`] to control
543
/// which capture states are compiled into a Thompson NFA.
544
///
545
/// The default configuration is [`WhichCaptures::All`].
546
#[derive(Clone, Copy, Debug)]
547
pub enum WhichCaptures {
548
    /// All capture states, including those corresponding to both implicit and
549
    /// explicit capture groups, are included in the Thompson NFA.
550
    All,
551
    /// Only capture states corresponding to implicit capture groups are
552
    /// included. Implicit capture groups appear in every pattern implicitly
553
    /// and correspond to the overall match of a pattern.
554
    ///
555
    /// This is useful when one only cares about the overall match of a
556
    /// pattern. By excluding capture states from explicit capture groups,
557
    /// one might be able to reduce the memory usage of a multi-pattern regex
558
    /// substantially if it was otherwise written to have many explicit capture
559
    /// groups.
560
    Implicit,
561
    /// No capture states are compiled into the Thompson NFA.
562
    ///
563
    /// This is useful when capture states are either not needed (for example,
564
    /// if one is only trying to build a DFA) or if they aren't supported (for
565
    /// example, a reverse NFA).
566
    None,
567
}
568
569
impl Default for WhichCaptures {
570
0
    fn default() -> WhichCaptures {
571
0
        WhichCaptures::All
572
0
    }
573
}
574
575
impl WhichCaptures {
576
    /// Returns true if this configuration indicates that no capture states
577
    /// should be produced in an NFA.
578
70.6k
    pub fn is_none(&self) -> bool {
579
70.6k
        matches!(*self, WhichCaptures::None)
580
70.6k
    }
581
582
    /// Returns true if this configuration indicates that some capture states
583
    /// should be added to an NFA. Note that this might only include capture
584
    /// states for implicit capture groups.
585
70.6k
    pub fn is_any(&self) -> bool {
586
70.6k
        !self.is_none()
587
70.6k
    }
588
}
589
590
/*
591
This compiler below uses Thompson's construction algorithm. The compiler takes
592
a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
593
is structured in a way that permits it to be executed by a virtual machine and
594
also used to efficiently build a DFA.
595
596
The compiler deals with a slightly expanded set of NFA states than what is
597
in a final NFA (as exhibited by builder::State and nfa::State). Notably a
598
compiler state includes an empty node that has exactly one unconditional
599
epsilon transition to the next state. In other words, it's a "goto" instruction
600
if one views Thompson's NFA as a set of bytecode instructions. These goto
601
instructions are removed in a subsequent phase before returning the NFA to the
602
caller. The purpose of these empty nodes is that they make the construction
603
algorithm substantially simpler to implement. We remove them before returning
604
to the caller because they can represent substantial overhead when traversing
605
the NFA graph (either while searching using the NFA directly or while building
606
a DFA).
607
608
In the future, it would be nice to provide a Glushkov compiler as well, as it
609
would work well as a bit-parallel NFA for smaller regexes. But the Thompson
610
construction is one I'm more familiar with and seems more straight-forward to
611
deal with when it comes to large Unicode character classes.
612
613
Internally, the compiler uses interior mutability to improve composition in the
614
face of the borrow checker. In particular, we'd really like to be able to write
615
things like this:
616
617
    self.c_concat(exprs.iter().map(|e| self.c(e)))
618
619
Which elegantly uses iterators to build up a sequence of compiled regex
620
sub-expressions and then hands it off to the concatenating compiler routine.
621
Without interior mutability, the borrow checker won't let us borrow `self`
622
mutably both inside and outside the closure at the same time.
623
*/
624
625
/// A builder for compiling an NFA from a regex's high-level intermediate
626
/// representation (HIR).
627
///
628
/// This compiler provides a way to translate a parsed regex pattern into an
629
/// NFA state graph. The NFA state graph can either be used directly to execute
630
/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
631
///
632
/// This compiler provides APIs both for compiling regex patterns directly from
633
/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
634
///
635
/// This compiler has various options that may be configured via
636
/// [`thompson::Config`](Config).
637
///
638
/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
639
/// A `Builder` provides a lower level API that is uncoupled from a regex
640
/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
641
/// together an NFA by hand. See its docs for examples.
642
///
643
/// # Example: compilation from concrete syntax
644
///
645
/// This shows how to compile an NFA from a pattern string while setting a size
646
/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
647
///
648
/// ```
649
/// use regex_automata::{
650
///     nfa::thompson::{NFA, pikevm::PikeVM},
651
///     Match,
652
/// };
653
///
654
/// let config = NFA::config().nfa_size_limit(Some(1_000));
655
/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
656
///
657
/// let re = PikeVM::new_from_nfa(nfa)?;
658
/// let mut cache = re.create_cache();
659
/// let mut caps = re.create_captures();
660
/// let expected = Some(Match::must(0, 3..4));
661
/// re.captures(&mut cache, "!@#A#@!", &mut caps);
662
/// assert_eq!(expected, caps.get_match());
663
///
664
/// # Ok::<(), Box<dyn std::error::Error>>(())
665
/// ```
666
///
667
/// # Example: compilation from HIR
668
///
669
/// This shows how to hand assemble a regular expression via its HIR, and then
670
/// compile an NFA directly from it.
671
///
672
/// ```
673
/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
674
/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
675
///
676
/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
677
///     ClassBytesRange::new(b'0', b'9'),
678
///     ClassBytesRange::new(b'A', b'Z'),
679
///     ClassBytesRange::new(b'_', b'_'),
680
///     ClassBytesRange::new(b'a', b'z'),
681
/// ])));
682
///
683
/// let config = NFA::config().nfa_size_limit(Some(1_000));
684
/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
685
///
686
/// let re = PikeVM::new_from_nfa(nfa)?;
687
/// let mut cache = re.create_cache();
688
/// let mut caps = re.create_captures();
689
/// let expected = Some(Match::must(0, 3..4));
690
/// re.captures(&mut cache, "!@#A#@!", &mut caps);
691
/// assert_eq!(expected, caps.get_match());
692
///
693
/// # Ok::<(), Box<dyn std::error::Error>>(())
694
/// ```
695
#[derive(Clone, Debug)]
696
pub struct Compiler {
697
    /// A regex parser, used when compiling an NFA directly from a pattern
698
    /// string.
699
    parser: ParserBuilder,
700
    /// The compiler configuration.
701
    config: Config,
702
    /// The builder for actually constructing an NFA. This provides a
703
    /// convenient abstraction for writing a compiler.
704
    builder: RefCell<Builder>,
705
    /// State used for compiling character classes to UTF-8 byte automata.
706
    /// State is not retained between character class compilations. This just
707
    /// serves to amortize allocation to the extent possible.
708
    utf8_state: RefCell<Utf8State>,
709
    /// State used for arranging character classes in reverse into a trie.
710
    trie_state: RefCell<RangeTrie>,
711
    /// State used for caching common suffixes when compiling reverse UTF-8
712
    /// automata (for Unicode character classes).
713
    utf8_suffix: RefCell<Utf8SuffixMap>,
714
}
715
716
impl Compiler {
717
    /// Create a new NFA builder with its default configuration.
718
504k
    pub fn new() -> Compiler {
719
504k
        Compiler {
720
504k
            parser: ParserBuilder::new(),
721
504k
            config: Config::default(),
722
504k
            builder: RefCell::new(Builder::new()),
723
504k
            utf8_state: RefCell::new(Utf8State::new()),
724
504k
            trie_state: RefCell::new(RangeTrie::new()),
725
504k
            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
726
504k
        }
727
504k
    }
728
729
    /// Compile the given regular expression pattern into an NFA.
730
    ///
731
    /// If there was a problem parsing the regex, then that error is returned.
732
    ///
733
    /// Otherwise, if there was a problem building the NFA, then an error is
734
    /// returned. The only error that can occur is if the compiled regex would
735
    /// exceed the size limits configured on this builder, or if any part of
736
    /// the NFA would exceed the integer representations used. (For example,
737
    /// too many states might plausibly occur on a 16-bit target.)
738
    ///
739
    /// # Example
740
    ///
741
    /// ```
742
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
743
    ///
744
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
745
    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
746
    ///
747
    /// let re = PikeVM::new_from_nfa(nfa)?;
748
    /// let mut cache = re.create_cache();
749
    /// let mut caps = re.create_captures();
750
    /// let expected = Some(Match::must(0, 3..4));
751
    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
752
    /// assert_eq!(expected, caps.get_match());
753
    ///
754
    /// # Ok::<(), Box<dyn std::error::Error>>(())
755
    /// ```
756
0
    pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
757
0
        self.build_many(&[pattern])
758
0
    }
759
760
    /// Compile the given regular expression patterns into a single NFA.
761
    ///
762
    /// When matches are returned, the pattern ID corresponds to the index of
763
    /// the pattern in the slice given.
764
    ///
765
    /// # Example
766
    ///
767
    /// ```
768
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
769
    ///
770
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
771
    /// let nfa = NFA::compiler().configure(config).build_many(&[
772
    ///     r"(?-u)\s",
773
    ///     r"(?-u)\w",
774
    /// ])?;
775
    ///
776
    /// let re = PikeVM::new_from_nfa(nfa)?;
777
    /// let mut cache = re.create_cache();
778
    /// let mut caps = re.create_captures();
779
    /// let expected = Some(Match::must(1, 1..2));
780
    /// re.captures(&mut cache, "!A! !A!", &mut caps);
781
    /// assert_eq!(expected, caps.get_match());
782
    ///
783
    /// # Ok::<(), Box<dyn std::error::Error>>(())
784
    /// ```
785
0
    pub fn build_many<P: AsRef<str>>(
786
0
        &self,
787
0
        patterns: &[P],
788
0
    ) -> Result<NFA, BuildError> {
789
0
        let mut hirs = vec![];
790
0
        for p in patterns {
791
0
            hirs.push(
792
0
                self.parser
793
0
                    .build()
794
0
                    .parse(p.as_ref())
795
0
                    .map_err(BuildError::syntax)?,
796
            );
797
0
            debug!("parsed: {:?}", p.as_ref());
798
        }
799
0
        self.build_many_from_hir(&hirs)
800
0
    }
801
802
    /// Compile the given high level intermediate representation of a regular
803
    /// expression into an NFA.
804
    ///
805
    /// If there was a problem building the NFA, then an error is returned. The
806
    /// only error that can occur is if the compiled regex would exceed the
807
    /// size limits configured on this builder, or if any part of the NFA would
808
    /// exceed the integer representations used. (For example, too many states
809
    /// might plausibly occur on a 16-bit target.)
810
    ///
811
    /// # Example
812
    ///
813
    /// ```
814
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
815
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
816
    ///
817
    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
818
    ///     ClassBytesRange::new(b'0', b'9'),
819
    ///     ClassBytesRange::new(b'A', b'Z'),
820
    ///     ClassBytesRange::new(b'_', b'_'),
821
    ///     ClassBytesRange::new(b'a', b'z'),
822
    /// ])));
823
    ///
824
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
825
    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
826
    ///
827
    /// let re = PikeVM::new_from_nfa(nfa)?;
828
    /// let mut cache = re.create_cache();
829
    /// let mut caps = re.create_captures();
830
    /// let expected = Some(Match::must(0, 3..4));
831
    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
832
    /// assert_eq!(expected, caps.get_match());
833
    ///
834
    /// # Ok::<(), Box<dyn std::error::Error>>(())
835
    /// ```
836
4.75k
    pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
837
4.75k
        self.build_many_from_hir(&[expr])
838
4.75k
    }
839
840
    /// Compile the given high level intermediate representations of regular
841
    /// expressions into a single NFA.
842
    ///
843
    /// When matches are returned, the pattern ID corresponds to the index of
844
    /// the pattern in the slice given.
845
    ///
846
    /// # Example
847
    ///
848
    /// ```
849
    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
850
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
851
    ///
852
    /// let hirs = &[
853
    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
854
    ///         ClassBytesRange::new(b'\t', b'\r'),
855
    ///         ClassBytesRange::new(b' ', b' '),
856
    ///     ]))),
857
    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
858
    ///         ClassBytesRange::new(b'0', b'9'),
859
    ///         ClassBytesRange::new(b'A', b'Z'),
860
    ///         ClassBytesRange::new(b'_', b'_'),
861
    ///         ClassBytesRange::new(b'a', b'z'),
862
    ///     ]))),
863
    /// ];
864
    ///
865
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
866
    /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
867
    ///
868
    /// let re = PikeVM::new_from_nfa(nfa)?;
869
    /// let mut cache = re.create_cache();
870
    /// let mut caps = re.create_captures();
871
    /// let expected = Some(Match::must(1, 1..2));
872
    /// re.captures(&mut cache, "!A! !A!", &mut caps);
873
    /// assert_eq!(expected, caps.get_match());
874
    ///
875
    /// # Ok::<(), Box<dyn std::error::Error>>(())
876
    /// ```
877
138k
    pub fn build_many_from_hir<H: Borrow<Hir>>(
878
138k
        &self,
879
138k
        exprs: &[H],
880
138k
    ) -> Result<NFA, BuildError> {
881
138k
        self.compile(exprs)
882
138k
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<regex_syntax::hir::Hir>
<regex_automata::nfa::thompson::compiler::Compiler>::build_many_from_hir::<&regex_syntax::hir::Hir>
Line
Count
Source
877
138k
    pub fn build_many_from_hir<H: Borrow<Hir>>(
878
138k
        &self,
879
138k
        exprs: &[H],
880
138k
    ) -> Result<NFA, BuildError> {
881
138k
        self.compile(exprs)
882
138k
    }
883
884
    /// Apply the given NFA configuration options to this builder.
885
    ///
886
    /// # Example
887
    ///
888
    /// ```
889
    /// use regex_automata::nfa::thompson::NFA;
890
    ///
891
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
892
    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
893
    /// assert_eq!(nfa.pattern_len(), 1);
894
    ///
895
    /// # Ok::<(), Box<dyn std::error::Error>>(())
896
    /// ```
897
138k
    pub fn configure(&mut self, config: Config) -> &mut Compiler {
898
138k
        self.config = self.config.overwrite(config);
899
138k
        self
900
138k
    }
901
902
    /// Set the syntax configuration for this builder using
903
    /// [`syntax::Config`](crate::util::syntax::Config).
904
    ///
905
    /// This permits setting things like case insensitivity, Unicode and multi
906
    /// line mode.
907
    ///
908
    /// This syntax configuration only applies when an NFA is built directly
909
    /// from a pattern string. If an NFA is built from an HIR, then all syntax
910
    /// settings are ignored.
911
    ///
912
    /// # Example
913
    ///
914
    /// ```
915
    /// use regex_automata::{nfa::thompson::NFA, util::syntax};
916
    ///
917
    /// let syntax_config = syntax::Config::new().unicode(false);
918
    /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
919
    /// // If Unicode were enabled, the number of states would be much bigger.
920
    /// assert!(nfa.states().len() < 15);
921
    ///
922
    /// # Ok::<(), Box<dyn std::error::Error>>(())
923
    /// ```
924
0
    pub fn syntax(
925
0
        &mut self,
926
0
        config: crate::util::syntax::Config,
927
0
    ) -> &mut Compiler {
928
0
        config.apply(&mut self.parser);
929
0
        self
930
0
    }
931
}
932
933
impl Compiler {
934
    /// Compile the sequence of HIR expressions given. Pattern IDs are
935
    /// allocated starting from 0, in correspondence with the slice given.
936
    ///
937
    /// It is legal to provide an empty slice. In that case, the NFA returned
938
    /// has no patterns and will never match anything.
939
138k
    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940
138k
        if exprs.len() > PatternID::LIMIT {
941
0
            return Err(BuildError::too_many_patterns(exprs.len()));
942
138k
        }
943
138k
        if self.config.get_reverse()
944
70.6k
            && self.config.get_which_captures().is_any()
945
        {
946
0
            return Err(BuildError::unsupported_captures());
947
138k
        }
948
138k
949
138k
        self.builder.borrow_mut().clear();
950
138k
        self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951
138k
        self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952
138k
        self.builder
953
138k
            .borrow_mut()
954
138k
            .set_look_matcher(self.config.get_look_matcher());
955
138k
        self.builder
956
138k
            .borrow_mut()
957
138k
            .set_size_limit(self.config.get_nfa_size_limit())?;
958
959
        // We always add an unanchored prefix unless we were specifically told
960
        // not to (for tests only), or if we know that the regex is anchored
961
        // for all matches. When an unanchored prefix is not added, then the
962
        // NFA's anchored and unanchored start states are equivalent.
963
138k
        let all_anchored = exprs.iter().all(|e| {
964
138k
            let props = e.borrow().properties();
965
138k
            if self.config.get_reverse() {
966
70.6k
                props.look_set_suffix().contains(hir::Look::End)
967
            } else {
968
67.8k
                props.look_set_prefix().contains(hir::Look::Start)
969
            }
970
138k
        });
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#0}
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>::{closure#0}
Line
Count
Source
963
138k
        let all_anchored = exprs.iter().all(|e| {
964
138k
            let props = e.borrow().properties();
965
138k
            if self.config.get_reverse() {
966
70.6k
                props.look_set_suffix().contains(hir::Look::End)
967
            } else {
968
67.8k
                props.look_set_prefix().contains(hir::Look::Start)
969
            }
970
138k
        });
971
138k
        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
972
138k
        let unanchored_prefix = if anchored {
973
3.14k
            self.c_empty()?
974
        } else {
975
135k
            self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
976
        };
977
978
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
979
138k
            let _ = self.start_pattern()?;
980
138k
            let one = self.c_cap(0, None, e.borrow())?;
981
136k
            let match_state_id = self.add_match()?;
982
136k
            self.patch(one.end, match_state_id)?;
983
136k
            let _ = self.finish_pattern(one.start)?;
984
136k
            Ok(ThompsonRef { start: one.start, end: match_state_id })
985
138k
        }))?;
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>::{closure#1}
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>::{closure#1}
Line
Count
Source
978
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
979
138k
            let _ = self.start_pattern()?;
980
138k
            let one = self.c_cap(0, None, e.borrow())?;
981
136k
            let match_state_id = self.add_match()?;
982
136k
            self.patch(one.end, match_state_id)?;
983
136k
            let _ = self.finish_pattern(one.start)?;
984
136k
            Ok(ThompsonRef { start: one.start, end: match_state_id })
985
138k
        }))?;
986
136k
        self.patch(unanchored_prefix.end, compiled.start)?;
987
136k
        let nfa = self
988
136k
            .builder
989
136k
            .borrow_mut()
990
136k
            .build(compiled.start, unanchored_prefix.start)?;
991
992
136k
        debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
993
136k
        Ok(nfa)
994
138k
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::compile::<regex_syntax::hir::Hir>
<regex_automata::nfa::thompson::compiler::Compiler>::compile::<&regex_syntax::hir::Hir>
Line
Count
Source
939
138k
    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940
138k
        if exprs.len() > PatternID::LIMIT {
941
0
            return Err(BuildError::too_many_patterns(exprs.len()));
942
138k
        }
943
138k
        if self.config.get_reverse()
944
70.6k
            && self.config.get_which_captures().is_any()
945
        {
946
0
            return Err(BuildError::unsupported_captures());
947
138k
        }
948
138k
949
138k
        self.builder.borrow_mut().clear();
950
138k
        self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951
138k
        self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952
138k
        self.builder
953
138k
            .borrow_mut()
954
138k
            .set_look_matcher(self.config.get_look_matcher());
955
138k
        self.builder
956
138k
            .borrow_mut()
957
138k
            .set_size_limit(self.config.get_nfa_size_limit())?;
958
959
        // We always add an unanchored prefix unless we were specifically told
960
        // not to (for tests only), or if we know that the regex is anchored
961
        // for all matches. When an unanchored prefix is not added, then the
962
        // NFA's anchored and unanchored start states are equivalent.
963
138k
        let all_anchored = exprs.iter().all(|e| {
964
            let props = e.borrow().properties();
965
            if self.config.get_reverse() {
966
                props.look_set_suffix().contains(hir::Look::End)
967
            } else {
968
                props.look_set_prefix().contains(hir::Look::Start)
969
            }
970
138k
        });
971
138k
        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
972
138k
        let unanchored_prefix = if anchored {
973
3.14k
            self.c_empty()?
974
        } else {
975
135k
            self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
976
        };
977
978
138k
        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
979
            let _ = self.start_pattern()?;
980
            let one = self.c_cap(0, None, e.borrow())?;
981
            let match_state_id = self.add_match()?;
982
            self.patch(one.end, match_state_id)?;
983
            let _ = self.finish_pattern(one.start)?;
984
            Ok(ThompsonRef { start: one.start, end: match_state_id })
985
138k
        }))?;
986
136k
        self.patch(unanchored_prefix.end, compiled.start)?;
987
136k
        let nfa = self
988
136k
            .builder
989
136k
            .borrow_mut()
990
136k
            .build(compiled.start, unanchored_prefix.start)?;
991
992
136k
        debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
993
136k
        Ok(nfa)
994
138k
    }
995
996
    /// Compile an arbitrary HIR expression.
997
76.0M
    fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
998
        use regex_syntax::hir::{Class, HirKind::*};
999
1000
76.0M
        match *expr.kind() {
1001
1.18M
            Empty => self.c_empty(),
1002
48.8M
            Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1003
3.59M
            Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1004
5.38M
            Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1005
5.48M
            Look(ref look) => self.c_look(look),
1006
3.72M
            Repetition(ref rep) => self.c_repetition(rep),
1007
4.18M
            Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1008
4.43M
            Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1009
2.74M
            Alternation(ref es) => self.c_alt_slice(es),
1010
        }
1011
76.0M
    }
1012
1013
    /// Compile a concatenation of the sub-expressions yielded by the given
1014
    /// iterator. If the iterator yields no elements, then this compiles down
1015
    /// to an "empty" state that always matches.
1016
    ///
1017
    /// If the compiler is in reverse mode, then the expressions given are
1018
    /// automatically compiled in reverse.
1019
50.0M
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020
50.0M
    where
1021
50.0M
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022
50.0M
    {
1023
50.0M
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024
50.0M
        let ThompsonRef { start, mut end } = match first {
1025
50.0M
            Some(result) => result?,
1026
5.65k
            None => return self.c_empty(),
1027
        };
1028
        loop {
1029
115M
            let next =
1030
115M
                if self.is_reverse() { it.next_back() } else { it.next() };
1031
115M
            let compiled = match next {
1032
65.1M
                Some(result) => result?,
1033
50.0M
                None => break,
1034
            };
1035
65.1M
            self.patch(end, compiled.start)?;
1036
65.1M
            end = compiled.end;
1037
        }
1038
50.0M
        Ok(ThompsonRef { start, end })
1039
50.0M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<u8>>, <regex_automata::nfa::thompson::compiler::Compiler>::c_literal::{closure#0}>>
Line
Count
Source
1019
48.8M
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020
48.8M
    where
1021
48.8M
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022
48.8M
    {
1023
48.8M
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024
48.8M
        let ThompsonRef { start, mut end } = match first {
1025
48.8M
            Some(result) => result?,
1026
0
            None => return self.c_empty(),
1027
        };
1028
        loop {
1029
60.1M
            let next =
1030
60.1M
                if self.is_reverse() { it.next_back() } else { it.next() };
1031
60.1M
            let compiled = match next {
1032
11.3M
                Some(result) => result?,
1033
48.8M
                None => break,
1034
            };
1035
11.3M
            self.patch(end, compiled.start)?;
1036
11.3M
            end = compiled.end;
1037
        }
1038
48.8M
        Ok(ThompsonRef { start, end })
1039
48.8M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_automata::nfa::thompson::compiler::Compiler>::c_exactly::{closure#0}>>
Line
Count
Source
1019
271k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020
271k
    where
1021
271k
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022
271k
    {
1023
271k
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024
271k
        let ThompsonRef { start, mut end } = match first {
1025
265k
            Some(result) => result?,
1026
5.65k
            None => return self.c_empty(),
1027
        };
1028
        loop {
1029
50.5M
            let next =
1030
50.5M
                if self.is_reverse() { it.next_back() } else { it.next() };
1031
50.5M
            let compiled = match next {
1032
50.2M
                Some(result) => result?,
1033
259k
                None => break,
1034
            };
1035
50.2M
            self.patch(end, compiled.start)?;
1036
50.2M
            end = compiled.end;
1037
        }
1038
259k
        Ok(ThompsonRef { start, end })
1039
271k
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c::{closure#0}>>
Line
Count
Source
1019
958k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020
958k
    where
1021
958k
        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022
958k
    {
1023
958k
        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024
958k
        let ThompsonRef { start, mut end } = match first {
1025
958k
            Some(result) => result?,
1026
0
            None => return self.c_empty(),
1027
        };
1028
        loop {
1029
4.43M
            let next =
1030
4.43M
                if self.is_reverse() { it.next_back() } else { it.next() };
1031
4.43M
            let compiled = match next {
1032
3.47M
                Some(result) => result?,
1033
956k
                None => break,
1034
            };
1035
3.47M
            self.patch(end, compiled.start)?;
1036
3.47M
            end = compiled.end;
1037
        }
1038
956k
        Ok(ThompsonRef { start, end })
1039
958k
    }
1040
1041
    /// Compile an alternation of the given HIR values.
1042
    ///
1043
    /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1044
    /// of an iterator of compiled NFA subgraphs. The point of accepting a
1045
    /// slice here is that it opens up some optimization opportunities. For
1046
    /// example, if all of the HIR values are literals, then this routine might
1047
    /// re-shuffle them to make NFA epsilon closures substantially faster.
1048
2.74M
    fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1049
2.74M
        // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1050
2.74M
        let literal_count = exprs
1051
2.74M
            .iter()
1052
10.4M
            .filter(|e| {
1053
10.4M
                matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1054
10.4M
            })
1055
2.74M
            .count();
1056
2.74M
        if literal_count <= 1 || literal_count < exprs.len() {
1057
9.62M
            return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1058
266k
        }
1059
1060
266k
        let mut trie = if self.is_reverse() {
1061
66.2k
            LiteralTrie::reverse()
1062
        } else {
1063
200k
            LiteralTrie::forward()
1064
        };
1065
838k
        for expr in exprs.iter() {
1066
838k
            let literal = match *expr.kind() {
1067
838k
                hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1068
0
                _ => unreachable!(),
1069
            };
1070
838k
            trie.add(literal)?;
1071
        }
1072
266k
        trie.compile(&mut self.builder.borrow_mut())
1073
2.74M
    }
1074
1075
    /// Compile an alternation, where each element yielded by the given
1076
    /// iterator represents an item in the alternation. If the iterator yields
1077
    /// no elements, then this compiles down to a "fail" state.
1078
    ///
1079
    /// In an alternation, expressions appearing earlier are "preferred" at
1080
    /// match time over expressions appearing later. At least, this is true
1081
    /// when using "leftmost first" match semantics. (If "leftmost longest" are
1082
    /// ever added in the future, then this preference order of priority would
1083
    /// not apply in that mode.)
1084
2.61M
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1085
2.61M
    where
1086
2.61M
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1087
2.61M
    {
1088
2.61M
        let first = match it.next() {
1089
0
            None => return self.c_fail(),
1090
2.61M
            Some(result) => result?,
1091
        };
1092
2.61M
        let second = match it.next() {
1093
136k
            None => return Ok(first),
1094
2.47M
            Some(result) => result?,
1095
        };
1096
1097
2.47M
        let union = self.add_union()?;
1098
2.47M
        let end = self.add_empty()?;
1099
2.47M
        self.patch(union, first.start)?;
1100
2.47M
        self.patch(first.end, end)?;
1101
2.47M
        self.patch(union, second.start)?;
1102
2.47M
        self.patch(second.end, end)?;
1103
7.14M
        for result in it {
1104
4.66M
            let compiled = result?;
1105
4.66M
            self.patch(union, compiled.start)?;
1106
4.66M
            self.patch(compiled.end, end)?;
1107
        }
1108
2.47M
        Ok(ThompsonRef { start: union, end })
1109
2.61M
    }
Unexecuted instantiation: <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<regex_syntax::hir::Hir>::{closure#1}>>
<regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::c_alt_slice::{closure#1}>>
Line
Count
Source
1084
2.47M
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1085
2.47M
    where
1086
2.47M
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1087
2.47M
    {
1088
2.47M
        let first = match it.next() {
1089
0
            None => return self.c_fail(),
1090
2.47M
            Some(result) => result?,
1091
        };
1092
2.47M
        let second = match it.next() {
1093
0
            None => return Ok(first),
1094
2.47M
            Some(result) => result?,
1095
        };
1096
1097
2.47M
        let union = self.add_union()?;
1098
2.47M
        let end = self.add_empty()?;
1099
2.47M
        self.patch(union, first.start)?;
1100
2.47M
        self.patch(first.end, end)?;
1101
2.47M
        self.patch(union, second.start)?;
1102
2.47M
        self.patch(second.end, end)?;
1103
7.14M
        for result in it {
1104
4.66M
            let compiled = result?;
1105
4.66M
            self.patch(union, compiled.start)?;
1106
4.66M
            self.patch(compiled.end, end)?;
1107
        }
1108
2.47M
        Ok(ThompsonRef { start: union, end })
1109
2.47M
    }
<regex_automata::nfa::thompson::compiler::Compiler>::c_alt_iter::<core::iter::adapters::map::Map<core::slice::iter::Iter<&regex_syntax::hir::Hir>, <regex_automata::nfa::thompson::compiler::Compiler>::compile<&regex_syntax::hir::Hir>::{closure#1}>>
Line
Count
Source
1084
138k
    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1085
138k
    where
1086
138k
        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1087
138k
    {
1088
138k
        let first = match it.next() {
1089
0
            None => return self.c_fail(),
1090
138k
            Some(result) => result?,
1091
        };
1092
136k
        let second = match it.next() {
1093
136k
            None => return Ok(first),
1094
0
            Some(result) => result?,
1095
        };
1096
1097
0
        let union = self.add_union()?;
1098
0
        let end = self.add_empty()?;
1099
0
        self.patch(union, first.start)?;
1100
0
        self.patch(first.end, end)?;
1101
0
        self.patch(union, second.start)?;
1102
0
        self.patch(second.end, end)?;
1103
0
        for result in it {
1104
0
            let compiled = result?;
1105
0
            self.patch(union, compiled.start)?;
1106
0
            self.patch(compiled.end, end)?;
1107
        }
1108
0
        Ok(ThompsonRef { start: union, end })
1109
138k
    }
1110
1111
    /// Compile the given capture sub-expression. `expr` should be the
1112
    /// sub-expression contained inside the capture. If "capture" states are
1113
    /// enabled, then they are added as appropriate.
1114
    ///
1115
    /// This accepts the pieces of a capture instead of a `hir::Capture` so
1116
    /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1117
    /// adding the entire pattern as if it were a group in order to create
1118
    /// appropriate "capture" states in the NFA.
1119
4.32M
    fn c_cap(
1120
4.32M
        &self,
1121
4.32M
        index: u32,
1122
4.32M
        name: Option<&str>,
1123
4.32M
        expr: &Hir,
1124
4.32M
    ) -> Result<ThompsonRef, BuildError> {
1125
4.32M
        match self.config.get_which_captures() {
1126
            // No capture states means we always skip them.
1127
1.77M
            WhichCaptures::None => return self.c(expr),
1128
            // Implicit captures states means we only add when index==0 since
1129
            // index==0 implies the group is implicit.
1130
0
            WhichCaptures::Implicit if index > 0 => return self.c(expr),
1131
2.55M
            _ => {}
1132
        }
1133
1134
2.55M
        let start = self.add_capture_start(index, name)?;
1135
2.55M
        let inner = self.c(expr)?;
1136
2.54M
        let end = self.add_capture_end(index)?;
1137
2.54M
        self.patch(start, inner.start)?;
1138
2.54M
        self.patch(inner.end, end)?;
1139
2.54M
        Ok(ThompsonRef { start, end })
1140
4.32M
    }
1141
1142
    /// Compile the given repetition expression. This handles all types of
1143
    /// repetitions and greediness.
1144
3.72M
    fn c_repetition(
1145
3.72M
        &self,
1146
3.72M
        rep: &hir::Repetition,
1147
3.72M
    ) -> Result<ThompsonRef, BuildError> {
1148
3.72M
        match (rep.min, rep.max) {
1149
1.48M
            (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1150
2.02M
            (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1151
217k
            (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1152
11.7k
            (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1153
        }
1154
3.72M
    }
1155
1156
    /// Compile the given expression such that it matches at least `min` times,
1157
    /// but no more than `max` times.
1158
    ///
1159
    /// When `greedy` is true, then the preference is for the expression to
1160
    /// match as much as possible. Otherwise, it will match as little as
1161
    /// possible.
1162
11.7k
    fn c_bounded(
1163
11.7k
        &self,
1164
11.7k
        expr: &Hir,
1165
11.7k
        greedy: bool,
1166
11.7k
        min: u32,
1167
11.7k
        max: u32,
1168
11.7k
    ) -> Result<ThompsonRef, BuildError> {
1169
11.7k
        let prefix = self.c_exactly(expr, min)?;
1170
10.7k
        if min == max {
1171
0
            return Ok(prefix);
1172
10.7k
        }
1173
1174
        // It is tempting here to compile the rest here as a concatenation
1175
        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1176
        // were `aaa?a?a?`. The problem here is that it leads to this program:
1177
        //
1178
        //     >000000: 61 => 01
1179
        //      000001: 61 => 02
1180
        //      000002: union(03, 04)
1181
        //      000003: 61 => 04
1182
        //      000004: union(05, 06)
1183
        //      000005: 61 => 06
1184
        //      000006: union(07, 08)
1185
        //      000007: 61 => 08
1186
        //      000008: MATCH
1187
        //
1188
        // And effectively, once you hit state 2, the epsilon closure will
1189
        // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1190
        // to instead compile it like so:
1191
        //
1192
        //     >000000: 61 => 01
1193
        //      000001: 61 => 02
1194
        //      000002: union(03, 08)
1195
        //      000003: 61 => 04
1196
        //      000004: union(05, 08)
1197
        //      000005: 61 => 06
1198
        //      000006: union(07, 08)
1199
        //      000007: 61 => 08
1200
        //      000008: MATCH
1201
        //
1202
        // So that the epsilon closure of state 2 is now just 3 and 8.
1203
10.7k
        let empty = self.add_empty()?;
1204
10.7k
        let mut prev_end = prefix.end;
1205
10.7k
        for _ in min..max {
1206
3.51M
            let union = if greedy {
1207
2.62M
                self.add_union()
1208
            } else {
1209
891k
                self.add_union_reverse()
1210
28
            }?;
1211
3.51M
            let compiled = self.c(expr)?;
1212
3.51M
            self.patch(prev_end, union)?;
1213
3.51M
            self.patch(union, compiled.start)?;
1214
3.51M
            self.patch(union, empty)?;
1215
3.51M
            prev_end = compiled.end;
1216
        }
1217
10.0k
        self.patch(prev_end, empty)?;
1218
10.0k
        Ok(ThompsonRef { start: prefix.start, end: empty })
1219
11.7k
    }
1220
1221
    /// Compile the given expression such that it may be matched `n` or more
1222
    /// times, where `n` can be any integer. (Although a particularly large
1223
    /// integer is likely to run afoul of any configured size limits.)
1224
    ///
1225
    /// When `greedy` is true, then the preference is for the expression to
1226
    /// match as much as possible. Otherwise, it will match as little as
1227
    /// possible.
1228
2.15M
    fn c_at_least(
1229
2.15M
        &self,
1230
2.15M
        expr: &Hir,
1231
2.15M
        greedy: bool,
1232
2.15M
        n: u32,
1233
2.15M
    ) -> Result<ThompsonRef, BuildError> {
1234
2.15M
        if n == 0 {
1235
            // When the expression cannot match the empty string, then we
1236
            // can get away with something much simpler: just one 'alt'
1237
            // instruction that optionally repeats itself. But if the expr
1238
            // can match the empty string... see below.
1239
1.04M
            if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1240
802k
                let union = if greedy {
1241
373k
                    self.add_union()
1242
                } else {
1243
428k
                    self.add_union_reverse()
1244
6
                }?;
1245
802k
                let compiled = self.c(expr)?;
1246
802k
                self.patch(union, compiled.start)?;
1247
802k
                self.patch(compiled.end, union)?;
1248
802k
                return Ok(ThompsonRef { start: union, end: union });
1249
240k
            }
1250
1251
            // What's going on here? Shouldn't x* be simpler than this? It
1252
            // turns out that when implementing leftmost-first (Perl-like)
1253
            // match semantics, x* results in an incorrect preference order
1254
            // when computing the transitive closure of states if and only if
1255
            // 'x' can match the empty string. So instead, we compile x* as
1256
            // (x+)?, which preserves the correct preference order.
1257
            //
1258
            // See: https://github.com/rust-lang/regex/issues/779
1259
240k
            let compiled = self.c(expr)?;
1260
239k
            let plus = if greedy {
1261
164k
                self.add_union()
1262
            } else {
1263
75.1k
                self.add_union_reverse()
1264
6
            }?;
1265
239k
            self.patch(compiled.end, plus)?;
1266
239k
            self.patch(plus, compiled.start)?;
1267
1268
239k
            let question = if greedy {
1269
164k
                self.add_union()
1270
            } else {
1271
75.1k
                self.add_union_reverse()
1272
4
            }?;
1273
239k
            let empty = self.add_empty()?;
1274
239k
            self.patch(question, compiled.start)?;
1275
239k
            self.patch(question, empty)?;
1276
239k
            self.patch(plus, empty)?;
1277
239k
            Ok(ThompsonRef { start: question, end: empty })
1278
1.11M
        } else if n == 1 {
1279
1.06M
            let compiled = self.c(expr)?;
1280
1.05M
            let union = if greedy {
1281
716k
                self.add_union()
1282
            } else {
1283
340k
                self.add_union_reverse()
1284
22
            }?;
1285
1.05M
            self.patch(compiled.end, union)?;
1286
1.05M
            self.patch(union, compiled.start)?;
1287
1.05M
            Ok(ThompsonRef { start: compiled.start, end: union })
1288
        } else {
1289
53.5k
            let prefix = self.c_exactly(expr, n - 1)?;
1290
52.2k
            let last = self.c(expr)?;
1291
52.1k
            let union = if greedy {
1292
17.9k
                self.add_union()
1293
            } else {
1294
34.2k
                self.add_union_reverse()
1295
3
            }?;
1296
52.1k
            self.patch(prefix.end, last.start)?;
1297
52.1k
            self.patch(last.end, union)?;
1298
52.1k
            self.patch(union, last.start)?;
1299
52.1k
            Ok(ThompsonRef { start: prefix.start, end: union })
1300
        }
1301
2.15M
    }
1302
1303
    /// Compile the given expression such that it may be matched zero or one
1304
    /// times.
1305
    ///
1306
    /// When `greedy` is true, then the preference is for the expression to
1307
    /// match as much as possible. Otherwise, it will match as little as
1308
    /// possible.
1309
1.48M
    fn c_zero_or_one(
1310
1.48M
        &self,
1311
1.48M
        expr: &Hir,
1312
1.48M
        greedy: bool,
1313
1.48M
    ) -> Result<ThompsonRef, BuildError> {
1314
1.48M
        let union =
1315
1.48M
            if greedy { self.add_union() } else { self.add_union_reverse() }?;
1316
1.48M
        let compiled = self.c(expr)?;
1317
1.48M
        let empty = self.add_empty()?;
1318
1.48M
        self.patch(union, compiled.start)?;
1319
1.48M
        self.patch(union, empty)?;
1320
1.48M
        self.patch(compiled.end, empty)?;
1321
1.48M
        Ok(ThompsonRef { start: union, end: empty })
1322
1.48M
    }
1323
1324
    /// Compile the given HIR expression exactly `n` times.
1325
271k
    fn c_exactly(
1326
271k
        &self,
1327
271k
        expr: &Hir,
1328
271k
        n: u32,
1329
271k
    ) -> Result<ThompsonRef, BuildError> {
1330
50.5M
        let it = (0..n).map(|_| self.c(expr));
1331
271k
        self.c_concat(it)
1332
271k
    }
1333
1334
    /// Compile the given byte oriented character class.
1335
    ///
1336
    /// This uses "sparse" states to represent an alternation between ranges in
1337
    /// this character class. We can use "sparse" states instead of stitching
1338
    /// together a "union" state because all ranges in a character class have
1339
    /// equal priority *and* are non-overlapping (thus, only one can match, so
1340
    /// there's never a question of priority in the first place). This saves a
1341
    /// fair bit of overhead when traversing an NFA.
1342
    ///
1343
    /// This routine compiles an empty character class into a "fail" state.
1344
3.59M
    fn c_byte_class(
1345
3.59M
        &self,
1346
3.59M
        cls: &hir::ClassBytes,
1347
3.59M
    ) -> Result<ThompsonRef, BuildError> {
1348
3.59M
        let end = self.add_empty()?;
1349
3.59M
        let mut trans = Vec::with_capacity(cls.ranges().len());
1350
5.74M
        for r in cls.iter() {
1351
5.74M
            trans.push(Transition {
1352
5.74M
                start: r.start(),
1353
5.74M
                end: r.end(),
1354
5.74M
                next: end,
1355
5.74M
            });
1356
5.74M
        }
1357
3.59M
        Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1358
3.59M
    }
1359
1360
    /// Compile the given Unicode character class.
1361
    ///
1362
    /// This routine specifically tries to use various types of compression,
1363
    /// since UTF-8 automata of large classes can get quite large. The specific
1364
    /// type of compression used depends on forward vs reverse compilation, and
1365
    /// whether NFA shrinking is enabled or not.
1366
    ///
1367
    /// Aside from repetitions causing lots of repeat group, this is like the
1368
    /// single most expensive part of regex compilation. Therefore, a large part
1369
    /// of the expense of compilation may be reduce by disabling Unicode in the
1370
    /// pattern.
1371
    ///
1372
    /// This routine compiles an empty character class into a "fail" state.
1373
5.38M
    fn c_unicode_class(
1374
5.38M
        &self,
1375
5.38M
        cls: &hir::ClassUnicode,
1376
5.38M
    ) -> Result<ThompsonRef, BuildError> {
1377
5.38M
        // If all we have are ASCII ranges wrapped in a Unicode package, then
1378
5.38M
        // there is zero reason to bring out the big guns. We can fit all ASCII
1379
5.38M
        // ranges within a single sparse state.
1380
5.38M
        if cls.is_ascii() {
1381
2.00M
            let end = self.add_empty()?;
1382
2.00M
            let mut trans = Vec::with_capacity(cls.ranges().len());
1383
4.64M
            for r in cls.iter() {
1384
4.64M
                // The unwraps below are OK because we've verified that this
1385
4.64M
                // class only contains ASCII codepoints.
1386
4.64M
                trans.push(Transition {
1387
4.64M
                    // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1388
4.64M
                    start: u8::try_from(u32::from(r.start())).unwrap(),
1389
4.64M
                    end: u8::try_from(u32::from(r.end())).unwrap(),
1390
4.64M
                    next: end,
1391
4.64M
                });
1392
4.64M
            }
1393
2.00M
            Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1394
3.37M
        } else if self.is_reverse() {
1395
1.02M
            if !self.config.get_shrink() {
1396
                // When we don't want to spend the extra time shrinking, we
1397
                // compile the UTF-8 automaton in reverse using something like
1398
                // the "naive" approach, but will attempt to re-use common
1399
                // suffixes.
1400
1.02M
                self.c_unicode_class_reverse_with_suffix(cls)
1401
            } else {
1402
                // When we want to shrink our NFA for reverse UTF-8 automata,
1403
                // we cannot feed UTF-8 sequences directly to the UTF-8
1404
                // compiler, since the UTF-8 compiler requires all sequences
1405
                // to be lexicographically sorted. Instead, we organize our
1406
                // sequences into a range trie, which can then output our
1407
                // sequences in the correct order. Unfortunately, building the
1408
                // range trie is fairly expensive (but not nearly as expensive
1409
                // as building a DFA). Hence the reason why the 'shrink' option
1410
                // exists, so that this path can be toggled off. For example,
1411
                // we might want to turn this off if we know we won't be
1412
                // compiling a DFA.
1413
0
                let mut trie = self.trie_state.borrow_mut();
1414
0
                trie.clear();
1415
1416
0
                for rng in cls.iter() {
1417
0
                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1418
0
                        seq.reverse();
1419
0
                        trie.insert(seq.as_slice());
1420
0
                    }
1421
                }
1422
0
                let mut builder = self.builder.borrow_mut();
1423
0
                let mut utf8_state = self.utf8_state.borrow_mut();
1424
0
                let mut utf8c =
1425
0
                    Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1426
0
                trie.iter(|seq| {
1427
0
                    utf8c.add(&seq)?;
1428
0
                    Ok(())
1429
0
                })?;
1430
0
                utf8c.finish()
1431
            }
1432
        } else {
1433
            // In the forward direction, we always shrink our UTF-8 automata
1434
            // because we can stream it right into the UTF-8 compiler. There
1435
            // is almost no downside (in either memory or time) to using this
1436
            // approach.
1437
2.34M
            let mut builder = self.builder.borrow_mut();
1438
2.34M
            let mut utf8_state = self.utf8_state.borrow_mut();
1439
2.34M
            let mut utf8c =
1440
2.34M
                Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1441
26.2M
            for rng in cls.iter() {
1442
43.2M
                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1443
43.2M
                    utf8c.add(seq.as_slice())?;
1444
                }
1445
            }
1446
2.34M
            utf8c.finish()
1447
        }
1448
1449
        // For reference, the code below is the "naive" version of compiling a
1450
        // UTF-8 automaton. It is deliciously simple (and works for both the
1451
        // forward and reverse cases), but will unfortunately produce very
1452
        // large NFAs. When compiling a forward automaton, the size difference
1453
        // can sometimes be an order of magnitude. For example, the '\w' regex
1454
        // will generate about ~3000 NFA states using the naive approach below,
1455
        // but only 283 states when using the approach above. This is because
1456
        // the approach above actually compiles a *minimal* (or near minimal,
1457
        // because of the bounded hashmap for reusing equivalent states) UTF-8
1458
        // automaton.
1459
        //
1460
        // The code below is kept as a reference point in order to make it
1461
        // easier to understand the higher level goal here. Although, it will
1462
        // almost certainly bit-rot, so keep that in mind. Also, if you try to
1463
        // use it, some of the tests in this module will fail because they look
1464
        // for terser byte code produce by the more optimized handling above.
1465
        // But the integration test suite should still pass.
1466
        //
1467
        // One good example of the substantial difference this can make is to
1468
        // compare and contrast performance of the Pike VM when the code below
1469
        // is active vs the code above. Here's an example to try:
1470
        //
1471
        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1472
        //
1473
        // With Unicode classes generated below, this search takes about 45s on
1474
        // my machine. But with the compressed version above, the search takes
1475
        // only around 1.4s. The NFA is also 20% smaller. This is in part due
1476
        // to the compression, but also because of the utilization of 'sparse'
1477
        // NFA states. They lead to much less state shuffling during the NFA
1478
        // search.
1479
        /*
1480
        let it = cls
1481
            .iter()
1482
            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1483
            .map(|seq| {
1484
                let it = seq
1485
                    .as_slice()
1486
                    .iter()
1487
                    .map(|rng| self.c_range(rng.start, rng.end));
1488
                self.c_concat(it)
1489
            });
1490
        self.c_alt_iter(it)
1491
        */
1492
5.38M
    }
1493
1494
    /// Compile the given Unicode character class in reverse with suffix
1495
    /// caching.
1496
    ///
1497
    /// This is a "quick" way to compile large Unicode classes into reverse
1498
    /// UTF-8 automata while doing a small amount of compression on that
1499
    /// automata by reusing common suffixes.
1500
    ///
1501
    /// A more comprehensive compression scheme can be accomplished by using
1502
    /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1503
    /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
1504
    ///
1505
    /// This is the technique used when "NFA shrinking" is disabled.
1506
    ///
1507
    /// (This also tries to use "sparse" states where possible, just like
1508
    /// `c_byte_class` does.)
1509
1.02M
    fn c_unicode_class_reverse_with_suffix(
1510
1.02M
        &self,
1511
1.02M
        cls: &hir::ClassUnicode,
1512
1.02M
    ) -> Result<ThompsonRef, BuildError> {
1513
1.02M
        // N.B. It would likely be better to cache common *prefixes* in the
1514
1.02M
        // reverse direction, but it's not quite clear how to do that. The
1515
1.02M
        // advantage of caching suffixes is that it does give us a win, and
1516
1.02M
        // has a very small additional overhead.
1517
1.02M
        let mut cache = self.utf8_suffix.borrow_mut();
1518
1.02M
        cache.clear();
1519
1520
1.02M
        let union = self.add_union()?;
1521
1.02M
        let alt_end = self.add_empty()?;
1522
13.7M
        for urng in cls.iter() {
1523
23.2M
            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1524
23.2M
                let mut end = alt_end;
1525
73.0M
                for brng in seq.as_slice() {
1526
73.0M
                    let key = Utf8SuffixKey {
1527
73.0M
                        from: end,
1528
73.0M
                        start: brng.start,
1529
73.0M
                        end: brng.end,
1530
73.0M
                    };
1531
73.0M
                    let hash = cache.hash(&key);
1532
73.0M
                    if let Some(id) = cache.get(&key, hash) {
1533
26.5M
                        end = id;
1534
26.5M
                        continue;
1535
46.5M
                    }
1536
1537
46.5M
                    let compiled = self.c_range(brng.start, brng.end)?;
1538
46.5M
                    self.patch(compiled.end, end)?;
1539
46.5M
                    end = compiled.start;
1540
46.5M
                    cache.set(key, hash, end);
1541
                }
1542
23.2M
                self.patch(union, end)?;
1543
            }
1544
        }
1545
1.02M
        Ok(ThompsonRef { start: union, end: alt_end })
1546
1.02M
    }
1547
1548
    /// Compile the given HIR look-around assertion to an NFA look-around
1549
    /// assertion.
1550
5.48M
    fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1551
5.48M
        let look = match *anchor {
1552
530k
            hir::Look::Start => Look::Start,
1553
386k
            hir::Look::End => Look::End,
1554
48.3k
            hir::Look::StartLF => Look::StartLF,
1555
97.2k
            hir::Look::EndLF => Look::EndLF,
1556
66.8k
            hir::Look::StartCRLF => Look::StartCRLF,
1557
63.3k
            hir::Look::EndCRLF => Look::EndCRLF,
1558
873k
            hir::Look::WordAscii => Look::WordAscii,
1559
301k
            hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1560
1.04M
            hir::Look::WordUnicode => Look::WordUnicode,
1561
183k
            hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1562
412k
            hir::Look::WordStartAscii => Look::WordStartAscii,
1563
221k
            hir::Look::WordEndAscii => Look::WordEndAscii,
1564
155k
            hir::Look::WordStartUnicode => Look::WordStartUnicode,
1565
200k
            hir::Look::WordEndUnicode => Look::WordEndUnicode,
1566
228k
            hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1567
255k
            hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1568
101k
            hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1569
318k
            hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1570
        };
1571
5.48M
        let id = self.add_look(look)?;
1572
5.48M
        Ok(ThompsonRef { start: id, end: id })
1573
5.48M
    }
1574
1575
    /// Compile the given byte string to a concatenation of bytes.
1576
48.8M
    fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1577
60.1M
        self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1578
48.8M
    }
1579
1580
    /// Compile a "range" state with one transition that may only be followed
1581
    /// if the input byte is in the (inclusive) range given.
1582
    ///
1583
    /// Both the `start` and `end` locations point to the state created.
1584
    /// Callers will likely want to keep the `start`, but patch the `end` to
1585
    /// point to some other state.
1586
106M
    fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1587
106M
        let id = self.add_range(start, end)?;
1588
106M
        Ok(ThompsonRef { start: id, end: id })
1589
106M
    }
1590
1591
    /// Compile an "empty" state with one unconditional epsilon transition.
1592
    ///
1593
    /// Both the `start` and `end` locations point to the state created.
1594
    /// Callers will likely want to keep the `start`, but patch the `end` to
1595
    /// point to some other state.
1596
1.19M
    fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1597
1.19M
        let id = self.add_empty()?;
1598
1.19M
        Ok(ThompsonRef { start: id, end: id })
1599
1.19M
    }
1600
1601
    /// Compile a "fail" state that can never have any outgoing transitions.
1602
0
    fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1603
0
        let id = self.add_fail()?;
1604
0
        Ok(ThompsonRef { start: id, end: id })
1605
0
    }
1606
1607
    // The below helpers are meant to be simple wrappers around the
1608
    // corresponding Builder methods. For the most part, they let us write
1609
    // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1610
    // the latter is a mouthful. Some of the methods do inject a little bit
1611
    // of extra logic. e.g., Flipping look-around operators when compiling in
1612
    // reverse mode.
1613
1614
179M
    fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1615
179M
        self.builder.borrow_mut().patch(from, to)
1616
179M
    }
1617
1618
138k
    fn start_pattern(&self) -> Result<PatternID, BuildError> {
1619
138k
        self.builder.borrow_mut().start_pattern()
1620
138k
    }
1621
1622
136k
    fn finish_pattern(
1623
136k
        &self,
1624
136k
        start_id: StateID,
1625
136k
    ) -> Result<PatternID, BuildError> {
1626
136k
        self.builder.borrow_mut().finish_pattern(start_id)
1627
136k
    }
1628
1629
12.0M
    fn add_empty(&self) -> Result<StateID, BuildError> {
1630
12.0M
        self.builder.borrow_mut().add_empty()
1631
12.0M
    }
1632
1633
106M
    fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1634
106M
        self.builder.borrow_mut().add_range(Transition {
1635
106M
            start,
1636
106M
            end,
1637
106M
            next: StateID::ZERO,
1638
106M
        })
1639
106M
    }
1640
1641
5.59M
    fn add_sparse(
1642
5.59M
        &self,
1643
5.59M
        ranges: Vec<Transition>,
1644
5.59M
    ) -> Result<StateID, BuildError> {
1645
5.59M
        self.builder.borrow_mut().add_sparse(ranges)
1646
5.59M
    }
1647
1648
5.48M
    fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1649
5.48M
        if self.is_reverse() {
1650
2.68M
            look = look.reversed();
1651
2.80M
        }
1652
5.48M
        self.builder.borrow_mut().add_look(StateID::ZERO, look)
1653
5.48M
    }
1654
1655
8.52M
    fn add_union(&self) -> Result<StateID, BuildError> {
1656
8.52M
        self.builder.borrow_mut().add_union(vec![])
1657
8.52M
    }
1658
1659
2.37M
    fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1660
2.37M
        self.builder.borrow_mut().add_union_reverse(vec![])
1661
2.37M
    }
1662
1663
2.55M
    fn add_capture_start(
1664
2.55M
        &self,
1665
2.55M
        capture_index: u32,
1666
2.55M
        name: Option<&str>,
1667
2.55M
    ) -> Result<StateID, BuildError> {
1668
2.55M
        let name = name.map(|n| Arc::from(n));
1669
2.55M
        self.builder.borrow_mut().add_capture_start(
1670
2.55M
            StateID::ZERO,
1671
2.55M
            capture_index,
1672
2.55M
            name,
1673
2.55M
        )
1674
2.55M
    }
1675
1676
2.54M
    fn add_capture_end(
1677
2.54M
        &self,
1678
2.54M
        capture_index: u32,
1679
2.54M
    ) -> Result<StateID, BuildError> {
1680
2.54M
        self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1681
2.54M
    }
1682
1683
0
    fn add_fail(&self) -> Result<StateID, BuildError> {
1684
0
        self.builder.borrow_mut().add_fail()
1685
0
    }
1686
1687
136k
    fn add_match(&self) -> Result<StateID, BuildError> {
1688
136k
        self.builder.borrow_mut().add_match()
1689
136k
    }
1690
1691
174M
    fn is_reverse(&self) -> bool {
1692
174M
        self.config.get_reverse()
1693
174M
    }
1694
}
1695
1696
/// A value that represents the result of compiling a sub-expression of a
1697
/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1698
/// has an initial state at `start` and a final state at `end`.
1699
#[derive(Clone, Copy, Debug)]
1700
pub(crate) struct ThompsonRef {
1701
    pub(crate) start: StateID,
1702
    pub(crate) end: StateID,
1703
}
1704
1705
/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
1706
/// from a lexicographically sorted sequence of strings in linear time.
1707
///
1708
/// The trick here is that any Unicode codepoint range can be converted to
1709
/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1710
/// together via an alternation is trivial, and indeed, it works. However,
1711
/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1712
/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1713
/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1714
/// minimal because we use a bounded cache of previously build DFA states.)
1715
///
1716
/// The drawback is that this sadly doesn't work for reverse automata, since
1717
/// the ranges are no longer in lexicographic order. For that, we invented the
1718
/// range trie (which gets its own module). Once a range trie is built, we then
1719
/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1720
///
1721
/// The high level idea is described here:
1722
/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1723
///
1724
/// There is also another implementation of this in the `fst` crate.
1725
#[derive(Debug)]
1726
struct Utf8Compiler<'a> {
1727
    builder: &'a mut Builder,
1728
    state: &'a mut Utf8State,
1729
    target: StateID,
1730
}
1731
1732
#[derive(Clone, Debug)]
1733
struct Utf8State {
1734
    compiled: Utf8BoundedMap,
1735
    uncompiled: Vec<Utf8Node>,
1736
}
1737
1738
#[derive(Clone, Debug)]
1739
struct Utf8Node {
1740
    trans: Vec<Transition>,
1741
    last: Option<Utf8LastTransition>,
1742
}
1743
1744
#[derive(Clone, Debug)]
1745
struct Utf8LastTransition {
1746
    start: u8,
1747
    end: u8,
1748
}
1749
1750
impl Utf8State {
1751
504k
    fn new() -> Utf8State {
1752
504k
        Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1753
504k
    }
1754
1755
2.34M
    fn clear(&mut self) {
1756
2.34M
        self.compiled.clear();
1757
2.34M
        self.uncompiled.clear();
1758
2.34M
    }
1759
}
1760
1761
impl<'a> Utf8Compiler<'a> {
1762
2.34M
    fn new(
1763
2.34M
        builder: &'a mut Builder,
1764
2.34M
        state: &'a mut Utf8State,
1765
2.34M
    ) -> Result<Utf8Compiler<'a>, BuildError> {
1766
2.34M
        let target = builder.add_empty()?;
1767
2.34M
        state.clear();
1768
2.34M
        let mut utf8c = Utf8Compiler { builder, state, target };
1769
2.34M
        utf8c.add_empty();
1770
2.34M
        Ok(utf8c)
1771
2.34M
    }
1772
1773
2.34M
    fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1774
2.34M
        self.compile_from(0)?;
1775
2.34M
        let node = self.pop_root();
1776
2.34M
        let start = self.compile(node)?;
1777
2.34M
        Ok(ThompsonRef { start, end: self.target })
1778
2.34M
    }
1779
1780
43.2M
    fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1781
43.2M
        let prefix_len = ranges
1782
43.2M
            .iter()
1783
43.2M
            .zip(&self.state.uncompiled)
1784
87.5M
            .take_while(|&(range, node)| {
1785
87.5M
                node.last.as_ref().map_or(false, |t| {
1786
85.2M
                    (t.start, t.end) == (range.start, range.end)
1787
87.5M
                })
1788
87.5M
            })
1789
43.2M
            .count();
1790
43.2M
        assert!(prefix_len < ranges.len());
1791
43.2M
        self.compile_from(prefix_len)?;
1792
43.2M
        self.add_suffix(&ranges[prefix_len..]);
1793
43.2M
        Ok(())
1794
43.2M
    }
1795
1796
45.5M
    fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1797
45.5M
        let mut next = self.target;
1798
92.2M
        while from + 1 < self.state.uncompiled.len() {
1799
46.7M
            let node = self.pop_freeze(next);
1800
46.7M
            next = self.compile(node)?;
1801
        }
1802
45.5M
        self.top_last_freeze(next);
1803
45.5M
        Ok(())
1804
45.5M
    }
1805
1806
49.0M
    fn compile(
1807
49.0M
        &mut self,
1808
49.0M
        node: Vec<Transition>,
1809
49.0M
    ) -> Result<StateID, BuildError> {
1810
49.0M
        let hash = self.state.compiled.hash(&node);
1811
49.0M
        if let Some(id) = self.state.compiled.get(&node, hash) {
1812
25.8M
            return Ok(id);
1813
23.2M
        }
1814
23.2M
        let id = self.builder.add_sparse(node.clone())?;
1815
23.2M
        self.state.compiled.set(node, hash, id);
1816
23.2M
        Ok(id)
1817
49.0M
    }
1818
1819
43.2M
    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1820
43.2M
        assert!(!ranges.is_empty());
1821
43.2M
        let last = self
1822
43.2M
            .state
1823
43.2M
            .uncompiled
1824
43.2M
            .len()
1825
43.2M
            .checked_sub(1)
1826
43.2M
            .expect("non-empty nodes");
1827
43.2M
        assert!(self.state.uncompiled[last].last.is_none());
1828
43.2M
        self.state.uncompiled[last].last = Some(Utf8LastTransition {
1829
43.2M
            start: ranges[0].start,
1830
43.2M
            end: ranges[0].end,
1831
43.2M
        });
1832
46.7M
        for r in &ranges[1..] {
1833
46.7M
            self.state.uncompiled.push(Utf8Node {
1834
46.7M
                trans: vec![],
1835
46.7M
                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1836
46.7M
            });
1837
46.7M
        }
1838
43.2M
    }
1839
1840
2.34M
    fn add_empty(&mut self) {
1841
2.34M
        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1842
2.34M
    }
1843
1844
46.7M
    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1845
46.7M
        let mut uncompiled = self.state.uncompiled.pop().unwrap();
1846
46.7M
        uncompiled.set_last_transition(next);
1847
46.7M
        uncompiled.trans
1848
46.7M
    }
1849
1850
2.34M
    fn pop_root(&mut self) -> Vec<Transition> {
1851
2.34M
        assert_eq!(self.state.uncompiled.len(), 1);
1852
2.34M
        assert!(self.state.uncompiled[0].last.is_none());
1853
2.34M
        self.state.uncompiled.pop().expect("non-empty nodes").trans
1854
2.34M
    }
1855
1856
45.5M
    fn top_last_freeze(&mut self, next: StateID) {
1857
45.5M
        let last = self
1858
45.5M
            .state
1859
45.5M
            .uncompiled
1860
45.5M
            .len()
1861
45.5M
            .checked_sub(1)
1862
45.5M
            .expect("non-empty nodes");
1863
45.5M
        self.state.uncompiled[last].set_last_transition(next);
1864
45.5M
    }
1865
}
1866
1867
impl Utf8Node {
1868
92.2M
    fn set_last_transition(&mut self, next: StateID) {
1869
92.2M
        if let Some(last) = self.last.take() {
1870
89.9M
            self.trans.push(Transition {
1871
89.9M
                start: last.start,
1872
89.9M
                end: last.end,
1873
89.9M
                next,
1874
89.9M
            });
1875
89.9M
        }
1876
92.2M
    }
1877
}
1878
1879
#[cfg(test)]
1880
mod tests {
1881
    use alloc::vec;
1882
1883
    use crate::{
1884
        nfa::thompson::{SparseTransitions, State},
1885
        util::primitives::SmallIndex,
1886
    };
1887
1888
    use super::*;
1889
1890
    fn build(pattern: &str) -> NFA {
1891
        NFA::compiler()
1892
            .configure(
1893
                NFA::config()
1894
                    .which_captures(WhichCaptures::None)
1895
                    .unanchored_prefix(false),
1896
            )
1897
            .build(pattern)
1898
            .unwrap()
1899
    }
1900
1901
    fn pid(id: usize) -> PatternID {
1902
        PatternID::new(id).unwrap()
1903
    }
1904
1905
    fn sid(id: usize) -> StateID {
1906
        StateID::new(id).unwrap()
1907
    }
1908
1909
    fn s_byte(byte: u8, next: usize) -> State {
1910
        let next = sid(next);
1911
        let trans = Transition { start: byte, end: byte, next };
1912
        State::ByteRange { trans }
1913
    }
1914
1915
    fn s_range(start: u8, end: u8, next: usize) -> State {
1916
        let next = sid(next);
1917
        let trans = Transition { start, end, next };
1918
        State::ByteRange { trans }
1919
    }
1920
1921
    fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1922
        let transitions = transitions
1923
            .iter()
1924
            .map(|&(start, end, next)| Transition {
1925
                start,
1926
                end,
1927
                next: sid(next),
1928
            })
1929
            .collect();
1930
        State::Sparse(SparseTransitions { transitions })
1931
    }
1932
1933
    fn s_look(look: Look, next: usize) -> State {
1934
        let next = sid(next);
1935
        State::Look { look, next }
1936
    }
1937
1938
    fn s_bin_union(alt1: usize, alt2: usize) -> State {
1939
        State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1940
    }
1941
1942
    fn s_union(alts: &[usize]) -> State {
1943
        State::Union {
1944
            alternates: alts
1945
                .iter()
1946
                .map(|&id| sid(id))
1947
                .collect::<Vec<StateID>>()
1948
                .into_boxed_slice(),
1949
        }
1950
    }
1951
1952
    fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1953
        State::Capture {
1954
            next: sid(next),
1955
            pattern_id: pid(pattern),
1956
            group_index: SmallIndex::new(index).unwrap(),
1957
            slot: SmallIndex::new(slot).unwrap(),
1958
        }
1959
    }
1960
1961
    fn s_fail() -> State {
1962
        State::Fail
1963
    }
1964
1965
    fn s_match(id: usize) -> State {
1966
        State::Match { pattern_id: pid(id) }
1967
    }
1968
1969
    // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1970
    // prefix.
1971
    #[test]
1972
    fn compile_unanchored_prefix() {
1973
        let nfa = NFA::compiler()
1974
            .configure(NFA::config().which_captures(WhichCaptures::None))
1975
            .build(r"a")
1976
            .unwrap();
1977
        assert_eq!(
1978
            nfa.states(),
1979
            &[
1980
                s_bin_union(2, 1),
1981
                s_range(0, 255, 0),
1982
                s_byte(b'a', 3),
1983
                s_match(0),
1984
            ]
1985
        );
1986
    }
1987
1988
    #[test]
1989
    fn compile_no_unanchored_prefix_with_start_anchor() {
1990
        let nfa = NFA::compiler()
1991
            .configure(NFA::config().which_captures(WhichCaptures::None))
1992
            .build(r"^a")
1993
            .unwrap();
1994
        assert_eq!(
1995
            nfa.states(),
1996
            &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
1997
        );
1998
    }
1999
2000
    #[test]
2001
    fn compile_yes_unanchored_prefix_with_end_anchor() {
2002
        let nfa = NFA::compiler()
2003
            .configure(NFA::config().which_captures(WhichCaptures::None))
2004
            .build(r"a$")
2005
            .unwrap();
2006
        assert_eq!(
2007
            nfa.states(),
2008
            &[
2009
                s_bin_union(2, 1),
2010
                s_range(0, 255, 0),
2011
                s_byte(b'a', 3),
2012
                s_look(Look::End, 4),
2013
                s_match(0),
2014
            ]
2015
        );
2016
    }
2017
2018
    #[test]
2019
    fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2020
        let nfa = NFA::compiler()
2021
            .configure(
2022
                NFA::config()
2023
                    .reverse(true)
2024
                    .which_captures(WhichCaptures::None),
2025
            )
2026
            .build(r"^a")
2027
            .unwrap();
2028
        assert_eq!(
2029
            nfa.states(),
2030
            &[
2031
                s_bin_union(2, 1),
2032
                s_range(0, 255, 0),
2033
                s_byte(b'a', 3),
2034
                // Anchors get flipped in a reverse automaton.
2035
                s_look(Look::End, 4),
2036
                s_match(0),
2037
            ],
2038
        );
2039
    }
2040
2041
    #[test]
2042
    fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2043
        let nfa = NFA::compiler()
2044
            .configure(
2045
                NFA::config()
2046
                    .reverse(true)
2047
                    .which_captures(WhichCaptures::None),
2048
            )
2049
            .build(r"a$")
2050
            .unwrap();
2051
        assert_eq!(
2052
            nfa.states(),
2053
            &[
2054
                // Anchors get flipped in a reverse automaton.
2055
                s_look(Look::Start, 1),
2056
                s_byte(b'a', 2),
2057
                s_match(0),
2058
            ],
2059
        );
2060
    }
2061
2062
    #[test]
2063
    fn compile_empty() {
2064
        assert_eq!(build("").states(), &[s_match(0),]);
2065
    }
2066
2067
    #[test]
2068
    fn compile_literal() {
2069
        assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
2070
        assert_eq!(
2071
            build("ab").states(),
2072
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
2073
        );
2074
        assert_eq!(
2075
            build("☃").states(),
2076
            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
2077
        );
2078
2079
        // Check that non-UTF-8 literals work.
2080
        let nfa = NFA::compiler()
2081
            .configure(
2082
                NFA::config()
2083
                    .which_captures(WhichCaptures::None)
2084
                    .unanchored_prefix(false),
2085
            )
2086
            .syntax(crate::util::syntax::Config::new().utf8(false))
2087
            .build(r"(?-u)\xFF")
2088
            .unwrap();
2089
        assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2090
    }
2091
2092
    #[test]
2093
    fn compile_class_ascii() {
2094
        assert_eq!(
2095
            build(r"[a-z]").states(),
2096
            &[s_range(b'a', b'z', 1), s_match(0),]
2097
        );
2098
        assert_eq!(
2099
            build(r"[x-za-c]").states(),
2100
            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2101
        );
2102
    }
2103
2104
    #[test]
2105
    #[cfg(not(miri))]
2106
    fn compile_class_unicode() {
2107
        assert_eq!(
2108
            build(r"[\u03B1-\u03B4]").states(),
2109
            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2110
        );
2111
        assert_eq!(
2112
            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2113
            &[
2114
                s_range(0xB1, 0xB4, 5),
2115
                s_range(0x99, 0x9E, 5),
2116
                s_byte(0xA4, 1),
2117
                s_byte(0x9F, 2),
2118
                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2119
                s_match(0),
2120
            ]
2121
        );
2122
        assert_eq!(
2123
            build(r"[a-z☃]").states(),
2124
            &[
2125
                s_byte(0x83, 3),
2126
                s_byte(0x98, 0),
2127
                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2128
                s_match(0),
2129
            ]
2130
        );
2131
    }
2132
2133
    #[test]
2134
    fn compile_repetition() {
2135
        assert_eq!(
2136
            build(r"a?").states(),
2137
            &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2138
        );
2139
        assert_eq!(
2140
            build(r"a??").states(),
2141
            &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2142
        );
2143
    }
2144
2145
    #[test]
2146
    fn compile_group() {
2147
        assert_eq!(
2148
            build(r"ab+").states(),
2149
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2150
        );
2151
        assert_eq!(
2152
            build(r"(ab)").states(),
2153
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2154
        );
2155
        assert_eq!(
2156
            build(r"(ab)+").states(),
2157
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2158
        );
2159
    }
2160
2161
    #[test]
2162
    fn compile_alternation() {
2163
        assert_eq!(
2164
            build(r"a|b").states(),
2165
            &[s_range(b'a', b'b', 1), s_match(0)]
2166
        );
2167
        assert_eq!(
2168
            build(r"ab|cd").states(),
2169
            &[
2170
                s_byte(b'b', 3),
2171
                s_byte(b'd', 3),
2172
                s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2173
                s_match(0)
2174
            ],
2175
        );
2176
        assert_eq!(
2177
            build(r"|b").states(),
2178
            &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2179
        );
2180
        assert_eq!(
2181
            build(r"a|").states(),
2182
            &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2183
        );
2184
    }
2185
2186
    // This tests the use of a non-binary union, i.e., a state with more than
2187
    // 2 unconditional epsilon transitions. The only place they tend to appear
2188
    // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2189
    // and 'sparse' tend to cover all other cases of alternation.
2190
    #[test]
2191
    fn compile_non_binary_union() {
2192
        let nfa = NFA::compiler()
2193
            .configure(
2194
                NFA::config()
2195
                    .which_captures(WhichCaptures::None)
2196
                    .reverse(true)
2197
                    .shrink(false)
2198
                    .unanchored_prefix(false),
2199
            )
2200
            .build(r"[\u1000\u2000\u3000]")
2201
            .unwrap();
2202
        assert_eq!(
2203
            nfa.states(),
2204
            &[
2205
                s_union(&[3, 6, 9]),
2206
                s_byte(0xE1, 10),
2207
                s_byte(0x80, 1),
2208
                s_byte(0x80, 2),
2209
                s_byte(0xE2, 10),
2210
                s_byte(0x80, 4),
2211
                s_byte(0x80, 5),
2212
                s_byte(0xE3, 10),
2213
                s_byte(0x80, 7),
2214
                s_byte(0x80, 8),
2215
                s_match(0),
2216
            ]
2217
        );
2218
    }
2219
2220
    #[test]
2221
    fn compile_many_start_pattern() {
2222
        let nfa = NFA::compiler()
2223
            .configure(
2224
                NFA::config()
2225
                    .which_captures(WhichCaptures::None)
2226
                    .unanchored_prefix(false),
2227
            )
2228
            .build_many(&["a", "b"])
2229
            .unwrap();
2230
        assert_eq!(
2231
            nfa.states(),
2232
            &[
2233
                s_byte(b'a', 1),
2234
                s_match(0),
2235
                s_byte(b'b', 3),
2236
                s_match(1),
2237
                s_bin_union(0, 2),
2238
            ]
2239
        );
2240
        assert_eq!(nfa.start_anchored().as_usize(), 4);
2241
        assert_eq!(nfa.start_unanchored().as_usize(), 4);
2242
        // Test that the start states for each individual pattern are correct.
2243
        assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2244
        assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2245
    }
2246
2247
    // This tests that our compiler can handle an empty character class. At the
2248
    // time of writing, the regex parser forbids it, so the only way to test it
2249
    // is to provide a hand written HIR.
2250
    #[test]
2251
    fn empty_class_bytes() {
2252
        use regex_syntax::hir::{Class, ClassBytes, Hir};
2253
2254
        let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2255
        let config = NFA::config()
2256
            .which_captures(WhichCaptures::None)
2257
            .unanchored_prefix(false);
2258
        let nfa =
2259
            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2260
        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2261
    }
2262
2263
    // Like empty_class_bytes, but for a Unicode class.
2264
    #[test]
2265
    fn empty_class_unicode() {
2266
        use regex_syntax::hir::{Class, ClassUnicode, Hir};
2267
2268
        let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2269
        let config = NFA::config()
2270
            .which_captures(WhichCaptures::None)
2271
            .unanchored_prefix(false);
2272
        let nfa =
2273
            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2274
        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2275
    }
2276
2277
    #[test]
2278
    fn compile_captures_all() {
2279
        let nfa = NFA::compiler()
2280
            .configure(
2281
                NFA::config()
2282
                    .unanchored_prefix(false)
2283
                    .which_captures(WhichCaptures::All),
2284
            )
2285
            .build("a(b)c")
2286
            .unwrap();
2287
        assert_eq!(
2288
            nfa.states(),
2289
            &[
2290
                s_cap(1, 0, 0, 0),
2291
                s_byte(b'a', 2),
2292
                s_cap(3, 0, 1, 2),
2293
                s_byte(b'b', 4),
2294
                s_cap(5, 0, 1, 3),
2295
                s_byte(b'c', 6),
2296
                s_cap(7, 0, 0, 1),
2297
                s_match(0)
2298
            ]
2299
        );
2300
        let ginfo = nfa.group_info();
2301
        assert_eq!(2, ginfo.all_group_len());
2302
    }
2303
2304
    #[test]
2305
    fn compile_captures_implicit() {
2306
        let nfa = NFA::compiler()
2307
            .configure(
2308
                NFA::config()
2309
                    .unanchored_prefix(false)
2310
                    .which_captures(WhichCaptures::Implicit),
2311
            )
2312
            .build("a(b)c")
2313
            .unwrap();
2314
        assert_eq!(
2315
            nfa.states(),
2316
            &[
2317
                s_cap(1, 0, 0, 0),
2318
                s_byte(b'a', 2),
2319
                s_byte(b'b', 3),
2320
                s_byte(b'c', 4),
2321
                s_cap(5, 0, 0, 1),
2322
                s_match(0)
2323
            ]
2324
        );
2325
        let ginfo = nfa.group_info();
2326
        assert_eq!(1, ginfo.all_group_len());
2327
    }
2328
2329
    #[test]
2330
    fn compile_captures_none() {
2331
        let nfa = NFA::compiler()
2332
            .configure(
2333
                NFA::config()
2334
                    .unanchored_prefix(false)
2335
                    .which_captures(WhichCaptures::None),
2336
            )
2337
            .build("a(b)c")
2338
            .unwrap();
2339
        assert_eq!(
2340
            nfa.states(),
2341
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2342
        );
2343
        let ginfo = nfa.group_info();
2344
        assert_eq!(0, ginfo.all_group_len());
2345
    }
2346
}