Coverage Report

Created: 2025-07-23 07:16

/src/regex/regex-lite/src/nfa.rs
Line
Count
Source (jump to first uncovered line)
1
use core::{cell::RefCell, mem::size_of};
2
3
use alloc::{string::String, sync::Arc, vec, vec::Vec};
4
5
use crate::{
6
    error::Error,
7
    hir::{self, Hir, HirKind},
8
    int::U32,
9
};
10
11
pub(crate) type StateID = u32;
12
13
#[derive(Clone, Copy, Debug)]
14
pub(crate) struct Config {
15
    pub(crate) size_limit: Option<usize>,
16
}
17
18
impl Default for Config {
19
7.54k
    fn default() -> Config {
20
7.54k
        Config { size_limit: Some(10 * (1 << 20)) }
21
7.54k
    }
22
}
23
24
#[derive(Clone)]
25
pub(crate) struct NFA {
26
    /// The pattern string this NFA was generated from.
27
    ///
28
    /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯
29
    pattern: String,
30
    /// The states that make up this NFA.
31
    states: Vec<State>,
32
    /// The ID of the start state.
33
    start: StateID,
34
    /// Whether this NFA can only match at the beginning of a haystack.
35
    is_start_anchored: bool,
36
    /// Whether this NFA can match the empty string.
37
    is_match_empty: bool,
38
    /// If every match has the same number of matching capture groups, then
39
    /// this corresponds to the number of groups.
40
    static_explicit_captures_len: Option<usize>,
41
    /// A map from capture group name to its corresponding index.
42
    cap_name_to_index: CaptureNameMap,
43
    /// A map from capture group index to the corresponding name, if one
44
    /// exists.
45
    cap_index_to_name: Vec<Option<Arc<str>>>,
46
    /// Heap memory used indirectly by NFA states and other things (like the
47
    /// various capturing group representations above). Since each state
48
    /// might use a different amount of heap, we need to keep track of this
49
    /// incrementally.
50
    memory_extra: usize,
51
}
52
53
impl NFA {
54
    /// Creates a new NFA from the given configuration and HIR.
55
5.43k
    pub(crate) fn new(
56
5.43k
        config: Config,
57
5.43k
        pattern: String,
58
5.43k
        hir: &Hir,
59
5.43k
    ) -> Result<NFA, Error> {
60
5.43k
        Compiler::new(config, pattern).compile(hir)
61
5.43k
    }
62
63
    /// Returns the pattern string used to construct this NFA.
64
0
    pub(crate) fn pattern(&self) -> &str {
65
0
        &self.pattern
66
0
    }
67
68
    /// Returns the state corresponding to the given ID.
69
    ///
70
    /// # Panics
71
    ///
72
    /// If the ID does not refer to a valid state, then this panics.
73
76.8M
    pub(crate) fn state(&self, id: StateID) -> &State {
74
76.8M
        &self.states[id.as_usize()]
75
76.8M
    }
76
77
    /// Returns the total number of states in this NFA.
78
20.3k
    pub(crate) fn len(&self) -> usize {
79
20.3k
        self.states.len()
80
20.3k
    }
81
82
    /// Returns the ID of the starting state for this NFA.
83
5.09k
    pub(crate) fn start(&self) -> StateID {
84
5.09k
        self.start
85
5.09k
    }
86
87
    /// Returns the capture group index for the corresponding named group.
88
    /// If no such group with the given name exists, then `None` is returned.
89
0
    pub(crate) fn to_index(&self, name: &str) -> Option<usize> {
90
0
        self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize())
91
0
    }
92
93
    /*
94
    /// Returns the capture group name for the corresponding index.
95
    /// If no such group with the given index, then `None` is returned.
96
    pub(crate) fn to_name(&self, index: usize) -> Option<&str> {
97
        self.cap_index_to_name.get(index)?.as_deref()
98
    }
99
    */
100
101
    /// Returns an iterator over all of the capture groups, along with their
102
    /// names if they exist, in this NFA.
103
0
    pub(crate) fn capture_names(&self) -> CaptureNames<'_> {
104
0
        CaptureNames { it: self.cap_index_to_name.iter() }
105
0
    }
106
107
    /// Returns the total number of capture groups, including the first and
108
    /// implicit group, in this NFA.
109
10.1k
    pub(crate) fn group_len(&self) -> usize {
110
10.1k
        self.cap_index_to_name.len()
111
10.1k
    }
112
113
    /// Returns true if and only if this NFA can only match at the beginning of
114
    /// a haystack.
115
5.09k
    pub(crate) fn is_start_anchored(&self) -> bool {
116
5.09k
        self.is_start_anchored
117
5.09k
    }
118
119
    /// If the pattern always reports the same number of matching capture groups
120
    /// for every match, then this returns the number of those groups. This
121
    /// doesn't include the implicit group found in every pattern.
122
0
    pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
123
0
        self.static_explicit_captures_len
124
0
    }
125
126
    /// Returns the heap memory usage, in bytes, used by this NFA.
127
1.64M
    fn memory_usage(&self) -> usize {
128
1.64M
        (self.states.len() * size_of::<State>())
129
1.64M
            + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>())
130
1.64M
            + self.memory_extra
131
1.64M
    }
132
}
133
134
impl core::fmt::Debug for NFA {
135
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
136
0
        writeln!(f, "NFA(")?;
137
0
        writeln!(f, "pattern: {}", self.pattern)?;
138
0
        for (sid, state) in self.states.iter().enumerate() {
139
0
            writeln!(f, "{:07?}: {:?}", sid, state)?;
140
        }
141
0
        writeln!(f, ")")?;
142
0
        Ok(())
143
0
    }
144
}
145
146
/// An iterator over all capture groups in an NFA.
147
///
148
/// If a particular group has a name, then it is yielded. Otherwise, `None`
149
/// is yielded.
150
#[derive(Clone, Debug)]
151
pub(crate) struct CaptureNames<'a> {
152
    it: core::slice::Iter<'a, Option<Arc<str>>>,
153
}
154
155
impl<'a> Iterator for CaptureNames<'a> {
156
    type Item = Option<&'a str>;
157
158
0
    fn next(&mut self) -> Option<Option<&'a str>> {
159
0
        self.it.next().map(|n| n.as_deref())
160
0
    }
161
}
162
163
#[derive(Clone, Eq, PartialEq)]
164
pub(crate) enum State {
165
    Char { target: StateID, ch: char },
166
    Ranges { target: StateID, ranges: Vec<(char, char)> },
167
    Splits { targets: Vec<StateID>, reverse: bool },
168
    Goto { target: StateID, look: Option<hir::Look> },
169
    Capture { target: StateID, slot: u32 },
170
    Fail,
171
    Match,
172
}
173
174
impl State {
175
    /// Returns the heap memory usage of this NFA state in bytes.
176
1.19M
    fn memory_usage(&self) -> usize {
177
1.19M
        match *self {
178
            State::Char { .. }
179
            | State::Goto { .. }
180
            | State::Capture { .. }
181
            | State::Fail { .. }
182
873k
            | State::Match => 0,
183
200k
            State::Splits { ref targets, .. } => {
184
200k
                targets.len() * size_of::<StateID>()
185
            }
186
119k
            State::Ranges { ref ranges, .. } => {
187
119k
                ranges.len() * size_of::<(char, char)>()
188
            }
189
        }
190
1.19M
    }
191
192
    /// Returns an iterator over the given split targets. The order of the
193
    /// iterator yields elements in reverse when `reverse` is true.
194
0
    pub(crate) fn iter_splits<'a>(
195
0
        splits: &'a [StateID],
196
0
        reverse: bool,
197
0
    ) -> impl Iterator<Item = StateID> + 'a {
198
0
        let mut it = splits.iter();
199
0
        core::iter::from_fn(move || {
200
0
            if reverse { it.next_back() } else { it.next() }.copied()
201
0
        })
202
0
    }
203
}
204
205
impl core::fmt::Debug for State {
206
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
207
0
        match *self {
208
0
            State::Char { target, ch } => {
209
0
                write!(f, "{:?} => {:?}", ch, target)
210
            }
211
0
            State::Ranges { target, ref ranges } => {
212
0
                for (i, &(start, end)) in ranges.iter().enumerate() {
213
0
                    if i > 0 {
214
0
                        write!(f, ", ")?;
215
0
                    }
216
0
                    write!(f, "{:?}-{:?} => {:?}", start, end, target)?;
217
                }
218
0
                Ok(())
219
            }
220
0
            State::Splits { ref targets, reverse } => {
221
0
                write!(f, "splits(")?;
222
0
                for (i, sid) in
223
0
                    State::iter_splits(targets, reverse).enumerate()
224
                {
225
0
                    if i > 0 {
226
0
                        write!(f, ", ")?;
227
0
                    }
228
0
                    write!(f, "{:?}", sid)?;
229
                }
230
0
                write!(f, ")")
231
            }
232
0
            State::Goto { target, look: None } => {
233
0
                write!(f, "goto({:?})", target)
234
            }
235
0
            State::Goto { target, look: Some(look) } => {
236
0
                write!(f, "{:?} => {:?}", look, target)
237
            }
238
0
            State::Capture { target, slot } => {
239
0
                write!(f, "capture(slot={:?}) => {:?}", slot, target,)
240
            }
241
0
            State::Fail => write!(f, "FAIL"),
242
            State::Match => {
243
0
                write!(f, "MATCH")
244
            }
245
        }
246
0
    }
247
}
248
249
/// A map from capture group name to its corresponding capture group index.
250
///
251
/// We define a type alias here so that we can transparently use a `HashMap`
252
/// whenever it's available. We do so presumably because it's faster, although
253
/// there are no benchmarks verifying this.
254
#[cfg(feature = "std")]
255
type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>;
256
#[cfg(not(feature = "std"))]
257
type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>;
258
259
#[derive(Debug)]
260
struct Compiler {
261
    config: Config,
262
    nfa: RefCell<NFA>,
263
}
264
265
impl Compiler {
266
5.43k
    fn new(config: Config, pattern: String) -> Compiler {
267
5.43k
        let nfa = RefCell::new(NFA {
268
5.43k
            pattern,
269
5.43k
            states: vec![],
270
5.43k
            start: 0,
271
5.43k
            is_start_anchored: false,
272
5.43k
            is_match_empty: false,
273
5.43k
            static_explicit_captures_len: None,
274
5.43k
            cap_name_to_index: CaptureNameMap::default(),
275
5.43k
            cap_index_to_name: vec![],
276
5.43k
            memory_extra: 0,
277
5.43k
        });
278
5.43k
        Compiler { config, nfa }
279
5.43k
    }
280
281
5.43k
    fn compile(self, hir: &Hir) -> Result<NFA, Error> {
282
5.43k
        self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored();
283
5.43k
        self.nfa.borrow_mut().is_match_empty = hir.is_match_empty();
284
5.43k
        self.nfa.borrow_mut().static_explicit_captures_len =
285
5.43k
            hir.static_explicit_captures_len();
286
5.43k
        let compiled = self.c_capture(0, None, hir)?;
287
5.10k
        let mat = self.add(State::Match)?;
288
5.09k
        self.patch(compiled.end, mat)?;
289
5.09k
        self.nfa.borrow_mut().start = compiled.start;
290
5.09k
        Ok(self.nfa.into_inner())
291
5.43k
    }
292
293
970k
    fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> {
294
970k
        match *hir.kind() {
295
102k
            HirKind::Empty => self.c_empty(),
296
436k
            HirKind::Char(ch) => self.c_char(ch),
297
121k
            HirKind::Class(ref class) => self.c_class(class),
298
124k
            HirKind::Look(ref look) => self.c_look(look),
299
87.6k
            HirKind::Repetition(ref rep) => self.c_repetition(rep),
300
70.0k
            HirKind::Capture(ref cap) => {
301
70.0k
                self.c_capture(cap.index, cap.name.as_deref(), &cap.sub)
302
            }
303
19.7k
            HirKind::Concat(ref subs) => {
304
333k
                self.c_concat(subs.iter().map(|s| self.c(s)))
305
            }
306
7.47k
            HirKind::Alternation(ref subs) => {
307
65.4k
                self.c_alternation(subs.iter().map(|s| self.c(s)))
308
            }
309
        }
310
970k
    }
311
312
    /// Compile a "fail" state that can never be transitioned out of.
313
0
    fn c_fail(&self) -> Result<ThompsonRef, Error> {
314
0
        let id = self.add(State::Fail)?;
315
0
        Ok(ThompsonRef { start: id, end: id })
316
0
    }
317
318
    /// Compile an "empty" state with one unconditional epsilon transition.
319
    ///
320
    /// Both the `start` and `end` locations point to the state created.
321
    /// Callers will likely want to keep the `start`, but patch the `end` to
322
    /// point to some other state.
323
103k
    fn c_empty(&self) -> Result<ThompsonRef, Error> {
324
103k
        let id = self.add_empty()?;
325
103k
        Ok(ThompsonRef { start: id, end: id })
326
103k
    }
327
328
    /// Compile the given literal char to an NFA.
329
436k
    fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
330
436k
        let id = self.add(State::Char { target: 0, ch })?;
331
436k
        Ok(ThompsonRef { start: id, end: id })
332
436k
    }
333
334
    /// Compile the given character class into an NFA.
335
    ///
336
    /// If the class is empty, then this compiles to a `Fail` state.
337
121k
    fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> {
338
121k
        let id = if class.ranges.is_empty() {
339
            // Technically using an explicit fail state probably isn't
340
            // necessary. Because if you try to match against an empty Ranges,
341
            // then it should turn up with nothing regardless of input, and
342
            // thus "acts" like a Fail state. But it's better to be more
343
            // explicit, and there's no real cost to doing so.
344
1.80k
            self.add(State::Fail)
345
        } else {
346
119k
            let ranges =
347
362k
                class.ranges.iter().map(|r| (r.start, r.end)).collect();
348
119k
            self.add(State::Ranges { target: 0, ranges })
349
30
        }?;
350
121k
        Ok(ThompsonRef { start: id, end: id })
351
121k
    }
352
353
    /// Compile the given HIR look-around assertion to an NFA look-around
354
    /// assertion.
355
124k
    fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> {
356
124k
        let id = self.add(State::Goto { target: 0, look: Some(*look) })?;
357
124k
        Ok(ThompsonRef { start: id, end: id })
358
124k
    }
359
360
    /// Compile the given repetition expression. This handles all types of
361
    /// repetitions and greediness.
362
87.6k
    fn c_repetition(
363
87.6k
        &self,
364
87.6k
        rep: &hir::Repetition,
365
87.6k
    ) -> Result<ThompsonRef, Error> {
366
87.6k
        match (rep.min, rep.max) {
367
26.5k
            (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
368
51.0k
            (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
369
10.0k
            (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
370
2.86k
            (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
371
        }
372
87.6k
    }
373
374
    /// Compile the given expression such that it matches at least `min` times,
375
    /// but no more than `max` times.
376
    ///
377
    /// When `greedy` is true, then the preference is for the expression to
378
    /// match as much as possible. Otherwise, it will match as little as
379
    /// possible.
380
2.86k
    fn c_bounded(
381
2.86k
        &self,
382
2.86k
        hir: &Hir,
383
2.86k
        greedy: bool,
384
2.86k
        min: u32,
385
2.86k
        max: u32,
386
2.86k
    ) -> Result<ThompsonRef, Error> {
387
2.86k
        let prefix = self.c_exactly(hir, min)?;
388
2.79k
        if min == max {
389
0
            return Ok(prefix);
390
2.79k
        }
391
392
        // It is tempting here to compile the rest here as a concatenation
393
        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
394
        // were `aaa?a?a?`. The problem here is that it leads to this program:
395
        //
396
        //     >000000: 61 => 01
397
        //      000001: 61 => 02
398
        //      000002: union(03, 04)
399
        //      000003: 61 => 04
400
        //      000004: union(05, 06)
401
        //      000005: 61 => 06
402
        //      000006: union(07, 08)
403
        //      000007: 61 => 08
404
        //      000008: MATCH
405
        //
406
        // And effectively, once you hit state 2, the epsilon closure will
407
        // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
408
        // to instead compile it like so:
409
        //
410
        //     >000000: 61 => 01
411
        //      000001: 61 => 02
412
        //      000002: union(03, 08)
413
        //      000003: 61 => 04
414
        //      000004: union(05, 08)
415
        //      000005: 61 => 06
416
        //      000006: union(07, 08)
417
        //      000007: 61 => 08
418
        //      000008: MATCH
419
        //
420
        // So that the epsilon closure of state 2 is now just 3 and 8.
421
2.79k
        let empty = self.add_empty()?;
422
2.78k
        let mut prev_end = prefix.end;
423
2.78k
        for _ in min..max {
424
101k
            let splits =
425
101k
                self.add(State::Splits { targets: vec![], reverse: !greedy })?;
426
101k
            let compiled = self.c(hir)?;
427
101k
            self.patch(prev_end, splits)?;
428
101k
            self.patch(splits, compiled.start)?;
429
101k
            self.patch(splits, empty)?;
430
101k
            prev_end = compiled.end;
431
        }
432
2.64k
        self.patch(prev_end, empty)?;
433
2.64k
        Ok(ThompsonRef { start: prefix.start, end: empty })
434
2.86k
    }
435
436
    /// Compile the given expression such that it may be matched `n` or more
437
    /// times, where `n` can be any integer. (Although a particularly large
438
    /// integer is likely to run afoul of any configured size limits.)
439
    ///
440
    /// When `greedy` is true, then the preference is for the expression to
441
    /// match as much as possible. Otherwise, it will match as little as
442
    /// possible.
443
51.0k
    fn c_at_least(
444
51.0k
        &self,
445
51.0k
        hir: &Hir,
446
51.0k
        greedy: bool,
447
51.0k
        n: u32,
448
51.0k
    ) -> Result<ThompsonRef, Error> {
449
51.0k
        if n == 0 {
450
            // When the expression cannot match the empty string, then we
451
            // can get away with something much simpler: just one 'alt'
452
            // instruction that optionally repeats itself. But if the expr
453
            // can match the empty string... see below.
454
37.0k
            if !hir.is_match_empty() {
455
22.2k
                let splits = self.add(State::Splits {
456
22.2k
                    targets: vec![],
457
22.2k
                    reverse: !greedy,
458
22.2k
                })?;
459
22.2k
                let compiled = self.c(hir)?;
460
22.1k
                self.patch(splits, compiled.start)?;
461
22.1k
                self.patch(compiled.end, splits)?;
462
22.1k
                return Ok(ThompsonRef { start: splits, end: splits });
463
14.8k
            }
464
465
            // What's going on here? Shouldn't x* be simpler than this? It
466
            // turns out that when implementing leftmost-first (Perl-like)
467
            // match semantics, x* results in an incorrect preference order
468
            // when computing the transitive closure of states if and only if
469
            // 'x' can match the empty string. So instead, we compile x* as
470
            // (x+)?, which preserves the correct preference order.
471
            //
472
            // See: https://github.com/rust-lang/regex/issues/779
473
14.8k
            let compiled = self.c(hir)?;
474
14.7k
            let plus =
475
14.7k
                self.add(State::Splits { targets: vec![], reverse: !greedy })?;
476
14.7k
            self.patch(compiled.end, plus)?;
477
14.7k
            self.patch(plus, compiled.start)?;
478
479
14.7k
            let question =
480
14.7k
                self.add(State::Splits { targets: vec![], reverse: !greedy })?;
481
14.7k
            let empty = self.add_empty()?;
482
14.7k
            self.patch(question, compiled.start)?;
483
14.7k
            self.patch(question, empty)?;
484
14.7k
            self.patch(plus, empty)?;
485
14.7k
            Ok(ThompsonRef { start: question, end: empty })
486
13.9k
        } else if n == 1 {
487
10.8k
            let compiled = self.c(hir)?;
488
10.6k
            let splits =
489
10.6k
                self.add(State::Splits { targets: vec![], reverse: !greedy })?;
490
10.6k
            self.patch(compiled.end, splits)?;
491
10.6k
            self.patch(splits, compiled.start)?;
492
10.6k
            Ok(ThompsonRef { start: compiled.start, end: splits })
493
        } else {
494
3.17k
            let prefix = self.c_exactly(hir, n - 1)?;
495
3.08k
            let last = self.c(hir)?;
496
3.07k
            let splits =
497
3.07k
                self.add(State::Splits { targets: vec![], reverse: !greedy })?;
498
3.07k
            self.patch(prefix.end, last.start)?;
499
3.07k
            self.patch(last.end, splits)?;
500
3.07k
            self.patch(splits, last.start)?;
501
3.07k
            Ok(ThompsonRef { start: prefix.start, end: splits })
502
        }
503
51.0k
    }
504
505
    /// Compile the given expression such that it may be matched zero or one
506
    /// times.
507
    ///
508
    /// When `greedy` is true, then the preference is for the expression to
509
    /// match as much as possible. Otherwise, it will match as little as
510
    /// possible.
511
26.5k
    fn c_zero_or_one(
512
26.5k
        &self,
513
26.5k
        hir: &Hir,
514
26.5k
        greedy: bool,
515
26.5k
    ) -> Result<ThompsonRef, Error> {
516
26.5k
        let splits =
517
26.5k
            self.add(State::Splits { targets: vec![], reverse: !greedy })?;
518
26.5k
        let compiled = self.c(hir)?;
519
26.4k
        let empty = self.add_empty()?;
520
26.4k
        self.patch(splits, compiled.start)?;
521
26.4k
        self.patch(splits, empty)?;
522
26.4k
        self.patch(compiled.end, empty)?;
523
26.4k
        Ok(ThompsonRef { start: splits, end: empty })
524
26.5k
    }
525
526
    /// Compile the given HIR expression exactly `n` times.
527
13.2k
    fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> {
528
317k
        self.c_concat((0..n).map(|_| self.c(hir)))
529
13.2k
    }
530
531
    /// Compile the given expression and insert capturing states at the
532
    /// beginning and end of it. The slot for the capture states is computed
533
    /// from the index.
534
75.4k
    fn c_capture(
535
75.4k
        &self,
536
75.4k
        index: u32,
537
75.4k
        name: Option<&str>,
538
75.4k
        hir: &Hir,
539
75.4k
    ) -> Result<ThompsonRef, Error> {
540
75.4k
        // For discontiguous indices, push placeholders for earlier capture
541
75.4k
        // groups that weren't explicitly added. This can happen, for example,
542
75.4k
        // with patterns like '(a){0}(a)' where '(a){0}' is completely removed
543
75.4k
        // from the pattern.
544
75.4k
        let existing_groups_len = self.nfa.borrow().cap_index_to_name.len();
545
142k
        for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) {
546
142k
            self.nfa.borrow_mut().cap_index_to_name.push(None);
547
142k
        }
548
75.4k
        if index.as_usize() >= existing_groups_len {
549
26.7k
            if let Some(name) = name {
550
6.52k
                let name = Arc::from(name);
551
6.52k
                let mut nfa = self.nfa.borrow_mut();
552
6.52k
                nfa.cap_name_to_index.insert(Arc::clone(&name), index);
553
6.52k
                nfa.cap_index_to_name.push(Some(Arc::clone(&name)));
554
6.52k
                // This is an approximation.
555
6.52k
                nfa.memory_extra += name.len() + size_of::<u32>();
556
20.2k
            } else {
557
20.2k
                self.nfa.borrow_mut().cap_index_to_name.push(None);
558
20.2k
            }
559
48.6k
        }
560
561
75.4k
        let Some(slot) = index.checked_mul(2) else {
562
0
            return Err(Error::new("capture group slots exhausted"));
563
        };
564
75.4k
        let start = self.add(State::Capture { target: 0, slot })?;
565
75.4k
        let inner = self.c(hir)?;
566
74.9k
        let Some(slot) = slot.checked_add(1) else {
567
0
            return Err(Error::new("capture group slots exhausted"));
568
        };
569
74.9k
        let end = self.add(State::Capture { target: 0, slot })?;
570
74.9k
        self.patch(start, inner.start)?;
571
74.9k
        self.patch(inner.end, end)?;
572
573
74.9k
        Ok(ThompsonRef { start, end })
574
75.4k
    }
575
576
    /// Compile a concatenation of the sub-expressions yielded by the given
577
    /// iterator. If the iterator yields no elements, then this compiles down
578
    /// to an "empty" state that always matches.
579
32.9k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
580
32.9k
    where
581
32.9k
        I: Iterator<Item = Result<ThompsonRef, Error>>,
582
32.9k
    {
583
32.9k
        let ThompsonRef { start, mut end } = match it.next() {
584
31.9k
            Some(result) => result?,
585
1.06k
            None => return self.c_empty(),
586
        };
587
650k
        for result in it {
588
619k
            let compiled = result?;
589
618k
            self.patch(end, compiled.start)?;
590
618k
            end = compiled.end;
591
        }
592
31.3k
        Ok(ThompsonRef { start, end })
593
32.9k
    }
<regex_lite::nfa::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_lite::nfa::Compiler>::c_exactly::{closure#0}>>
Line
Count
Source
579
13.2k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
580
13.2k
    where
581
13.2k
        I: Iterator<Item = Result<ThompsonRef, Error>>,
582
13.2k
    {
583
13.2k
        let ThompsonRef { start, mut end } = match it.next() {
584
12.2k
            Some(result) => result?,
585
1.06k
            None => return self.c_empty(),
586
        };
587
317k
        for result in it {
588
305k
            let compiled = result?;
589
305k
            self.patch(end, compiled.start)?;
590
305k
            end = compiled.end;
591
        }
592
11.7k
        Ok(ThompsonRef { start, end })
593
13.2k
    }
<regex_lite::nfa::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_lite::hir::Hir>, <regex_lite::nfa::Compiler>::c::{closure#0}>>
Line
Count
Source
579
19.7k
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
580
19.7k
    where
581
19.7k
        I: Iterator<Item = Result<ThompsonRef, Error>>,
582
19.7k
    {
583
19.7k
        let ThompsonRef { start, mut end } = match it.next() {
584
19.7k
            Some(result) => result?,
585
0
            None => return self.c_empty(),
586
        };
587
332k
        for result in it {
588
313k
            let compiled = result?;
589
313k
            self.patch(end, compiled.start)?;
590
313k
            end = compiled.end;
591
        }
592
19.5k
        Ok(ThompsonRef { start, end })
593
19.7k
    }
594
595
    /// Compile an alternation, where each element yielded by the given
596
    /// iterator represents an item in the alternation. If the iterator yields
597
    /// no elements, then this compiles down to a "fail" state.
598
    ///
599
    /// In an alternation, expressions appearing earlier are "preferred" at
600
    /// match time over expressions appearing later. (This is currently always
601
    /// true, as this crate only supports leftmost-first semantics.)
602
7.47k
    fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
603
7.47k
    where
604
7.47k
        I: Iterator<Item = Result<ThompsonRef, Error>>,
605
7.47k
    {
606
7.47k
        let first = match it.next() {
607
0
            None => return self.c_fail(),
608
7.47k
            Some(result) => result?,
609
        };
610
7.43k
        let second = match it.next() {
611
0
            None => return Ok(first),
612
7.43k
            Some(result) => result?,
613
        };
614
615
7.40k
        let splits =
616
7.40k
            self.add(State::Splits { targets: vec![], reverse: false })?;
617
7.40k
        let end = self.add_empty()?;
618
7.40k
        self.patch(splits, first.start)?;
619
7.40k
        self.patch(first.end, end)?;
620
7.40k
        self.patch(splits, second.start)?;
621
7.40k
        self.patch(second.end, end)?;
622
57.8k
        for result in it {
623
50.5k
            let compiled = result?;
624
50.4k
            self.patch(splits, compiled.start)?;
625
50.4k
            self.patch(compiled.end, end)?;
626
        }
627
7.33k
        Ok(ThompsonRef { start: splits, end })
628
7.47k
    }
629
630
    /// A convenience routine for adding an empty state, also known as an
631
    /// unconditional epsilon transition. These are quite useful for making
632
    /// NFA construction simpler.
633
    ///
634
    /// (In the regex crate, we do a second pass to remove these, but don't
635
    /// bother with that here.)
636
155k
    fn add_empty(&self) -> Result<StateID, Error> {
637
155k
        self.add(State::Goto { target: 0, look: None })
638
155k
    }
639
640
    /// The common implementation of "add a state." It handles the common
641
    /// error cases of state ID exhausting (by owning state ID allocation) and
642
    /// whether the size limit has been exceeded.
643
1.19M
    fn add(&self, state: State) -> Result<StateID, Error> {
644
1.19M
        let id = u32::try_from(self.nfa.borrow().states.len())
645
1.19M
            .map_err(|_| Error::new("exhausted state IDs, too many states"))?;
646
1.19M
        self.nfa.borrow_mut().memory_extra += state.memory_usage();
647
1.19M
        self.nfa.borrow_mut().states.push(state);
648
1.19M
        self.check_size_limit()?;
649
1.19M
        Ok(id)
650
1.19M
    }
651
652
    /// Add a transition from one state to another.
653
    ///
654
    /// This routine is called "patch" since it is very common to add the
655
    /// states you want, typically with "dummy" state ID transitions, and then
656
    /// "patch" in the real state IDs later. This is because you don't always
657
    /// know all of the necessary state IDs to add because they might not
658
    /// exist yet.
659
    ///
660
    /// # Errors
661
    ///
662
    /// This may error if patching leads to an increase in heap usage beyond
663
    /// the configured size limit. Heap usage only grows when patching adds a
664
    /// new transition (as in the case of a "splits" state).
665
1.43M
    fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
666
1.43M
        let mut new_memory_extra = self.nfa.borrow().memory_extra;
667
1.43M
        match self.nfa.borrow_mut().states[from.as_usize()] {
668
436k
            State::Char { ref mut target, .. } => {
669
436k
                *target = to;
670
436k
            }
671
119k
            State::Ranges { ref mut target, .. } => {
672
119k
                *target = to;
673
119k
            }
674
451k
            State::Splits { ref mut targets, .. } => {
675
451k
                targets.push(to);
676
451k
                new_memory_extra += size_of::<StateID>();
677
451k
            }
678
279k
            State::Goto { ref mut target, .. } => {
679
279k
                *target = to;
680
279k
            }
681
149k
            State::Capture { ref mut target, .. } => {
682
149k
                *target = to;
683
149k
            }
684
1.80k
            State::Fail | State::Match => {}
685
        }
686
1.43M
        if new_memory_extra != self.nfa.borrow().memory_extra {
687
451k
            self.nfa.borrow_mut().memory_extra = new_memory_extra;
688
451k
            self.check_size_limit()?;
689
987k
        }
690
1.43M
        Ok(())
691
1.43M
    }
692
693
    /// Checks that the current heap memory usage of the NFA being compiled
694
    /// doesn't exceed the configured size limit. If it does, an error is
695
    /// returned.
696
1.64M
    fn check_size_limit(&self) -> Result<(), Error> {
697
1.64M
        if let Some(limit) = self.config.size_limit {
698
1.64M
            if self.nfa.borrow().memory_usage() > limit {
699
339
                return Err(Error::new("compiled regex exceeded size limit"));
700
1.64M
            }
701
0
        }
702
1.64M
        Ok(())
703
1.64M
    }
704
}
705
706
/// A value that represents the result of compiling a sub-expression of a
707
/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
708
/// has an initial state at `start` and a final state at `end`.
709
#[derive(Clone, Copy, Debug)]
710
struct ThompsonRef {
711
    start: StateID,
712
    end: StateID,
713
}