Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/nfa/thompson/builder.rs
Line
Count
Source
1
use core::mem;
2
3
use alloc::{sync::Arc, vec, vec::Vec};
4
5
use crate::{
6
    nfa::thompson::{
7
        error::BuildError,
8
        nfa::{self, SparseTransitions, Transition, NFA},
9
    },
10
    util::{
11
        look::{Look, LookMatcher},
12
        primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
13
    },
14
};
15
16
/// An intermediate NFA state used during construction.
17
///
18
/// During construction of an NFA, it is often convenient to work with states
19
/// that are amenable to mutation and other carry more information than we
20
/// otherwise need once an NFA has been built. This type represents those
21
/// needs.
22
///
23
/// Once construction is finished, the builder will convert these states to a
24
/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not
25
/// only results in a simpler representation, but in some cases, entire classes
26
/// of states are completely removed (such as [`State::Empty`]).
27
#[derive(Clone, Debug, Eq, PartialEq)]
28
enum State {
29
    /// An empty state whose only purpose is to forward the automaton to
30
    /// another state via an unconditional epsilon transition.
31
    ///
32
    /// Unconditional epsilon transitions are quite useful during the
33
    /// construction of an NFA, as they permit the insertion of no-op
34
    /// placeholders that make it easier to compose NFA sub-graphs. When
35
    /// the Thompson NFA builder produces a final NFA, all unconditional
36
    /// epsilon transitions are removed, and state identifiers are remapped
37
    /// accordingly.
38
    Empty {
39
        /// The next state that this state should transition to.
40
        next: StateID,
41
    },
42
    /// A state that only transitions to another state if the current input
43
    /// byte is in a particular range of bytes.
44
    ByteRange { trans: Transition },
45
    /// A state with possibly many transitions, represented in a sparse
46
    /// fashion. Transitions must be ordered lexicographically by input range
47
    /// and be non-overlapping. As such, this may only be used when every
48
    /// transition has equal priority. (In practice, this is only used for
49
    /// encoding large UTF-8 automata.) In contrast, a `Union` state has each
50
    /// alternate in order of priority. Priority is used to implement greedy
51
    /// matching and also alternations themselves, e.g., `abc|a` where `abc`
52
    /// has priority over `a`.
53
    ///
54
    /// To clarify, it is possible to remove `Sparse` and represent all things
55
    /// that `Sparse` is used for via `Union`. But this creates a more bloated
56
    /// NFA with more epsilon transitions than is necessary in the special case
57
    /// of character classes.
58
    Sparse { transitions: Vec<Transition> },
59
    /// A conditional epsilon transition satisfied via some sort of
60
    /// look-around.
61
    Look { look: Look, next: StateID },
62
    /// An empty state that records the start of a capture location. This is an
63
    /// unconditional epsilon transition like `Empty`, except it can be used to
64
    /// record position information for a capture group when using the NFA for
65
    /// search.
66
    CaptureStart {
67
        /// The ID of the pattern that this capture was defined.
68
        pattern_id: PatternID,
69
        /// The capture group index that this capture state corresponds to.
70
        /// The capture group index is always relative to its corresponding
71
        /// pattern. Therefore, in the presence of multiple patterns, both the
72
        /// pattern ID and the capture group index are required to uniquely
73
        /// identify a capturing group.
74
        group_index: SmallIndex,
75
        /// The next state that this state should transition to.
76
        next: StateID,
77
    },
78
    /// An empty state that records the end of a capture location. This is an
79
    /// unconditional epsilon transition like `Empty`, except it can be used to
80
    /// record position information for a capture group when using the NFA for
81
    /// search.
82
    CaptureEnd {
83
        /// The ID of the pattern that this capture was defined.
84
        pattern_id: PatternID,
85
        /// The capture group index that this capture state corresponds to.
86
        /// The capture group index is always relative to its corresponding
87
        /// pattern. Therefore, in the presence of multiple patterns, both the
88
        /// pattern ID and the capture group index are required to uniquely
89
        /// identify a capturing group.
90
        group_index: SmallIndex,
91
        /// The next state that this state should transition to.
92
        next: StateID,
93
    },
94
    /// An alternation such that there exists an epsilon transition to all
95
    /// states in `alternates`, where matches found via earlier transitions
96
    /// are preferred over later transitions.
97
    Union { alternates: Vec<StateID> },
98
    /// An alternation such that there exists an epsilon transition to all
99
    /// states in `alternates`, where matches found via later transitions are
100
    /// preferred over earlier transitions.
101
    ///
102
    /// This "reverse" state exists for convenience during compilation that
103
    /// permits easy construction of non-greedy combinations of NFA states. At
104
    /// the end of compilation, Union and UnionReverse states are merged into
105
    /// one Union type of state, where the latter has its epsilon transitions
106
    /// reversed to reflect the priority inversion.
107
    ///
108
    /// The "convenience" here arises from the fact that as new states are
109
    /// added to the list of `alternates`, we would like that add operation
110
    /// to be amortized constant time. But if we used a `Union`, we'd need to
111
    /// prepend the state, which takes O(n) time. There are other approaches we
112
    /// could use to solve this, but this seems simple enough.
113
    UnionReverse { alternates: Vec<StateID> },
114
    /// A state that cannot be transitioned out of. This is useful for cases
115
    /// where you want to prevent matching from occurring. For example, if your
116
    /// regex parser permits empty character classes, then one could choose a
117
    /// `Fail` state to represent it.
118
    Fail,
119
    /// A match state. There is at most one such occurrence of this state in
120
    /// an NFA for each pattern compiled into the NFA. At time of writing, a
121
    /// match state is always produced for every pattern given, but in theory,
122
    /// if a pattern can never lead to a match, then the match state could be
123
    /// omitted.
124
    ///
125
    /// `pattern_id` refers to the ID of the pattern itself, which corresponds
126
    /// to the pattern's index (starting at 0).
127
    Match { pattern_id: PatternID },
128
}
129
130
impl State {
131
    /// If this state is an unconditional epsilon transition, then this returns
132
    /// the target of the transition.
133
26.5M
    fn goto(&self) -> Option<StateID> {
134
6.28M
        match *self {
135
8.80M
            State::Empty { next } => Some(next),
136
6.28M
            State::Union { ref alternates } if alternates.len() == 1 => {
137
308k
                Some(alternates[0])
138
            }
139
758k
            State::UnionReverse { ref alternates }
140
758k
                if alternates.len() == 1 =>
141
            {
142
0
                Some(alternates[0])
143
            }
144
17.4M
            _ => None,
145
        }
146
26.5M
    }
147
148
    /// Returns the heap memory usage, in bytes, of this state.
149
182M
    fn memory_usage(&self) -> usize {
150
182M
        match *self {
151
            State::Empty { .. }
152
            | State::ByteRange { .. }
153
            | State::Look { .. }
154
            | State::CaptureStart { .. }
155
            | State::CaptureEnd { .. }
156
            | State::Fail
157
139M
            | State::Match { .. } => 0,
158
29.4M
            State::Sparse { ref transitions } => {
159
29.4M
                transitions.len() * mem::size_of::<Transition>()
160
            }
161
10.2M
            State::Union { ref alternates } => {
162
10.2M
                alternates.len() * mem::size_of::<StateID>()
163
            }
164
2.75M
            State::UnionReverse { ref alternates } => {
165
2.75M
                alternates.len() * mem::size_of::<StateID>()
166
            }
167
        }
168
182M
    }
169
}
170
171
/// An abstraction for building Thompson NFAs by hand.
172
///
173
/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler)
174
/// uses internally to translate a regex's high-level intermediate
175
/// representation into an [`NFA`].
176
///
177
/// The primary function of this builder is to abstract away the internal
178
/// representation of an NFA and make it difficult to produce NFAs are that
179
/// internally invalid or inconsistent. This builder also provides a way to
180
/// add "empty" states (which can be thought of as unconditional epsilon
181
/// transitions), despite the fact that [`thompson::State`](nfa::State) does
182
/// not have any "empty" representation. The advantage of "empty" states is
183
/// that they make the code for constructing a Thompson NFA logically simpler.
184
///
185
/// Many of the routines on this builder may panic or return errors. Generally
186
/// speaking, panics occur when an invalid sequence of method calls were made,
187
/// where as an error occurs if things get too big. (Where "too big" might mean
188
/// exhausting identifier space or using up too much heap memory in accordance
189
/// with the configured [`size_limit`](Builder::set_size_limit).)
190
///
191
/// # Overview
192
///
193
/// ## Adding multiple patterns
194
///
195
/// Each pattern you add to an NFA should correspond to a pair of
196
/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with
197
/// calls inbetween that add NFA states for that pattern. NFA states may be
198
/// added without first calling `start_pattern`, with the exception of adding
199
/// capturing states.
200
///
201
/// ## Adding NFA states
202
///
203
/// Here is a very brief overview of each of the methods that add NFA states.
204
/// Every method adds a single state.
205
///
206
/// * [`add_empty`](Builder::add_empty): Add a state with a single
207
/// unconditional epsilon transition to another state.
208
/// * [`add_union`](Builder::add_union): Adds a state with unconditional
209
/// epsilon transitions to two or more states, with earlier transitions
210
/// preferred over later ones.
211
/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with
212
/// unconditional epsilon transitions to two or more states, with later
213
/// transitions preferred over earlier ones.
214
/// * [`add_range`](Builder::add_range): Adds a state with a single transition
215
/// to another state that can only be followed if the current input byte is
216
/// within the range given.
217
/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more
218
/// range transitions to other states, where a transition is only followed
219
/// if the current input byte is within one of the ranges. All transitions
220
/// in this state have equal priority, and the corresponding ranges must be
221
/// non-overlapping.
222
/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional*
223
/// epsilon transition to another state, where the condition depends on a
224
/// limited look-around property.
225
/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with
226
/// a single unconditional epsilon transition that also instructs an NFA
227
/// simulation to record the current input position to a specific location in
228
/// memory. This is intended to represent the starting location of a capturing
229
/// group.
230
/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with
231
/// a single unconditional epsilon transition that also instructs an NFA
232
/// simulation to record the current input position to a specific location in
233
/// memory. This is intended to represent the ending location of a capturing
234
/// group.
235
/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to
236
/// another state.
237
/// * [`add_match`](Builder::add_match): Add a state that indicates a match has
238
/// been found for a particular pattern. A match state is a final state with
239
/// no outgoing transitions.
240
///
241
/// ## Setting transitions between NFA states
242
///
243
/// The [`Builder::patch`] method creates a transition from one state to the
244
/// next. If the `from` state corresponds to a state that supports multiple
245
/// outgoing transitions (such as "union"), then this adds the corresponding
246
/// transition. Otherwise, it sets the single transition. (This routine panics
247
/// if `from` corresponds to a state added by `add_sparse`, since sparse states
248
/// need more specialized handling.)
249
///
250
/// # Example
251
///
252
/// This annotated example shows how to hand construct the regex `[a-z]+`
253
/// (without an unanchored prefix).
254
///
255
/// ```
256
/// use regex_automata::{
257
///     nfa::thompson::{pikevm::PikeVM, Builder, Transition},
258
///     util::primitives::StateID,
259
///     Match,
260
/// };
261
///
262
/// let mut builder = Builder::new();
263
/// // Before adding NFA states for our pattern, we need to tell the builder
264
/// // that we are starting the pattern.
265
/// builder.start_pattern()?;
266
/// // Since we use the Pike VM below for searching, we need to add capturing
267
/// // states. If you're just going to build a DFA from the NFA, then capturing
268
/// // states do not need to be added.
269
/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
270
/// let range = builder.add_range(Transition {
271
///     // We don't know the state ID of the 'next' state yet, so we just fill
272
///     // in a dummy 'ZERO' value.
273
///     start: b'a', end: b'z', next: StateID::ZERO,
274
/// })?;
275
/// // This state will point back to 'range', but also enable us to move ahead.
276
/// // That is, this implements the '+' repetition operator. We add 'range' and
277
/// // then 'end' below to this alternation.
278
/// let alt = builder.add_union(vec![])?;
279
/// // The final state before the match state, which serves to capture the
280
/// // end location of the match.
281
/// let end = builder.add_capture_end(StateID::ZERO, 0)?;
282
/// // The match state for our pattern.
283
/// let mat = builder.add_match()?;
284
/// // Now we fill in the transitions between states.
285
/// builder.patch(start, range)?;
286
/// builder.patch(range, alt)?;
287
/// // If we added 'end' before 'range', then we'd implement non-greedy
288
/// // matching, i.e., '+?'.
289
/// builder.patch(alt, range)?;
290
/// builder.patch(alt, end)?;
291
/// builder.patch(end, mat)?;
292
/// // We must explicitly finish pattern and provide the starting state ID for
293
/// // this particular pattern.
294
/// builder.finish_pattern(start)?;
295
/// // Finally, when we build the NFA, we provide the anchored and unanchored
296
/// // starting state IDs. Since we didn't bother with an unanchored prefix
297
/// // here, we only support anchored searching. Thus, both starting states are
298
/// // the same.
299
/// let nfa = builder.build(start, start)?;
300
///
301
/// // Now build a Pike VM from our NFA, and use it for searching. This shows
302
/// // how we can use a regex engine without ever worrying about syntax!
303
/// let re = PikeVM::new_from_nfa(nfa)?;
304
/// let mut cache = re.create_cache();
305
/// let mut caps = re.create_captures();
306
/// let expected = Some(Match::must(0, 0..3));
307
/// re.captures(&mut cache, "foo0", &mut caps);
308
/// assert_eq!(expected, caps.get_match());
309
///
310
/// # Ok::<(), Box<dyn std::error::Error>>(())
311
/// ```
312
#[derive(Clone, Debug, Default)]
313
pub struct Builder {
314
    /// The ID of the pattern that we're currently building.
315
    ///
316
    /// Callers are required to set (and unset) this by calling
317
    /// {start,finish}_pattern. Otherwise, most methods will panic.
318
    pattern_id: Option<PatternID>,
319
    /// A sequence of intermediate NFA states. Once a state is added to this
320
    /// sequence, it is assigned a state ID equivalent to its index. Once a
321
    /// state is added, it is still expected to be mutated, e.g., to set its
322
    /// transition to a state that didn't exist at the time it was added.
323
    states: Vec<State>,
324
    /// The starting states for each individual pattern. Starting at any
325
    /// of these states will result in only an anchored search for the
326
    /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
327
    /// contains a single regex, then `start_pattern[0]` and `start_anchored`
328
    /// are always equivalent.
329
    start_pattern: Vec<StateID>,
330
    /// A map from pattern ID to capture group index to name. (If no name
331
    /// exists, then a None entry is present. Thus, all capturing groups are
332
    /// present in this mapping.)
333
    ///
334
    /// The outer vec is indexed by pattern ID, while the inner vec is indexed
335
    /// by capture index offset for the corresponding pattern.
336
    ///
337
    /// The first capture group for each pattern is always unnamed and is thus
338
    /// always None.
339
    captures: Vec<Vec<Option<Arc<str>>>>,
340
    /// The combined memory used by each of the 'State's in 'states'. This
341
    /// only includes heap usage by each state, and not the size of the state
342
    /// itself. In other words, this tracks heap memory used that isn't
343
    /// captured via `size_of::<State>() * states.len()`.
344
    memory_states: usize,
345
    /// Whether this NFA only matches UTF-8 and whether regex engines using
346
    /// this NFA for searching should report empty matches that split a
347
    /// codepoint.
348
    utf8: bool,
349
    /// Whether this NFA should be matched in reverse or not.
350
    reverse: bool,
351
    /// The matcher to use for look-around assertions.
352
    look_matcher: LookMatcher,
353
    /// A size limit to respect when building an NFA. If the total heap memory
354
    /// of the intermediate NFA states exceeds (or would exceed) this amount,
355
    /// then an error is returned.
356
    size_limit: Option<usize>,
357
}
358
359
impl Builder {
360
    /// Create a new builder for hand-assembling NFAs.
361
501k
    pub fn new() -> Builder {
362
501k
        Builder::default()
363
501k
    }
364
365
    /// Clear this builder.
366
    ///
367
    /// Clearing removes all state associated with building an NFA, but does
368
    /// not reset configuration (such as size limits and whether the NFA
369
    /// should only match UTF-8). After clearing, the builder can be reused to
370
    /// assemble an entirely new NFA.
371
137k
    pub fn clear(&mut self) {
372
137k
        self.pattern_id = None;
373
137k
        self.states.clear();
374
137k
        self.start_pattern.clear();
375
137k
        self.captures.clear();
376
137k
        self.memory_states = 0;
377
137k
    }
378
379
    /// Assemble a [`NFA`] from the states added so far.
380
    ///
381
    /// After building an NFA, more states may be added and `build` may be
382
    /// called again. To reuse a builder to produce an entirely new NFA from
383
    /// scratch, call the [`clear`](Builder::clear) method first.
384
    ///
385
    /// `start_anchored` refers to the ID of the starting state that anchored
386
    /// searches should use. That is, searches who matches are limited to the
387
    /// starting position of the search.
388
    ///
389
    /// `start_unanchored` refers to the ID of the starting state that
390
    /// unanchored searches should use. This permits searches to report matches
391
    /// that start after the beginning of the search. In cases where unanchored
392
    /// searches are not supported, the unanchored starting state ID must be
393
    /// the same as the anchored starting state ID.
394
    ///
395
    /// # Errors
396
    ///
397
    /// This returns an error if there was a problem producing the final NFA.
398
    /// In particular, this might include an error if the capturing groups
399
    /// added to this builder violate any of the invariants documented on
400
    /// [`GroupInfo`](crate::util::captures::GroupInfo).
401
    ///
402
    /// # Panics
403
    ///
404
    /// If `start_pattern` was called, then `finish_pattern` must be called
405
    /// before `build`, otherwise this panics.
406
    ///
407
    /// This may panic for other invalid uses of a builder. For example, if
408
    /// a "start capture" state was added without a corresponding "end capture"
409
    /// state.
410
135k
    pub fn build(
411
135k
        &self,
412
135k
        start_anchored: StateID,
413
135k
        start_unanchored: StateID,
414
135k
    ) -> Result<NFA, BuildError> {
415
135k
        assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
416
135k
        debug!(
417
0
            "intermediate NFA compilation via builder is complete, \
418
0
             intermediate NFA size: {} states, {} bytes on heap",
419
0
            self.states.len(),
420
0
            self.memory_usage(),
421
        );
422
423
135k
        let mut nfa = nfa::Inner::default();
424
135k
        nfa.set_utf8(self.utf8);
425
135k
        nfa.set_reverse(self.reverse);
426
135k
        nfa.set_look_matcher(self.look_matcher.clone());
427
        // A set of compiler internal state IDs that correspond to states
428
        // that are exclusively epsilon transitions, i.e., goto instructions,
429
        // combined with the state that they point to. This is used to
430
        // record said states while transforming the compiler's internal NFA
431
        // representation to the external form.
432
135k
        let mut empties = vec![];
433
        // A map used to re-map state IDs when translating this builder's
434
        // internal NFA state representation to the final NFA representation.
435
135k
        let mut remap = vec![];
436
135k
        remap.resize(self.states.len(), StateID::ZERO);
437
438
135k
        nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
439
135k
        nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
440
        // The idea here is to convert our intermediate states to their final
441
        // form. The only real complexity here is the process of converting
442
        // transitions, which are expressed in terms of state IDs. The new
443
        // set of states will be smaller because of partial epsilon removal,
444
        // so the state IDs will not be the same.
445
129M
        for (sid, state) in self.states.iter().with_state_ids() {
446
129M
            match *state {
447
10.5M
                State::Empty { next } => {
448
10.5M
                    // Since we're removing empty states, we need to handle
449
10.5M
                    // them later since we don't yet know which new state this
450
10.5M
                    // empty state will be mapped to.
451
10.5M
                    empties.push((sid, next));
452
10.5M
                }
453
84.3M
                State::ByteRange { trans } => {
454
84.3M
                    remap[sid] = nfa.add(nfa::State::ByteRange { trans });
455
84.3M
                }
456
16.4M
                State::Sparse { ref transitions } => {
457
16.4M
                    remap[sid] = match transitions.len() {
458
345k
                        0 => nfa.add(nfa::State::Fail),
459
8.56M
                        1 => nfa.add(nfa::State::ByteRange {
460
8.56M
                            trans: transitions[0],
461
8.56M
                        }),
462
                        _ => {
463
7.57M
                            let transitions =
464
7.57M
                                transitions.to_vec().into_boxed_slice();
465
7.57M
                            let sparse = SparseTransitions { transitions };
466
7.57M
                            nfa.add(nfa::State::Sparse(sparse))
467
                        }
468
                    }
469
                }
470
5.38M
                State::Look { look, next } => {
471
5.38M
                    remap[sid] = nfa.add(nfa::State::Look { look, next });
472
5.38M
                }
473
2.00M
                State::CaptureStart { pattern_id, group_index, next } => {
474
2.00M
                    // We can't remove this empty state because of the side
475
2.00M
                    // effect of capturing an offset for this capture slot.
476
2.00M
                    let slot = nfa
477
2.00M
                        .group_info()
478
2.00M
                        .slot(pattern_id, group_index.as_usize())
479
2.00M
                        .expect("invalid capture index");
480
2.00M
                    let slot =
481
2.00M
                        SmallIndex::new(slot).expect("a small enough slot");
482
2.00M
                    remap[sid] = nfa.add(nfa::State::Capture {
483
2.00M
                        next,
484
2.00M
                        pattern_id,
485
2.00M
                        group_index,
486
2.00M
                        slot,
487
2.00M
                    });
488
2.00M
                }
489
2.00M
                State::CaptureEnd { pattern_id, group_index, next } => {
490
2.00M
                    // We can't remove this empty state because of the side
491
2.00M
                    // effect of capturing an offset for this capture slot.
492
2.00M
                    // Also, this always succeeds because we check that all
493
2.00M
                    // slot indices are valid for all capture indices when they
494
2.00M
                    // are initially added.
495
2.00M
                    let slot = nfa
496
2.00M
                        .group_info()
497
2.00M
                        .slot(pattern_id, group_index.as_usize())
498
2.00M
                        .expect("invalid capture index")
499
2.00M
                        .checked_add(1)
500
2.00M
                        .unwrap();
501
2.00M
                    let slot =
502
2.00M
                        SmallIndex::new(slot).expect("a small enough slot");
503
2.00M
                    remap[sid] = nfa.add(nfa::State::Capture {
504
2.00M
                        next,
505
2.00M
                        pattern_id,
506
2.00M
                        group_index,
507
2.00M
                        slot,
508
2.00M
                    });
509
2.00M
                }
510
7.46M
                State::Union { ref alternates } => {
511
7.46M
                    if alternates.is_empty() {
512
0
                        remap[sid] = nfa.add(nfa::State::Fail);
513
7.46M
                    } else if alternates.len() == 1 {
514
917k
                        empties.push((sid, alternates[0]));
515
917k
                        remap[sid] = alternates[0];
516
6.54M
                    } else if alternates.len() == 2 {
517
4.17M
                        remap[sid] = nfa.add(nfa::State::BinaryUnion {
518
4.17M
                            alt1: alternates[0],
519
4.17M
                            alt2: alternates[1],
520
4.17M
                        });
521
4.17M
                    } else {
522
2.37M
                        let alternates =
523
2.37M
                            alternates.to_vec().into_boxed_slice();
524
2.37M
                        remap[sid] = nfa.add(nfa::State::Union { alternates });
525
2.37M
                    }
526
                }
527
1.07M
                State::UnionReverse { ref alternates } => {
528
1.07M
                    if alternates.is_empty() {
529
0
                        remap[sid] = nfa.add(nfa::State::Fail);
530
1.07M
                    } else if alternates.len() == 1 {
531
0
                        empties.push((sid, alternates[0]));
532
0
                        remap[sid] = alternates[0];
533
1.07M
                    } else if alternates.len() == 2 {
534
1.07M
                        remap[sid] = nfa.add(nfa::State::BinaryUnion {
535
1.07M
                            alt1: alternates[1],
536
1.07M
                            alt2: alternates[0],
537
1.07M
                        });
538
1.07M
                    } else {
539
0
                        let mut alternates =
540
0
                            alternates.to_vec().into_boxed_slice();
541
0
                        alternates.reverse();
542
0
                        remap[sid] = nfa.add(nfa::State::Union { alternates });
543
0
                    }
544
                }
545
0
                State::Fail => {
546
0
                    remap[sid] = nfa.add(nfa::State::Fail);
547
0
                }
548
135k
                State::Match { pattern_id } => {
549
135k
                    remap[sid] = nfa.add(nfa::State::Match { pattern_id });
550
135k
                }
551
            }
552
        }
553
        // Some of the new states still point to empty state IDs, so we need to
554
        // follow each of them and remap the empty state IDs to their non-empty
555
        // state IDs.
556
        //
557
        // We also keep track of which states we've already mapped. This helps
558
        // avoid quadratic behavior in a long chain of empty states. For
559
        // example, in 'a{0}{50000}'.
560
135k
        let mut remapped = vec![false; self.states.len()];
561
11.4M
        for &(empty_id, empty_next) in empties.iter() {
562
11.4M
            if remapped[empty_id] {
563
2.71M
                continue;
564
8.71M
            }
565
            // empty states can point to other empty states, forming a chain.
566
            // So we must follow the chain until the end, which must end at
567
            // a non-empty state, and therefore, a state that is correctly
568
            // remapped. We are guaranteed to terminate because our compiler
569
            // never builds a loop among only empty states.
570
8.71M
            let mut new_next = empty_next;
571
13.2M
            while let Some(next) = self.states[new_next].goto() {
572
4.55M
                new_next = next;
573
4.55M
            }
574
8.71M
            remap[empty_id] = remap[new_next];
575
8.71M
            remapped[empty_id] = true;
576
577
            // Now that we've remapped the main 'empty_id' above, we re-follow
578
            // the chain from above and remap every empty state we found along
579
            // the way to our ultimate non-empty target. We are careful to set
580
            // 'remapped' to true for each such state. We thus will not need
581
            // to re-compute this chain for any subsequent empty states in
582
            // 'empties' that are part of this chain.
583
8.71M
            let mut next2 = empty_next;
584
13.2M
            while let Some(next) = self.states[next2].goto() {
585
4.55M
                remap[next2] = remap[new_next];
586
4.55M
                remapped[next2] = true;
587
4.55M
                next2 = next;
588
4.55M
            }
589
        }
590
        // Finally remap all of the state IDs.
591
135k
        nfa.remap(&remap);
592
135k
        let final_nfa = nfa.into_nfa();
593
135k
        debug!(
594
0
            "NFA compilation via builder complete, \
595
0
             final NFA size: {} states, {} bytes on heap, \
596
0
             has empty? {:?}, utf8? {:?}",
597
0
            final_nfa.states().len(),
598
0
            final_nfa.memory_usage(),
599
0
            final_nfa.has_empty(),
600
0
            final_nfa.is_utf8(),
601
        );
602
135k
        Ok(final_nfa)
603
135k
    }
604
605
    /// Start the assembly of a pattern in this NFA.
606
    ///
607
    /// Upon success, this returns the identifier for the new pattern.
608
    /// Identifiers start at `0` and are incremented by 1 for each new pattern.
609
    ///
610
    /// It is necessary to call this routine before adding capturing states.
611
    /// Otherwise, any other NFA state may be added before starting a pattern.
612
    ///
613
    /// # Errors
614
    ///
615
    /// If the pattern identifier space is exhausted, then this returns an
616
    /// error.
617
    ///
618
    /// # Panics
619
    ///
620
    /// If this is called while assembling another pattern (i.e., before
621
    /// `finish_pattern` is called), then this panics.
622
137k
    pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> {
623
137k
        assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
624
625
137k
        let proposed = self.start_pattern.len();
626
137k
        let pid = PatternID::new(proposed)
627
137k
            .map_err(|_| BuildError::too_many_patterns(proposed))?;
628
137k
        self.pattern_id = Some(pid);
629
        // This gets filled in when 'finish_pattern' is called.
630
137k
        self.start_pattern.push(StateID::ZERO);
631
137k
        Ok(pid)
632
137k
    }
633
634
    /// Finish the assembly of a pattern in this NFA.
635
    ///
636
    /// Upon success, this returns the identifier for the new pattern.
637
    /// Identifiers start at `0` and are incremented by 1 for each new
638
    /// pattern. This is the same identifier returned by the corresponding
639
    /// `start_pattern` call.
640
    ///
641
    /// Note that `start_pattern` and `finish_pattern` pairs cannot be
642
    /// interleaved or nested. A correct `finish_pattern` call _always_
643
    /// corresponds to the most recently called `start_pattern` routine.
644
    ///
645
    /// # Errors
646
    ///
647
    /// This currently never returns an error, but this is subject to change.
648
    ///
649
    /// # Panics
650
    ///
651
    /// If this is called without a corresponding `start_pattern` call, then
652
    /// this panics.
653
135k
    pub fn finish_pattern(
654
135k
        &mut self,
655
135k
        start_id: StateID,
656
135k
    ) -> Result<PatternID, BuildError> {
657
135k
        let pid = self.current_pattern_id();
658
135k
        self.start_pattern[pid] = start_id;
659
135k
        self.pattern_id = None;
660
135k
        Ok(pid)
661
135k
    }
662
663
    /// Returns the pattern identifier of the current pattern.
664
    ///
665
    /// # Panics
666
    ///
667
    /// If this doesn't occur after a `start_pattern` call and before the
668
    /// corresponding `finish_pattern` call, then this panics.
669
5.82M
    pub fn current_pattern_id(&self) -> PatternID {
670
5.82M
        self.pattern_id.expect("must call 'start_pattern' first")
671
5.82M
    }
672
673
    /// Returns the number of patterns added to this builder so far.
674
    ///
675
    /// This only includes patterns that have had `finish_pattern` called
676
    /// for them.
677
0
    pub fn pattern_len(&self) -> usize {
678
0
        self.start_pattern.len()
679
0
    }
680
681
    /// Add an "empty" NFA state.
682
    ///
683
    /// An "empty" NFA state is a state with a single unconditional epsilon
684
    /// transition to another NFA state. Such empty states are removed before
685
    /// building the final [`NFA`] (which has no such "empty" states), but they
686
    /// can be quite useful in the construction process of an NFA.
687
    ///
688
    /// # Errors
689
    ///
690
    /// This returns an error if the state identifier space is exhausted, or if
691
    /// the configured heap size limit has been exceeded.
692
14.9M
    pub fn add_empty(&mut self) -> Result<StateID, BuildError> {
693
14.9M
        self.add(State::Empty { next: StateID::ZERO })
694
14.9M
    }
695
696
    /// Add a "union" NFA state.
697
    ///
698
    /// A "union" NFA state that contains zero or more unconditional epsilon
699
    /// transitions to other NFA states. The order of these transitions
700
    /// reflects a priority order where earlier transitions are preferred over
701
    /// later transitions.
702
    ///
703
    /// Callers may provide an empty set of alternates to this method call, and
704
    /// then later add transitions via `patch`. At final build time, a "union"
705
    /// state with no alternates is converted to a "fail" state, and a "union"
706
    /// state with exactly one alternate is treated as if it were an "empty"
707
    /// state.
708
    ///
709
    /// # Errors
710
    ///
711
    /// This returns an error if the state identifier space is exhausted, or if
712
    /// the configured heap size limit has been exceeded.
713
10.2M
    pub fn add_union(
714
10.2M
        &mut self,
715
10.2M
        alternates: Vec<StateID>,
716
10.2M
    ) -> Result<StateID, BuildError> {
717
10.2M
        self.add(State::Union { alternates })
718
10.2M
    }
719
720
    /// Add a "reverse union" NFA state.
721
    ///
722
    /// A "reverse union" NFA state contains zero or more unconditional epsilon
723
    /// transitions to other NFA states. The order of these transitions
724
    /// reflects a priority order where later transitions are preferred
725
    /// over earlier transitions. This is an inverted priority order when
726
    /// compared to `add_union`. This is useful, for example, for implementing
727
    /// non-greedy repetition operators.
728
    ///
729
    /// Callers may provide an empty set of alternates to this method call, and
730
    /// then later add transitions via `patch`. At final build time, a "reverse
731
    /// union" state with no alternates is converted to a "fail" state, and a
732
    /// "reverse union" state with exactly one alternate is treated as if it
733
    /// were an "empty" state.
734
    ///
735
    /// # Errors
736
    ///
737
    /// This returns an error if the state identifier space is exhausted, or if
738
    /// the configured heap size limit has been exceeded.
739
2.75M
    pub fn add_union_reverse(
740
2.75M
        &mut self,
741
2.75M
        alternates: Vec<StateID>,
742
2.75M
    ) -> Result<StateID, BuildError> {
743
2.75M
        self.add(State::UnionReverse { alternates })
744
2.75M
    }
745
746
    /// Add a "range" NFA state.
747
    ///
748
    /// A "range" NFA state is a state with one outgoing transition to another
749
    /// state, where that transition may only be followed if the current input
750
    /// byte falls between a range of bytes given.
751
    ///
752
    /// # Errors
753
    ///
754
    /// This returns an error if the state identifier space is exhausted, or if
755
    /// the configured heap size limit has been exceeded.
756
113M
    pub fn add_range(
757
113M
        &mut self,
758
113M
        trans: Transition,
759
113M
    ) -> Result<StateID, BuildError> {
760
113M
        self.add(State::ByteRange { trans })
761
113M
    }
762
763
    /// Add a "sparse" NFA state.
764
    ///
765
    /// A "sparse" NFA state contains zero or more outgoing transitions, where
766
    /// the transition to be followed (if any) is chosen based on whether the
767
    /// current input byte falls in the range of one such transition. The
768
    /// transitions given *must* be non-overlapping and in ascending order. (A
769
    /// "sparse" state with no transitions is equivalent to a "fail" state.)
770
    ///
771
    /// A "sparse" state is like adding a "union" state and pointing it at a
772
    /// bunch of "range" states, except that the different alternates have
773
    /// equal priority.
774
    ///
775
    /// Note that a "sparse" state is the only state that cannot be patched.
776
    /// This is because a "sparse" state has many transitions, each of which
777
    /// may point to a different NFA state. Moreover, adding more such
778
    /// transitions requires more than just an NFA state ID to point to. It
779
    /// also requires a byte range. The `patch` routine does not support the
780
    /// additional information required. Therefore, callers must ensure that
781
    /// all outgoing transitions for this state are included when `add_sparse`
782
    /// is called. There is no way to add more later.
783
    ///
784
    /// # Errors
785
    ///
786
    /// This returns an error if the state identifier space is exhausted, or if
787
    /// the configured heap size limit has been exceeded.
788
    ///
789
    /// # Panics
790
    ///
791
    /// This routine _may_ panic if the transitions given overlap or are not
792
    /// in ascending order.
793
29.4M
    pub fn add_sparse(
794
29.4M
        &mut self,
795
29.4M
        transitions: Vec<Transition>,
796
29.4M
    ) -> Result<StateID, BuildError> {
797
29.4M
        self.add(State::Sparse { transitions })
798
29.4M
    }
799
800
    /// Add a "look" NFA state.
801
    ///
802
    /// A "look" NFA state corresponds to a state with exactly one
803
    /// *conditional* epsilon transition to another NFA state. Namely, it
804
    /// represents one of a small set of simplistic look-around operators.
805
    ///
806
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
807
    /// and then change it later with [`patch`](Builder::patch).
808
    ///
809
    /// # Errors
810
    ///
811
    /// This returns an error if the state identifier space is exhausted, or if
812
    /// the configured heap size limit has been exceeded.
813
5.54M
    pub fn add_look(
814
5.54M
        &mut self,
815
5.54M
        next: StateID,
816
5.54M
        look: Look,
817
5.54M
    ) -> Result<StateID, BuildError> {
818
5.54M
        self.add(State::Look { look, next })
819
5.54M
    }
820
821
    /// Add a "start capture" NFA state.
822
    ///
823
    /// A "start capture" NFA state corresponds to a state with exactly one
824
    /// outgoing unconditional epsilon transition to another state. Unlike
825
    /// "empty" states, a "start capture" state also carries with it an
826
    /// instruction for saving the current position of input to a particular
827
    /// location in memory. NFA simulations, like the Pike VM, may use this
828
    /// information to report the match locations of capturing groups in a
829
    /// regex pattern.
830
    ///
831
    /// If the corresponding capturing group has a name, then callers should
832
    /// include it here.
833
    ///
834
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
835
    /// and then change it later with [`patch`](Builder::patch).
836
    ///
837
    /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
838
    /// end states may be interleaved. Indeed, it is typical for many "start
839
    /// capture" NFA states to appear before the first "end capture" state.
840
    ///
841
    /// # Errors
842
    ///
843
    /// This returns an error if the state identifier space is exhausted, or if
844
    /// the configured heap size limit has been exceeded or if the given
845
    /// capture index overflows `usize`.
846
    ///
847
    /// While the above are the only conditions in which this routine can
848
    /// currently return an error, it is possible to call this method with an
849
    /// inputs that results in the final `build()` step failing to produce an
850
    /// NFA. For example, if one adds two distinct capturing groups with the
851
    /// same name, then that will result in `build()` failing with an error.
852
    ///
853
    /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
854
    /// more information on what qualifies as valid capturing groups.
855
    ///
856
    /// # Example
857
    ///
858
    /// This example shows that an error occurs when one tries to add multiple
859
    /// capturing groups with the same name to the same pattern.
860
    ///
861
    /// ```
862
    /// use regex_automata::{
863
    ///     nfa::thompson::Builder,
864
    ///     util::primitives::StateID,
865
    /// };
866
    ///
867
    /// let name = Some(std::sync::Arc::from("foo"));
868
    /// let mut builder = Builder::new();
869
    /// builder.start_pattern()?;
870
    /// // 0th capture group should always be unnamed.
871
    /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
872
    /// // OK
873
    /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?;
874
    /// // This is not OK, but 'add_capture_start' still succeeds. We don't
875
    /// // get an error until we call 'build' below. Without this call, the
876
    /// // call to 'build' below would succeed.
877
    /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?;
878
    /// // Finish our pattern so we can try to build the NFA.
879
    /// builder.finish_pattern(start)?;
880
    /// let result = builder.build(start, start);
881
    /// assert!(result.is_err());
882
    ///
883
    /// # Ok::<(), Box<dyn std::error::Error>>(())
884
    /// ```
885
    ///
886
    /// However, adding multiple capturing groups with the same name to
887
    /// distinct patterns is okay:
888
    ///
889
    /// ```
890
    /// use std::sync::Arc;
891
    ///
892
    /// use regex_automata::{
893
    ///     nfa::thompson::{pikevm::PikeVM, Builder, Transition},
894
    ///     util::{
895
    ///         captures::Captures,
896
    ///         primitives::{PatternID, StateID},
897
    ///     },
898
    ///     Span,
899
    /// };
900
    ///
901
    /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'.
902
    /// let mut builder = Builder::new();
903
    /// // We compile them to support an unanchored search, which requires
904
    /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern.
905
    /// let unanchored_prefix = builder.add_union_reverse(vec![])?;
906
    /// let any = builder.add_range(Transition {
907
    ///     start: b'\x00', end: b'\xFF', next: StateID::ZERO,
908
    /// })?;
909
    /// builder.patch(unanchored_prefix, any)?;
910
    /// builder.patch(any, unanchored_prefix)?;
911
    ///
912
    /// // Compile an alternation that permits matching multiple patterns.
913
    /// let alt = builder.add_union(vec![])?;
914
    /// builder.patch(unanchored_prefix, alt)?;
915
    ///
916
    /// // Compile '(?P<foo>[a-z]+)'.
917
    /// builder.start_pattern()?;
918
    /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?;
919
    /// // N.B. 0th capture group must always be unnamed.
920
    /// let foo_start0 = builder.add_capture_start(
921
    ///     StateID::ZERO, 1, Some(Arc::from("foo")),
922
    /// )?;
923
    /// let lowercase = builder.add_range(Transition {
924
    ///     start: b'a', end: b'z', next: StateID::ZERO,
925
    /// })?;
926
    /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?;
927
    /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?;
928
    /// let match0 = builder.add_match()?;
929
    /// builder.patch(start0, foo_start0)?;
930
    /// builder.patch(foo_start0, lowercase)?;
931
    /// builder.patch(lowercase, foo_end0)?;
932
    /// builder.patch(foo_end0, end0)?;
933
    /// builder.patch(end0, match0)?;
934
    /// builder.finish_pattern(start0)?;
935
    ///
936
    /// // Compile '(?P<foo>[A-Z]+)'.
937
    /// builder.start_pattern()?;
938
    /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?;
939
    /// // N.B. 0th capture group must always be unnamed.
940
    /// let foo_start1 = builder.add_capture_start(
941
    ///     StateID::ZERO, 1, Some(Arc::from("foo")),
942
    /// )?;
943
    /// let uppercase = builder.add_range(Transition {
944
    ///     start: b'A', end: b'Z', next: StateID::ZERO,
945
    /// })?;
946
    /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?;
947
    /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?;
948
    /// let match1 = builder.add_match()?;
949
    /// builder.patch(start1, foo_start1)?;
950
    /// builder.patch(foo_start1, uppercase)?;
951
    /// builder.patch(uppercase, foo_end1)?;
952
    /// builder.patch(foo_end1, end1)?;
953
    /// builder.patch(end1, match1)?;
954
    /// builder.finish_pattern(start1)?;
955
    ///
956
    /// // Now add the patterns to our alternation that we started above.
957
    /// builder.patch(alt, start0)?;
958
    /// builder.patch(alt, start1)?;
959
    ///
960
    /// // Finally build the NFA. The first argument is the anchored starting
961
    /// // state (the pattern alternation) where as the second is the
962
    /// // unanchored starting state (the unanchored prefix).
963
    /// let nfa = builder.build(alt, unanchored_prefix)?;
964
    ///
965
    /// // Now build a Pike VM from our NFA and access the 'foo' capture
966
    /// // group regardless of which pattern matched, since it is defined
967
    /// // for both patterns.
968
    /// let vm = PikeVM::new_from_nfa(nfa)?;
969
    /// let mut cache = vm.create_cache();
970
    /// let caps: Vec<Captures> =
971
    ///     vm.captures_iter(&mut cache, "0123aAaAA").collect();
972
    /// assert_eq!(5, caps.len());
973
    ///
974
    /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern());
975
    /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo"));
976
    ///
977
    /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern());
978
    /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo"));
979
    ///
980
    /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern());
981
    /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo"));
982
    ///
983
    /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern());
984
    /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo"));
985
    ///
986
    /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern());
987
    /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo"));
988
    ///
989
    /// # Ok::<(), Box<dyn std::error::Error>>(())
990
    /// ```
991
2.78M
    pub fn add_capture_start(
992
2.78M
        &mut self,
993
2.78M
        next: StateID,
994
2.78M
        group_index: u32,
995
2.78M
        name: Option<Arc<str>>,
996
2.78M
    ) -> Result<StateID, BuildError> {
997
2.78M
        let pid = self.current_pattern_id();
998
2.78M
        let group_index = match SmallIndex::try_from(group_index) {
999
            Err(_) => {
1000
0
                return Err(BuildError::invalid_capture_index(group_index))
1001
            }
1002
2.78M
            Ok(group_index) => group_index,
1003
        };
1004
        // Make sure we have space to insert our (pid,index)|-->name mapping.
1005
2.78M
        if pid.as_usize() >= self.captures.len() {
1006
67.2k
            for _ in 0..=(pid.as_usize() - self.captures.len()) {
1007
67.2k
                self.captures.push(vec![]);
1008
67.2k
            }
1009
2.71M
        }
1010
        // In the case where 'group_index < self.captures[pid].len()', it means
1011
        // that we are adding a duplicate capture group. This is somewhat
1012
        // weird, but permissible because the capture group itself can be
1013
        // repeated in the syntax. For example, '([a-z]){4}' will produce 4
1014
        // capture groups. In practice, only the last will be set at search
1015
        // time when a match occurs. For duplicates, we don't need to push
1016
        // anything other than a CaptureStart NFA state.
1017
2.78M
        if group_index.as_usize() >= self.captures[pid].len() {
1018
            // For discontiguous indices, push placeholders for earlier capture
1019
            // groups that weren't explicitly added.
1020
167k
            for _ in 0..(group_index.as_usize() - self.captures[pid].len()) {
1021
2.85k
                self.captures[pid].push(None);
1022
2.85k
            }
1023
167k
            self.captures[pid].push(name);
1024
2.61M
        }
1025
2.78M
        self.add(State::CaptureStart { pattern_id: pid, group_index, next })
1026
2.78M
    }
1027
1028
    /// Add a "end capture" NFA state.
1029
    ///
1030
    /// A "end capture" NFA state corresponds to a state with exactly one
1031
    /// outgoing unconditional epsilon transition to another state. Unlike
1032
    /// "empty" states, a "end capture" state also carries with it an
1033
    /// instruction for saving the current position of input to a particular
1034
    /// location in memory. NFA simulations, like the Pike VM, may use this
1035
    /// information to report the match locations of capturing groups in a
1036
    ///
1037
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
1038
    /// and then change it later with [`patch`](Builder::patch).
1039
    ///
1040
    /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
1041
    /// end states may be interleaved. Indeed, it is typical for many "start
1042
    /// capture" NFA states to appear before the first "end capture" state.
1043
    ///
1044
    /// # Errors
1045
    ///
1046
    /// This returns an error if the state identifier space is exhausted, or if
1047
    /// the configured heap size limit has been exceeded or if the given
1048
    /// capture index overflows `usize`.
1049
    ///
1050
    /// While the above are the only conditions in which this routine can
1051
    /// currently return an error, it is possible to call this method with an
1052
    /// inputs that results in the final `build()` step failing to produce an
1053
    /// NFA. For example, if one adds two distinct capturing groups with the
1054
    /// same name, then that will result in `build()` failing with an error.
1055
    ///
1056
    /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
1057
    /// more information on what qualifies as valid capturing groups.
1058
2.77M
    pub fn add_capture_end(
1059
2.77M
        &mut self,
1060
2.77M
        next: StateID,
1061
2.77M
        group_index: u32,
1062
2.77M
    ) -> Result<StateID, BuildError> {
1063
2.77M
        let pid = self.current_pattern_id();
1064
2.77M
        let group_index = match SmallIndex::try_from(group_index) {
1065
            Err(_) => {
1066
0
                return Err(BuildError::invalid_capture_index(group_index))
1067
            }
1068
2.77M
            Ok(group_index) => group_index,
1069
        };
1070
2.77M
        self.add(State::CaptureEnd { pattern_id: pid, group_index, next })
1071
2.77M
    }
1072
1073
    /// Adds a "fail" NFA state.
1074
    ///
1075
    /// A "fail" state is simply a state that has no outgoing transitions. It
1076
    /// acts as a way to cause a search to stop without reporting a match.
1077
    /// For example, one way to represent an NFA with zero patterns is with a
1078
    /// single "fail" state.
1079
    ///
1080
    /// # Errors
1081
    ///
1082
    /// This returns an error if the state identifier space is exhausted, or if
1083
    /// the configured heap size limit has been exceeded.
1084
0
    pub fn add_fail(&mut self) -> Result<StateID, BuildError> {
1085
0
        self.add(State::Fail)
1086
0
    }
1087
1088
    /// Adds a "match" NFA state.
1089
    ///
1090
    /// A "match" state has no outgoing transitions (just like a "fail"
1091
    /// state), but it has special significance in that if a search enters
1092
    /// this state, then a match has been found. The match state that is added
1093
    /// automatically has the current pattern ID associated with it. This is
1094
    /// used to report the matching pattern ID at search time.
1095
    ///
1096
    /// # Errors
1097
    ///
1098
    /// This returns an error if the state identifier space is exhausted, or if
1099
    /// the configured heap size limit has been exceeded.
1100
    ///
1101
    /// # Panics
1102
    ///
1103
    /// This must be called after a `start_pattern` call but before the
1104
    /// corresponding `finish_pattern` call. Otherwise, it panics.
1105
135k
    pub fn add_match(&mut self) -> Result<StateID, BuildError> {
1106
135k
        let pattern_id = self.current_pattern_id();
1107
135k
        let sid = self.add(State::Match { pattern_id })?;
1108
135k
        Ok(sid)
1109
135k
    }
1110
1111
    /// The common implementation of "add a state." It handles the common
1112
    /// error cases of state ID exhausting (by owning state ID allocation) and
1113
    /// whether the size limit has been exceeded.
1114
182M
    fn add(&mut self, state: State) -> Result<StateID, BuildError> {
1115
182M
        let id = StateID::new(self.states.len())
1116
182M
            .map_err(|_| BuildError::too_many_states(self.states.len()))?;
1117
182M
        self.memory_states += state.memory_usage();
1118
182M
        self.states.push(state);
1119
182M
        self.check_size_limit()?;
1120
182M
        Ok(id)
1121
182M
    }
1122
1123
    /// Add a transition from one state to another.
1124
    ///
1125
    /// This routine is called "patch" since it is very common to add the
1126
    /// states you want, typically with "dummy" state ID transitions, and then
1127
    /// "patch" in the real state IDs later. This is because you don't always
1128
    /// know all of the necessary state IDs to add because they might not
1129
    /// exist yet.
1130
    ///
1131
    /// # Errors
1132
    ///
1133
    /// This may error if patching leads to an increase in heap usage beyond
1134
    /// the configured size limit. Heap usage only grows when patching adds a
1135
    /// new transition (as in the case of a "union" state).
1136
    ///
1137
    /// # Panics
1138
    ///
1139
    /// This panics if `from` corresponds to a "sparse" state. When "sparse"
1140
    /// states are added, there is no way to patch them after-the-fact. (If you
1141
    /// have a use case where this would be helpful, please file an issue. It
1142
    /// will likely require a new API.)
1143
186M
    pub fn patch(
1144
186M
        &mut self,
1145
186M
        from: StateID,
1146
186M
        to: StateID,
1147
186M
    ) -> Result<(), BuildError> {
1148
186M
        let old_memory_states = self.memory_states;
1149
186M
        match self.states[from] {
1150
14.9M
            State::Empty { ref mut next } => {
1151
14.9M
                *next = to;
1152
14.9M
            }
1153
111M
            State::ByteRange { ref mut trans } => {
1154
111M
                trans.next = to;
1155
111M
            }
1156
            State::Sparse { .. } => {
1157
0
                panic!("cannot patch from a sparse NFA state")
1158
            }
1159
5.54M
            State::Look { ref mut next, .. } => {
1160
5.54M
                *next = to;
1161
5.54M
            }
1162
42.8M
            State::Union { ref mut alternates } => {
1163
42.8M
                alternates.push(to);
1164
42.8M
                self.memory_states += mem::size_of::<StateID>();
1165
42.8M
            }
1166
5.49M
            State::UnionReverse { ref mut alternates } => {
1167
5.49M
                alternates.push(to);
1168
5.49M
                self.memory_states += mem::size_of::<StateID>();
1169
5.49M
            }
1170
2.77M
            State::CaptureStart { ref mut next, .. } => {
1171
2.77M
                *next = to;
1172
2.77M
            }
1173
2.77M
            State::CaptureEnd { ref mut next, .. } => {
1174
2.77M
                *next = to;
1175
2.77M
            }
1176
0
            State::Fail => {}
1177
0
            State::Match { .. } => {}
1178
        }
1179
186M
        if old_memory_states != self.memory_states {
1180
48.3M
            self.check_size_limit()?;
1181
138M
        }
1182
186M
        Ok(())
1183
186M
    }
1184
1185
    /// Set whether the NFA produced by this builder should only match UTF-8.
1186
    ///
1187
    /// This should be set when both of the following are true:
1188
    ///
1189
    /// 1. The caller guarantees that the NFA created by this build will only
1190
    /// report non-empty matches with spans that are valid UTF-8.
1191
    /// 2. The caller desires regex engines using this NFA to avoid reporting
1192
    /// empty matches with a span that splits a valid UTF-8 encoded codepoint.
1193
    ///
1194
    /// Property (1) is not checked. Instead, this requires the caller to
1195
    /// promise that it is true. Property (2) corresponds to the behavior of
1196
    /// regex engines using the NFA created by this builder. Namely, there
1197
    /// is no way in the NFA's graph itself to say that empty matches found
1198
    /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries.
1199
    /// Instead, this option is used to communicate the UTF-8 semantic to regex
1200
    /// engines that will typically implement it as a post-processing step by
1201
    /// filtering out empty matches that don't fall on UTF-8 boundaries.
1202
    ///
1203
    /// If you're building an NFA from an HIR (and not using a
1204
    /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can
1205
    /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8)
1206
    /// option to guarantee that if the HIR detects a non-empty match, then it
1207
    /// is guaranteed to be valid UTF-8.
1208
    ///
1209
    /// Note that property (2) does *not* specify the behavior of executing
1210
    /// a search on a haystack that is not valid UTF-8. Therefore, if you're
1211
    /// *not* running this NFA on strings that are guaranteed to be valid
1212
    /// UTF-8, you almost certainly do not want to enable this option.
1213
    /// Similarly, if you are running the NFA on strings that *are* guaranteed
1214
    /// to be valid UTF-8, then you almost certainly want to enable this option
1215
    /// unless you can guarantee that your NFA will never produce a zero-width
1216
    /// match.
1217
    ///
1218
    /// It is disabled by default.
1219
137k
    pub fn set_utf8(&mut self, yes: bool) {
1220
137k
        self.utf8 = yes;
1221
137k
    }
1222
1223
    /// Returns whether UTF-8 mode is enabled for this builder.
1224
    ///
1225
    /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is.
1226
0
    pub fn get_utf8(&self) -> bool {
1227
0
        self.utf8
1228
0
    }
1229
1230
    /// Sets whether the NFA produced by this builder should be matched in
1231
    /// reverse or not. Generally speaking, when enabled, the NFA produced
1232
    /// should be matched by moving backwards through a haystack, from a higher
1233
    /// memory address to a lower memory address.
1234
    ///
1235
    /// See also [`NFA::is_reverse`] for more details.
1236
    ///
1237
    /// This is disabled by default, which means NFAs are by default matched
1238
    /// in the forward direction.
1239
137k
    pub fn set_reverse(&mut self, yes: bool) {
1240
137k
        self.reverse = yes;
1241
137k
    }
1242
1243
    /// Returns whether reverse mode is enabled for this builder.
1244
    ///
1245
    /// See [`Builder::set_reverse`] for more details about what "reverse mode"
1246
    /// is.
1247
0
    pub fn get_reverse(&self) -> bool {
1248
0
        self.reverse
1249
0
    }
1250
1251
    /// Sets the look-around matcher that should be used for the resulting NFA.
1252
    ///
1253
    /// A look-around matcher can be used to configure how look-around
1254
    /// assertions are matched. For example, a matcher might carry
1255
    /// configuration that changes the line terminator used for `(?m:^)` and
1256
    /// `(?m:$)` assertions.
1257
137k
    pub fn set_look_matcher(&mut self, m: LookMatcher) {
1258
137k
        self.look_matcher = m;
1259
137k
    }
1260
1261
    /// Returns the look-around matcher used for this builder.
1262
    ///
1263
    /// If a matcher was not explicitly set, then `LookMatcher::default()` is
1264
    /// returned.
1265
0
    pub fn get_look_matcher(&self) -> &LookMatcher {
1266
0
        &self.look_matcher
1267
0
    }
1268
1269
    /// Set the size limit on this builder.
1270
    ///
1271
    /// Setting the size limit will also check whether the NFA built so far
1272
    /// fits within the given size limit. If it doesn't, then an error is
1273
    /// returned.
1274
    ///
1275
    /// By default, there is no configured size limit.
1276
137k
    pub fn set_size_limit(
1277
137k
        &mut self,
1278
137k
        limit: Option<usize>,
1279
137k
    ) -> Result<(), BuildError> {
1280
137k
        self.size_limit = limit;
1281
137k
        self.check_size_limit()
1282
137k
    }
1283
1284
    /// Return the currently configured size limit.
1285
    ///
1286
    /// By default, this returns `None`, which corresponds to no configured
1287
    /// size limit.
1288
0
    pub fn get_size_limit(&self) -> Option<usize> {
1289
0
        self.size_limit
1290
0
    }
1291
1292
    /// Returns the heap memory usage, in bytes, used by the NFA states added
1293
    /// so far.
1294
    ///
1295
    /// Note that this is an approximation of how big the final NFA will be.
1296
    /// In practice, the final NFA will likely be a bit smaller because of
1297
    /// its simpler state representation. (For example, using things like
1298
    /// `Box<[StateID]>` instead of `Vec<StateID>`.)
1299
230M
    pub fn memory_usage(&self) -> usize {
1300
230M
        self.states.len() * mem::size_of::<State>() + self.memory_states
1301
230M
    }
1302
1303
230M
    fn check_size_limit(&self) -> Result<(), BuildError> {
1304
230M
        if let Some(limit) = self.size_limit {
1305
230M
            if self.memory_usage() > limit {
1306
2.09k
                return Err(BuildError::exceeded_size_limit(limit));
1307
230M
            }
1308
0
        }
1309
230M
        Ok(())
1310
230M
    }
1311
}
1312
1313
#[cfg(test)]
1314
mod tests {
1315
    use super::*;
1316
1317
    // This asserts that a builder state doesn't have its size changed. It is
1318
    // *really* easy to accidentally increase the size, and thus potentially
1319
    // dramatically increase the memory usage of NFA builder.
1320
    //
1321
    // This assert doesn't mean we absolutely cannot increase the size of a
1322
    // builder state. We can. It's just here to make sure we do it knowingly
1323
    // and intentionally.
1324
    //
1325
    // A builder state is unfortunately a little bigger than an NFA state,
1326
    // since we really want to support adding things to a pre-existing state.
1327
    // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an
1328
    // extra 8 bytes per state. Sad, but at least it gets freed once the NFA
1329
    // is built.
1330
    #[test]
1331
    fn state_has_small_size() {
1332
        #[cfg(target_pointer_width = "64")]
1333
        assert_eq!(32, core::mem::size_of::<State>());
1334
        #[cfg(target_pointer_width = "32")]
1335
        assert_eq!(16, core::mem::size_of::<State>());
1336
    }
1337
}