Coverage Report

Created: 2025-11-16 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.1.10/src/nfa/compiler.rs
Line
Count
Source
1
// This module provides an NFA compiler using Thompson's construction
2
// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
3
// graph as output. The NFA graph is structured in a way that permits it to be
4
// executed by a virtual machine and also used to efficiently build a DFA.
5
//
6
// The compiler deals with a slightly expanded set of NFA states that notably
7
// includes an empty node that has exactly one epsilon transition to the next
8
// state. In other words, it's a "goto" instruction if one views Thompson's NFA
9
// as a set of bytecode instructions. These goto instructions are removed in
10
// a subsequent phase before returning the NFA to the caller. The purpose of
11
// these empty nodes is that they make the construction algorithm substantially
12
// simpler to implement. We remove them before returning to the caller because
13
// they can represent substantial overhead when traversing the NFA graph
14
// (either while searching using the NFA directly or while building a DFA).
15
//
16
// In the future, it would be nice to provide a Glushkov compiler as well,
17
// as it would work well as a bit-parallel NFA for smaller regexes. But
18
// the Thompson construction is one I'm more familiar with and seems more
19
// straight-forward to deal with when it comes to large Unicode character
20
// classes.
21
//
22
// Internally, the compiler uses interior mutability to improve composition
23
// in the face of the borrow checker. In particular, we'd really like to be
24
// able to write things like this:
25
//
26
//     self.c_concat(exprs.iter().map(|e| self.c(e)))
27
//
28
// Which elegantly uses iterators to build up a sequence of compiled regex
29
// sub-expressions and then hands it off to the concatenating compiler
30
// routine. Without interior mutability, the borrow checker won't let us
31
// borrow `self` mutably both inside and outside the closure at the same
32
// time.
33
34
use std::cell::RefCell;
35
use std::mem;
36
37
use regex_syntax::hir::{self, Hir, HirKind};
38
use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
39
40
use classes::ByteClassSet;
41
use error::{Error, Result};
42
use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
43
use nfa::range_trie::RangeTrie;
44
use nfa::{State, StateID, Transition, NFA};
45
46
/// Config knobs for the NFA compiler. See the builder's methods for more
47
/// docs on each one.
48
#[derive(Clone, Copy, Debug)]
49
struct Config {
50
    anchored: bool,
51
    allow_invalid_utf8: bool,
52
    reverse: bool,
53
    shrink: bool,
54
}
55
56
impl Default for Config {
57
0
    fn default() -> Config {
58
0
        Config {
59
0
            anchored: false,
60
0
            allow_invalid_utf8: false,
61
0
            reverse: false,
62
0
            shrink: true,
63
0
        }
64
0
    }
65
}
66
67
/// A builder for compiling an NFA.
68
#[derive(Clone, Debug)]
69
pub struct Builder {
70
    config: Config,
71
}
72
73
impl Builder {
74
    /// Create a new NFA builder with its default configuration.
75
0
    pub fn new() -> Builder {
76
0
        Builder { config: Config::default() }
77
0
    }
78
79
    /// Compile the given high level intermediate representation of a regular
80
    /// expression into an NFA.
81
    ///
82
    /// If there was a problem building the NFA, then an error is returned.
83
    /// For example, if the regex uses unsupported features (such as zero-width
84
    /// assertions), then an error is returned.
85
0
    pub fn build(&self, expr: &Hir) -> Result<NFA> {
86
0
        let mut nfa = NFA::always_match();
87
0
        self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
88
0
        Ok(nfa)
89
0
    }
90
91
    /// Compile the given high level intermediate representation of a regular
92
    /// expression into the NFA given using the given compiler. Callers may
93
    /// prefer this over `build` if they would like to reuse allocations while
94
    /// compiling many regular expressions.
95
    ///
96
    /// On success, the given NFA is completely overwritten with the NFA
97
    /// produced by the compiler.
98
    ///
99
    /// If there was a problem building the NFA, then an error is returned. For
100
    /// example, if the regex uses unsupported features (such as zero-width
101
    /// assertions), then an error is returned. When an error is returned,
102
    /// the contents of `nfa` are unspecified and should not be relied upon.
103
    /// However, it can still be reused in subsequent calls to this method.
104
0
    pub fn build_with(
105
0
        &self,
106
0
        compiler: &mut Compiler,
107
0
        nfa: &mut NFA,
108
0
        expr: &Hir,
109
0
    ) -> Result<()> {
110
0
        compiler.clear();
111
0
        compiler.configure(self.config);
112
0
        compiler.compile(nfa, expr)
113
0
    }
114
115
    /// Set whether matching must be anchored at the beginning of the input.
116
    ///
117
    /// When enabled, a match must begin at the start of the input. When
118
    /// disabled, the NFA will act as if the pattern started with a `.*?`,
119
    /// which enables a match to appear anywhere.
120
    ///
121
    /// By default this is disabled.
122
0
    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
123
0
        self.config.anchored = yes;
124
0
        self
125
0
    }
126
127
    /// When enabled, the builder will permit the construction of an NFA that
128
    /// may match invalid UTF-8.
129
    ///
130
    /// When disabled (the default), the builder is guaranteed to produce a
131
    /// regex that will only ever match valid UTF-8 (otherwise, the builder
132
    /// will return an error).
133
0
    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
134
0
        self.config.allow_invalid_utf8 = yes;
135
0
        self
136
0
    }
137
138
    /// Reverse the NFA.
139
    ///
140
    /// A NFA reversal is performed by reversing all of the concatenated
141
    /// sub-expressions in the original pattern, recursively. The resulting
142
    /// NFA can be used to match the pattern starting from the end of a string
143
    /// instead of the beginning of a string.
144
    ///
145
    /// Reversing the NFA is useful for building a reverse DFA, which is most
146
    /// useful for finding the start of a match.
147
0
    pub fn reverse(&mut self, yes: bool) -> &mut Builder {
148
0
        self.config.reverse = yes;
149
0
        self
150
0
    }
151
152
    /// Apply best effort heuristics to shrink the NFA at the expense of more
153
    /// time/memory.
154
    ///
155
    /// This is enabled by default. Generally speaking, if one is using an NFA
156
    /// to compile DFA, then the extra time used to shrink the NFA will be
157
    /// more than made up for during DFA construction (potentially by a lot).
158
    /// In other words, enabling this can substantially decrease the overall
159
    /// amount of time it takes to build a DFA.
160
    ///
161
    /// The only reason to disable this if you want to compile an NFA and start
162
    /// using it as quickly as possible without needing to build a DFA.
163
0
    pub fn shrink(&mut self, yes: bool) -> &mut Builder {
164
0
        self.config.shrink = yes;
165
0
        self
166
0
    }
167
}
168
169
/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
170
/// construction. Namely, this compiler permits epsilon transitions between
171
/// states.
172
///
173
/// Users of this crate cannot use a compiler directly. Instead, all one can
174
/// do is create one and use it via the
175
/// [`Builder::build_with`](struct.Builder.html#method.build_with)
176
/// method. This permits callers to reuse compilers in order to amortize
177
/// allocations.
178
#[derive(Clone, Debug)]
179
pub struct Compiler {
180
    /// The set of compiled NFA states. Once a state is compiled, it is
181
    /// assigned a state ID equivalent to its index in this list. Subsequent
182
    /// compilation can modify previous states by adding new transitions.
183
    states: RefCell<Vec<CState>>,
184
    /// The configuration from the builder.
185
    config: Config,
186
    /// State used for compiling character classes to UTF-8 byte automata.
187
    /// State is not retained between character class compilations. This just
188
    /// serves to amortize allocation to the extent possible.
189
    utf8_state: RefCell<Utf8State>,
190
    /// State used for arranging character classes in reverse into a trie.
191
    trie_state: RefCell<RangeTrie>,
192
    /// State used for caching common suffixes when compiling reverse UTF-8
193
    /// automata (for Unicode character classes).
194
    utf8_suffix: RefCell<Utf8SuffixMap>,
195
    /// A map used to re-map state IDs when translating the compiler's internal
196
    /// NFA state representation to the external NFA representation.
197
    remap: RefCell<Vec<StateID>>,
198
    /// A set of compiler internal state IDs that correspond to states that are
199
    /// exclusively epsilon transitions, i.e., goto instructions, combined with
200
    /// the state that they point to. This is used to record said states while
201
    /// transforming the compiler's internal NFA representation to the external
202
    /// form.
203
    empties: RefCell<Vec<(StateID, StateID)>>,
204
}
205
206
/// A compiler intermediate state representation for an NFA that is only used
207
/// during compilation. Once compilation is done, `CState`s are converted to
208
/// `State`s, which have a much simpler representation.
209
#[derive(Clone, Debug, Eq, PartialEq)]
210
enum CState {
211
    /// An empty state whose only purpose is to forward the automaton to
212
    /// another state via en epsilon transition. These are useful during
213
    /// compilation but are otherwise removed at the end.
214
    Empty { next: StateID },
215
    /// A state that only transitions to `next` if the current input byte is
216
    /// in the range `[start, end]` (inclusive on both ends).
217
    Range { range: Transition },
218
    /// A state with possibly many transitions, represented in a sparse
219
    /// fashion. Transitions are ordered lexicographically by input range.
220
    /// As such, this may only be used when every transition has equal
221
    /// priority. (In practice, this is only used for encoding large UTF-8
222
    /// automata.)
223
    Sparse { ranges: Vec<Transition> },
224
    /// An alternation such that there exists an epsilon transition to all
225
    /// states in `alternates`, where matches found via earlier transitions
226
    /// are preferred over later transitions.
227
    Union { alternates: Vec<StateID> },
228
    /// An alternation such that there exists an epsilon transition to all
229
    /// states in `alternates`, where matches found via later transitions
230
    /// are preferred over earlier transitions.
231
    ///
232
    /// This "reverse" state exists for convenience during compilation that
233
    /// permits easy construction of non-greedy combinations of NFA states.
234
    /// At the end of compilation, Union and UnionReverse states are merged
235
    /// into one Union type of state, where the latter has its epsilon
236
    /// transitions reversed to reflect the priority inversion.
237
    UnionReverse { alternates: Vec<StateID> },
238
    /// A match state. There is exactly one such occurrence of this state in
239
    /// an NFA.
240
    Match,
241
}
242
243
/// A value that represents the result of compiling a sub-expression of a
244
/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
245
/// has an initial state at `start` and a final state at `end`.
246
#[derive(Clone, Copy, Debug)]
247
pub struct ThompsonRef {
248
    start: StateID,
249
    end: StateID,
250
}
251
252
impl Compiler {
253
    /// Create a new compiler.
254
0
    pub fn new() -> Compiler {
255
0
        Compiler {
256
0
            states: RefCell::new(vec![]),
257
0
            config: Config::default(),
258
0
            utf8_state: RefCell::new(Utf8State::new()),
259
0
            trie_state: RefCell::new(RangeTrie::new()),
260
0
            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
261
0
            remap: RefCell::new(vec![]),
262
0
            empties: RefCell::new(vec![]),
263
0
        }
264
0
    }
265
266
    /// Clear any memory used by this compiler such that it is ready to compile
267
    /// a new regex.
268
    ///
269
    /// It is preferrable to reuse a compiler if possible in order to reuse
270
    /// allocations.
271
0
    fn clear(&self) {
272
0
        self.states.borrow_mut().clear();
273
        // We don't need to clear anything else since they are cleared on
274
        // their own and only when they are used.
275
0
    }
276
277
    /// Configure this compiler from the builder's knobs.
278
    ///
279
    /// The compiler is always reconfigured by the builder before using it to
280
    /// build an NFA.
281
0
    fn configure(&mut self, config: Config) {
282
0
        self.config = config;
283
0
    }
284
285
    /// Convert the current intermediate NFA to its final compiled form.
286
0
    fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
287
0
        nfa.anchored = self.config.anchored;
288
289
0
        let mut start = self.add_empty();
290
0
        if !nfa.anchored {
291
0
            let compiled = if self.config.allow_invalid_utf8 {
292
0
                self.c_unanchored_prefix_invalid_utf8()?
293
            } else {
294
0
                self.c_unanchored_prefix_valid_utf8()?
295
            };
296
0
            self.patch(start, compiled.start);
297
0
            start = compiled.end;
298
0
        }
299
0
        let compiled = self.c(&expr)?;
300
0
        let match_id = self.add_match();
301
0
        self.patch(start, compiled.start);
302
0
        self.patch(compiled.end, match_id);
303
0
        self.finish(nfa);
304
0
        Ok(())
305
0
    }
306
307
    /// Finishes the compilation process and populates the provide NFA with
308
    /// the final graph.
309
0
    fn finish(&self, nfa: &mut NFA) {
310
0
        let mut bstates = self.states.borrow_mut();
311
0
        let mut remap = self.remap.borrow_mut();
312
0
        remap.resize(bstates.len(), 0);
313
0
        let mut empties = self.empties.borrow_mut();
314
0
        empties.clear();
315
316
        // We don't reuse allocations here becuase this is what we're
317
        // returning.
318
0
        nfa.states.clear();
319
0
        let mut byteset = ByteClassSet::new();
320
321
        // The idea here is to convert our intermediate states to their final
322
        // form. The only real complexity here is the process of converting
323
        // transitions, which are expressed in terms of state IDs. The new
324
        // set of states will be smaller because of partial epsilon removal,
325
        // so the state IDs will not be the same.
326
0
        for (id, bstate) in bstates.iter_mut().enumerate() {
327
0
            match *bstate {
328
0
                CState::Empty { next } => {
329
0
                    // Since we're removing empty states, we need to handle
330
0
                    // them later since we don't yet know which new state this
331
0
                    // empty state will be mapped to.
332
0
                    empties.push((id, next));
333
0
                }
334
0
                CState::Range { ref range } => {
335
0
                    remap[id] = nfa.states.len();
336
0
                    byteset.set_range(range.start, range.end);
337
0
                    nfa.states.push(State::Range { range: range.clone() });
338
0
                }
339
0
                CState::Sparse { ref mut ranges } => {
340
0
                    remap[id] = nfa.states.len();
341
342
0
                    let ranges = mem::replace(ranges, vec![]);
343
0
                    for r in &ranges {
344
0
                        byteset.set_range(r.start, r.end);
345
0
                    }
346
0
                    nfa.states.push(State::Sparse {
347
0
                        ranges: ranges.into_boxed_slice(),
348
0
                    });
349
                }
350
0
                CState::Union { ref mut alternates } => {
351
0
                    remap[id] = nfa.states.len();
352
0
353
0
                    let alternates = mem::replace(alternates, vec![]);
354
0
                    nfa.states.push(State::Union {
355
0
                        alternates: alternates.into_boxed_slice(),
356
0
                    });
357
0
                }
358
0
                CState::UnionReverse { ref mut alternates } => {
359
0
                    remap[id] = nfa.states.len();
360
0
361
0
                    let mut alternates = mem::replace(alternates, vec![]);
362
0
                    alternates.reverse();
363
0
                    nfa.states.push(State::Union {
364
0
                        alternates: alternates.into_boxed_slice(),
365
0
                    });
366
0
                }
367
0
                CState::Match => {
368
0
                    remap[id] = nfa.states.len();
369
0
                    nfa.states.push(State::Match);
370
0
                }
371
            }
372
        }
373
0
        for &(empty_id, mut empty_next) in empties.iter() {
374
            // empty states can point to other empty states, forming a chain.
375
            // So we must follow the chain until the end, which must end at
376
            // a non-empty state, and therefore, a state that is correctly
377
            // remapped. We are guaranteed to terminate because our compiler
378
            // never builds a loop among empty states.
379
0
            while let CState::Empty { next } = bstates[empty_next] {
380
0
                empty_next = next;
381
0
            }
382
0
            remap[empty_id] = remap[empty_next];
383
        }
384
0
        for state in &mut nfa.states {
385
0
            state.remap(&remap);
386
0
        }
387
        // The compiler always begins the NFA at the first state.
388
0
        nfa.start = remap[0];
389
0
        nfa.byte_classes = byteset.byte_classes();
390
0
    }
391
392
0
    fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
393
0
        match *expr.kind() {
394
            HirKind::Empty => {
395
0
                let id = self.add_empty();
396
0
                Ok(ThompsonRef { start: id, end: id })
397
            }
398
0
            HirKind::Literal(hir::Literal::Unicode(ch)) => {
399
0
                let mut buf = [0; 4];
400
0
                let it = ch
401
0
                    .encode_utf8(&mut buf)
402
0
                    .as_bytes()
403
0
                    .iter()
404
0
                    .map(|&b| Ok(self.c_range(b, b)));
405
0
                self.c_concat(it)
406
            }
407
0
            HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
408
0
            HirKind::Class(hir::Class::Bytes(ref cls)) => {
409
0
                self.c_byte_class(cls)
410
            }
411
0
            HirKind::Class(hir::Class::Unicode(ref cls)) => {
412
0
                self.c_unicode_class(cls)
413
            }
414
0
            HirKind::Repetition(ref rep) => self.c_repetition(rep),
415
0
            HirKind::Group(ref group) => self.c(&*group.hir),
416
0
            HirKind::Concat(ref exprs) => {
417
0
                self.c_concat(exprs.iter().map(|e| self.c(e)))
418
            }
419
0
            HirKind::Alternation(ref exprs) => {
420
0
                self.c_alternation(exprs.iter().map(|e| self.c(e)))
421
            }
422
0
            HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
423
0
            HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
424
        }
425
0
    }
426
427
0
    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
428
0
    where
429
0
        I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
430
    {
431
0
        let first =
432
0
            if self.config.reverse { it.next_back() } else { it.next() };
433
0
        let ThompsonRef { start, mut end } = match first {
434
0
            Some(result) => result?,
435
0
            None => return Ok(self.c_empty()),
436
        };
437
        loop {
438
0
            let next =
439
0
                if self.config.reverse { it.next_back() } else { it.next() };
440
0
            let compiled = match next {
441
0
                Some(result) => result?,
442
0
                None => break,
443
            };
444
0
            self.patch(end, compiled.start);
445
0
            end = compiled.end;
446
        }
447
0
        Ok(ThompsonRef { start, end })
448
0
    }
Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_automata::nfa::compiler::Compiler>::c_exactly::{closure#0}>>
Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::compiler::Compiler>::c::{closure#1}>>
Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<u8>, <regex_automata::nfa::compiler::Compiler>::c::{closure#0}>>
449
450
0
    fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
451
0
    where
452
0
        I: Iterator<Item = Result<ThompsonRef>>,
453
    {
454
0
        let first = it.next().expect("alternations must be non-empty")?;
455
0
        let second = match it.next() {
456
0
            None => return Ok(first),
457
0
            Some(result) => result?,
458
        };
459
460
0
        let union = self.add_union();
461
0
        let end = self.add_empty();
462
0
        self.patch(union, first.start);
463
0
        self.patch(first.end, end);
464
0
        self.patch(union, second.start);
465
0
        self.patch(second.end, end);
466
0
        for result in it {
467
0
            let compiled = result?;
468
0
            self.patch(union, compiled.start);
469
0
            self.patch(compiled.end, end);
470
        }
471
0
        Ok(ThompsonRef { start: union, end })
472
0
    }
473
474
0
    fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
475
0
        match rep.kind {
476
            hir::RepetitionKind::ZeroOrOne => {
477
0
                self.c_zero_or_one(&rep.hir, rep.greedy)
478
            }
479
            hir::RepetitionKind::ZeroOrMore => {
480
0
                self.c_at_least(&rep.hir, rep.greedy, 0)
481
            }
482
            hir::RepetitionKind::OneOrMore => {
483
0
                self.c_at_least(&rep.hir, rep.greedy, 1)
484
            }
485
0
            hir::RepetitionKind::Range(ref rng) => match *rng {
486
0
                hir::RepetitionRange::Exactly(count) => {
487
0
                    self.c_exactly(&rep.hir, count)
488
                }
489
0
                hir::RepetitionRange::AtLeast(m) => {
490
0
                    self.c_at_least(&rep.hir, rep.greedy, m)
491
                }
492
0
                hir::RepetitionRange::Bounded(min, max) => {
493
0
                    self.c_bounded(&rep.hir, rep.greedy, min, max)
494
                }
495
            },
496
        }
497
0
    }
498
499
0
    fn c_bounded(
500
0
        &self,
501
0
        expr: &Hir,
502
0
        greedy: bool,
503
0
        min: u32,
504
0
        max: u32,
505
0
    ) -> Result<ThompsonRef> {
506
0
        let prefix = self.c_exactly(expr, min)?;
507
0
        if min == max {
508
0
            return Ok(prefix);
509
0
        }
510
511
        // It is tempting here to compile the rest here as a concatenation
512
        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
513
        // were `aaa?a?a?`. The problem here is that it leads to this program:
514
        //
515
        //     >000000: 61 => 01
516
        //     000001: 61 => 02
517
        //     000002: alt(03, 04)
518
        //     000003: 61 => 04
519
        //     000004: alt(05, 06)
520
        //     000005: 61 => 06
521
        //     000006: alt(07, 08)
522
        //     000007: 61 => 08
523
        //     000008: MATCH
524
        //
525
        // And effectively, once you hit state 2, the epsilon closure will
526
        // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
527
        // better to instead compile it like so:
528
        //
529
        //     >000000: 61 => 01
530
        //      000001: 61 => 02
531
        //      000002: alt(03, 08)
532
        //      000003: 61 => 04
533
        //      000004: alt(05, 08)
534
        //      000005: 61 => 06
535
        //      000006: alt(07, 08)
536
        //      000007: 61 => 08
537
        //      000008: MATCH
538
        //
539
        // So that the epsilon closure of state 2 is now just 3 and 8.
540
0
        let empty = self.add_empty();
541
0
        let mut prev_end = prefix.end;
542
0
        for _ in min..max {
543
0
            let union = if greedy {
544
0
                self.add_union()
545
            } else {
546
0
                self.add_reverse_union()
547
            };
548
0
            let compiled = self.c(expr)?;
549
0
            self.patch(prev_end, union);
550
0
            self.patch(union, compiled.start);
551
0
            self.patch(union, empty);
552
0
            prev_end = compiled.end;
553
        }
554
0
        self.patch(prev_end, empty);
555
0
        Ok(ThompsonRef { start: prefix.start, end: empty })
556
0
    }
557
558
0
    fn c_at_least(
559
0
        &self,
560
0
        expr: &Hir,
561
0
        greedy: bool,
562
0
        n: u32,
563
0
    ) -> Result<ThompsonRef> {
564
0
        if n == 0 {
565
0
            let union = if greedy {
566
0
                self.add_union()
567
            } else {
568
0
                self.add_reverse_union()
569
            };
570
0
            let compiled = self.c(expr)?;
571
0
            self.patch(union, compiled.start);
572
0
            self.patch(compiled.end, union);
573
0
            Ok(ThompsonRef { start: union, end: union })
574
0
        } else if n == 1 {
575
0
            let compiled = self.c(expr)?;
576
0
            let union = if greedy {
577
0
                self.add_union()
578
            } else {
579
0
                self.add_reverse_union()
580
            };
581
0
            self.patch(compiled.end, union);
582
0
            self.patch(union, compiled.start);
583
0
            Ok(ThompsonRef { start: compiled.start, end: union })
584
        } else {
585
0
            let prefix = self.c_exactly(expr, n - 1)?;
586
0
            let last = self.c(expr)?;
587
0
            let union = if greedy {
588
0
                self.add_union()
589
            } else {
590
0
                self.add_reverse_union()
591
            };
592
0
            self.patch(prefix.end, last.start);
593
0
            self.patch(last.end, union);
594
0
            self.patch(union, last.start);
595
0
            Ok(ThompsonRef { start: prefix.start, end: union })
596
        }
597
0
    }
598
599
0
    fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
600
0
        let union =
601
0
            if greedy { self.add_union() } else { self.add_reverse_union() };
602
0
        let compiled = self.c(expr)?;
603
0
        let empty = self.add_empty();
604
0
        self.patch(union, compiled.start);
605
0
        self.patch(union, empty);
606
0
        self.patch(compiled.end, empty);
607
0
        Ok(ThompsonRef { start: union, end: empty })
608
0
    }
609
610
0
    fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
611
0
        let it = (0..n).map(|_| self.c(expr));
612
0
        self.c_concat(it)
613
0
    }
614
615
0
    fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
616
0
        let end = self.add_empty();
617
0
        let mut trans = Vec::with_capacity(cls.ranges().len());
618
0
        for r in cls.iter() {
619
0
            trans.push(Transition {
620
0
                start: r.start(),
621
0
                end: r.end(),
622
0
                next: end,
623
0
            });
624
0
        }
625
0
        Ok(ThompsonRef { start: self.add_sparse(trans), end })
626
0
    }
627
628
0
    fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
629
        // If all we have are ASCII ranges wrapped in a Unicode package, then
630
        // there is zero reason to bring out the big guns. We can fit all ASCII
631
        // ranges within a single sparse transition.
632
0
        if cls.is_all_ascii() {
633
0
            let end = self.add_empty();
634
0
            let mut trans = Vec::with_capacity(cls.ranges().len());
635
0
            for r in cls.iter() {
636
0
                assert!(r.start() <= '\x7F');
637
0
                assert!(r.end() <= '\x7F');
638
0
                trans.push(Transition {
639
0
                    start: r.start() as u8,
640
0
                    end: r.end() as u8,
641
0
                    next: end,
642
0
                });
643
            }
644
0
            Ok(ThompsonRef { start: self.add_sparse(trans), end })
645
0
        } else if self.config.reverse {
646
0
            if !self.config.shrink {
647
                // When we don't want to spend the extra time shrinking, we
648
                // compile the UTF-8 automaton in reverse using something like
649
                // the "naive" approach, but will attempt to re-use common
650
                // suffixes.
651
0
                self.c_unicode_class_reverse_with_suffix(cls)
652
            } else {
653
                // When we want to shrink our NFA for reverse UTF-8 automata,
654
                // we cannot feed UTF-8 sequences directly to the UTF-8
655
                // compiler, since the UTF-8 compiler requires all sequences
656
                // to be lexicographically sorted. Instead, we organize our
657
                // sequences into a range trie, which can then output our
658
                // sequences in the correct order. Unfortunately, building the
659
                // range trie is fairly expensive (but not nearly as expensive
660
                // as building a DFA). Hence the reason why the 'shrink' option
661
                // exists, so that this path can be toggled off.
662
0
                let mut trie = self.trie_state.borrow_mut();
663
0
                trie.clear();
664
665
0
                for rng in cls.iter() {
666
0
                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
667
0
                        seq.reverse();
668
0
                        trie.insert(seq.as_slice());
669
0
                    }
670
                }
671
0
                let mut utf8_state = self.utf8_state.borrow_mut();
672
0
                let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
673
0
                trie.iter(|seq| {
674
0
                    utf8c.add(&seq);
675
0
                });
676
0
                Ok(utf8c.finish())
677
            }
678
        } else {
679
            // In the forward direction, we always shrink our UTF-8 automata
680
            // because we can stream it right into the UTF-8 compiler. There
681
            // is almost no downside (in either memory or time) to using this
682
            // approach.
683
0
            let mut utf8_state = self.utf8_state.borrow_mut();
684
0
            let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
685
0
            for rng in cls.iter() {
686
0
                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
687
0
                    utf8c.add(seq.as_slice());
688
0
                }
689
            }
690
0
            Ok(utf8c.finish())
691
        }
692
693
        // For reference, the code below is the "naive" version of compiling a
694
        // UTF-8 automaton. It is deliciously simple (and works for both the
695
        // forward and reverse cases), but will unfortunately produce very
696
        // large NFAs. When compiling a forward automaton, the size difference
697
        // can sometimes be an order of magnitude. For example, the '\w' regex
698
        // will generate about ~3000 NFA states using the naive approach below,
699
        // but only 283 states when using the approach above. This is because
700
        // the approach above actually compiles a *minimal* (or near minimal,
701
        // because of the bounded hashmap) UTF-8 automaton.
702
        //
703
        // The code below is kept as a reference point in order to make it
704
        // easier to understand the higher level goal here.
705
        /*
706
        let it = cls
707
            .iter()
708
            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
709
            .map(|seq| {
710
                let it = seq
711
                    .as_slice()
712
                    .iter()
713
                    .map(|rng| Ok(self.c_range(rng.start, rng.end)));
714
                self.c_concat(it)
715
            });
716
        self.c_alternation(it);
717
        */
718
0
    }
719
720
0
    fn c_unicode_class_reverse_with_suffix(
721
0
        &self,
722
0
        cls: &hir::ClassUnicode,
723
0
    ) -> Result<ThompsonRef> {
724
        // N.B. It would likely be better to cache common *prefixes* in the
725
        // reverse direction, but it's not quite clear how to do that. The
726
        // advantage of caching suffixes is that it does give us a win, and
727
        // has a very small additional overhead.
728
0
        let mut cache = self.utf8_suffix.borrow_mut();
729
0
        cache.clear();
730
731
0
        let union = self.add_union();
732
0
        let alt_end = self.add_empty();
733
0
        for urng in cls.iter() {
734
0
            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
735
0
                let mut end = alt_end;
736
0
                for brng in seq.as_slice() {
737
0
                    let key = Utf8SuffixKey {
738
0
                        from: end,
739
0
                        start: brng.start,
740
0
                        end: brng.end,
741
0
                    };
742
0
                    let hash = cache.hash(&key);
743
0
                    if let Some(id) = cache.get(&key, hash) {
744
0
                        end = id;
745
0
                        continue;
746
0
                    }
747
748
0
                    let compiled = self.c_range(brng.start, brng.end);
749
0
                    self.patch(compiled.end, end);
750
0
                    end = compiled.start;
751
0
                    cache.set(key, hash, end);
752
                }
753
0
                self.patch(union, end);
754
            }
755
        }
756
0
        Ok(ThompsonRef { start: union, end: alt_end })
757
0
    }
758
759
0
    fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
760
0
        let id = self.add_range(start, end);
761
0
        ThompsonRef { start: id, end: id }
762
0
    }
763
764
0
    fn c_empty(&self) -> ThompsonRef {
765
0
        let id = self.add_empty();
766
0
        ThompsonRef { start: id, end: id }
767
0
    }
768
769
0
    fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
770
0
        self.c(&Hir::repetition(hir::Repetition {
771
0
            kind: hir::RepetitionKind::ZeroOrMore,
772
0
            greedy: false,
773
0
            hir: Box::new(Hir::any(false)),
774
0
        }))
775
0
    }
776
777
0
    fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
778
0
        self.c(&Hir::repetition(hir::Repetition {
779
0
            kind: hir::RepetitionKind::ZeroOrMore,
780
0
            greedy: false,
781
0
            hir: Box::new(Hir::any(true)),
782
0
        }))
783
0
    }
784
785
0
    fn patch(&self, from: StateID, to: StateID) {
786
0
        match self.states.borrow_mut()[from] {
787
0
            CState::Empty { ref mut next } => {
788
0
                *next = to;
789
0
            }
790
0
            CState::Range { ref mut range } => {
791
0
                range.next = to;
792
0
            }
793
            CState::Sparse { .. } => {
794
0
                panic!("cannot patch from a sparse NFA state")
795
            }
796
0
            CState::Union { ref mut alternates } => {
797
0
                alternates.push(to);
798
0
            }
799
0
            CState::UnionReverse { ref mut alternates } => {
800
0
                alternates.push(to);
801
0
            }
802
0
            CState::Match => {}
803
        }
804
0
    }
805
806
0
    fn add_empty(&self) -> StateID {
807
0
        let id = self.states.borrow().len();
808
0
        self.states.borrow_mut().push(CState::Empty { next: 0 });
809
0
        id
810
0
    }
811
812
0
    fn add_range(&self, start: u8, end: u8) -> StateID {
813
0
        let id = self.states.borrow().len();
814
0
        let trans = Transition { start, end, next: 0 };
815
0
        let state = CState::Range { range: trans };
816
0
        self.states.borrow_mut().push(state);
817
0
        id
818
0
    }
819
820
0
    fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
821
0
        if ranges.len() == 1 {
822
0
            let id = self.states.borrow().len();
823
0
            let state = CState::Range { range: ranges[0] };
824
0
            self.states.borrow_mut().push(state);
825
0
            return id;
826
0
        }
827
0
        let id = self.states.borrow().len();
828
0
        let state = CState::Sparse { ranges };
829
0
        self.states.borrow_mut().push(state);
830
0
        id
831
0
    }
832
833
0
    fn add_union(&self) -> StateID {
834
0
        let id = self.states.borrow().len();
835
0
        let state = CState::Union { alternates: vec![] };
836
0
        self.states.borrow_mut().push(state);
837
0
        id
838
0
    }
839
840
0
    fn add_reverse_union(&self) -> StateID {
841
0
        let id = self.states.borrow().len();
842
0
        let state = CState::UnionReverse { alternates: vec![] };
843
0
        self.states.borrow_mut().push(state);
844
0
        id
845
0
    }
846
847
0
    fn add_match(&self) -> StateID {
848
0
        let id = self.states.borrow().len();
849
0
        self.states.borrow_mut().push(CState::Match);
850
0
        id
851
0
    }
852
}
853
854
#[derive(Debug)]
855
struct Utf8Compiler<'a> {
856
    nfac: &'a Compiler,
857
    state: &'a mut Utf8State,
858
    target: StateID,
859
}
860
861
#[derive(Clone, Debug)]
862
struct Utf8State {
863
    compiled: Utf8BoundedMap,
864
    uncompiled: Vec<Utf8Node>,
865
}
866
867
#[derive(Clone, Debug)]
868
struct Utf8Node {
869
    trans: Vec<Transition>,
870
    last: Option<Utf8LastTransition>,
871
}
872
873
#[derive(Clone, Debug)]
874
struct Utf8LastTransition {
875
    start: u8,
876
    end: u8,
877
}
878
879
impl Utf8State {
880
0
    fn new() -> Utf8State {
881
0
        Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
882
0
    }
883
884
0
    fn clear(&mut self) {
885
0
        self.compiled.clear();
886
0
        self.uncompiled.clear();
887
0
    }
888
}
889
890
impl<'a> Utf8Compiler<'a> {
891
0
    fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
892
0
        let target = nfac.add_empty();
893
0
        state.clear();
894
0
        let mut utf8c = Utf8Compiler { nfac, state, target };
895
0
        utf8c.add_empty();
896
0
        utf8c
897
0
    }
898
899
0
    fn finish(&mut self) -> ThompsonRef {
900
0
        self.compile_from(0);
901
0
        let node = self.pop_root();
902
0
        let start = self.compile(node);
903
0
        ThompsonRef { start, end: self.target }
904
0
    }
905
906
0
    fn add(&mut self, ranges: &[Utf8Range]) {
907
0
        let prefix_len = ranges
908
0
            .iter()
909
0
            .zip(&self.state.uncompiled)
910
0
            .take_while(|&(range, node)| {
911
0
                node.last.as_ref().map_or(false, |t| {
912
0
                    (t.start, t.end) == (range.start, range.end)
913
0
                })
914
0
            })
915
0
            .count();
916
0
        assert!(prefix_len < ranges.len());
917
0
        self.compile_from(prefix_len);
918
0
        self.add_suffix(&ranges[prefix_len..]);
919
0
    }
920
921
0
    fn compile_from(&mut self, from: usize) {
922
0
        let mut next = self.target;
923
0
        while from + 1 < self.state.uncompiled.len() {
924
0
            let node = self.pop_freeze(next);
925
0
            next = self.compile(node);
926
0
        }
927
0
        self.top_last_freeze(next);
928
0
    }
929
930
0
    fn compile(&mut self, node: Vec<Transition>) -> StateID {
931
0
        let hash = self.state.compiled.hash(&node);
932
0
        if let Some(id) = self.state.compiled.get(&node, hash) {
933
0
            return id;
934
0
        }
935
0
        let id = self.nfac.add_sparse(node.clone());
936
0
        self.state.compiled.set(node, hash, id);
937
0
        id
938
0
    }
939
940
0
    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
941
0
        assert!(!ranges.is_empty());
942
0
        let last = self
943
0
            .state
944
0
            .uncompiled
945
0
            .len()
946
0
            .checked_sub(1)
947
0
            .expect("non-empty nodes");
948
0
        assert!(self.state.uncompiled[last].last.is_none());
949
0
        self.state.uncompiled[last].last = Some(Utf8LastTransition {
950
0
            start: ranges[0].start,
951
0
            end: ranges[0].end,
952
0
        });
953
0
        for r in &ranges[1..] {
954
0
            self.state.uncompiled.push(Utf8Node {
955
0
                trans: vec![],
956
0
                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
957
0
            });
958
0
        }
959
0
    }
960
961
0
    fn add_empty(&mut self) {
962
0
        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
963
0
    }
964
965
0
    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
966
0
        let mut uncompiled = self.state.uncompiled.pop().unwrap();
967
0
        uncompiled.set_last_transition(next);
968
0
        uncompiled.trans
969
0
    }
970
971
0
    fn pop_root(&mut self) -> Vec<Transition> {
972
0
        assert_eq!(self.state.uncompiled.len(), 1);
973
0
        assert!(self.state.uncompiled[0].last.is_none());
974
0
        self.state.uncompiled.pop().expect("non-empty nodes").trans
975
0
    }
976
977
0
    fn top_last_freeze(&mut self, next: StateID) {
978
0
        let last = self
979
0
            .state
980
0
            .uncompiled
981
0
            .len()
982
0
            .checked_sub(1)
983
0
            .expect("non-empty nodes");
984
0
        self.state.uncompiled[last].set_last_transition(next);
985
0
    }
986
}
987
988
impl Utf8Node {
989
0
    fn set_last_transition(&mut self, next: StateID) {
990
0
        if let Some(last) = self.last.take() {
991
0
            self.trans.push(Transition {
992
0
                start: last.start,
993
0
                end: last.end,
994
0
                next,
995
0
            });
996
0
        }
997
0
    }
998
}
999
1000
#[cfg(test)]
1001
mod tests {
1002
    use regex_syntax::hir::Hir;
1003
    use regex_syntax::ParserBuilder;
1004
1005
    use super::{Builder, State, StateID, Transition, NFA};
1006
1007
    fn parse(pattern: &str) -> Hir {
1008
        ParserBuilder::new().build().parse(pattern).unwrap()
1009
    }
1010
1011
    fn build(pattern: &str) -> NFA {
1012
        Builder::new().anchored(true).build(&parse(pattern)).unwrap()
1013
    }
1014
1015
    fn s_byte(byte: u8, next: StateID) -> State {
1016
        let trans = Transition { start: byte, end: byte, next };
1017
        State::Range { range: trans }
1018
    }
1019
1020
    fn s_range(start: u8, end: u8, next: StateID) -> State {
1021
        let trans = Transition { start, end, next };
1022
        State::Range { range: trans }
1023
    }
1024
1025
    fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
1026
        let ranges = ranges
1027
            .iter()
1028
            .map(|&(start, end, next)| Transition { start, end, next })
1029
            .collect();
1030
        State::Sparse { ranges }
1031
    }
1032
1033
    fn s_union(alts: &[StateID]) -> State {
1034
        State::Union { alternates: alts.to_vec().into_boxed_slice() }
1035
    }
1036
1037
    fn s_match() -> State {
1038
        State::Match
1039
    }
1040
1041
    #[test]
1042
    fn errors() {
1043
        // unsupported anchors
1044
        assert!(Builder::new().build(&parse(r"^")).is_err());
1045
        assert!(Builder::new().build(&parse(r"$")).is_err());
1046
        assert!(Builder::new().build(&parse(r"\A")).is_err());
1047
        assert!(Builder::new().build(&parse(r"\z")).is_err());
1048
1049
        // unsupported word boundaries
1050
        assert!(Builder::new().build(&parse(r"\b")).is_err());
1051
        assert!(Builder::new().build(&parse(r"\B")).is_err());
1052
        assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
1053
    }
1054
1055
    // Test that building an unanchored NFA has an appropriate `.*?` prefix.
1056
    #[test]
1057
    fn compile_unanchored_prefix() {
1058
        // When the machine can only match valid UTF-8.
1059
        let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
1060
        // There should be many states since the `.` in `.*?` matches any
1061
        // Unicode scalar value.
1062
        assert_eq!(11, nfa.len());
1063
        assert_eq!(nfa.states[10], s_match());
1064
        assert_eq!(nfa.states[9], s_byte(b'a', 10));
1065
1066
        // When the machine can match invalid UTF-8.
1067
        let nfa = Builder::new()
1068
            .anchored(false)
1069
            .allow_invalid_utf8(true)
1070
            .build(&parse(r"a"))
1071
            .unwrap();
1072
        assert_eq!(
1073
            nfa.states,
1074
            &[
1075
                s_union(&[2, 1]),
1076
                s_range(0, 255, 0),
1077
                s_byte(b'a', 3),
1078
                s_match(),
1079
            ]
1080
        );
1081
    }
1082
1083
    #[test]
1084
    fn compile_empty() {
1085
        assert_eq!(build("").states, &[s_match(),]);
1086
    }
1087
1088
    #[test]
1089
    fn compile_literal() {
1090
        assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
1091
        assert_eq!(
1092
            build("ab").states,
1093
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1094
        );
1095
        assert_eq!(
1096
            build("☃").states,
1097
            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
1098
        );
1099
1100
        // Check that non-UTF-8 literals work.
1101
        let hir = ParserBuilder::new()
1102
            .allow_invalid_utf8(true)
1103
            .build()
1104
            .parse(r"(?-u)\xFF")
1105
            .unwrap();
1106
        let nfa = Builder::new()
1107
            .anchored(true)
1108
            .allow_invalid_utf8(true)
1109
            .build(&hir)
1110
            .unwrap();
1111
        assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
1112
    }
1113
1114
    #[test]
1115
    fn compile_class() {
1116
        assert_eq!(
1117
            build(r"[a-z]").states,
1118
            &[s_range(b'a', b'z', 1), s_match(),]
1119
        );
1120
        assert_eq!(
1121
            build(r"[x-za-c]").states,
1122
            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
1123
        );
1124
        assert_eq!(
1125
            build(r"[\u03B1-\u03B4]").states,
1126
            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
1127
        );
1128
        assert_eq!(
1129
            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1130
            &[
1131
                s_range(0xB1, 0xB4, 5),
1132
                s_range(0x99, 0x9E, 5),
1133
                s_byte(0xA4, 1),
1134
                s_byte(0x9F, 2),
1135
                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1136
                s_match(),
1137
            ]
1138
        );
1139
        assert_eq!(
1140
            build(r"[a-z☃]").states,
1141
            &[
1142
                s_byte(0x83, 3),
1143
                s_byte(0x98, 0),
1144
                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1145
                s_match(),
1146
            ]
1147
        );
1148
    }
1149
1150
    #[test]
1151
    fn compile_repetition() {
1152
        assert_eq!(
1153
            build(r"a?").states,
1154
            &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
1155
        );
1156
        assert_eq!(
1157
            build(r"a??").states,
1158
            &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
1159
        );
1160
    }
1161
1162
    #[test]
1163
    fn compile_group() {
1164
        assert_eq!(
1165
            build(r"ab+").states,
1166
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
1167
        );
1168
        assert_eq!(
1169
            build(r"(ab)").states,
1170
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1171
        );
1172
        assert_eq!(
1173
            build(r"(ab)+").states,
1174
            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
1175
        );
1176
    }
1177
1178
    #[test]
1179
    fn compile_alternation() {
1180
        assert_eq!(
1181
            build(r"a|b").states,
1182
            &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
1183
        );
1184
        assert_eq!(
1185
            build(r"|b").states,
1186
            &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
1187
        );
1188
        assert_eq!(
1189
            build(r"a|").states,
1190
            &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
1191
        );
1192
    }
1193
}