Coverage Report

Created: 2023-04-25 07:07

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-1.8.1/src/exec.rs
Line
Count
Source (jump to first uncovered line)
1
use std::cell::RefCell;
2
use std::collections::HashMap;
3
use std::panic::AssertUnwindSafe;
4
use std::sync::Arc;
5
6
#[cfg(feature = "perf-literal")]
7
use aho_corasick::{AhoCorasick, MatchKind};
8
use regex_syntax::hir::literal;
9
use regex_syntax::hir::{Hir, Look};
10
use regex_syntax::ParserBuilder;
11
12
use crate::backtrack;
13
use crate::compile::Compiler;
14
#[cfg(feature = "perf-dfa")]
15
use crate::dfa;
16
use crate::error::Error;
17
use crate::input::{ByteInput, CharInput};
18
use crate::literal::LiteralSearcher;
19
use crate::pikevm;
20
use crate::pool::{Pool, PoolGuard};
21
use crate::prog::Program;
22
use crate::re_builder::RegexOptions;
23
use crate::re_bytes;
24
use crate::re_set;
25
use crate::re_trait::{Locations, RegularExpression, Slot};
26
use crate::re_unicode;
27
use crate::utf8::next_utf8;
28
29
/// `Exec` manages the execution of a regular expression.
30
///
31
/// In particular, this manages the various compiled forms of a single regular
32
/// expression and the choice of which matching engine to use to execute a
33
/// regular expression.
34
0
#[derive(Debug)]
35
pub struct Exec {
36
    /// All read only state.
37
    ro: Arc<ExecReadOnly>,
38
    /// A pool of reusable values for the various matching engines.
39
    ///
40
    /// Note that boxing this value is not strictly necessary, but it is an
41
    /// easy way to ensure that T does not bloat the stack sized used by a pool
42
    /// in the case where T is big. And this turns out to be the case at the
43
    /// time of writing for regex's use of this pool. At the time of writing,
44
    /// the size of a Regex on the stack is 856 bytes. Boxing this value
45
    /// reduces that size to 16 bytes.
46
    pool: Box<Pool<ProgramCache>>,
47
}
48
49
/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50
/// means it is no longer Sync, but we can now avoid the overhead of
51
/// synchronization to fetch the cache.
52
0
#[derive(Debug)]
53
pub struct ExecNoSync<'c> {
54
    /// All read only state.
55
    ro: &'c Arc<ExecReadOnly>,
56
    /// Caches for the various matching engines.
57
    cache: PoolGuard<'c, ProgramCache>,
58
}
59
60
/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61
0
#[derive(Debug)]
62
pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64
/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65
/// state is determined at compile time and never changes during search.
66
0
#[derive(Debug)]
67
struct ExecReadOnly {
68
    /// The original regular expressions given by the caller to compile.
69
    res: Vec<String>,
70
    /// A compiled program that is used in the NFA simulation and backtracking.
71
    /// It can be byte-based or Unicode codepoint based.
72
    ///
73
    /// N.B. It is not possibly to make this byte-based from the public API.
74
    /// It is only used for testing byte based programs in the NFA simulations.
75
    nfa: Program,
76
    /// A compiled byte based program for DFA execution. This is only used
77
    /// if a DFA can be executed. (Currently, only word boundary assertions are
78
    /// not supported.) Note that this program contains an embedded `.*?`
79
    /// preceding the first capture group, unless the regex is anchored at the
80
    /// beginning.
81
    #[allow(dead_code)]
82
    dfa: Program,
83
    /// The same as above, except the program is reversed (and there is no
84
    /// preceding `.*?`). This is used by the DFA to find the starting location
85
    /// of matches.
86
    #[allow(dead_code)]
87
    dfa_reverse: Program,
88
    /// A set of suffix literals extracted from the regex.
89
    ///
90
    /// Prefix literals are stored on the `Program`, since they are used inside
91
    /// the matching engines.
92
    #[allow(dead_code)]
93
    suffixes: LiteralSearcher,
94
    /// An Aho-Corasick automaton with leftmost-first match semantics.
95
    ///
96
    /// This is only set when the entire regex is a simple unanchored
97
    /// alternation of literals. We could probably use it more circumstances,
98
    /// but this is already hacky enough in this architecture.
99
    ///
100
    /// N.B. We use u32 as a state ID representation under the assumption that
101
    /// if we were to exhaust the ID space, we probably would have long
102
    /// surpassed the compilation size limit.
103
    #[cfg(feature = "perf-literal")]
104
    ac: Option<AhoCorasick>,
105
    /// match_type encodes as much upfront knowledge about how we're going to
106
    /// execute a search as possible.
107
    match_type: MatchType,
108
}
109
110
/// Facilitates the construction of an executor by exposing various knobs
111
/// to control how a regex is executed and what kinds of resources it's
112
/// permitted to use.
113
// `ExecBuilder` is only public via the `internal` module, so avoid deriving
114
// `Debug`.
115
#[allow(missing_debug_implementations)]
116
pub struct ExecBuilder {
117
    options: RegexOptions,
118
    match_type: Option<MatchType>,
119
    bytes: bool,
120
    only_utf8: bool,
121
}
122
123
/// Parsed represents a set of parsed regular expressions and their detected
124
/// literals.
125
struct Parsed {
126
    exprs: Vec<Hir>,
127
    prefixes: literal::Seq,
128
    suffixes: literal::Seq,
129
    bytes: bool,
130
}
131
132
impl ExecBuilder {
133
    /// Create a regex execution builder.
134
    ///
135
    /// This uses default settings for everything except the regex itself,
136
    /// which must be provided. Further knobs can be set by calling methods,
137
    /// and then finally, `build` to actually create the executor.
138
0
    pub fn new(re: &str) -> Self {
139
0
        Self::new_many(&[re])
140
0
    }
141
142
    /// Like new, but compiles the union of the given regular expressions.
143
    ///
144
    /// Note that when compiling 2 or more regular expressions, capture groups
145
    /// are completely unsupported. (This means both `find` and `captures`
146
    /// won't work.)
147
0
    pub fn new_many<I, S>(res: I) -> Self
148
0
    where
149
0
        S: AsRef<str>,
150
0
        I: IntoIterator<Item = S>,
151
0
    {
152
0
        let mut opts = RegexOptions::default();
153
0
        opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
154
0
        Self::new_options(opts)
155
0
    }
156
157
    /// Create a regex execution builder.
158
0
    pub fn new_options(opts: RegexOptions) -> Self {
159
0
        ExecBuilder {
160
0
            options: opts,
161
0
            match_type: None,
162
0
            bytes: false,
163
0
            only_utf8: true,
164
0
        }
165
0
    }
166
167
    /// Set the matching engine to be automatically determined.
168
    ///
169
    /// This is the default state and will apply whatever optimizations are
170
    /// possible, such as running a DFA.
171
    ///
172
    /// This overrides whatever was previously set via the `nfa` or
173
    /// `bounded_backtracking` methods.
174
0
    pub fn automatic(mut self) -> Self {
175
0
        self.match_type = None;
176
0
        self
177
0
    }
178
179
    /// Sets the matching engine to use the NFA algorithm no matter what
180
    /// optimizations are possible.
181
    ///
182
    /// This overrides whatever was previously set via the `automatic` or
183
    /// `bounded_backtracking` methods.
184
0
    pub fn nfa(mut self) -> Self {
185
0
        self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
186
0
        self
187
0
    }
188
189
    /// Sets the matching engine to use a bounded backtracking engine no
190
    /// matter what optimizations are possible.
191
    ///
192
    /// One must use this with care, since the bounded backtracking engine
193
    /// uses memory proportion to `len(regex) * len(text)`.
194
    ///
195
    /// This overrides whatever was previously set via the `automatic` or
196
    /// `nfa` methods.
197
0
    pub fn bounded_backtracking(mut self) -> Self {
198
0
        self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
199
0
        self
200
0
    }
201
202
    /// Compiles byte based programs for use with the NFA matching engines.
203
    ///
204
    /// By default, the NFA engines match on Unicode scalar values. They can
205
    /// be made to use byte based programs instead. In general, the byte based
206
    /// programs are slower because of a less efficient encoding of character
207
    /// classes.
208
    ///
209
    /// Note that this does not impact DFA matching engines, which always
210
    /// execute on bytes.
211
0
    pub fn bytes(mut self, yes: bool) -> Self {
212
0
        self.bytes = yes;
213
0
        self
214
0
    }
215
216
    /// When disabled, the program compiled may match arbitrary bytes.
217
    ///
218
    /// When enabled (the default), all compiled programs exclusively match
219
    /// valid UTF-8 bytes.
220
0
    pub fn only_utf8(mut self, yes: bool) -> Self {
221
0
        self.only_utf8 = yes;
222
0
        self
223
0
    }
224
225
    /// Set the Unicode flag.
226
0
    pub fn unicode(mut self, yes: bool) -> Self {
227
0
        self.options.unicode = yes;
228
0
        self
229
0
    }
230
231
    /// Parse the current set of patterns into their AST and extract literals.
232
0
    fn parse(&self) -> Result<Parsed, Error> {
233
0
        let mut exprs = Vec::with_capacity(self.options.pats.len());
234
0
        let mut prefixes = Some(literal::Seq::empty());
235
0
        let mut suffixes = Some(literal::Seq::empty());
236
0
        let mut bytes = false;
237
0
        let is_set = self.options.pats.len() > 1;
238
        // If we're compiling a regex set and that set has any anchored
239
        // expressions, then disable all literal optimizations.
240
0
        for pat in &self.options.pats {
241
0
            let mut parser = ParserBuilder::new()
242
0
                .octal(self.options.octal)
243
0
                .case_insensitive(self.options.case_insensitive)
244
0
                .multi_line(self.options.multi_line)
245
0
                .dot_matches_new_line(self.options.dot_matches_new_line)
246
0
                .swap_greed(self.options.swap_greed)
247
0
                .ignore_whitespace(self.options.ignore_whitespace)
248
0
                .unicode(self.options.unicode)
249
0
                .utf8(self.only_utf8)
250
0
                .nest_limit(self.options.nest_limit)
251
0
                .build();
252
0
            let expr =
253
0
                parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
254
0
            let props = expr.properties();
255
0
            // This used to just check whether the HIR matched valid UTF-8
256
0
            // or not, but in regex-syntax 0.7, we changed our definition of
257
0
            // "matches valid UTF-8" to exclude zero-width matches. And in
258
0
            // particular, previously, we considered WordAsciiNegate (that
259
0
            // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
260
0
            // matcher engines were built under this assumption and fixing
261
0
            // them is not worth it with the imminent plan to switch over to
262
0
            // regex-automata. So for now, we retain the previous behavior by
263
0
            // just explicitly treating the presence of a negated ASCII word
264
0
            // boundary as forcing use to use a byte oriented automaton.
265
0
            bytes = bytes
266
0
                || !props.is_utf8()
267
0
                || props.look_set().contains(Look::WordAsciiNegate);
268
269
0
            if cfg!(feature = "perf-literal") {
270
0
                if !props.look_set_prefix().contains(Look::Start)
271
0
                    && props.look_set().contains(Look::Start)
272
0
                {
273
0
                    // Partial anchors unfortunately make it hard to use
274
0
                    // prefixes, so disable them.
275
0
                    prefixes = None;
276
0
                } else if is_set
277
0
                    && props.look_set_prefix_any().contains(Look::Start)
278
0
                {
279
0
                    // Regex sets with anchors do not go well with literal
280
0
                    // optimizations.
281
0
                    prefixes = None;
282
0
                } else if props.look_set_prefix_any().contains_word() {
283
0
                    // The new literal extractor ignores look-around while
284
0
                    // the old one refused to extract prefixes from regexes
285
0
                    // that began with a \b. These old creaky regex internals
286
0
                    // can't deal with it, so we drop it.
287
0
                    prefixes = None;
288
0
                } else if props.look_set_prefix_any().contains(Look::StartLF) {
289
0
                    // Similar to the reasoning for word boundaries, this old
290
0
                    // regex engine can't handle literal prefixes with '(?m:^)'
291
0
                    // at the beginning of a regex.
292
0
                    prefixes = None;
293
0
                }
294
295
0
                if !props.look_set_suffix().contains(Look::End)
296
0
                    && props.look_set().contains(Look::End)
297
0
                {
298
0
                    // Partial anchors unfortunately make it hard to use
299
0
                    // suffixes, so disable them.
300
0
                    suffixes = None;
301
0
                } else if is_set
302
0
                    && props.look_set_suffix_any().contains(Look::End)
303
0
                {
304
0
                    // Regex sets with anchors do not go well with literal
305
0
                    // optimizations.
306
0
                    suffixes = None;
307
0
                } else if props.look_set_suffix_any().contains_word() {
308
0
                    // See the prefix case for reasoning here.
309
0
                    suffixes = None;
310
0
                } else if props.look_set_suffix_any().contains(Look::EndLF) {
311
0
                    // See the prefix case for reasoning here.
312
0
                    suffixes = None;
313
0
                }
314
315
0
                let (mut pres, mut suffs) =
316
0
                    if prefixes.is_none() && suffixes.is_none() {
317
0
                        (literal::Seq::infinite(), literal::Seq::infinite())
318
                    } else {
319
0
                        literal_analysis(&expr)
320
                    };
321
                // These old creaky regex internals can't handle cases where
322
                // the literal sequences are exact but there are look-around
323
                // assertions. So we make sure the sequences are inexact if
324
                // there are look-around assertions anywhere. This forces the
325
                // regex engines to run instead of assuming that a literal
326
                // match implies an overall match.
327
0
                if !props.look_set().is_empty() {
328
0
                    pres.make_inexact();
329
0
                    suffs.make_inexact();
330
0
                }
331
0
                prefixes = prefixes.and_then(|mut prefixes| {
332
0
                    prefixes.union(&mut pres);
333
0
                    Some(prefixes)
334
0
                });
335
0
                suffixes = suffixes.and_then(|mut suffixes| {
336
0
                    suffixes.union(&mut suffs);
337
0
                    Some(suffixes)
338
0
                });
339
0
            }
340
0
            exprs.push(expr);
341
        }
342
0
        Ok(Parsed {
343
0
            exprs,
344
0
            prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
345
0
            suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
346
0
            bytes,
347
0
        })
348
0
    }
349
350
    /// Build an executor that can run a regular expression.
351
0
    pub fn build(self) -> Result<Exec, Error> {
352
0
        // Special case when we have no patterns to compile.
353
0
        // This can happen when compiling a regex set.
354
0
        if self.options.pats.is_empty() {
355
0
            let ro = Arc::new(ExecReadOnly {
356
0
                res: vec![],
357
0
                nfa: Program::new(),
358
0
                dfa: Program::new(),
359
0
                dfa_reverse: Program::new(),
360
0
                suffixes: LiteralSearcher::empty(),
361
0
                #[cfg(feature = "perf-literal")]
362
0
                ac: None,
363
0
                match_type: MatchType::Nothing,
364
0
            });
365
0
            let pool = ExecReadOnly::new_pool(&ro);
366
0
            return Ok(Exec { ro, pool });
367
0
        }
368
0
        let parsed = self.parse()?;
369
0
        let mut nfa = Compiler::new()
370
0
            .size_limit(self.options.size_limit)
371
0
            .bytes(self.bytes || parsed.bytes)
372
0
            .only_utf8(self.only_utf8)
373
0
            .compile(&parsed.exprs)?;
374
0
        let mut dfa = Compiler::new()
375
0
            .size_limit(self.options.size_limit)
376
0
            .dfa(true)
377
0
            .only_utf8(self.only_utf8)
378
0
            .compile(&parsed.exprs)?;
379
0
        let mut dfa_reverse = Compiler::new()
380
0
            .size_limit(self.options.size_limit)
381
0
            .dfa(true)
382
0
            .only_utf8(self.only_utf8)
383
0
            .reverse(true)
384
0
            .compile(&parsed.exprs)?;
385
386
        #[cfg(feature = "perf-literal")]
387
0
        let ac = self.build_aho_corasick(&parsed);
388
0
        nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
389
0
        dfa.prefixes = nfa.prefixes.clone();
390
0
        dfa.dfa_size_limit = self.options.dfa_size_limit;
391
0
        dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
392
0
393
0
        let mut ro = ExecReadOnly {
394
0
            res: self.options.pats,
395
0
            nfa,
396
0
            dfa,
397
0
            dfa_reverse,
398
0
            suffixes: LiteralSearcher::suffixes(parsed.suffixes),
399
0
            #[cfg(feature = "perf-literal")]
400
0
            ac,
401
0
            match_type: MatchType::Nothing,
402
0
        };
403
0
        ro.match_type = ro.choose_match_type(self.match_type);
404
0
405
0
        let ro = Arc::new(ro);
406
0
        let pool = ExecReadOnly::new_pool(&ro);
407
0
        Ok(Exec { ro, pool })
408
0
    }
409
410
    #[cfg(feature = "perf-literal")]
411
0
    fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
412
0
        if parsed.exprs.len() != 1 {
413
0
            return None;
414
0
        }
415
0
        let lits = match alternation_literals(&parsed.exprs[0]) {
416
0
            None => return None,
417
0
            Some(lits) => lits,
418
0
        };
419
0
        // If we have a small number of literals, then let Teddy handle
420
0
        // things (see literal/mod.rs).
421
0
        if lits.len() <= 32 {
422
0
            return None;
423
0
        }
424
0
        Some(
425
0
            AhoCorasick::builder()
426
0
                .match_kind(MatchKind::LeftmostFirst)
427
0
                .build(&lits)
428
0
                // This should never happen because we'd long exceed the
429
0
                // compilation limit for regexes first.
430
0
                .expect("AC automaton too big"),
431
0
        )
432
0
    }
433
}
434
435
impl<'c> RegularExpression for ExecNoSyncStr<'c> {
436
    type Text = str;
437
438
0
    fn slots_len(&self) -> usize {
439
0
        self.0.slots_len()
440
0
    }
441
442
0
    fn next_after_empty(&self, text: &str, i: usize) -> usize {
443
0
        next_utf8(text.as_bytes(), i)
444
0
    }
445
446
    #[cfg_attr(feature = "perf-inline", inline(always))]
447
0
    fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
448
0
        self.0.shortest_match_at(text.as_bytes(), start)
449
0
    }
450
451
    #[cfg_attr(feature = "perf-inline", inline(always))]
452
0
    fn is_match_at(&self, text: &str, start: usize) -> bool {
453
0
        self.0.is_match_at(text.as_bytes(), start)
454
0
    }
455
456
    #[cfg_attr(feature = "perf-inline", inline(always))]
457
0
    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
458
0
        self.0.find_at(text.as_bytes(), start)
459
0
    }
460
461
    #[cfg_attr(feature = "perf-inline", inline(always))]
462
0
    fn captures_read_at(
463
0
        &self,
464
0
        locs: &mut Locations,
465
0
        text: &str,
466
0
        start: usize,
467
0
    ) -> Option<(usize, usize)> {
468
0
        self.0.captures_read_at(locs, text.as_bytes(), start)
469
0
    }
470
}
471
472
impl<'c> RegularExpression for ExecNoSync<'c> {
473
    type Text = [u8];
474
475
    /// Returns the number of capture slots in the regular expression. (There
476
    /// are two slots for every capture group, corresponding to possibly empty
477
    /// start and end locations of the capture.)
478
0
    fn slots_len(&self) -> usize {
479
0
        self.ro.nfa.captures.len() * 2
480
0
    }
481
482
0
    fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
483
0
        i + 1
484
0
    }
485
486
    /// Returns the end of a match location, possibly occurring before the
487
    /// end location of the correct leftmost-first match.
488
    #[cfg_attr(feature = "perf-inline", inline(always))]
489
0
    fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
490
0
        if !self.is_anchor_end_match(text) {
491
0
            return None;
492
0
        }
493
0
        match self.ro.match_type {
494
            #[cfg(feature = "perf-literal")]
495
0
            MatchType::Literal(ty) => {
496
0
                self.find_literals(ty, text, start).map(|(_, e)| e)
497
            }
498
            #[cfg(feature = "perf-dfa")]
499
            MatchType::Dfa | MatchType::DfaMany => {
500
0
                match self.shortest_dfa(text, start) {
501
0
                    dfa::Result::Match(end) => Some(end),
502
0
                    dfa::Result::NoMatch(_) => None,
503
0
                    dfa::Result::Quit => self.shortest_nfa(text, start),
504
                }
505
            }
506
            #[cfg(feature = "perf-dfa")]
507
            MatchType::DfaAnchoredReverse => {
508
0
                match dfa::Fsm::reverse(
509
0
                    &self.ro.dfa_reverse,
510
0
                    self.cache.value(),
511
0
                    true,
512
0
                    &text[start..],
513
0
                    text.len() - start,
514
0
                ) {
515
0
                    dfa::Result::Match(_) => Some(text.len()),
516
0
                    dfa::Result::NoMatch(_) => None,
517
0
                    dfa::Result::Quit => self.shortest_nfa(text, start),
518
                }
519
            }
520
            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
521
            MatchType::DfaSuffix => {
522
0
                match self.shortest_dfa_reverse_suffix(text, start) {
523
0
                    dfa::Result::Match(e) => Some(e),
524
0
                    dfa::Result::NoMatch(_) => None,
525
0
                    dfa::Result::Quit => self.shortest_nfa(text, start),
526
                }
527
            }
528
0
            MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
529
0
            MatchType::Nothing => None,
530
        }
531
0
    }
532
533
    /// Returns true if and only if the regex matches text.
534
    ///
535
    /// For single regular expressions, this is equivalent to calling
536
    /// shortest_match(...).is_some().
537
    #[cfg_attr(feature = "perf-inline", inline(always))]
538
0
    fn is_match_at(&self, text: &[u8], start: usize) -> bool {
539
0
        if !self.is_anchor_end_match(text) {
540
0
            return false;
541
0
        }
542
0
        // We need to do this dance because shortest_match relies on the NFA
543
0
        // filling in captures[1], but a RegexSet has no captures. In other
544
0
        // words, a RegexSet can't (currently) use shortest_match. ---AG
545
0
        match self.ro.match_type {
546
            #[cfg(feature = "perf-literal")]
547
0
            MatchType::Literal(ty) => {
548
0
                self.find_literals(ty, text, start).is_some()
549
            }
550
            #[cfg(feature = "perf-dfa")]
551
            MatchType::Dfa | MatchType::DfaMany => {
552
0
                match self.shortest_dfa(text, start) {
553
0
                    dfa::Result::Match(_) => true,
554
0
                    dfa::Result::NoMatch(_) => false,
555
0
                    dfa::Result::Quit => self.match_nfa(text, start),
556
                }
557
            }
558
            #[cfg(feature = "perf-dfa")]
559
            MatchType::DfaAnchoredReverse => {
560
0
                match dfa::Fsm::reverse(
561
0
                    &self.ro.dfa_reverse,
562
0
                    self.cache.value(),
563
0
                    true,
564
0
                    &text[start..],
565
0
                    text.len() - start,
566
0
                ) {
567
0
                    dfa::Result::Match(_) => true,
568
0
                    dfa::Result::NoMatch(_) => false,
569
0
                    dfa::Result::Quit => self.match_nfa(text, start),
570
                }
571
            }
572
            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
573
            MatchType::DfaSuffix => {
574
0
                match self.shortest_dfa_reverse_suffix(text, start) {
575
0
                    dfa::Result::Match(_) => true,
576
0
                    dfa::Result::NoMatch(_) => false,
577
0
                    dfa::Result::Quit => self.match_nfa(text, start),
578
                }
579
            }
580
0
            MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
581
0
            MatchType::Nothing => false,
582
        }
583
0
    }
584
585
    /// Finds the start and end location of the leftmost-first match, starting
586
    /// at the given location.
587
    #[cfg_attr(feature = "perf-inline", inline(always))]
588
0
    fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
589
0
        if !self.is_anchor_end_match(text) {
590
0
            return None;
591
0
        }
592
0
        match self.ro.match_type {
593
            #[cfg(feature = "perf-literal")]
594
0
            MatchType::Literal(ty) => self.find_literals(ty, text, start),
595
            #[cfg(feature = "perf-dfa")]
596
0
            MatchType::Dfa => match self.find_dfa_forward(text, start) {
597
0
                dfa::Result::Match((s, e)) => Some((s, e)),
598
0
                dfa::Result::NoMatch(_) => None,
599
                dfa::Result::Quit => {
600
0
                    self.find_nfa(MatchNfaType::Auto, text, start)
601
                }
602
            },
603
            #[cfg(feature = "perf-dfa")]
604
            MatchType::DfaAnchoredReverse => {
605
0
                match self.find_dfa_anchored_reverse(text, start) {
606
0
                    dfa::Result::Match((s, e)) => Some((s, e)),
607
0
                    dfa::Result::NoMatch(_) => None,
608
                    dfa::Result::Quit => {
609
0
                        self.find_nfa(MatchNfaType::Auto, text, start)
610
                    }
611
                }
612
            }
613
            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
614
            MatchType::DfaSuffix => {
615
0
                match self.find_dfa_reverse_suffix(text, start) {
616
0
                    dfa::Result::Match((s, e)) => Some((s, e)),
617
0
                    dfa::Result::NoMatch(_) => None,
618
                    dfa::Result::Quit => {
619
0
                        self.find_nfa(MatchNfaType::Auto, text, start)
620
                    }
621
                }
622
            }
623
0
            MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
624
0
            MatchType::Nothing => None,
625
            #[cfg(feature = "perf-dfa")]
626
            MatchType::DfaMany => {
627
0
                unreachable!("BUG: RegexSet cannot be used with find")
628
            }
629
        }
630
0
    }
631
632
    /// Finds the start and end location of the leftmost-first match and also
633
    /// fills in all matching capture groups.
634
    ///
635
    /// The number of capture slots given should be equal to the total number
636
    /// of capture slots in the compiled program.
637
    ///
638
    /// Note that the first two slots always correspond to the start and end
639
    /// locations of the overall match.
640
0
    fn captures_read_at(
641
0
        &self,
642
0
        locs: &mut Locations,
643
0
        text: &[u8],
644
0
        start: usize,
645
0
    ) -> Option<(usize, usize)> {
646
0
        let slots = locs.as_slots();
647
0
        for slot in slots.iter_mut() {
648
0
            *slot = None;
649
0
        }
650
        // If the caller unnecessarily uses this, then we try to save them
651
        // from themselves.
652
0
        match slots.len() {
653
0
            0 => return self.find_at(text, start),
654
            2 => {
655
0
                return self.find_at(text, start).map(|(s, e)| {
656
0
                    slots[0] = Some(s);
657
0
                    slots[1] = Some(e);
658
0
                    (s, e)
659
0
                });
660
            }
661
0
            _ => {} // fallthrough
662
0
        }
663
0
        if !self.is_anchor_end_match(text) {
664
0
            return None;
665
0
        }
666
0
        match self.ro.match_type {
667
            #[cfg(feature = "perf-literal")]
668
0
            MatchType::Literal(ty) => {
669
0
                self.find_literals(ty, text, start).and_then(|(s, e)| {
670
0
                    self.captures_nfa_type(
671
0
                        MatchNfaType::Auto,
672
0
                        slots,
673
0
                        text,
674
0
                        s,
675
0
                        e,
676
0
                    )
677
0
                })
678
            }
679
            #[cfg(feature = "perf-dfa")]
680
            MatchType::Dfa => {
681
0
                if self.ro.nfa.is_anchored_start {
682
0
                    self.captures_nfa(slots, text, start)
683
                } else {
684
0
                    match self.find_dfa_forward(text, start) {
685
0
                        dfa::Result::Match((s, e)) => self.captures_nfa_type(
686
0
                            MatchNfaType::Auto,
687
0
                            slots,
688
0
                            text,
689
0
                            s,
690
0
                            e,
691
0
                        ),
692
0
                        dfa::Result::NoMatch(_) => None,
693
                        dfa::Result::Quit => {
694
0
                            self.captures_nfa(slots, text, start)
695
                        }
696
                    }
697
                }
698
            }
699
            #[cfg(feature = "perf-dfa")]
700
            MatchType::DfaAnchoredReverse => {
701
0
                match self.find_dfa_anchored_reverse(text, start) {
702
0
                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
703
0
                        MatchNfaType::Auto,
704
0
                        slots,
705
0
                        text,
706
0
                        s,
707
0
                        e,
708
0
                    ),
709
0
                    dfa::Result::NoMatch(_) => None,
710
0
                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
711
                }
712
            }
713
            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
714
            MatchType::DfaSuffix => {
715
0
                match self.find_dfa_reverse_suffix(text, start) {
716
0
                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
717
0
                        MatchNfaType::Auto,
718
0
                        slots,
719
0
                        text,
720
0
                        s,
721
0
                        e,
722
0
                    ),
723
0
                    dfa::Result::NoMatch(_) => None,
724
0
                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
725
                }
726
            }
727
0
            MatchType::Nfa(ty) => {
728
0
                self.captures_nfa_type(ty, slots, text, start, text.len())
729
            }
730
0
            MatchType::Nothing => None,
731
            #[cfg(feature = "perf-dfa")]
732
            MatchType::DfaMany => {
733
0
                unreachable!("BUG: RegexSet cannot be used with captures")
734
            }
735
        }
736
0
    }
737
}
738
739
impl<'c> ExecNoSync<'c> {
740
    /// Finds the leftmost-first match using only literal search.
741
    #[cfg(feature = "perf-literal")]
742
    #[cfg_attr(feature = "perf-inline", inline(always))]
743
0
    fn find_literals(
744
0
        &self,
745
0
        ty: MatchLiteralType,
746
0
        text: &[u8],
747
0
        start: usize,
748
0
    ) -> Option<(usize, usize)> {
749
0
        use self::MatchLiteralType::*;
750
0
        match ty {
751
            Unanchored => {
752
0
                let lits = &self.ro.nfa.prefixes;
753
0
                lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
754
            }
755
            AnchoredStart => {
756
0
                let lits = &self.ro.nfa.prefixes;
757
0
                if start == 0 || !self.ro.nfa.is_anchored_start {
758
0
                    lits.find_start(&text[start..])
759
0
                        .map(|(s, e)| (start + s, start + e))
760
                } else {
761
0
                    None
762
                }
763
            }
764
            AnchoredEnd => {
765
0
                let lits = &self.ro.suffixes;
766
0
                lits.find_end(&text[start..])
767
0
                    .map(|(s, e)| (start + s, start + e))
768
            }
769
0
            AhoCorasick => self
770
0
                .ro
771
0
                .ac
772
0
                .as_ref()
773
0
                .unwrap()
774
0
                .find(&text[start..])
775
0
                .map(|m| (start + m.start(), start + m.end())),
776
        }
777
0
    }
778
779
    /// Finds the leftmost-first match (start and end) using only the DFA.
780
    ///
781
    /// If the result returned indicates that the DFA quit, then another
782
    /// matching engine should be used.
783
    #[cfg(feature = "perf-dfa")]
784
    #[cfg_attr(feature = "perf-inline", inline(always))]
785
0
    fn find_dfa_forward(
786
0
        &self,
787
0
        text: &[u8],
788
0
        start: usize,
789
0
    ) -> dfa::Result<(usize, usize)> {
790
        use crate::dfa::Result::*;
791
0
        let end = match dfa::Fsm::forward(
792
0
            &self.ro.dfa,
793
0
            self.cache.value(),
794
0
            false,
795
0
            text,
796
0
            start,
797
        ) {
798
0
            NoMatch(i) => return NoMatch(i),
799
0
            Quit => return Quit,
800
0
            Match(end) if start == end => return Match((start, start)),
801
0
            Match(end) => end,
802
0
        };
803
0
        // Now run the DFA in reverse to find the start of the match.
804
0
        match dfa::Fsm::reverse(
805
0
            &self.ro.dfa_reverse,
806
0
            self.cache.value(),
807
0
            false,
808
0
            &text[start..],
809
0
            end - start,
810
0
        ) {
811
0
            Match(s) => Match((start + s, end)),
812
0
            NoMatch(i) => NoMatch(i),
813
0
            Quit => Quit,
814
        }
815
0
    }
816
817
    /// Finds the leftmost-first match (start and end) using only the DFA,
818
    /// but assumes the regex is anchored at the end and therefore starts at
819
    /// the end of the regex and matches in reverse.
820
    ///
821
    /// If the result returned indicates that the DFA quit, then another
822
    /// matching engine should be used.
823
    #[cfg(feature = "perf-dfa")]
824
    #[cfg_attr(feature = "perf-inline", inline(always))]
825
0
    fn find_dfa_anchored_reverse(
826
0
        &self,
827
0
        text: &[u8],
828
0
        start: usize,
829
0
    ) -> dfa::Result<(usize, usize)> {
830
0
        use crate::dfa::Result::*;
831
0
        match dfa::Fsm::reverse(
832
0
            &self.ro.dfa_reverse,
833
0
            self.cache.value(),
834
0
            false,
835
0
            &text[start..],
836
0
            text.len() - start,
837
0
        ) {
838
0
            Match(s) => Match((start + s, text.len())),
839
0
            NoMatch(i) => NoMatch(i),
840
0
            Quit => Quit,
841
        }
842
0
    }
843
844
    /// Finds the end of the shortest match using only the DFA.
845
    #[cfg(feature = "perf-dfa")]
846
    #[cfg_attr(feature = "perf-inline", inline(always))]
847
0
    fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
848
0
        dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
849
0
    }
850
851
    /// Finds the end of the shortest match using only the DFA by scanning for
852
    /// suffix literals.
853
    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
854
    #[cfg_attr(feature = "perf-inline", inline(always))]
855
0
    fn shortest_dfa_reverse_suffix(
856
0
        &self,
857
0
        text: &[u8],
858
0
        start: usize,
859
0
    ) -> dfa::Result<usize> {
860
0
        match self.exec_dfa_reverse_suffix(text, start) {
861
0
            None => self.shortest_dfa(text, start),
862
0
            Some(r) => r.map(|(_, end)| end),
863
        }
864
0
    }
865
866
    /// Finds the end of the shortest match using only the DFA by scanning for
867
    /// suffix literals. It also reports the start of the match.
868
    ///
869
    /// Note that if None is returned, then the optimization gave up to avoid
870
    /// worst case quadratic behavior. A forward scanning DFA should be tried
871
    /// next.
872
    ///
873
    /// If a match is returned and the full leftmost-first match is desired,
874
    /// then a forward scan starting from the beginning of the match must be
875
    /// done.
876
    ///
877
    /// If the result returned indicates that the DFA quit, then another
878
    /// matching engine should be used.
879
    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
880
    #[cfg_attr(feature = "perf-inline", inline(always))]
881
0
    fn exec_dfa_reverse_suffix(
882
0
        &self,
883
0
        text: &[u8],
884
0
        original_start: usize,
885
0
    ) -> Option<dfa::Result<(usize, usize)>> {
886
0
        use crate::dfa::Result::*;
887
0
888
0
        let lcs = self.ro.suffixes.lcs();
889
0
        debug_assert!(lcs.len() >= 1);
890
0
        let mut start = original_start;
891
0
        let mut end = start;
892
0
        let mut last_literal = start;
893
0
        while end <= text.len() {
894
0
            last_literal += match lcs.find(&text[last_literal..]) {
895
0
                None => return Some(NoMatch(text.len())),
896
0
                Some(i) => i,
897
0
            };
898
0
            end = last_literal + lcs.len();
899
0
            match dfa::Fsm::reverse(
900
0
                &self.ro.dfa_reverse,
901
0
                self.cache.value(),
902
0
                false,
903
0
                &text[start..end],
904
0
                end - start,
905
0
            ) {
906
0
                Match(0) | NoMatch(0) => return None,
907
0
                Match(i) => return Some(Match((start + i, end))),
908
0
                NoMatch(i) => {
909
0
                    start += i;
910
0
                    last_literal += 1;
911
0
                    continue;
912
                }
913
0
                Quit => return Some(Quit),
914
            };
915
        }
916
0
        Some(NoMatch(text.len()))
917
0
    }
918
919
    /// Finds the leftmost-first match (start and end) using only the DFA
920
    /// by scanning for suffix literals.
921
    ///
922
    /// If the result returned indicates that the DFA quit, then another
923
    /// matching engine should be used.
924
    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
925
    #[cfg_attr(feature = "perf-inline", inline(always))]
926
0
    fn find_dfa_reverse_suffix(
927
0
        &self,
928
0
        text: &[u8],
929
0
        start: usize,
930
0
    ) -> dfa::Result<(usize, usize)> {
931
        use crate::dfa::Result::*;
932
933
0
        let match_start = match self.exec_dfa_reverse_suffix(text, start) {
934
0
            None => return self.find_dfa_forward(text, start),
935
0
            Some(Match((start, _))) => start,
936
0
            Some(r) => return r,
937
        };
938
        // At this point, we've found a match. The only way to quit now
939
        // without a match is if the DFA gives up (seems unlikely).
940
        //
941
        // Now run the DFA forwards to find the proper end of the match.
942
        // (The suffix literal match can only indicate the earliest
943
        // possible end location, which may appear before the end of the
944
        // leftmost-first match.)
945
0
        match dfa::Fsm::forward(
946
0
            &self.ro.dfa,
947
0
            self.cache.value(),
948
0
            false,
949
0
            text,
950
0
            match_start,
951
0
        ) {
952
0
            NoMatch(_) => panic!("BUG: reverse match implies forward match"),
953
0
            Quit => Quit,
954
0
            Match(e) => Match((match_start, e)),
955
        }
956
0
    }
957
958
    /// Executes the NFA engine to return whether there is a match or not.
959
    ///
960
    /// Ideally, we could use shortest_nfa(...).is_some() and get the same
961
    /// performance characteristics, but regex sets don't have captures, which
962
    /// shortest_nfa depends on.
963
    #[cfg(feature = "perf-dfa")]
964
0
    fn match_nfa(&self, text: &[u8], start: usize) -> bool {
965
0
        self.match_nfa_type(MatchNfaType::Auto, text, start)
966
0
    }
967
968
    /// Like match_nfa, but allows specification of the type of NFA engine.
969
0
    fn match_nfa_type(
970
0
        &self,
971
0
        ty: MatchNfaType,
972
0
        text: &[u8],
973
0
        start: usize,
974
0
    ) -> bool {
975
0
        self.exec_nfa(
976
0
            ty,
977
0
            &mut [false],
978
0
            &mut [],
979
0
            true,
980
0
            false,
981
0
            text,
982
0
            start,
983
0
            text.len(),
984
0
        )
985
0
    }
986
987
    /// Finds the shortest match using an NFA.
988
    #[cfg(feature = "perf-dfa")]
989
0
    fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
990
0
        self.shortest_nfa_type(MatchNfaType::Auto, text, start)
991
0
    }
992
993
    /// Like shortest_nfa, but allows specification of the type of NFA engine.
994
0
    fn shortest_nfa_type(
995
0
        &self,
996
0
        ty: MatchNfaType,
997
0
        text: &[u8],
998
0
        start: usize,
999
0
    ) -> Option<usize> {
1000
0
        let mut slots = [None, None];
1001
0
        if self.exec_nfa(
1002
0
            ty,
1003
0
            &mut [false],
1004
0
            &mut slots,
1005
0
            true,
1006
0
            true,
1007
0
            text,
1008
0
            start,
1009
0
            text.len(),
1010
0
        ) {
1011
0
            slots[1]
1012
        } else {
1013
0
            None
1014
        }
1015
0
    }
1016
1017
    /// Like find, but executes an NFA engine.
1018
0
    fn find_nfa(
1019
0
        &self,
1020
0
        ty: MatchNfaType,
1021
0
        text: &[u8],
1022
0
        start: usize,
1023
0
    ) -> Option<(usize, usize)> {
1024
0
        let mut slots = [None, None];
1025
0
        if self.exec_nfa(
1026
0
            ty,
1027
0
            &mut [false],
1028
0
            &mut slots,
1029
0
            false,
1030
0
            false,
1031
0
            text,
1032
0
            start,
1033
0
            text.len(),
1034
0
        ) {
1035
0
            match (slots[0], slots[1]) {
1036
0
                (Some(s), Some(e)) => Some((s, e)),
1037
0
                _ => None,
1038
            }
1039
        } else {
1040
0
            None
1041
        }
1042
0
    }
1043
1044
    /// Like find_nfa, but fills in captures.
1045
    ///
1046
    /// `slots` should have length equal to `2 * nfa.captures.len()`.
1047
    #[cfg(feature = "perf-dfa")]
1048
0
    fn captures_nfa(
1049
0
        &self,
1050
0
        slots: &mut [Slot],
1051
0
        text: &[u8],
1052
0
        start: usize,
1053
0
    ) -> Option<(usize, usize)> {
1054
0
        self.captures_nfa_type(
1055
0
            MatchNfaType::Auto,
1056
0
            slots,
1057
0
            text,
1058
0
            start,
1059
0
            text.len(),
1060
0
        )
1061
0
    }
1062
1063
    /// Like captures_nfa, but allows specification of type of NFA engine.
1064
0
    fn captures_nfa_type(
1065
0
        &self,
1066
0
        ty: MatchNfaType,
1067
0
        slots: &mut [Slot],
1068
0
        text: &[u8],
1069
0
        start: usize,
1070
0
        end: usize,
1071
0
    ) -> Option<(usize, usize)> {
1072
0
        if self.exec_nfa(
1073
0
            ty,
1074
0
            &mut [false],
1075
0
            slots,
1076
0
            false,
1077
0
            false,
1078
0
            text,
1079
0
            start,
1080
0
            end,
1081
0
        ) {
1082
0
            match (slots[0], slots[1]) {
1083
0
                (Some(s), Some(e)) => Some((s, e)),
1084
0
                _ => None,
1085
            }
1086
        } else {
1087
0
            None
1088
        }
1089
0
    }
1090
1091
0
    fn exec_nfa(
1092
0
        &self,
1093
0
        mut ty: MatchNfaType,
1094
0
        matches: &mut [bool],
1095
0
        slots: &mut [Slot],
1096
0
        quit_after_match: bool,
1097
0
        quit_after_match_with_pos: bool,
1098
0
        text: &[u8],
1099
0
        start: usize,
1100
0
        end: usize,
1101
0
    ) -> bool {
1102
0
        use self::MatchNfaType::*;
1103
0
        if let Auto = ty {
1104
0
            if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1105
0
                ty = Backtrack;
1106
0
            } else {
1107
0
                ty = PikeVM;
1108
0
            }
1109
0
        }
1110
        // The backtracker can't return the shortest match position as it is
1111
        // implemented today. So if someone calls `shortest_match` and we need
1112
        // to run an NFA, then use the PikeVM.
1113
0
        if quit_after_match_with_pos || ty == PikeVM {
1114
0
            self.exec_pikevm(
1115
0
                matches,
1116
0
                slots,
1117
0
                quit_after_match,
1118
0
                text,
1119
0
                start,
1120
0
                end,
1121
0
            )
1122
        } else {
1123
0
            self.exec_backtrack(matches, slots, text, start, end)
1124
        }
1125
0
    }
1126
1127
    /// Always run the NFA algorithm.
1128
0
    fn exec_pikevm(
1129
0
        &self,
1130
0
        matches: &mut [bool],
1131
0
        slots: &mut [Slot],
1132
0
        quit_after_match: bool,
1133
0
        text: &[u8],
1134
0
        start: usize,
1135
0
        end: usize,
1136
0
    ) -> bool {
1137
0
        if self.ro.nfa.uses_bytes() {
1138
0
            pikevm::Fsm::exec(
1139
0
                &self.ro.nfa,
1140
0
                self.cache.value(),
1141
0
                matches,
1142
0
                slots,
1143
0
                quit_after_match,
1144
0
                ByteInput::new(text, self.ro.nfa.only_utf8),
1145
0
                start,
1146
0
                end,
1147
0
            )
1148
        } else {
1149
0
            pikevm::Fsm::exec(
1150
0
                &self.ro.nfa,
1151
0
                self.cache.value(),
1152
0
                matches,
1153
0
                slots,
1154
0
                quit_after_match,
1155
0
                CharInput::new(text),
1156
0
                start,
1157
0
                end,
1158
0
            )
1159
        }
1160
0
    }
1161
1162
    /// Always runs the NFA using bounded backtracking.
1163
0
    fn exec_backtrack(
1164
0
        &self,
1165
0
        matches: &mut [bool],
1166
0
        slots: &mut [Slot],
1167
0
        text: &[u8],
1168
0
        start: usize,
1169
0
        end: usize,
1170
0
    ) -> bool {
1171
0
        if self.ro.nfa.uses_bytes() {
1172
0
            backtrack::Bounded::exec(
1173
0
                &self.ro.nfa,
1174
0
                self.cache.value(),
1175
0
                matches,
1176
0
                slots,
1177
0
                ByteInput::new(text, self.ro.nfa.only_utf8),
1178
0
                start,
1179
0
                end,
1180
0
            )
1181
        } else {
1182
0
            backtrack::Bounded::exec(
1183
0
                &self.ro.nfa,
1184
0
                self.cache.value(),
1185
0
                matches,
1186
0
                slots,
1187
0
                CharInput::new(text),
1188
0
                start,
1189
0
                end,
1190
0
            )
1191
        }
1192
0
    }
1193
1194
    /// Finds which regular expressions match the given text.
1195
    ///
1196
    /// `matches` should have length equal to the number of regexes being
1197
    /// searched.
1198
    ///
1199
    /// This is only useful when one wants to know which regexes in a set
1200
    /// match some text.
1201
0
    pub fn many_matches_at(
1202
0
        &self,
1203
0
        matches: &mut [bool],
1204
0
        text: &[u8],
1205
0
        start: usize,
1206
0
    ) -> bool {
1207
0
        use self::MatchType::*;
1208
0
        if !self.is_anchor_end_match(text) {
1209
0
            return false;
1210
0
        }
1211
0
        match self.ro.match_type {
1212
            #[cfg(feature = "perf-literal")]
1213
0
            Literal(ty) => {
1214
0
                debug_assert_eq!(matches.len(), 1);
1215
0
                matches[0] = self.find_literals(ty, text, start).is_some();
1216
0
                matches[0]
1217
            }
1218
            #[cfg(feature = "perf-dfa")]
1219
            Dfa | DfaAnchoredReverse | DfaMany => {
1220
0
                match dfa::Fsm::forward_many(
1221
0
                    &self.ro.dfa,
1222
0
                    self.cache.value(),
1223
0
                    matches,
1224
0
                    text,
1225
0
                    start,
1226
0
                ) {
1227
0
                    dfa::Result::Match(_) => true,
1228
0
                    dfa::Result::NoMatch(_) => false,
1229
0
                    dfa::Result::Quit => self.exec_nfa(
1230
0
                        MatchNfaType::Auto,
1231
0
                        matches,
1232
0
                        &mut [],
1233
0
                        false,
1234
0
                        false,
1235
0
                        text,
1236
0
                        start,
1237
0
                        text.len(),
1238
0
                    ),
1239
                }
1240
            }
1241
            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1242
            DfaSuffix => {
1243
0
                match dfa::Fsm::forward_many(
1244
0
                    &self.ro.dfa,
1245
0
                    self.cache.value(),
1246
0
                    matches,
1247
0
                    text,
1248
0
                    start,
1249
0
                ) {
1250
0
                    dfa::Result::Match(_) => true,
1251
0
                    dfa::Result::NoMatch(_) => false,
1252
0
                    dfa::Result::Quit => self.exec_nfa(
1253
0
                        MatchNfaType::Auto,
1254
0
                        matches,
1255
0
                        &mut [],
1256
0
                        false,
1257
0
                        false,
1258
0
                        text,
1259
0
                        start,
1260
0
                        text.len(),
1261
0
                    ),
1262
                }
1263
            }
1264
0
            Nfa(ty) => self.exec_nfa(
1265
0
                ty,
1266
0
                matches,
1267
0
                &mut [],
1268
0
                false,
1269
0
                false,
1270
0
                text,
1271
0
                start,
1272
0
                text.len(),
1273
0
            ),
1274
0
            Nothing => false,
1275
        }
1276
0
    }
1277
1278
    #[cfg_attr(feature = "perf-inline", inline(always))]
1279
0
    fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1280
0
        #[cfg(not(feature = "perf-literal"))]
1281
0
        fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1282
0
            true
1283
0
        }
1284
0
1285
0
        #[cfg(feature = "perf-literal")]
1286
0
        fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1287
0
            // Only do this check if the haystack is big (>1MB).
1288
0
            if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1289
0
                let lcs = ro.suffixes.lcs();
1290
0
                if lcs.len() >= 1 && !lcs.is_suffix(text) {
1291
0
                    return false;
1292
0
                }
1293
0
            }
1294
0
            true
1295
0
        }
1296
0
1297
0
        imp(&self.ro, text)
1298
0
    }
1299
1300
0
    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1301
0
        &self.ro.nfa.capture_name_idx
1302
0
    }
1303
}
1304
1305
impl<'c> ExecNoSyncStr<'c> {
1306
0
    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1307
0
        self.0.capture_name_idx()
1308
0
    }
1309
}
1310
1311
impl Exec {
1312
    /// Get a searcher that isn't Sync.
1313
    #[cfg_attr(feature = "perf-inline", inline(always))]
1314
0
    pub fn searcher(&self) -> ExecNoSync<'_> {
1315
0
        ExecNoSync {
1316
0
            ro: &self.ro, // a clone is too expensive here! (and not needed)
1317
0
            cache: self.pool.get(),
1318
0
        }
1319
0
    }
1320
1321
    /// Get a searcher that isn't Sync and can match on &str.
1322
    #[cfg_attr(feature = "perf-inline", inline(always))]
1323
0
    pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1324
0
        ExecNoSyncStr(self.searcher())
1325
0
    }
1326
1327
    /// Build a Regex from this executor.
1328
0
    pub fn into_regex(self) -> re_unicode::Regex {
1329
0
        re_unicode::Regex::from(self)
1330
0
    }
1331
1332
    /// Build a RegexSet from this executor.
1333
0
    pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1334
0
        re_set::unicode::RegexSet::from(self)
1335
0
    }
1336
1337
    /// Build a Regex from this executor that can match arbitrary bytes.
1338
0
    pub fn into_byte_regex(self) -> re_bytes::Regex {
1339
0
        re_bytes::Regex::from(self)
1340
0
    }
1341
1342
    /// Build a RegexSet from this executor that can match arbitrary bytes.
1343
0
    pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1344
0
        re_set::bytes::RegexSet::from(self)
1345
0
    }
1346
1347
    /// The original regular expressions given by the caller that were
1348
    /// compiled.
1349
0
    pub fn regex_strings(&self) -> &[String] {
1350
0
        &self.ro.res
1351
0
    }
1352
1353
    /// Return a slice of capture names.
1354
    ///
1355
    /// Any capture that isn't named is None.
1356
0
    pub fn capture_names(&self) -> &[Option<String>] {
1357
0
        &self.ro.nfa.captures
1358
0
    }
1359
1360
    /// Return a reference to named groups mapping (from group name to
1361
    /// group position).
1362
0
    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1363
0
        &self.ro.nfa.capture_name_idx
1364
0
    }
1365
1366
    /// If the number of capture groups in every match is always the same, then
1367
    /// return that number. Otherwise return `None`.
1368
0
    pub fn static_captures_len(&self) -> Option<usize> {
1369
0
        self.ro.nfa.static_captures_len
1370
0
    }
1371
}
1372
1373
impl Clone for Exec {
1374
0
    fn clone(&self) -> Exec {
1375
0
        let pool = ExecReadOnly::new_pool(&self.ro);
1376
0
        Exec { ro: self.ro.clone(), pool }
1377
0
    }
1378
}
1379
1380
impl ExecReadOnly {
1381
    fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1382
0
        if let Some(MatchType::Nfa(_)) = hint {
1383
0
            return hint.unwrap();
1384
0
        }
1385
0
        // If the NFA is empty, then we'll never match anything.
1386
0
        if self.nfa.insts.is_empty() {
1387
0
            return MatchType::Nothing;
1388
0
        }
1389
0
        if let Some(literalty) = self.choose_literal_match_type() {
1390
0
            return literalty;
1391
0
        }
1392
0
        if let Some(dfaty) = self.choose_dfa_match_type() {
1393
0
            return dfaty;
1394
0
        }
1395
0
        // We're so totally hosed.
1396
0
        MatchType::Nfa(MatchNfaType::Auto)
1397
0
    }
1398
1399
    /// If a plain literal scan can be used, then a corresponding literal
1400
    /// search type is returned.
1401
0
    fn choose_literal_match_type(&self) -> Option<MatchType> {
1402
0
        #[cfg(not(feature = "perf-literal"))]
1403
0
        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1404
0
            None
1405
0
        }
1406
0
1407
0
        #[cfg(feature = "perf-literal")]
1408
0
        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1409
0
            // If our set of prefixes is complete, then we can use it to find
1410
0
            // a match in lieu of a regex engine. This doesn't quite work well
1411
0
            // in the presence of multiple regexes, so only do it when there's
1412
0
            // one.
1413
0
            //
1414
0
            // TODO(burntsushi): Also, don't try to match literals if the regex
1415
0
            // is partially anchored. We could technically do it, but we'd need
1416
0
            // to create two sets of literals: all of them and then the subset
1417
0
            // that aren't anchored. We would then only search for all of them
1418
0
            // when at the beginning of the input and use the subset in all
1419
0
            // other cases.
1420
0
            if ro.res.len() != 1 {
1421
0
                return None;
1422
0
            }
1423
0
            if ro.ac.is_some() {
1424
0
                return Some(MatchType::Literal(
1425
0
                    MatchLiteralType::AhoCorasick,
1426
0
                ));
1427
0
            }
1428
0
            if ro.nfa.prefixes.complete() {
1429
0
                return if ro.nfa.is_anchored_start {
1430
0
                    Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1431
0
                } else {
1432
0
                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1433
0
                };
1434
0
            }
1435
0
            if ro.suffixes.complete() {
1436
0
                return if ro.nfa.is_anchored_end {
1437
0
                    Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1438
0
                } else {
1439
0
                    // This case shouldn't happen. When the regex isn't
1440
0
                    // anchored, then complete prefixes should imply complete
1441
0
                    // suffixes.
1442
0
                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1443
0
                };
1444
0
            }
1445
0
            None
1446
0
        }
1447
0
1448
0
        imp(self)
1449
0
    }
1450
1451
    /// If a DFA scan can be used, then choose the appropriate DFA strategy.
1452
0
    fn choose_dfa_match_type(&self) -> Option<MatchType> {
1453
0
        #[cfg(not(feature = "perf-dfa"))]
1454
0
        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1455
0
            None
1456
0
        }
1457
0
1458
0
        #[cfg(feature = "perf-dfa")]
1459
0
        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1460
0
            if !dfa::can_exec(&ro.dfa) {
1461
0
                return None;
1462
0
            }
1463
0
            // Regex sets require a slightly specialized path.
1464
0
            if ro.res.len() >= 2 {
1465
0
                return Some(MatchType::DfaMany);
1466
0
            }
1467
0
            // If the regex is anchored at the end but not the start, then
1468
0
            // just match in reverse from the end of the haystack.
1469
0
            if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1470
0
                return Some(MatchType::DfaAnchoredReverse);
1471
0
            }
1472
0
            #[cfg(feature = "perf-literal")]
1473
0
            {
1474
0
                // If there's a longish suffix literal, then it might be faster
1475
0
                // to look for that first.
1476
0
                if ro.should_suffix_scan() {
1477
0
                    return Some(MatchType::DfaSuffix);
1478
0
                }
1479
0
            }
1480
0
            // Fall back to your garden variety forward searching lazy DFA.
1481
0
            Some(MatchType::Dfa)
1482
0
        }
1483
0
1484
0
        imp(self)
1485
0
    }
1486
1487
    /// Returns true if the program is amenable to suffix scanning.
1488
    ///
1489
    /// When this is true, as a heuristic, we assume it is OK to quickly scan
1490
    /// for suffix literals and then do a *reverse* DFA match from any matches
1491
    /// produced by the literal scan. (And then followed by a forward DFA
1492
    /// search, since the previously found suffix literal maybe not actually be
1493
    /// the end of a match.)
1494
    ///
1495
    /// This is a bit of a specialized optimization, but can result in pretty
1496
    /// big performance wins if 1) there are no prefix literals and 2) the
1497
    /// suffix literals are pretty rare in the text. (1) is obviously easy to
1498
    /// account for but (2) is harder. As a proxy, we assume that longer
1499
    /// strings are generally rarer, so we only enable this optimization when
1500
    /// we have a meaty suffix.
1501
    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1502
0
    fn should_suffix_scan(&self) -> bool {
1503
0
        if self.suffixes.is_empty() {
1504
0
            return false;
1505
0
        }
1506
0
        let lcs_len = self.suffixes.lcs().char_len();
1507
0
        lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1508
0
    }
1509
1510
0
    fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1511
0
        let ro = ro.clone();
1512
0
        Box::new(Pool::new(Box::new(move || {
1513
0
            AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1514
0
        })))
1515
0
    }
1516
}
1517
1518
0
#[derive(Clone, Copy, Debug)]
1519
enum MatchType {
1520
    /// A single or multiple literal search. This is only used when the regex
1521
    /// can be decomposed into a literal search.
1522
    #[cfg(feature = "perf-literal")]
1523
    Literal(MatchLiteralType),
1524
    /// A normal DFA search.
1525
    #[cfg(feature = "perf-dfa")]
1526
    Dfa,
1527
    /// A reverse DFA search starting from the end of a haystack.
1528
    #[cfg(feature = "perf-dfa")]
1529
    DfaAnchoredReverse,
1530
    /// A reverse DFA search with suffix literal scanning.
1531
    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1532
    DfaSuffix,
1533
    /// Use the DFA on two or more regular expressions.
1534
    #[cfg(feature = "perf-dfa")]
1535
    DfaMany,
1536
    /// An NFA variant.
1537
    Nfa(MatchNfaType),
1538
    /// No match is ever possible, so don't ever try to search.
1539
    Nothing,
1540
}
1541
1542
0
#[derive(Clone, Copy, Debug)]
1543
#[cfg(feature = "perf-literal")]
1544
enum MatchLiteralType {
1545
    /// Match literals anywhere in text.
1546
    Unanchored,
1547
    /// Match literals only at the start of text.
1548
    AnchoredStart,
1549
    /// Match literals only at the end of text.
1550
    AnchoredEnd,
1551
    /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1552
    /// ExecReadOnly.
1553
    AhoCorasick,
1554
}
1555
1556
0
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1557
enum MatchNfaType {
1558
    /// Choose between Backtrack and PikeVM.
1559
    Auto,
1560
    /// NFA bounded backtracking.
1561
    ///
1562
    /// (This is only set by tests, since it never makes sense to always want
1563
    /// backtracking.)
1564
    Backtrack,
1565
    /// The Pike VM.
1566
    ///
1567
    /// (This is only set by tests, since it never makes sense to always want
1568
    /// the Pike VM.)
1569
    PikeVM,
1570
}
1571
1572
/// `ProgramCache` maintains reusable allocations for each matching engine
1573
/// available to a particular program.
1574
///
1575
/// We declare this as unwind safe since it's a cache that's only used for
1576
/// performance purposes. If a panic occurs, it is (or should be) always safe
1577
/// to continue using the same regex object.
1578
pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1579
1580
0
#[derive(Debug)]
1581
pub struct ProgramCacheInner {
1582
    pub pikevm: pikevm::Cache,
1583
    pub backtrack: backtrack::Cache,
1584
    #[cfg(feature = "perf-dfa")]
1585
    pub dfa: dfa::Cache,
1586
    #[cfg(feature = "perf-dfa")]
1587
    pub dfa_reverse: dfa::Cache,
1588
}
1589
1590
impl ProgramCacheInner {
1591
0
    fn new(ro: &ExecReadOnly) -> Self {
1592
0
        ProgramCacheInner {
1593
0
            pikevm: pikevm::Cache::new(&ro.nfa),
1594
0
            backtrack: backtrack::Cache::new(&ro.nfa),
1595
0
            #[cfg(feature = "perf-dfa")]
1596
0
            dfa: dfa::Cache::new(&ro.dfa),
1597
0
            #[cfg(feature = "perf-dfa")]
1598
0
            dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1599
0
        }
1600
0
    }
1601
}
1602
1603
/// Alternation literals checks if the given HIR is a simple alternation of
1604
/// literals, and if so, returns them. Otherwise, this returns None.
1605
#[cfg(feature = "perf-literal")]
1606
0
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1607
0
    use regex_syntax::hir::{HirKind, Literal};
1608
0
1609
0
    // This is pretty hacky, but basically, if `is_alternation_literal` is
1610
0
    // true, then we can make several assumptions about the structure of our
1611
0
    // HIR. This is what justifies the `unreachable!` statements below.
1612
0
    //
1613
0
    // This code should be refactored once we overhaul this crate's
1614
0
    // optimization pipeline, because this is a terribly inflexible way to go
1615
0
    // about things.
1616
0
1617
0
    if !expr.properties().is_alternation_literal() {
1618
0
        return None;
1619
0
    }
1620
0
    let alts = match *expr.kind() {
1621
0
        HirKind::Alternation(ref alts) => alts,
1622
0
        _ => return None, // one literal isn't worth it
1623
    };
1624
1625
0
    let mut lits = vec![];
1626
0
    for alt in alts {
1627
0
        let mut lit = vec![];
1628
0
        match *alt.kind() {
1629
0
            HirKind::Literal(Literal(ref bytes)) => {
1630
0
                lit.extend_from_slice(bytes)
1631
            }
1632
0
            HirKind::Concat(ref exprs) => {
1633
0
                for e in exprs {
1634
0
                    match *e.kind() {
1635
0
                        HirKind::Literal(Literal(ref bytes)) => {
1636
0
                            lit.extend_from_slice(bytes);
1637
0
                        }
1638
0
                        _ => unreachable!("expected literal, got {:?}", e),
1639
                    }
1640
                }
1641
            }
1642
0
            _ => unreachable!("expected literal or concat, got {:?}", alt),
1643
        }
1644
0
        lits.push(lit);
1645
    }
1646
0
    Some(lits)
1647
0
}
1648
1649
#[cfg(not(feature = "perf-literal"))]
1650
fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
1651
    (literal::Seq::infinite(), literal::Seq::infinite())
1652
}
1653
1654
#[cfg(feature = "perf-literal")]
1655
0
fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
1656
0
    const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
1657
0
1658
0
    let mut prefixes = literal::Extractor::new()
1659
0
        .kind(literal::ExtractKind::Prefix)
1660
0
        .extract(expr);
1661
0
    for (keep, limit) in ATTEMPTS {
1662
0
        let len = match prefixes.len() {
1663
0
            None => break,
1664
0
            Some(len) => len,
1665
0
        };
1666
0
        if len <= limit {
1667
0
            break;
1668
0
        }
1669
0
        prefixes.keep_first_bytes(keep);
1670
0
        prefixes.minimize_by_preference();
1671
    }
1672
1673
0
    let mut suffixes = literal::Extractor::new()
1674
0
        .kind(literal::ExtractKind::Suffix)
1675
0
        .extract(expr);
1676
0
    for (keep, limit) in ATTEMPTS {
1677
0
        let len = match suffixes.len() {
1678
0
            None => break,
1679
0
            Some(len) => len,
1680
0
        };
1681
0
        if len <= limit {
1682
0
            break;
1683
0
        }
1684
0
        suffixes.keep_last_bytes(keep);
1685
0
        suffixes.minimize_by_preference();
1686
    }
1687
1688
0
    (prefixes, suffixes)
1689
0
}
1690
1691
#[cfg(test)]
1692
mod test {
1693
    #[test]
1694
    fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1695
        use crate::internal::ExecBuilder;
1696
1697
        let backtrack_bytes_re = ExecBuilder::new("^S")
1698
            .bounded_backtracking()
1699
            .only_utf8(false)
1700
            .build()
1701
            .map(|exec| exec.into_byte_regex())
1702
            .map_err(|err| format!("{}", err))
1703
            .unwrap();
1704
1705
        let default_bytes_re = ExecBuilder::new("^S")
1706
            .only_utf8(false)
1707
            .build()
1708
            .map(|exec| exec.into_byte_regex())
1709
            .map_err(|err| format!("{}", err))
1710
            .unwrap();
1711
1712
        let input = vec![83, 83];
1713
1714
        let s1 = backtrack_bytes_re.split(&input);
1715
        let s2 = default_bytes_re.split(&input);
1716
        for (chunk1, chunk2) in s1.zip(s2) {
1717
            assert_eq!(chunk1, chunk2);
1718
        }
1719
    }
1720
1721
    #[test]
1722
    fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1723
        use crate::internal::ExecBuilder;
1724
1725
        let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1726
            .bounded_backtracking()
1727
            .bytes(true)
1728
            .build()
1729
            .map(|exec| exec.into_regex())
1730
            .map_err(|err| format!("{}", err))
1731
            .unwrap();
1732
1733
        let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1734
            .bytes(true)
1735
            .build()
1736
            .map(|exec| exec.into_regex())
1737
            .map_err(|err| format!("{}", err))
1738
            .unwrap();
1739
1740
        let input = "**";
1741
1742
        let s1 = backtrack_bytes_re.split(input);
1743
        let s2 = default_bytes_re.split(input);
1744
        for (chunk1, chunk2) in s1.zip(s2) {
1745
            assert_eq!(chunk1, chunk2);
1746
        }
1747
    }
1748
}