Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/nfa/thompson/backtrack.rs
Line
Count
Source
1
/*!
2
An NFA backed bounded backtracker for executing regex searches with capturing
3
groups.
4
5
This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6
using the classical backtracking algorithm with a twist: it avoids redoing
7
work that it has done before and thereby avoids worst case exponential time.
8
In exchange, it can only be used on "short" haystacks. Its advantage is that
9
is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10
because it does less book-keeping.
11
*/
12
13
use alloc::{vec, vec::Vec};
14
15
use crate::{
16
    nfa::thompson::{self, BuildError, State, NFA},
17
    util::{
18
        captures::Captures,
19
        empty, iter,
20
        prefilter::Prefilter,
21
        primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22
        search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23
    },
24
};
25
26
/// Returns the minimum visited capacity for the given haystack.
27
///
28
/// This function can be used as the argument to [`Config::visited_capacity`]
29
/// in order to guarantee that a backtracking search for the given `input`
30
/// won't return an error when using a [`BoundedBacktracker`] built from the
31
/// given `NFA`.
32
///
33
/// This routine exists primarily as a way to test that the bounded backtracker
34
/// works correctly when its capacity is set to the smallest possible amount.
35
/// Still, it may be useful in cases where you know you want to use the bounded
36
/// backtracker for a specific input, and just need to know what visited
37
/// capacity to provide to make it work.
38
///
39
/// Be warned that this number could be quite large as it is multiplicative in
40
/// the size the given NFA and haystack.
41
0
pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42
0
    div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
43
0
}
44
45
/// The configuration used for building a bounded backtracker.
46
///
47
/// A bounded backtracker configuration is a simple data object that is
48
/// typically used with [`Builder::configure`].
49
#[derive(Clone, Debug, Default)]
50
pub struct Config {
51
    pre: Option<Option<Prefilter>>,
52
    visited_capacity: Option<usize>,
53
}
54
55
impl Config {
56
    /// Return a new default regex configuration.
57
65.3k
    pub fn new() -> Config {
58
65.3k
        Config::default()
59
65.3k
    }
60
61
    /// Set a prefilter to be used whenever a start state is entered.
62
    ///
63
    /// A [`Prefilter`] in this context is meant to accelerate searches by
64
    /// looking for literal prefixes that every match for the corresponding
65
    /// pattern (or patterns) must start with. Once a prefilter produces a
66
    /// match, the underlying search routine continues on to try and confirm
67
    /// the match.
68
    ///
69
    /// Be warned that setting a prefilter does not guarantee that the search
70
    /// will be faster. While it's usually a good bet, if the prefilter
71
    /// produces a lot of false positive candidates (i.e., positions matched
72
    /// by the prefilter but not by the regex), then the overall result can
73
    /// be slower than if you had just executed the regex engine without any
74
    /// prefilters.
75
    ///
76
    /// By default no prefilter is set.
77
    ///
78
    /// # Example
79
    ///
80
    /// ```
81
    /// use regex_automata::{
82
    ///     nfa::thompson::backtrack::BoundedBacktracker,
83
    ///     util::prefilter::Prefilter,
84
    ///     Input, Match, MatchKind,
85
    /// };
86
    ///
87
    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88
    /// let re = BoundedBacktracker::builder()
89
    ///     .configure(BoundedBacktracker::config().prefilter(pre))
90
    ///     .build(r"(foo|bar)[a-z]+")?;
91
    /// let mut cache = re.create_cache();
92
    /// let input = Input::new("foo1 barfox bar");
93
    /// assert_eq!(
94
    ///     Some(Match::must(0, 5..11)),
95
    ///     re.try_find(&mut cache, input)?,
96
    /// );
97
    ///
98
    /// # Ok::<(), Box<dyn std::error::Error>>(())
99
    /// ```
100
    ///
101
    /// Be warned though that an incorrect prefilter can lead to incorrect
102
    /// results!
103
    ///
104
    /// ```
105
    /// use regex_automata::{
106
    ///     nfa::thompson::backtrack::BoundedBacktracker,
107
    ///     util::prefilter::Prefilter,
108
    ///     Input, HalfMatch, MatchKind,
109
    /// };
110
    ///
111
    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112
    /// let re = BoundedBacktracker::builder()
113
    ///     .configure(BoundedBacktracker::config().prefilter(pre))
114
    ///     .build(r"(foo|bar)[a-z]+")?;
115
    /// let mut cache = re.create_cache();
116
    /// let input = Input::new("foo1 barfox bar");
117
    /// // No match reported even though there clearly is one!
118
    /// assert_eq!(None, re.try_find(&mut cache, input)?);
119
    ///
120
    /// # Ok::<(), Box<dyn std::error::Error>>(())
121
    /// ```
122
65.3k
    pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123
65.3k
        self.pre = Some(pre);
124
65.3k
        self
125
65.3k
    }
126
127
    /// Set the visited capacity used to bound backtracking.
128
    ///
129
    /// The visited capacity represents the amount of heap memory (in bytes) to
130
    /// allocate toward tracking which parts of the backtracking search have
131
    /// been done before. The heap memory needed for any particular search is
132
    /// proportional to `haystack.len() * nfa.states().len()`, which an be
133
    /// quite large. Therefore, the bounded backtracker is typically only able
134
    /// to run on shorter haystacks.
135
    ///
136
    /// For a given regex, increasing the visited capacity means that the
137
    /// maximum haystack length that can be searched is increased. The
138
    /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139
    ///
140
    /// The default capacity is a reasonable but empirically chosen size.
141
    ///
142
    /// # Example
143
    ///
144
    /// As with other regex engines, Unicode is what tends to make the bounded
145
    /// backtracker less useful by making the maximum haystack length quite
146
    /// small. If necessary, increasing the visited capacity using this routine
147
    /// will increase the maximum haystack length at the cost of using more
148
    /// memory.
149
    ///
150
    /// Note though that the specific maximum values here are not an API
151
    /// guarantee. The default visited capacity is subject to change and not
152
    /// covered by semver.
153
    ///
154
    /// ```
155
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157
    ///
158
    /// // Unicode inflates the size of the underlying NFA quite a bit, and
159
    /// // thus means that the backtracker can only handle smaller haystacks,
160
    /// // assuming that the visited capacity remains unchanged.
161
    /// let re = BoundedBacktracker::new(r"\w+")?;
162
    /// assert!(re.max_haystack_len() <= 7_000);
163
    /// // But we can increase the visited capacity to handle bigger haystacks!
164
    /// let re = BoundedBacktracker::builder()
165
    ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
166
    ///     .build(r"\w+")?;
167
    /// assert!(re.max_haystack_len() >= 25_000);
168
    /// assert!(re.max_haystack_len() <= 28_000);
169
    /// # Ok::<(), Box<dyn std::error::Error>>(())
170
    /// ```
171
0
    pub fn visited_capacity(mut self, capacity: usize) -> Config {
172
0
        self.visited_capacity = Some(capacity);
173
0
        self
174
0
    }
175
176
    /// Returns the prefilter set in this configuration, if one at all.
177
37.3k
    pub fn get_prefilter(&self) -> Option<&Prefilter> {
178
37.3k
        self.pre.as_ref().unwrap_or(&None).as_ref()
179
37.3k
    }
180
181
    /// Returns the configured visited capacity.
182
    ///
183
    /// Note that the actual capacity used may be slightly bigger than the
184
    /// configured capacity.
185
71.5k
    pub fn get_visited_capacity(&self) -> usize {
186
        const DEFAULT: usize = 256 * (1 << 10); // 256 KB
187
71.5k
        self.visited_capacity.unwrap_or(DEFAULT)
188
71.5k
    }
189
190
    /// Overwrite the default configuration such that the options in `o` are
191
    /// always used. If an option in `o` is not set, then the corresponding
192
    /// option in `self` is used. If it's not set in `self` either, then it
193
    /// remains not set.
194
65.3k
    pub(crate) fn overwrite(&self, o: Config) -> Config {
195
        Config {
196
65.3k
            pre: o.pre.or_else(|| self.pre.clone()),
197
65.3k
            visited_capacity: o.visited_capacity.or(self.visited_capacity),
198
        }
199
65.3k
    }
200
}
201
202
/// A builder for a bounded backtracker.
203
///
204
/// This builder permits configuring options for the syntax of a pattern, the
205
/// NFA construction and the `BoundedBacktracker` construction. This builder
206
/// is different from a general purpose regex builder in that it permits fine
207
/// grain configuration of the construction process. The trade off for this is
208
/// complexity, and the possibility of setting a configuration that might not
209
/// make sense. For example, there are two different UTF-8 modes:
210
///
211
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212
/// whether the pattern itself can contain sub-expressions that match invalid
213
/// UTF-8.
214
/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215
/// advance the starting position of the next search when a match with zero
216
/// length is found.
217
///
218
/// Generally speaking, callers will want to either enable all of these or
219
/// disable all of these.
220
///
221
/// # Example
222
///
223
/// This example shows how to disable UTF-8 mode in the syntax and the regex
224
/// itself. This is generally what you want for matching on arbitrary bytes.
225
///
226
/// ```
227
/// use regex_automata::{
228
///     nfa::thompson::{self, backtrack::BoundedBacktracker},
229
///     util::syntax,
230
///     Match,
231
/// };
232
///
233
/// let re = BoundedBacktracker::builder()
234
///     .syntax(syntax::Config::new().utf8(false))
235
///     .thompson(thompson::Config::new().utf8(false))
236
///     .build(r"foo(?-u:[^b])ar.*")?;
237
/// let mut cache = re.create_cache();
238
///
239
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
240
/// let expected = Some(Ok(Match::must(0, 1..9)));
241
/// let got = re.try_find_iter(&mut cache, haystack).next();
242
/// assert_eq!(expected, got);
243
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244
/// // but the subsequent `.*` does not! Disabling UTF-8
245
/// // on the syntax permits this.
246
/// //
247
/// // N.B. This example does not show the impact of
248
/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249
/// // only impacts regexes that can produce matches of
250
/// // length 0.
251
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
252
///
253
/// # Ok::<(), Box<dyn std::error::Error>>(())
254
/// ```
255
#[derive(Clone, Debug)]
256
pub struct Builder {
257
    config: Config,
258
    #[cfg(feature = "syntax")]
259
    thompson: thompson::Compiler,
260
}
261
262
impl Builder {
263
    /// Create a new BoundedBacktracker builder with its default configuration.
264
65.3k
    pub fn new() -> Builder {
265
65.3k
        Builder {
266
65.3k
            config: Config::default(),
267
65.3k
            #[cfg(feature = "syntax")]
268
65.3k
            thompson: thompson::Compiler::new(),
269
65.3k
        }
270
65.3k
    }
271
272
    /// Build a `BoundedBacktracker` from the given pattern.
273
    ///
274
    /// If there was a problem parsing or compiling the pattern, then an error
275
    /// is returned.
276
    #[cfg(feature = "syntax")]
277
0
    pub fn build(
278
0
        &self,
279
0
        pattern: &str,
280
0
    ) -> Result<BoundedBacktracker, BuildError> {
281
0
        self.build_many(&[pattern])
282
0
    }
283
284
    /// Build a `BoundedBacktracker` from the given patterns.
285
    #[cfg(feature = "syntax")]
286
0
    pub fn build_many<P: AsRef<str>>(
287
0
        &self,
288
0
        patterns: &[P],
289
0
    ) -> Result<BoundedBacktracker, BuildError> {
290
0
        let nfa = self.thompson.build_many(patterns)?;
291
0
        self.build_from_nfa(nfa)
292
0
    }
293
294
    /// Build a `BoundedBacktracker` directly from its NFA.
295
    ///
296
    /// Note that when using this method, any configuration that applies to the
297
    /// construction of the NFA itself will of course be ignored, since the NFA
298
    /// given here is already built.
299
65.3k
    pub fn build_from_nfa(
300
65.3k
        &self,
301
65.3k
        nfa: NFA,
302
65.3k
    ) -> Result<BoundedBacktracker, BuildError> {
303
65.3k
        nfa.look_set_any().available().map_err(BuildError::word)?;
304
65.3k
        Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305
65.3k
    }
306
307
    /// Apply the given `BoundedBacktracker` configuration options to this
308
    /// builder.
309
65.3k
    pub fn configure(&mut self, config: Config) -> &mut Builder {
310
65.3k
        self.config = self.config.overwrite(config);
311
65.3k
        self
312
65.3k
    }
313
314
    /// Set the syntax configuration for this builder using
315
    /// [`syntax::Config`](crate::util::syntax::Config).
316
    ///
317
    /// This permits setting things like case insensitivity, Unicode and multi
318
    /// line mode.
319
    ///
320
    /// These settings only apply when constructing a `BoundedBacktracker`
321
    /// directly from a pattern.
322
    #[cfg(feature = "syntax")]
323
0
    pub fn syntax(
324
0
        &mut self,
325
0
        config: crate::util::syntax::Config,
326
0
    ) -> &mut Builder {
327
0
        self.thompson.syntax(config);
328
0
        self
329
0
    }
330
331
    /// Set the Thompson NFA configuration for this builder using
332
    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333
    ///
334
    /// This permits setting things like if additional time should be spent
335
    /// shrinking the size of the NFA.
336
    ///
337
    /// These settings only apply when constructing a `BoundedBacktracker`
338
    /// directly from a pattern.
339
    #[cfg(feature = "syntax")]
340
0
    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341
0
        self.thompson.configure(config);
342
0
        self
343
0
    }
344
}
345
346
/// A backtracking regex engine that bounds its execution to avoid exponential
347
/// blow-up.
348
///
349
/// This regex engine only implements leftmost-first match semantics and
350
/// only supports leftmost searches. It effectively does the same thing as a
351
/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352
/// it doesn't have to worry about copying capturing group spans for most NFA
353
/// states. Instead, the backtracker can maintain one set of captures (provided
354
/// by the caller) and never needs to copy them. In exchange, the backtracker
355
/// bounds itself to ensure it doesn't exhibit worst case exponential time.
356
/// This results in the backtracker only being able to handle short haystacks
357
/// given reasonable memory usage.
358
///
359
/// # Searches may return an error!
360
///
361
/// By design, this backtracking regex engine is bounded. This bound is
362
/// implemented by not visiting any combination of NFA state ID and position
363
/// in a haystack more than once. Thus, the total memory required to bound
364
/// backtracking is proportional to `haystack.len() * nfa.states().len()`.
365
/// This can obviously get quite large, since large haystacks aren't terribly
366
/// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367
/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368
/// capacity required for a particular regex and a haystack exceeds this
369
/// capacity, then the search routine will return an error.
370
///
371
/// Unlike other regex engines that may return an error at search time (like
372
/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373
/// backtracker will work for every haystack. Therefore, this regex engine
374
/// _only_ exposes fallible search routines to avoid the footgun of panicking
375
/// when running a search on a haystack that is too big.
376
///
377
/// If one wants to use the fallible search APIs without handling the
378
/// error, the only way to guarantee an error won't occur from the
379
/// haystack length is to ensure the haystack length does not exceed
380
/// [`BoundedBacktracker::max_haystack_len`].
381
///
382
/// # Example: Unicode word boundaries
383
///
384
/// This example shows that the bounded backtracker implements Unicode word
385
/// boundaries correctly by default.
386
///
387
/// ```
388
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
389
/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390
///
391
/// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392
/// let mut cache = re.create_cache();
393
///
394
/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395
/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
396
/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
397
/// assert_eq!(None, it.next());
398
/// # Ok::<(), Box<dyn std::error::Error>>(())
399
/// ```
400
///
401
/// # Example: multiple regex patterns
402
///
403
/// The bounded backtracker supports searching for multiple patterns
404
/// simultaneously, just like other regex engines. Note though that because it
405
/// uses a backtracking strategy, this regex engine is unlikely to scale well
406
/// as more patterns are added. But then again, as more patterns are added, the
407
/// maximum haystack length allowed will also shorten (assuming the visited
408
/// capacity remains invariant).
409
///
410
/// ```
411
/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412
///
413
/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414
/// let mut cache = re.create_cache();
415
///
416
/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417
/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
418
/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
419
/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
420
/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
421
/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
422
/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
423
/// assert_eq!(None, it.next());
424
/// # Ok::<(), Box<dyn std::error::Error>>(())
425
/// ```
426
#[derive(Clone, Debug)]
427
pub struct BoundedBacktracker {
428
    config: Config,
429
    nfa: NFA,
430
}
431
432
impl BoundedBacktracker {
433
    /// Parse the given regular expression using the default configuration and
434
    /// return the corresponding `BoundedBacktracker`.
435
    ///
436
    /// If you want a non-default configuration, then use the [`Builder`] to
437
    /// set your own configuration.
438
    ///
439
    /// # Example
440
    ///
441
    /// ```
442
    /// use regex_automata::{
443
    ///     nfa::thompson::backtrack::BoundedBacktracker,
444
    ///     Match,
445
    /// };
446
    ///
447
    /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448
    /// let mut cache = re.create_cache();
449
    /// assert_eq!(
450
    ///     Some(Ok(Match::must(0, 3..14))),
451
    ///     re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452
    /// );
453
    /// # Ok::<(), Box<dyn std::error::Error>>(())
454
    /// ```
455
    #[cfg(feature = "syntax")]
456
0
    pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457
0
        BoundedBacktracker::builder().build(pattern)
458
0
    }
459
460
    /// Like `new`, but parses multiple patterns into a single "multi regex."
461
    /// This similarly uses the default regex configuration.
462
    ///
463
    /// # Example
464
    ///
465
    /// ```
466
    /// use regex_automata::{
467
    ///     nfa::thompson::backtrack::BoundedBacktracker,
468
    ///     Match,
469
    /// };
470
    ///
471
    /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472
    /// let mut cache = re.create_cache();
473
    ///
474
    /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475
    /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
476
    /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
477
    /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
478
    /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
479
    /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
480
    /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
481
    /// assert_eq!(None, it.next());
482
    /// # Ok::<(), Box<dyn std::error::Error>>(())
483
    /// ```
484
    #[cfg(feature = "syntax")]
485
    pub fn new_many<P: AsRef<str>>(
486
        patterns: &[P],
487
    ) -> Result<BoundedBacktracker, BuildError> {
488
        BoundedBacktracker::builder().build_many(patterns)
489
    }
490
491
    /// # Example
492
    ///
493
    /// This shows how to hand assemble a regular expression via its HIR,
494
    /// compile an NFA from it and build a BoundedBacktracker from the NFA.
495
    ///
496
    /// ```
497
    /// use regex_automata::{
498
    ///     nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499
    ///     Match,
500
    /// };
501
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502
    ///
503
    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504
    ///     ClassBytesRange::new(b'0', b'9'),
505
    ///     ClassBytesRange::new(b'A', b'Z'),
506
    ///     ClassBytesRange::new(b'_', b'_'),
507
    ///     ClassBytesRange::new(b'a', b'z'),
508
    /// ])));
509
    ///
510
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
511
    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512
    ///
513
    /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515
    /// let expected = Some(Match::must(0, 3..4));
516
    /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517
    /// assert_eq!(expected, caps.get_match());
518
    ///
519
    /// # Ok::<(), Box<dyn std::error::Error>>(())
520
    /// ```
521
0
    pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522
0
        BoundedBacktracker::builder().build_from_nfa(nfa)
523
0
    }
524
525
    /// Create a new `BoundedBacktracker` that matches every input.
526
    ///
527
    /// # Example
528
    ///
529
    /// ```
530
    /// use regex_automata::{
531
    ///     nfa::thompson::backtrack::BoundedBacktracker,
532
    ///     Match,
533
    /// };
534
    ///
535
    /// let re = BoundedBacktracker::always_match()?;
536
    /// let mut cache = re.create_cache();
537
    ///
538
    /// let expected = Some(Ok(Match::must(0, 0..0)));
539
    /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540
    /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541
    /// # Ok::<(), Box<dyn std::error::Error>>(())
542
    /// ```
543
0
    pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544
0
        let nfa = thompson::NFA::always_match();
545
0
        BoundedBacktracker::new_from_nfa(nfa)
546
0
    }
547
548
    /// Create a new `BoundedBacktracker` that never matches any input.
549
    ///
550
    /// # Example
551
    ///
552
    /// ```
553
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554
    ///
555
    /// let re = BoundedBacktracker::never_match()?;
556
    /// let mut cache = re.create_cache();
557
    ///
558
    /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559
    /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560
    /// # Ok::<(), Box<dyn std::error::Error>>(())
561
    /// ```
562
0
    pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563
0
        let nfa = thompson::NFA::never_match();
564
0
        BoundedBacktracker::new_from_nfa(nfa)
565
0
    }
566
567
    /// Return a default configuration for a `BoundedBacktracker`.
568
    ///
569
    /// This is a convenience routine to avoid needing to import the `Config`
570
    /// type when customizing the construction of a `BoundedBacktracker`.
571
    ///
572
    /// # Example
573
    ///
574
    /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575
    /// disabled, zero-width matches that split a codepoint are allowed.
576
    /// Otherwise they are never reported.
577
    ///
578
    /// In the code below, notice that `""` is permitted to match positions
579
    /// that split the encoding of a codepoint.
580
    ///
581
    /// ```
582
    /// use regex_automata::{
583
    ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
584
    ///     Match,
585
    /// };
586
    ///
587
    /// let re = BoundedBacktracker::builder()
588
    ///     .thompson(thompson::Config::new().utf8(false))
589
    ///     .build(r"")?;
590
    /// let mut cache = re.create_cache();
591
    ///
592
    /// let haystack = "a☃z";
593
    /// let mut it = re.try_find_iter(&mut cache, haystack);
594
    /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
595
    /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
596
    /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
597
    /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
598
    /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
599
    /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
600
    /// assert_eq!(None, it.next());
601
    ///
602
    /// # Ok::<(), Box<dyn std::error::Error>>(())
603
    /// ```
604
0
    pub fn config() -> Config {
605
0
        Config::new()
606
0
    }
607
608
    /// Return a builder for configuring the construction of a
609
    /// `BoundedBacktracker`.
610
    ///
611
    /// This is a convenience routine to avoid needing to import the
612
    /// [`Builder`] type in common cases.
613
    ///
614
    /// # Example
615
    ///
616
    /// This example shows how to use the builder to disable UTF-8 mode
617
    /// everywhere.
618
    ///
619
    /// ```
620
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
621
    /// use regex_automata::{
622
    ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
623
    ///     util::syntax,
624
    ///     Match,
625
    /// };
626
    ///
627
    /// let re = BoundedBacktracker::builder()
628
    ///     .syntax(syntax::Config::new().utf8(false))
629
    ///     .thompson(thompson::Config::new().utf8(false))
630
    ///     .build(r"foo(?-u:[^b])ar.*")?;
631
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632
    ///
633
    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
634
    /// let expected = Some(Match::must(0, 1..9));
635
    /// re.try_captures(&mut cache, haystack, &mut caps)?;
636
    /// assert_eq!(expected, caps.get_match());
637
    ///
638
    /// # Ok::<(), Box<dyn std::error::Error>>(())
639
    /// ```
640
0
    pub fn builder() -> Builder {
641
0
        Builder::new()
642
0
    }
643
644
    /// Create a new cache for this regex.
645
    ///
646
    /// The cache returned should only be used for searches for this
647
    /// regex. If you want to reuse the cache for another regex, then you
648
    /// must call [`Cache::reset`] with that regex (or, equivalently,
649
    /// [`BoundedBacktracker::reset_cache`]).
650
15.3k
    pub fn create_cache(&self) -> Cache {
651
15.3k
        Cache::new(self)
652
15.3k
    }
653
654
    /// Create a new empty set of capturing groups that is guaranteed to be
655
    /// valid for the search APIs on this `BoundedBacktracker`.
656
    ///
657
    /// A `Captures` value created for a specific `BoundedBacktracker` cannot
658
    /// be used with any other `BoundedBacktracker`.
659
    ///
660
    /// This is a convenience function for [`Captures::all`]. See the
661
    /// [`Captures`] documentation for an explanation of its alternative
662
    /// constructors that permit the `BoundedBacktracker` to do less work
663
    /// during a search, and thus might make it faster.
664
0
    pub fn create_captures(&self) -> Captures {
665
0
        Captures::all(self.get_nfa().group_info().clone())
666
0
    }
667
668
    /// Reset the given cache such that it can be used for searching with the
669
    /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670
    ///
671
    /// A cache reset permits reusing memory already allocated in this cache
672
    /// with a different `BoundedBacktracker`.
673
    ///
674
    /// # Example
675
    ///
676
    /// This shows how to re-purpose a cache for use with a different
677
    /// `BoundedBacktracker`.
678
    ///
679
    /// ```
680
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
681
    /// use regex_automata::{
682
    ///     nfa::thompson::backtrack::BoundedBacktracker,
683
    ///     Match,
684
    /// };
685
    ///
686
    /// let re1 = BoundedBacktracker::new(r"\w")?;
687
    /// let re2 = BoundedBacktracker::new(r"\W")?;
688
    ///
689
    /// let mut cache = re1.create_cache();
690
    /// assert_eq!(
691
    ///     Some(Ok(Match::must(0, 0..2))),
692
    ///     re1.try_find_iter(&mut cache, "Δ").next(),
693
    /// );
694
    ///
695
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
696
    /// // incorrect results. In order to re-purpose the cache, we must reset
697
    /// // it with the BoundedBacktracker we'd like to use it with.
698
    /// //
699
    /// // Similarly, after this reset, using the cache with 're1' is also not
700
    /// // allowed.
701
    /// cache.reset(&re2);
702
    /// assert_eq!(
703
    ///     Some(Ok(Match::must(0, 0..3))),
704
    ///     re2.try_find_iter(&mut cache, "☃").next(),
705
    /// );
706
    ///
707
    /// # Ok::<(), Box<dyn std::error::Error>>(())
708
    /// ```
709
0
    pub fn reset_cache(&self, cache: &mut Cache) {
710
0
        cache.reset(self);
711
0
    }
712
713
    /// Returns the total number of patterns compiled into this
714
    /// `BoundedBacktracker`.
715
    ///
716
    /// In the case of a `BoundedBacktracker` that contains no patterns, this
717
    /// returns `0`.
718
    ///
719
    /// # Example
720
    ///
721
    /// This example shows the pattern length for a `BoundedBacktracker` that
722
    /// never matches:
723
    ///
724
    /// ```
725
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726
    ///
727
    /// let re = BoundedBacktracker::never_match()?;
728
    /// assert_eq!(re.pattern_len(), 0);
729
    /// # Ok::<(), Box<dyn std::error::Error>>(())
730
    /// ```
731
    ///
732
    /// And another example for a `BoundedBacktracker` that matches at every
733
    /// position:
734
    ///
735
    /// ```
736
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737
    ///
738
    /// let re = BoundedBacktracker::always_match()?;
739
    /// assert_eq!(re.pattern_len(), 1);
740
    /// # Ok::<(), Box<dyn std::error::Error>>(())
741
    /// ```
742
    ///
743
    /// And finally, a `BoundedBacktracker` that was constructed from multiple
744
    /// patterns:
745
    ///
746
    /// ```
747
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748
    ///
749
    /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750
    /// assert_eq!(re.pattern_len(), 3);
751
    /// # Ok::<(), Box<dyn std::error::Error>>(())
752
    /// ```
753
0
    pub fn pattern_len(&self) -> usize {
754
0
        self.nfa.pattern_len()
755
0
    }
756
757
    /// Return the config for this `BoundedBacktracker`.
758
    #[inline]
759
108k
    pub fn get_config(&self) -> &Config {
760
108k
        &self.config
761
108k
    }
762
763
    /// Returns a reference to the underlying NFA.
764
    #[inline]
765
125k
    pub fn get_nfa(&self) -> &NFA {
766
125k
        &self.nfa
767
125k
    }
768
769
    /// Returns the maximum haystack length supported by this backtracker.
770
    ///
771
    /// This routine is a function of both [`Config::visited_capacity`] and the
772
    /// internal size of the backtracker's NFA.
773
    ///
774
    /// # Example
775
    ///
776
    /// This example shows how the maximum haystack length can vary depending
777
    /// on the size of the regex itself. Note though that the specific maximum
778
    /// values here are not an API guarantee. The default visited capacity is
779
    /// subject to change and not covered by semver.
780
    ///
781
    /// ```
782
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
783
    /// use regex_automata::{
784
    ///     nfa::thompson::backtrack::BoundedBacktracker,
785
    ///     Match, MatchError,
786
    /// };
787
    ///
788
    /// // If you're only using ASCII, you get a big budget.
789
    /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790
    /// let mut cache = re.create_cache();
791
    /// assert_eq!(re.max_haystack_len(), 299_592);
792
    /// // Things work up to the max.
793
    /// let mut haystack = "a".repeat(299_592);
794
    /// let expected = Some(Ok(Match::must(0, 0..299_592)));
795
    /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796
    /// // But you'll get an error if you provide a haystack that's too big.
797
    /// // Notice that we use the 'try_find_iter' routine instead, which
798
    /// // yields Result<Match, MatchError> instead of Match.
799
    /// haystack.push('a');
800
    /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
801
    /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802
    ///
803
    /// // Unicode inflates the size of the underlying NFA quite a bit, and
804
    /// // thus means that the backtracker can only handle smaller haystacks,
805
    /// // assuming that the visited capacity remains unchanged.
806
    /// let re = BoundedBacktracker::new(r"\w+")?;
807
    /// assert!(re.max_haystack_len() <= 7_000);
808
    /// // But we can increase the visited capacity to handle bigger haystacks!
809
    /// let re = BoundedBacktracker::builder()
810
    ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
811
    ///     .build(r"\w+")?;
812
    /// assert!(re.max_haystack_len() >= 25_000);
813
    /// assert!(re.max_haystack_len() <= 28_000);
814
    /// # Ok::<(), Box<dyn std::error::Error>>(())
815
    /// ```
816
    #[inline]
817
31.0k
    pub fn max_haystack_len(&self) -> usize {
818
        // The capacity given in the config is "bytes of heap memory," but the
819
        // capacity we use here is "number of bits." So convert the capacity in
820
        // bytes to the capacity in bits.
821
31.0k
        let capacity = 8 * self.get_config().get_visited_capacity();
822
31.0k
        let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823
31.0k
        let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824
        // It's possible for `real_capacity` to be smaller than the number of
825
        // NFA states for particularly large regexes, so we saturate towards
826
        // zero.
827
31.0k
        (real_capacity / self.nfa.states().len()).saturating_sub(1)
828
31.0k
    }
829
}
830
831
impl BoundedBacktracker {
832
    /// Returns true if and only if this regex matches the given haystack.
833
    ///
834
    /// In the case of a backtracking regex engine, and unlike most other
835
    /// regex engines in this crate, short circuiting isn't practical. However,
836
    /// this routine may still be faster because it instructs backtracking to
837
    /// not keep track of any capturing groups.
838
    ///
839
    /// # Errors
840
    ///
841
    /// This routine only errors if the search could not complete. For this
842
    /// backtracking regex engine, this only occurs when the haystack length
843
    /// exceeds [`BoundedBacktracker::max_haystack_len`].
844
    ///
845
    /// When a search cannot complete, callers cannot know whether a match
846
    /// exists or not.
847
    ///
848
    /// # Example
849
    ///
850
    /// ```
851
    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852
    ///
853
    /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854
    /// let mut cache = re.create_cache();
855
    ///
856
    /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857
    /// assert!(!re.try_is_match(&mut cache, "foobar")?);
858
    /// # Ok::<(), Box<dyn std::error::Error>>(())
859
    /// ```
860
    ///
861
    /// # Example: consistency with search APIs
862
    ///
863
    /// `is_match` is guaranteed to return `true` whenever `find` returns a
864
    /// match. This includes searches that are executed entirely within a
865
    /// codepoint:
866
    ///
867
    /// ```
868
    /// use regex_automata::{
869
    ///     nfa::thompson::backtrack::BoundedBacktracker,
870
    ///     Input,
871
    /// };
872
    ///
873
    /// let re = BoundedBacktracker::new("a*")?;
874
    /// let mut cache = re.create_cache();
875
    ///
876
    /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
877
    /// # Ok::<(), Box<dyn std::error::Error>>(())
878
    /// ```
879
    ///
880
    /// Notice that when UTF-8 mode is disabled, then the above reports a
881
    /// match because the restriction against zero-width matches that split a
882
    /// codepoint has been lifted:
883
    ///
884
    /// ```
885
    /// use regex_automata::{
886
    ///     nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887
    ///     Input,
888
    /// };
889
    ///
890
    /// let re = BoundedBacktracker::builder()
891
    ///     .thompson(NFA::config().utf8(false))
892
    ///     .build("a*")?;
893
    /// let mut cache = re.create_cache();
894
    ///
895
    /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
896
    /// # Ok::<(), Box<dyn std::error::Error>>(())
897
    /// ```
898
    #[inline]
899
2.54k
    pub fn try_is_match<'h, I: Into<Input<'h>>>(
900
2.54k
        &self,
901
2.54k
        cache: &mut Cache,
902
2.54k
        input: I,
903
2.54k
    ) -> Result<bool, MatchError> {
904
2.54k
        let input = input.into().earliest(true);
905
2.54k
        self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
906
2.54k
    }
907
908
    /// Executes a leftmost forward search and returns a `Match` if one exists.
909
    ///
910
    /// This routine only includes the overall match span. To get
911
    /// access to the individual spans of each capturing group, use
912
    /// [`BoundedBacktracker::try_captures`].
913
    ///
914
    /// # Errors
915
    ///
916
    /// This routine only errors if the search could not complete. For this
917
    /// backtracking regex engine, this only occurs when the haystack length
918
    /// exceeds [`BoundedBacktracker::max_haystack_len`].
919
    ///
920
    /// When a search cannot complete, callers cannot know whether a match
921
    /// exists or not.
922
    ///
923
    /// # Example
924
    ///
925
    /// ```
926
    /// use regex_automata::{
927
    ///     nfa::thompson::backtrack::BoundedBacktracker,
928
    ///     Match,
929
    /// };
930
    ///
931
    /// let re = BoundedBacktracker::new("foo[0-9]+")?;
932
    /// let mut cache = re.create_cache();
933
    /// let expected = Match::must(0, 0..8);
934
    /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935
    ///
936
    /// # Ok::<(), Box<dyn std::error::Error>>(())
937
    /// ```
938
    #[inline]
939
    pub fn try_find<'h, I: Into<Input<'h>>>(
940
        &self,
941
        cache: &mut Cache,
942
        input: I,
943
    ) -> Result<Option<Match>, MatchError> {
944
        let input = input.into();
945
        if self.get_nfa().pattern_len() == 1 {
946
            let mut slots = [None, None];
947
            let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948
                None => return Ok(None),
949
                Some(pid) => pid,
950
            };
951
            let start = match slots[0] {
952
                None => return Ok(None),
953
                Some(s) => s.get(),
954
            };
955
            let end = match slots[1] {
956
                None => return Ok(None),
957
                Some(s) => s.get(),
958
            };
959
            return Ok(Some(Match::new(pid, Span { start, end })));
960
        }
961
        let ginfo = self.get_nfa().group_info();
962
        let slots_len = ginfo.implicit_slot_len();
963
        let mut slots = vec![None; slots_len];
964
        let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965
            None => return Ok(None),
966
            Some(pid) => pid,
967
        };
968
        let start = match slots[pid.as_usize() * 2] {
969
            None => return Ok(None),
970
            Some(s) => s.get(),
971
        };
972
        let end = match slots[pid.as_usize() * 2 + 1] {
973
            None => return Ok(None),
974
            Some(s) => s.get(),
975
        };
976
        Ok(Some(Match::new(pid, Span { start, end })))
977
    }
978
979
    /// Executes a leftmost forward search and writes the spans of capturing
980
    /// groups that participated in a match into the provided [`Captures`]
981
    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
982
    /// to return `false`.
983
    ///
984
    /// # Errors
985
    ///
986
    /// This routine only errors if the search could not complete. For this
987
    /// backtracking regex engine, this only occurs when the haystack length
988
    /// exceeds [`BoundedBacktracker::max_haystack_len`].
989
    ///
990
    /// When a search cannot complete, callers cannot know whether a match
991
    /// exists or not.
992
    ///
993
    /// # Example
994
    ///
995
    /// ```
996
    /// use regex_automata::{
997
    ///     nfa::thompson::backtrack::BoundedBacktracker,
998
    ///     Span,
999
    /// };
1000
    ///
1001
    /// let re = BoundedBacktracker::new(
1002
    ///     r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003
    /// )?;
1004
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005
    ///
1006
    /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007
    /// assert!(caps.is_match());
1008
    /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
1009
    /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
1010
    /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
1011
    ///
1012
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1013
    /// ```
1014
    #[inline]
1015
    pub fn try_captures<'h, I: Into<Input<'h>>>(
1016
        &self,
1017
        cache: &mut Cache,
1018
        input: I,
1019
        caps: &mut Captures,
1020
    ) -> Result<(), MatchError> {
1021
        self.try_search(cache, &input.into(), caps)
1022
    }
1023
1024
    /// Returns an iterator over all non-overlapping leftmost matches in the
1025
    /// given bytes. If no match exists, then the iterator yields no elements.
1026
    ///
1027
    /// If the regex engine returns an error at any point, then the iterator
1028
    /// will yield that error.
1029
    ///
1030
    /// # Example
1031
    ///
1032
    /// ```
1033
    /// use regex_automata::{
1034
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1035
    ///     Match, MatchError,
1036
    /// };
1037
    ///
1038
    /// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039
    /// let mut cache = re.create_cache();
1040
    ///
1041
    /// let text = "foo1 foo12 foo123";
1042
    /// let result: Result<Vec<Match>, MatchError> = re
1043
    ///     .try_find_iter(&mut cache, text)
1044
    ///     .collect();
1045
    /// let matches = result?;
1046
    /// assert_eq!(matches, vec![
1047
    ///     Match::must(0, 0..4),
1048
    ///     Match::must(0, 5..10),
1049
    ///     Match::must(0, 11..17),
1050
    /// ]);
1051
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1052
    /// ```
1053
    #[inline]
1054
    pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055
        &'r self,
1056
        cache: &'c mut Cache,
1057
        input: I,
1058
    ) -> TryFindMatches<'r, 'c, 'h> {
1059
        let caps = Captures::matches(self.get_nfa().group_info().clone());
1060
        let it = iter::Searcher::new(input.into());
1061
        TryFindMatches { re: self, cache, caps, it }
1062
    }
1063
1064
    /// Returns an iterator over all non-overlapping `Captures` values. If no
1065
    /// match exists, then the iterator yields no elements.
1066
    ///
1067
    /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068
    /// but it includes the spans of all capturing groups that participate in
1069
    /// each match.
1070
    ///
1071
    /// If the regex engine returns an error at any point, then the iterator
1072
    /// will yield that error.
1073
    ///
1074
    /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075
    /// how to correctly iterate over all matches in a haystack while avoiding
1076
    /// the creation of a new `Captures` value for every match. (Which you are
1077
    /// forced to do with an `Iterator`.)
1078
    ///
1079
    /// # Example
1080
    ///
1081
    /// ```
1082
    /// use regex_automata::{
1083
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1084
    ///     Span,
1085
    /// };
1086
    ///
1087
    /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088
    /// let mut cache = re.create_cache();
1089
    ///
1090
    /// let text = "foo1 foo12 foo123";
1091
    /// let mut spans = vec![];
1092
    /// for result in re.try_captures_iter(&mut cache, text) {
1093
    ///     let caps = result?;
1094
    ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
1095
    ///     spans.push(caps.get_group_by_name("numbers").unwrap());
1096
    /// }
1097
    /// assert_eq!(spans, vec![
1098
    ///     Span::from(3..4),
1099
    ///     Span::from(8..10),
1100
    ///     Span::from(14..17),
1101
    /// ]);
1102
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1103
    /// ```
1104
    #[inline]
1105
    pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106
        &'r self,
1107
        cache: &'c mut Cache,
1108
        input: I,
1109
    ) -> TryCapturesMatches<'r, 'c, 'h> {
1110
        let caps = self.create_captures();
1111
        let it = iter::Searcher::new(input.into());
1112
        TryCapturesMatches { re: self, cache, caps, it }
1113
    }
1114
}
1115
1116
impl BoundedBacktracker {
1117
    /// Executes a leftmost forward search and writes the spans of capturing
1118
    /// groups that participated in a match into the provided [`Captures`]
1119
    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120
    /// to return `false`.
1121
    ///
1122
    /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123
    /// concrete `&Input` instead of an `Into<Input>`.
1124
    ///
1125
    /// # Errors
1126
    ///
1127
    /// This routine only errors if the search could not complete. For this
1128
    /// backtracking regex engine, this only occurs when the haystack length
1129
    /// exceeds [`BoundedBacktracker::max_haystack_len`].
1130
    ///
1131
    /// When a search cannot complete, callers cannot know whether a match
1132
    /// exists or not.
1133
    ///
1134
    /// # Example: specific pattern search
1135
    ///
1136
    /// This example shows how to build a multi bounded backtracker that
1137
    /// permits searching for specific patterns.
1138
    ///
1139
    /// ```
1140
    /// use regex_automata::{
1141
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1142
    ///     Anchored, Input, Match, PatternID,
1143
    /// };
1144
    ///
1145
    /// let re = BoundedBacktracker::new_many(&[
1146
    ///     "[a-z0-9]{6}",
1147
    ///     "[a-z][a-z0-9]{5}",
1148
    /// ])?;
1149
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150
    /// let haystack = "foo123";
1151
    ///
1152
    /// // Since we are using the default leftmost-first match and both
1153
    /// // patterns match at the same starting position, only the first pattern
1154
    /// // will be returned in this case when doing a search for any of the
1155
    /// // patterns.
1156
    /// let expected = Some(Match::must(0, 0..6));
1157
    /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158
    /// assert_eq!(expected, caps.get_match());
1159
    ///
1160
    /// // But if we want to check whether some other pattern matches, then we
1161
    /// // can provide its pattern ID.
1162
    /// let expected = Some(Match::must(1, 0..6));
1163
    /// let input = Input::new(haystack)
1164
    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
1165
    /// re.try_search(&mut cache, &input, &mut caps)?;
1166
    /// assert_eq!(expected, caps.get_match());
1167
    ///
1168
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1169
    /// ```
1170
    ///
1171
    /// # Example: specifying the bounds of a search
1172
    ///
1173
    /// This example shows how providing the bounds of a search can produce
1174
    /// different results than simply sub-slicing the haystack.
1175
    ///
1176
    /// ```
1177
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178
    /// use regex_automata::{
1179
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1180
    ///     Match, Input,
1181
    /// };
1182
    ///
1183
    /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185
    /// let haystack = "foo123bar";
1186
    ///
1187
    /// // Since we sub-slice the haystack, the search doesn't know about
1188
    /// // the larger context and assumes that `123` is surrounded by word
1189
    /// // boundaries. And of course, the match position is reported relative
1190
    /// // to the sub-slice as well, which means we get `0..3` instead of
1191
    /// // `3..6`.
1192
    /// let expected = Some(Match::must(0, 0..3));
1193
    /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
1194
    /// assert_eq!(expected, caps.get_match());
1195
    ///
1196
    /// // But if we provide the bounds of the search within the context of the
1197
    /// // entire haystack, then the search can take the surrounding context
1198
    /// // into account. (And if we did find a match, it would be reported
1199
    /// // as a valid offset into `haystack` instead of its sub-slice.)
1200
    /// let expected = None;
1201
    /// re.try_search(
1202
    ///     &mut cache, &Input::new(haystack).range(3..6), &mut caps,
1203
    /// )?;
1204
    /// assert_eq!(expected, caps.get_match());
1205
    ///
1206
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1207
    /// ```
1208
    #[inline]
1209
    pub fn try_search(
1210
        &self,
1211
        cache: &mut Cache,
1212
        input: &Input<'_>,
1213
        caps: &mut Captures,
1214
    ) -> Result<(), MatchError> {
1215
        caps.set_pattern(None);
1216
        let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217
        caps.set_pattern(pid);
1218
        Ok(())
1219
    }
1220
1221
    /// Executes a leftmost forward search and writes the spans of capturing
1222
    /// groups that participated in a match into the provided `slots`, and
1223
    /// returns the matching pattern ID. The contents of the slots for patterns
1224
    /// other than the matching pattern are unspecified. If no match was found,
1225
    /// then `None` is returned and the contents of all `slots` is unspecified.
1226
    ///
1227
    /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228
    /// slots slice instead of a `Captures` value. This is useful in contexts
1229
    /// where you don't want or need to allocate a `Captures`.
1230
    ///
1231
    /// It is legal to pass _any_ number of slots to this routine. If the regex
1232
    /// engine would otherwise write a slot offset that doesn't fit in the
1233
    /// provided slice, then it is simply skipped. In general though, there are
1234
    /// usually three slice lengths you might want to use:
1235
    ///
1236
    /// * An empty slice, if you only care about which pattern matched.
1237
    /// * A slice with
1238
    /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1239
    /// slots, if you only care about the overall match spans for each matching
1240
    /// pattern.
1241
    /// * A slice with
1242
    /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243
    /// permits recording match offsets for every capturing group in every
1244
    /// pattern.
1245
    ///
1246
    /// # Errors
1247
    ///
1248
    /// This routine only errors if the search could not complete. For this
1249
    /// backtracking regex engine, this only occurs when the haystack length
1250
    /// exceeds [`BoundedBacktracker::max_haystack_len`].
1251
    ///
1252
    /// When a search cannot complete, callers cannot know whether a match
1253
    /// exists or not.
1254
    ///
1255
    /// # Example
1256
    ///
1257
    /// This example shows how to find the overall match offsets in a
1258
    /// multi-pattern search without allocating a `Captures` value. Indeed, we
1259
    /// can put our slots right on the stack.
1260
    ///
1261
    /// ```
1262
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263
    /// use regex_automata::{
1264
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1265
    ///     PatternID, Input,
1266
    /// };
1267
    ///
1268
    /// let re = BoundedBacktracker::new_many(&[
1269
    ///     r"\pL+",
1270
    ///     r"\d+",
1271
    /// ])?;
1272
    /// let mut cache = re.create_cache();
1273
    /// let input = Input::new("!@#123");
1274
    ///
1275
    /// // We only care about the overall match offsets here, so we just
1276
    /// // allocate two slots for each pattern. Each slot records the start
1277
    /// // and end of the match.
1278
    /// let mut slots = [None; 4];
1279
    /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280
    /// assert_eq!(Some(PatternID::must(1)), pid);
1281
    ///
1282
    /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1283
    /// // See 'GroupInfo' for more details on the mapping between groups and
1284
    /// // slot indices.
1285
    /// let slot_start = pid.unwrap().as_usize() * 2;
1286
    /// let slot_end = slot_start + 1;
1287
    /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1288
    /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1289
    ///
1290
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1291
    /// ```
1292
    #[inline]
1293
30.0k
    pub fn try_search_slots(
1294
30.0k
        &self,
1295
30.0k
        cache: &mut Cache,
1296
30.0k
        input: &Input<'_>,
1297
30.0k
        slots: &mut [Option<NonMaxUsize>],
1298
30.0k
    ) -> Result<Option<PatternID>, MatchError> {
1299
30.0k
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300
30.0k
        if !utf8empty {
1301
25.1k
            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302
25.1k
            return Ok(maybe_hm.map(|hm| hm.pattern()));
1303
4.94k
        }
1304
        // See PikeVM::try_search_slots for why we do this.
1305
4.94k
        let min = self.get_nfa().group_info().implicit_slot_len();
1306
4.94k
        if slots.len() >= min {
1307
4.94k
            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308
4.94k
            return Ok(maybe_hm.map(|hm| hm.pattern()));
1309
0
        }
1310
0
        if self.get_nfa().pattern_len() == 1 {
1311
0
            let mut enough = [None, None];
1312
0
            let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313
            // This is OK because we know `enough_slots` is strictly bigger
1314
            // than `slots`, otherwise this special case isn't reached.
1315
0
            slots.copy_from_slice(&enough[..slots.len()]);
1316
0
            return Ok(got.map(|hm| hm.pattern()));
1317
0
        }
1318
0
        let mut enough = vec![None; min];
1319
0
        let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320
        // This is OK because we know `enough_slots` is strictly bigger than
1321
        // `slots`, otherwise this special case isn't reached.
1322
0
        slots.copy_from_slice(&enough[..slots.len()]);
1323
0
        Ok(got.map(|hm| hm.pattern()))
1324
30.0k
    }
1325
1326
    /// This is the actual implementation of `try_search_slots_imp` that
1327
    /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328
    /// enabled, 2) the NFA can match the empty string and 3) the caller has
1329
    /// provided an insufficient number of slots to record match offsets.
1330
    #[inline(never)]
1331
30.0k
    fn try_search_slots_imp(
1332
30.0k
        &self,
1333
30.0k
        cache: &mut Cache,
1334
30.0k
        input: &Input<'_>,
1335
30.0k
        slots: &mut [Option<NonMaxUsize>],
1336
30.0k
    ) -> Result<Option<HalfMatch>, MatchError> {
1337
30.0k
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338
30.0k
        let hm = match self.search_imp(cache, input, slots)? {
1339
20.7k
            None => return Ok(None),
1340
6.07k
            Some(hm) if !utf8empty => return Ok(Some(hm)),
1341
3.19k
            Some(hm) => hm,
1342
        };
1343
10.4k
        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1344
10.4k
            Ok(self
1345
10.4k
                .search_imp(cache, input, slots)?
1346
10.4k
                .map(|hm| (hm, hm.offset())))
1347
10.4k
        })
1348
30.0k
    }
1349
1350
    /// The implementation of standard leftmost backtracking search.
1351
    ///
1352
    /// Capturing group spans are written to 'caps', but only if requested.
1353
    /// 'caps' can be one of three things: 1) totally empty, in which case, we
1354
    /// only report the pattern that matched or 2) only has slots for recording
1355
    /// the overall match offsets for any pattern or 3) has all slots available
1356
    /// for recording the spans of any groups participating in a match.
1357
40.5k
    fn search_imp(
1358
40.5k
        &self,
1359
40.5k
        cache: &mut Cache,
1360
40.5k
        input: &Input<'_>,
1361
40.5k
        slots: &mut [Option<NonMaxUsize>],
1362
40.5k
    ) -> Result<Option<HalfMatch>, MatchError> {
1363
        // Unlike in the PikeVM, we write our capturing group spans directly
1364
        // into the caller's captures groups. So we have to make sure we're
1365
        // starting with a blank slate first. In the PikeVM, we avoid this
1366
        // by construction: the spans that are copied to every slot in the
1367
        // 'Captures' value already account for presence/absence. In this
1368
        // backtracker, we write directly into the caller provided slots, where
1369
        // as in the PikeVM, we write into scratch space first and only copy
1370
        // them to the caller provided slots when a match is found.
1371
136k
        for slot in slots.iter_mut() {
1372
136k
            *slot = None;
1373
136k
        }
1374
40.5k
        cache.setup_search(&self, input)?;
1375
40.5k
        if input.is_done() {
1376
0
            return Ok(None);
1377
40.5k
        }
1378
40.5k
        let (anchored, start_id) = match input.get_anchored() {
1379
            // Only way we're unanchored is if both the caller asked for an
1380
            // unanchored search *and* the pattern is itself not anchored.
1381
38.4k
            Anchored::No => (
1382
38.4k
                self.nfa.is_always_start_anchored(),
1383
38.4k
                // We always use the anchored starting state here, even if
1384
38.4k
                // doing an unanchored search. The "unanchored" part of it is
1385
38.4k
                // implemented in the loop below, by simply trying the next
1386
38.4k
                // byte offset if the previous backtracking exploration failed.
1387
38.4k
                self.nfa.start_anchored(),
1388
38.4k
            ),
1389
0
            Anchored::Yes => (true, self.nfa.start_anchored()),
1390
2.15k
            Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391
0
                None => return Ok(None),
1392
2.15k
                Some(sid) => (true, sid),
1393
            },
1394
        };
1395
40.5k
        if anchored {
1396
3.19k
            let at = input.start();
1397
3.19k
            return Ok(self.backtrack(cache, input, at, start_id, slots));
1398
37.3k
        }
1399
37.3k
        let pre = self.get_config().get_prefilter();
1400
37.3k
        let mut at = input.start();
1401
2.73M
        while at <= input.end() {
1402
2.72M
            if let Some(ref pre) = pre {
1403
826k
                let span = Span::from(at..input.end());
1404
826k
                match pre.find(input.haystack(), span) {
1405
12.3k
                    None => break,
1406
814k
                    Some(ref span) => at = span.start,
1407
                }
1408
1.89M
            }
1409
2.71M
            if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410
            {
1411
17.3k
                return Ok(Some(hm));
1412
2.69M
            }
1413
2.69M
            at += 1;
1414
        }
1415
19.9k
        Ok(None)
1416
40.5k
    }
1417
1418
    /// Look for a match starting at `at` in `input` and write the matching
1419
    /// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420
    /// starting state in the underlying NFA.
1421
    ///
1422
    /// If no match was found, then the caller should increment `at` and try
1423
    /// at the next position.
1424
    #[cfg_attr(feature = "perf-inline", inline(always))]
1425
2.71M
    fn backtrack(
1426
2.71M
        &self,
1427
2.71M
        cache: &mut Cache,
1428
2.71M
        input: &Input<'_>,
1429
2.71M
        at: usize,
1430
2.71M
        start_id: StateID,
1431
2.71M
        slots: &mut [Option<NonMaxUsize>],
1432
2.71M
    ) -> Option<HalfMatch> {
1433
2.71M
        cache.stack.push(Frame::Step { sid: start_id, at });
1434
121M
        while let Some(frame) = cache.stack.pop() {
1435
118M
            match frame {
1436
77.0M
                Frame::Step { sid, at } => {
1437
77.0M
                    if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438
19.7k
                        return Some(hm);
1439
77.0M
                    }
1440
                }
1441
41.4M
                Frame::RestoreCapture { slot, offset } => {
1442
41.4M
                    slots[slot] = offset;
1443
41.4M
                }
1444
            }
1445
        }
1446
2.69M
        None
1447
2.71M
    }
1448
1449
    // LAMENTATION: The actual backtracking search is implemented in about
1450
    // 75 lines below. Yet this file is over 2,000 lines long. What have I
1451
    // done?
1452
1453
    /// Execute a "step" in the backtracing algorithm.
1454
    ///
1455
    /// A "step" is somewhat of a misnomer, because this routine keeps going
1456
    /// until it either runs out of things to try or fins a match. In the
1457
    /// former case, it may have pushed some things on to the backtracking
1458
    /// stack, in which case, those will be tried next as part of the
1459
    /// 'backtrack' routine above.
1460
    #[cfg_attr(feature = "perf-inline", inline(always))]
1461
77.0M
    fn step(
1462
77.0M
        &self,
1463
77.0M
        cache: &mut Cache,
1464
77.0M
        input: &Input<'_>,
1465
77.0M
        mut sid: StateID,
1466
77.0M
        mut at: usize,
1467
77.0M
        slots: &mut [Option<NonMaxUsize>],
1468
77.0M
    ) -> Option<HalfMatch> {
1469
        loop {
1470
183M
            if !cache.visited.insert(sid, at - input.start()) {
1471
23.5M
                return None;
1472
159M
            }
1473
159M
            match *self.nfa.state(sid) {
1474
6.14M
                State::ByteRange { ref trans } => {
1475
                    // Why do we need this? Unlike other regex engines in this
1476
                    // crate, the backtracker can steam roll ahead in the
1477
                    // haystack outside of the main loop over the bytes in the
1478
                    // haystack. While 'trans.matches()' below handles the case
1479
                    // of 'at' being out of bounds of 'input.haystack()', we
1480
                    // also need to handle the case of 'at' going out of bounds
1481
                    // of the span the caller asked to search.
1482
                    //
1483
                    // We should perhaps make the 'trans.matches()' API accept
1484
                    // an '&Input' instead of a '&[u8]'. Or at least, add a new
1485
                    // API that does it.
1486
6.14M
                    if at >= input.end() {
1487
142k
                        return None;
1488
6.00M
                    }
1489
6.00M
                    if !trans.matches(input.haystack(), at) {
1490
3.30M
                        return None;
1491
2.69M
                    }
1492
2.69M
                    sid = trans.next;
1493
2.69M
                    at += 1;
1494
                }
1495
21.4M
                State::Sparse(ref sparse) => {
1496
21.4M
                    if at >= input.end() {
1497
532k
                        return None;
1498
20.8M
                    }
1499
20.8M
                    sid = sparse.matches(input.haystack(), at)?;
1500
19.2M
                    at += 1;
1501
                }
1502
0
                State::Dense(ref dense) => {
1503
0
                    if at >= input.end() {
1504
0
                        return None;
1505
0
                    }
1506
0
                    sid = dense.matches(input.haystack(), at)?;
1507
0
                    at += 1;
1508
                }
1509
67.1M
                State::Look { look, next } => {
1510
                    // OK because we don't permit building a searcher with a
1511
                    // Unicode word boundary if the requisite Unicode data is
1512
                    // unavailable.
1513
67.1M
                    if !self.nfa.look_matcher().matches_inline(
1514
67.1M
                        look,
1515
67.1M
                        input.haystack(),
1516
67.1M
                        at,
1517
67.1M
                    ) {
1518
47.8M
                        return None;
1519
19.2M
                    }
1520
19.2M
                    sid = next;
1521
                }
1522
16.8M
                State::Union { ref alternates } => {
1523
16.8M
                    sid = match alternates.get(0) {
1524
0
                        None => return None,
1525
16.8M
                        Some(&sid) => sid,
1526
                    };
1527
16.8M
                    cache.stack.extend(
1528
16.8M
                        alternates[1..]
1529
16.8M
                            .iter()
1530
16.8M
                            .copied()
1531
16.8M
                            .rev()
1532
72.6M
                            .map(|sid| Frame::Step { sid, at }),
1533
                    );
1534
                }
1535
4.18M
                State::BinaryUnion { alt1, alt2 } => {
1536
4.18M
                    sid = alt1;
1537
4.18M
                    cache.stack.push(Frame::Step { sid: alt2, at });
1538
4.18M
                }
1539
43.8M
                State::Capture { next, slot, .. } => {
1540
43.8M
                    if slot.as_usize() < slots.len() {
1541
42.9M
                        cache.stack.push(Frame::RestoreCapture {
1542
42.9M
                            slot,
1543
42.9M
                            offset: slots[slot],
1544
42.9M
                        });
1545
42.9M
                        slots[slot] = NonMaxUsize::new(at);
1546
42.9M
                    }
1547
43.8M
                    sid = next;
1548
                }
1549
13.9k
                State::Fail => return None,
1550
19.7k
                State::Match { pattern_id } => {
1551
19.7k
                    return Some(HalfMatch::new(pattern_id, at));
1552
                }
1553
            }
1554
        }
1555
77.0M
    }
1556
}
1557
1558
/// An iterator over all non-overlapping matches for a fallible search.
1559
///
1560
/// The iterator yields a `Result<Match, MatchError` value until no more
1561
/// matches could be found.
1562
///
1563
/// The lifetime parameters are as follows:
1564
///
1565
/// * `'r` represents the lifetime of the BoundedBacktracker.
1566
/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1567
/// * `'h` represents the lifetime of the haystack being searched.
1568
///
1569
/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570
/// method.
1571
#[derive(Debug)]
1572
pub struct TryFindMatches<'r, 'c, 'h> {
1573
    re: &'r BoundedBacktracker,
1574
    cache: &'c mut Cache,
1575
    caps: Captures,
1576
    it: iter::Searcher<'h>,
1577
}
1578
1579
impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580
    type Item = Result<Match, MatchError>;
1581
1582
    #[inline]
1583
    fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584
        // Splitting 'self' apart seems necessary to appease borrowck.
1585
        let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586
            *self;
1587
        it.try_advance(|input| {
1588
            re.try_search(cache, input, caps)?;
1589
            Ok(caps.get_match())
1590
        })
1591
        .transpose()
1592
    }
1593
}
1594
1595
/// An iterator over all non-overlapping leftmost matches, with their capturing
1596
/// groups, for a fallible search.
1597
///
1598
/// The iterator yields a `Result<Captures, MatchError>` value until no more
1599
/// matches could be found.
1600
///
1601
/// The lifetime parameters are as follows:
1602
///
1603
/// * `'r` represents the lifetime of the BoundedBacktracker.
1604
/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1605
/// * `'h` represents the lifetime of the haystack being searched.
1606
///
1607
/// This iterator can be created with the
1608
/// [`BoundedBacktracker::try_captures_iter`] method.
1609
#[derive(Debug)]
1610
pub struct TryCapturesMatches<'r, 'c, 'h> {
1611
    re: &'r BoundedBacktracker,
1612
    cache: &'c mut Cache,
1613
    caps: Captures,
1614
    it: iter::Searcher<'h>,
1615
}
1616
1617
impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618
    type Item = Result<Captures, MatchError>;
1619
1620
    #[inline]
1621
    fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622
        // Splitting 'self' apart seems necessary to appease borrowck.
1623
        let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624
            *self;
1625
        let _ = it
1626
            .try_advance(|input| {
1627
                re.try_search(cache, input, caps)?;
1628
                Ok(caps.get_match())
1629
            })
1630
            .transpose()?;
1631
        if caps.is_match() {
1632
            Some(Ok(caps.clone()))
1633
        } else {
1634
            None
1635
        }
1636
    }
1637
}
1638
1639
/// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640
/// during a search.
1641
///
1642
/// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643
/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644
/// They are equivalent in every way, except the former does not require
1645
/// explicitly importing `Cache`.
1646
///
1647
/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648
/// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649
/// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650
/// it can only be used with the new `BoundedBacktracker` (and not the old
1651
/// one).
1652
#[derive(Clone, Debug)]
1653
pub struct Cache {
1654
    /// Stack used on the heap for doing backtracking instead of the
1655
    /// traditional recursive approach. We don't want recursion because then
1656
    /// we're likely to hit a stack overflow for bigger regexes.
1657
    stack: Vec<Frame>,
1658
    /// The set of (StateID, HaystackOffset) pairs that have been visited
1659
    /// by the backtracker within a single search. If such a pair has been
1660
    /// visited, then we avoid doing the work for that pair again. This is
1661
    /// what "bounds" the backtracking and prevents it from having worst case
1662
    /// exponential time.
1663
    visited: Visited,
1664
}
1665
1666
impl Cache {
1667
    /// Create a new [`BoundedBacktracker`] cache.
1668
    ///
1669
    /// A potentially more convenient routine to create a cache is
1670
    /// [`BoundedBacktracker::create_cache`], as it does not require also
1671
    /// importing the `Cache` type.
1672
    ///
1673
    /// If you want to reuse the returned `Cache` with some other
1674
    /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675
    /// desired `BoundedBacktracker`.
1676
15.3k
    pub fn new(re: &BoundedBacktracker) -> Cache {
1677
15.3k
        Cache { stack: vec![], visited: Visited::new(re) }
1678
15.3k
    }
1679
1680
    /// Reset this cache such that it can be used for searching with different
1681
    /// [`BoundedBacktracker`].
1682
    ///
1683
    /// A cache reset permits reusing memory already allocated in this cache
1684
    /// with a different `BoundedBacktracker`.
1685
    ///
1686
    /// # Example
1687
    ///
1688
    /// This shows how to re-purpose a cache for use with a different
1689
    /// `BoundedBacktracker`.
1690
    ///
1691
    /// ```
1692
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693
    /// use regex_automata::{
1694
    ///     nfa::thompson::backtrack::BoundedBacktracker,
1695
    ///     Match,
1696
    /// };
1697
    ///
1698
    /// let re1 = BoundedBacktracker::new(r"\w")?;
1699
    /// let re2 = BoundedBacktracker::new(r"\W")?;
1700
    ///
1701
    /// let mut cache = re1.create_cache();
1702
    /// assert_eq!(
1703
    ///     Some(Ok(Match::must(0, 0..2))),
1704
    ///     re1.try_find_iter(&mut cache, "Δ").next(),
1705
    /// );
1706
    ///
1707
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
1708
    /// // incorrect results. In order to re-purpose the cache, we must reset
1709
    /// // it with the BoundedBacktracker we'd like to use it with.
1710
    /// //
1711
    /// // Similarly, after this reset, using the cache with 're1' is also not
1712
    /// // allowed.
1713
    /// cache.reset(&re2);
1714
    /// assert_eq!(
1715
    ///     Some(Ok(Match::must(0, 0..3))),
1716
    ///     re2.try_find_iter(&mut cache, "☃").next(),
1717
    /// );
1718
    ///
1719
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1720
    /// ```
1721
0
    pub fn reset(&mut self, re: &BoundedBacktracker) {
1722
0
        self.visited.reset(re);
1723
0
    }
1724
1725
    /// Returns the heap memory usage, in bytes, of this cache.
1726
    ///
1727
    /// This does **not** include the stack size used up by this cache. To
1728
    /// compute that, use `std::mem::size_of::<Cache>()`.
1729
0
    pub fn memory_usage(&self) -> usize {
1730
0
        self.stack.len() * core::mem::size_of::<Frame>()
1731
0
            + self.visited.memory_usage()
1732
0
    }
1733
1734
    /// Clears this cache. This should be called at the start of every search
1735
    /// to ensure we start with a clean slate.
1736
    ///
1737
    /// This also sets the length of the capturing groups used in the current
1738
    /// search. This permits an optimization where by 'SlotTable::for_state'
1739
    /// only returns the number of slots equivalent to the number of slots
1740
    /// given in the 'Captures' value. This may be less than the total number
1741
    /// of possible slots, e.g., when one only wants to track overall match
1742
    /// offsets. This in turn permits less copying of capturing group spans
1743
    /// in the BoundedBacktracker.
1744
40.5k
    fn setup_search(
1745
40.5k
        &mut self,
1746
40.5k
        re: &BoundedBacktracker,
1747
40.5k
        input: &Input<'_>,
1748
40.5k
    ) -> Result<(), MatchError> {
1749
40.5k
        self.stack.clear();
1750
40.5k
        self.visited.setup_search(re, input)?;
1751
40.5k
        Ok(())
1752
40.5k
    }
1753
}
1754
1755
/// Represents a stack frame on the heap while doing backtracking.
1756
///
1757
/// Instead of using explicit recursion for backtracking, we use a stack on
1758
/// the heap to keep track of things that we want to explore if the current
1759
/// backtracking branch turns out to not lead to a match.
1760
#[derive(Clone, Debug)]
1761
enum Frame {
1762
    /// Look for a match starting at `sid` and the given position in the
1763
    /// haystack.
1764
    Step { sid: StateID, at: usize },
1765
    /// Reset the given `slot` to the given `offset` (which might be `None`).
1766
    /// This effectively gives a "scope" to capturing groups, such that an
1767
    /// offset for a particular group only gets returned if the match goes
1768
    /// through that capturing group. If backtracking ends up going down a
1769
    /// different branch that results in a different offset (or perhaps none at
1770
    /// all), then this "restore capture" frame will cause the offset to get
1771
    /// reset.
1772
    RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773
}
1774
1775
/// A bitset that keeps track of whether a particular (StateID, offset) has
1776
/// been considered during backtracking. If it has already been visited, then
1777
/// backtracking skips it. This is what gives backtracking its "bound."
1778
#[derive(Clone, Debug)]
1779
struct Visited {
1780
    /// The actual underlying bitset. Each element in the bitset corresponds
1781
    /// to a particular (StateID, offset) pair. States correspond to the rows
1782
    /// and the offsets correspond to the columns.
1783
    ///
1784
    /// If our underlying NFA has N states and the haystack we're searching
1785
    /// has M bytes, then we have N*(M+1) entries in our bitset table. The
1786
    /// M+1 occurs because our matches are delayed by one byte (to support
1787
    /// look-around), and so we need to handle the end position itself rather
1788
    /// than stopping just before the end. (If there is no end position, then
1789
    /// it's treated as "end-of-input," which is matched by things like '$'.)
1790
    ///
1791
    /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
1792
    /// blocks.
1793
    ///
1794
    /// We use 'usize' to represent our blocks because it makes some of the
1795
    /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796
    /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797
    bitset: Vec<usize>,
1798
    /// The stride represents one plus length of the haystack we're searching
1799
    /// (as described above). The stride must be initialized for each search.
1800
    stride: usize,
1801
}
1802
1803
impl Visited {
1804
    /// The size of each block, in bits.
1805
    const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
1806
1807
    /// Create a new visited set for the given backtracker.
1808
    ///
1809
    /// The set is ready to use, but must be setup at the beginning of each
1810
    /// search by calling `setup_search`.
1811
15.3k
    fn new(re: &BoundedBacktracker) -> Visited {
1812
15.3k
        let mut visited = Visited { bitset: vec![], stride: 0 };
1813
15.3k
        visited.reset(re);
1814
15.3k
        visited
1815
15.3k
    }
1816
1817
    /// Insert the given (StateID, offset) pair into this set. If it already
1818
    /// exists, then this is a no-op and it returns false. Otherwise this
1819
    /// returns true.
1820
183M
    fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821
183M
        let table_index = sid.as_usize() * self.stride + at;
1822
183M
        let block_index = table_index / Visited::BLOCK_SIZE;
1823
183M
        let bit = table_index % Visited::BLOCK_SIZE;
1824
183M
        let block_with_bit = 1 << bit;
1825
183M
        if self.bitset[block_index] & block_with_bit != 0 {
1826
23.5M
            return false;
1827
159M
        }
1828
159M
        self.bitset[block_index] |= block_with_bit;
1829
159M
        true
1830
183M
    }
1831
1832
    /// Reset this visited set to work with the given bounded backtracker.
1833
15.3k
    fn reset(&mut self, _: &BoundedBacktracker) {
1834
15.3k
        self.bitset.truncate(0);
1835
15.3k
    }
1836
1837
    /// Setup this visited set to work for a search using the given NFA
1838
    /// and input configuration. The NFA must be the same NFA used by the
1839
    /// BoundedBacktracker given to Visited::reset. Failing to call this might
1840
    /// result in panics or silently incorrect search behavior.
1841
40.5k
    fn setup_search(
1842
40.5k
        &mut self,
1843
40.5k
        re: &BoundedBacktracker,
1844
40.5k
        input: &Input<'_>,
1845
40.5k
    ) -> Result<(), MatchError> {
1846
        // Our haystack length is only the length of the span of the entire
1847
        // haystack that we'll be searching.
1848
40.5k
        let haylen = input.get_span().len();
1849
40.5k
        let err = || MatchError::haystack_too_long(haylen);
1850
        // Our stride is one more than the length of the input because our main
1851
        // search loop includes the position at input.end(). (And it does this
1852
        // because matches are delayed by one byte to account for look-around.)
1853
40.5k
        self.stride = haylen + 1;
1854
40.5k
        let needed_capacity =
1855
40.5k
            match re.get_nfa().states().len().checked_mul(self.stride) {
1856
0
                None => return Err(err()),
1857
40.5k
                Some(capacity) => capacity,
1858
            };
1859
40.5k
        let max_capacity = 8 * re.get_config().get_visited_capacity();
1860
40.5k
        if needed_capacity > max_capacity {
1861
0
            return Err(err());
1862
40.5k
        }
1863
40.5k
        let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864
40.5k
        self.bitset.truncate(needed_blocks);
1865
15.4M
        for block in self.bitset.iter_mut() {
1866
15.4M
            *block = 0;
1867
15.4M
        }
1868
40.5k
        if needed_blocks > self.bitset.len() {
1869
15.4k
            self.bitset.resize(needed_blocks, 0);
1870
25.0k
        }
1871
40.5k
        Ok(())
1872
40.5k
    }
1873
1874
    /// Return the heap memory usage, in bytes, of this visited set.
1875
0
    fn memory_usage(&self) -> usize {
1876
0
        self.bitset.len() * core::mem::size_of::<usize>()
1877
0
    }
1878
}
1879
1880
/// Integer division, but rounds up instead of down.
1881
71.5k
fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882
71.5k
    if lhs % rhs == 0 {
1883
32.3k
        lhs / rhs
1884
    } else {
1885
39.2k
        (lhs / rhs) + 1
1886
    }
1887
71.5k
}
1888
1889
#[cfg(test)]
1890
mod tests {
1891
    use super::*;
1892
1893
    // This is a regression test for the maximum haystack length computation.
1894
    // Previously, it assumed that the total capacity of the backtracker's
1895
    // bitset would always be greater than the number of NFA states. But there
1896
    // is of course no guarantee that this is true. This regression test
1897
    // ensures that not only does `max_haystack_len` not panic, but that it
1898
    // should return `0`.
1899
    #[cfg(feature = "syntax")]
1900
    #[test]
1901
    fn max_haystack_len_overflow() {
1902
        let re = BoundedBacktracker::builder()
1903
            .configure(BoundedBacktracker::config().visited_capacity(10))
1904
            .build(r"[0-9A-Za-z]{100}")
1905
            .unwrap();
1906
        assert_eq!(0, re.max_haystack_len());
1907
    }
1908
}