Coverage Report

Created: 2026-01-09 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.4.13/src/hybrid/dfa.rs
Line
Count
Source
1
/*!
2
Types and routines specific to lazy DFAs.
3
4
This module is the home of [`hybrid::dfa::DFA`](DFA).
5
6
This module also contains a [`hybrid::dfa::Builder`](Builder) and a
7
[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
8
*/
9
10
use core::{iter, mem::size_of};
11
12
use alloc::vec::Vec;
13
14
use crate::{
15
    hybrid::{
16
        error::{BuildError, CacheError, StartError},
17
        id::{LazyStateID, LazyStateIDError},
18
        search,
19
    },
20
    nfa::thompson,
21
    util::{
22
        alphabet::{self, ByteClasses, ByteSet},
23
        determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
24
        empty,
25
        prefilter::Prefilter,
26
        primitives::{PatternID, StateID as NFAStateID},
27
        search::{
28
            Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
29
        },
30
        sparse_set::SparseSets,
31
        start::{self, Start, StartByteMap},
32
    },
33
};
34
35
/// The minimum number of states that a lazy DFA's cache size must support.
36
///
37
/// This is checked at time of construction to ensure that at least some small
38
/// number of states can fit in the given capacity allotment. If we can't fit
39
/// at least this number of states, then the thinking is that it's pretty
40
/// senseless to use the lazy DFA. More to the point, parts of the code do
41
/// assume that the cache can fit at least some small number of states.
42
const MIN_STATES: usize = SENTINEL_STATES + 2;
43
44
/// The number of "sentinel" states that get added to every lazy DFA.
45
///
46
/// These are special states indicating status conditions of a search: unknown,
47
/// dead and quit. These states in particular also use zero NFA states, so
48
/// their memory usage is quite small. This is relevant for computing the
49
/// minimum memory needed for a lazy DFA cache.
50
const SENTINEL_STATES: usize = 3;
51
52
/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
53
///
54
/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
55
/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
56
/// Indeed, both support precisely the same regex features with precisely the
57
/// same semantics.
58
///
59
/// Where as a `dense::DFA` must be completely built to handle any input before
60
/// it may be used for search, a lazy DFA starts off effectively empty. During
61
/// a search, a lazy DFA will build itself depending on whether it has already
62
/// computed the next transition or not. If it has, then it looks a lot like
63
/// a `dense::DFA` internally: it does a very fast table based access to find
64
/// the next transition. Otherwise, if the state hasn't been computed, then it
65
/// does determinization _for that specific transition_ to compute the next DFA
66
/// state.
67
///
68
/// The main selling point of a lazy DFA is that, in practice, it has
69
/// the performance profile of a `dense::DFA` without the weakness of it
70
/// taking worst case exponential time to build. Indeed, for each byte of
71
/// input, the lazy DFA will construct as most one new DFA state. Thus, a
72
/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
73
/// pattern.len()` and `n ~ haystack.len()`).
74
///
75
/// The main downsides of a lazy DFA are:
76
///
77
/// 1. It requires mutable "cache" space during search. This is where the
78
/// transition table, among other things, is stored.
79
/// 2. In pathological cases (e.g., if the cache is too small), it will run
80
/// out of room and either require a bigger cache capacity or will repeatedly
81
/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
82
/// will tend to be slower than a typical NFA simulation.
83
///
84
/// # Capabilities
85
///
86
/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
87
/// operations:
88
///
89
/// 1. Detection of a match.
90
/// 2. Location of the end of a match.
91
/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
92
/// is reported as well.
93
///
94
/// A notable absence from the above list of capabilities is the location of
95
/// the *start* of a match. In order to provide both the start and end of
96
/// a match, *two* lazy DFAs are required. This functionality is provided by a
97
/// [`Regex`](crate::hybrid::regex::Regex).
98
///
99
/// # Example
100
///
101
/// This shows how to build a lazy DFA with the default configuration and
102
/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
103
/// a cache and pass it to our search routine.
104
///
105
/// ```
106
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
107
///
108
/// let dfa = DFA::new("foo[0-9]+")?;
109
/// let mut cache = dfa.create_cache();
110
///
111
/// let expected = Some(HalfMatch::must(0, 8));
112
/// assert_eq!(expected, dfa.try_search_fwd(
113
///     &mut cache, &Input::new("foo12345"))?,
114
/// );
115
/// # Ok::<(), Box<dyn std::error::Error>>(())
116
/// ```
117
#[derive(Clone, Debug)]
118
pub struct DFA {
119
    config: Config,
120
    nfa: thompson::NFA,
121
    stride2: usize,
122
    start_map: StartByteMap,
123
    classes: ByteClasses,
124
    quitset: ByteSet,
125
    cache_capacity: usize,
126
}
127
128
impl DFA {
129
    /// Parse the given regular expression using a default configuration and
130
    /// return the corresponding lazy DFA.
131
    ///
132
    /// If you want a non-default configuration, then use the [`Builder`] to
133
    /// set your own configuration.
134
    ///
135
    /// # Example
136
    ///
137
    /// ```
138
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
139
    ///
140
    /// let dfa = DFA::new("foo[0-9]+bar")?;
141
    /// let mut cache = dfa.create_cache();
142
    ///
143
    /// let expected = HalfMatch::must(0, 11);
144
    /// assert_eq!(
145
    ///     Some(expected),
146
    ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
147
    /// );
148
    /// # Ok::<(), Box<dyn std::error::Error>>(())
149
    /// ```
150
    #[cfg(feature = "syntax")]
151
0
    pub fn new(pattern: &str) -> Result<DFA, BuildError> {
152
0
        DFA::builder().build(pattern)
153
0
    }
154
155
    /// Parse the given regular expressions using a default configuration and
156
    /// return the corresponding lazy multi-DFA.
157
    ///
158
    /// If you want a non-default configuration, then use the [`Builder`] to
159
    /// set your own configuration.
160
    ///
161
    /// # Example
162
    ///
163
    /// ```
164
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
165
    ///
166
    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
167
    /// let mut cache = dfa.create_cache();
168
    ///
169
    /// let expected = HalfMatch::must(1, 3);
170
    /// assert_eq!(
171
    ///     Some(expected),
172
    ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
173
    /// );
174
    /// # Ok::<(), Box<dyn std::error::Error>>(())
175
    /// ```
176
    #[cfg(feature = "syntax")]
177
0
    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
178
0
        DFA::builder().build_many(patterns)
179
0
    }
180
181
    /// Create a new lazy DFA that matches every input.
182
    ///
183
    /// # Example
184
    ///
185
    /// ```
186
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
187
    ///
188
    /// let dfa = DFA::always_match()?;
189
    /// let mut cache = dfa.create_cache();
190
    ///
191
    /// let expected = HalfMatch::must(0, 0);
192
    /// assert_eq!(Some(expected), dfa.try_search_fwd(
193
    ///     &mut cache, &Input::new(""))?,
194
    /// );
195
    /// assert_eq!(Some(expected), dfa.try_search_fwd(
196
    ///     &mut cache, &Input::new("foo"))?,
197
    /// );
198
    /// # Ok::<(), Box<dyn std::error::Error>>(())
199
    /// ```
200
0
    pub fn always_match() -> Result<DFA, BuildError> {
201
0
        let nfa = thompson::NFA::always_match();
202
0
        Builder::new().build_from_nfa(nfa)
203
0
    }
204
205
    /// Create a new lazy DFA that never matches any input.
206
    ///
207
    /// # Example
208
    ///
209
    /// ```
210
    /// use regex_automata::{hybrid::dfa::DFA, Input};
211
    ///
212
    /// let dfa = DFA::never_match()?;
213
    /// let mut cache = dfa.create_cache();
214
    ///
215
    /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
216
    /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
217
    /// # Ok::<(), Box<dyn std::error::Error>>(())
218
    /// ```
219
0
    pub fn never_match() -> Result<DFA, BuildError> {
220
0
        let nfa = thompson::NFA::never_match();
221
0
        Builder::new().build_from_nfa(nfa)
222
0
    }
223
224
    /// Return a default configuration for a `DFA`.
225
    ///
226
    /// This is a convenience routine to avoid needing to import the [`Config`]
227
    /// type when customizing the construction of a lazy DFA.
228
    ///
229
    /// # Example
230
    ///
231
    /// This example shows how to build a lazy DFA that heuristically supports
232
    /// Unicode word boundaries.
233
    ///
234
    /// ```
235
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
236
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
237
    ///
238
    /// let re = DFA::builder()
239
    ///     .configure(DFA::config().unicode_word_boundary(true))
240
    ///     .build(r"\b\w+\b")?;
241
    /// let mut cache = re.create_cache();
242
    ///
243
    /// // Since our haystack is all ASCII, the DFA search sees then and knows
244
    /// // it is legal to interpret Unicode word boundaries as ASCII word
245
    /// // boundaries.
246
    /// let input = Input::new("!!foo!!");
247
    /// let expected = HalfMatch::must(0, 5);
248
    /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
249
    ///
250
    /// // But if our haystack contains non-ASCII, then the search will fail
251
    /// // with an error.
252
    /// let input = Input::new("!!βββ!!");
253
    /// let expected = MatchError::quit(b'\xCE', 2);
254
    /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
255
    ///
256
    /// # Ok::<(), Box<dyn std::error::Error>>(())
257
    /// ```
258
0
    pub fn config() -> Config {
259
0
        Config::new()
260
0
    }
261
262
    /// Return a builder for configuring the construction of a `Regex`.
263
    ///
264
    /// This is a convenience routine to avoid needing to import the
265
    /// [`Builder`] type in common cases.
266
    ///
267
    /// # Example
268
    ///
269
    /// This example shows how to use the builder to disable UTF-8 mode
270
    /// everywhere for lazy DFAs.
271
    ///
272
    /// ```
273
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
274
    /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
275
    ///
276
    /// let re = DFA::builder()
277
    ///     .syntax(syntax::Config::new().utf8(false))
278
    ///     .build(r"foo(?-u:[^b])ar.*")?;
279
    /// let mut cache = re.create_cache();
280
    ///
281
    /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
282
    /// let expected = Some(HalfMatch::must(0, 9));
283
    /// let got = re.try_search_fwd(&mut cache, &input)?;
284
    /// assert_eq!(expected, got);
285
    ///
286
    /// # Ok::<(), Box<dyn std::error::Error>>(())
287
    /// ```
288
0
    pub fn builder() -> Builder {
289
0
        Builder::new()
290
0
    }
291
292
    /// Create a new cache for this lazy DFA.
293
    ///
294
    /// The cache returned should only be used for searches for this
295
    /// lazy DFA. If you want to reuse the cache for another DFA, then
296
    /// you must call [`Cache::reset`] with that DFA (or, equivalently,
297
    /// [`DFA::reset_cache`]).
298
0
    pub fn create_cache(&self) -> Cache {
299
0
        Cache::new(self)
300
0
    }
301
302
    /// Reset the given cache such that it can be used for searching with the
303
    /// this lazy DFA (and only this DFA).
304
    ///
305
    /// A cache reset permits reusing memory already allocated in this cache
306
    /// with a different lazy DFA.
307
    ///
308
    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
309
    /// lazy DFA has been configured to "give up" after it has cleared the
310
    /// cache a certain number of times.
311
    ///
312
    /// Any lazy state ID generated by the cache prior to resetting it is
313
    /// invalid after the reset.
314
    ///
315
    /// # Example
316
    ///
317
    /// This shows how to re-purpose a cache for use with a different DFA.
318
    ///
319
    /// ```
320
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
321
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
322
    ///
323
    /// let dfa1 = DFA::new(r"\w")?;
324
    /// let dfa2 = DFA::new(r"\W")?;
325
    ///
326
    /// let mut cache = dfa1.create_cache();
327
    /// assert_eq!(
328
    ///     Some(HalfMatch::must(0, 2)),
329
    ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
330
    /// );
331
    ///
332
    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
333
    /// // incorrect results. In order to re-purpose the cache, we must reset
334
    /// // it with the DFA we'd like to use it with.
335
    /// //
336
    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
337
    /// // allowed.
338
    /// dfa2.reset_cache(&mut cache);
339
    /// assert_eq!(
340
    ///     Some(HalfMatch::must(0, 3)),
341
    ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
342
    /// );
343
    ///
344
    /// # Ok::<(), Box<dyn std::error::Error>>(())
345
    /// ```
346
0
    pub fn reset_cache(&self, cache: &mut Cache) {
347
0
        Lazy::new(self, cache).reset_cache()
348
0
    }
349
350
    /// Returns the total number of patterns compiled into this lazy DFA.
351
    ///
352
    /// In the case of a DFA that contains no patterns, this returns `0`.
353
    ///
354
    /// # Example
355
    ///
356
    /// This example shows the pattern length for a DFA that never matches:
357
    ///
358
    /// ```
359
    /// use regex_automata::hybrid::dfa::DFA;
360
    ///
361
    /// let dfa = DFA::never_match()?;
362
    /// assert_eq!(dfa.pattern_len(), 0);
363
    /// # Ok::<(), Box<dyn std::error::Error>>(())
364
    /// ```
365
    ///
366
    /// And another example for a DFA that matches at every position:
367
    ///
368
    /// ```
369
    /// use regex_automata::hybrid::dfa::DFA;
370
    ///
371
    /// let dfa = DFA::always_match()?;
372
    /// assert_eq!(dfa.pattern_len(), 1);
373
    /// # Ok::<(), Box<dyn std::error::Error>>(())
374
    /// ```
375
    ///
376
    /// And finally, a DFA that was constructed from multiple patterns:
377
    ///
378
    /// ```
379
    /// use regex_automata::hybrid::dfa::DFA;
380
    ///
381
    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
382
    /// assert_eq!(dfa.pattern_len(), 3);
383
    /// # Ok::<(), Box<dyn std::error::Error>>(())
384
    /// ```
385
0
    pub fn pattern_len(&self) -> usize {
386
0
        self.nfa.pattern_len()
387
0
    }
388
389
    /// Returns the equivalence classes that make up the alphabet for this DFA.
390
    ///
391
    /// Unless [`Config::byte_classes`] was disabled, it is possible that
392
    /// multiple distinct bytes are grouped into the same equivalence class
393
    /// if it is impossible for them to discriminate between a match and a
394
    /// non-match. This has the effect of reducing the overall alphabet size
395
    /// and in turn potentially substantially reducing the size of the DFA's
396
    /// transition table.
397
    ///
398
    /// The downside of using equivalence classes like this is that every state
399
    /// transition will automatically use this map to convert an arbitrary
400
    /// byte to its corresponding equivalence class. In practice this has a
401
    /// negligible impact on performance.
402
0
    pub fn byte_classes(&self) -> &ByteClasses {
403
0
        &self.classes
404
0
    }
405
406
    /// Returns this lazy DFA's configuration.
407
0
    pub fn get_config(&self) -> &Config {
408
0
        &self.config
409
0
    }
410
411
    /// Returns a reference to the underlying NFA.
412
0
    pub fn get_nfa(&self) -> &thompson::NFA {
413
0
        &self.nfa
414
0
    }
415
416
    /// Returns the stride, as a base-2 exponent, required for these
417
    /// equivalence classes.
418
    ///
419
    /// The stride is always the smallest power of 2 that is greater than or
420
    /// equal to the alphabet length. This is done so that converting between
421
    /// state IDs and indices can be done with shifts alone, which is much
422
    /// faster than integer division.
423
0
    fn stride2(&self) -> usize {
424
0
        self.stride2
425
0
    }
426
427
    /// Returns the total stride for every state in this lazy DFA. This
428
    /// corresponds to the total number of transitions used by each state in
429
    /// this DFA's transition table.
430
0
    fn stride(&self) -> usize {
431
0
        1 << self.stride2()
432
0
    }
433
434
    /// Returns the memory usage, in bytes, of this lazy DFA.
435
    ///
436
    /// This does **not** include the stack size used up by this lazy DFA. To
437
    /// compute that, use `std::mem::size_of::<DFA>()`. This also does not
438
    /// include the size of the `Cache` used.
439
    ///
440
    /// This also does not include any heap memory used by the NFA inside of
441
    /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
442
    /// thus not owned by this hybrid NFA/DFA. More practically, several regex
443
    /// engines in this crate embed an NFA, and reporting the NFA's memory
444
    /// usage in all of them would likely result in reporting higher heap
445
    /// memory than is actually used.
446
0
    pub fn memory_usage(&self) -> usize {
447
        // The only thing that uses heap memory in a DFA is the NFA. But the
448
        // NFA has shared ownership, so reporting its memory as part of the
449
        // hybrid DFA is likely to lead to double-counting the NFA memory
450
        // somehow. In particular, this DFA does not really own an NFA, so
451
        // including it in the DFA's memory usage doesn't seem semantically
452
        // correct.
453
0
        0
454
0
    }
455
}
456
457
impl DFA {
458
    /// Executes a forward search and returns the end position of the leftmost
459
    /// match that is found. If no match exists, then `None` is returned.
460
    ///
461
    /// In particular, this method continues searching even after it enters
462
    /// a match state. The search only terminates once it has reached the
463
    /// end of the input or when it has entered a dead or quit state. Upon
464
    /// termination, the position of the last byte seen while still in a match
465
    /// state is returned.
466
    ///
467
    /// # Errors
468
    ///
469
    /// This routine errors if the search could not complete. This can occur
470
    /// in a number of circumstances:
471
    ///
472
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
473
    /// For example, setting quit bytes or enabling heuristic support for
474
    /// Unicode word boundaries. The default configuration does not enable any
475
    /// option that could result in the lazy DFA quitting.
476
    /// * The configuration of the lazy DFA may also permit it to "give up"
477
    /// on a search if it makes ineffective use of its transition table
478
    /// cache. The default configuration does not enable this by default,
479
    /// although it is typically a good idea to.
480
    /// * When the provided `Input` configuration is not supported. For
481
    /// example, by providing an unsupported anchor mode.
482
    ///
483
    /// When a search returns an error, callers cannot know whether a match
484
    /// exists or not.
485
    ///
486
    /// # Example
487
    ///
488
    /// This example shows how to run a basic search.
489
    ///
490
    /// ```
491
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
492
    ///
493
    /// let dfa = DFA::new("foo[0-9]+")?;
494
    /// let mut cache = dfa.create_cache();
495
    /// let expected = HalfMatch::must(0, 8);
496
    /// assert_eq!(Some(expected), dfa.try_search_fwd(
497
    ///     &mut cache, &Input::new("foo12345"))?,
498
    /// );
499
    ///
500
    /// // Even though a match is found after reading the first byte (`a`),
501
    /// // the leftmost first match semantics demand that we find the earliest
502
    /// // match that prefers earlier parts of the pattern over later parts.
503
    /// let dfa = DFA::new("abc|a")?;
504
    /// let mut cache = dfa.create_cache();
505
    /// let expected = HalfMatch::must(0, 3);
506
    /// assert_eq!(Some(expected), dfa.try_search_fwd(
507
    ///     &mut cache, &Input::new("abc"))?,
508
    /// );
509
    ///
510
    /// # Ok::<(), Box<dyn std::error::Error>>(())
511
    /// ```
512
    ///
513
    /// # Example: specific pattern search
514
    ///
515
    /// This example shows how to build a lazy multi-DFA that permits searching
516
    /// for specific patterns.
517
    ///
518
    /// ```
519
    /// use regex_automata::{
520
    ///     hybrid::dfa::DFA,
521
    ///     Anchored, HalfMatch, PatternID, Input,
522
    /// };
523
    ///
524
    /// let dfa = DFA::builder()
525
    ///     .configure(DFA::config().starts_for_each_pattern(true))
526
    ///     .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
527
    /// let mut cache = dfa.create_cache();
528
    /// let haystack = "foo123";
529
    ///
530
    /// // Since we are using the default leftmost-first match and both
531
    /// // patterns match at the same starting position, only the first pattern
532
    /// // will be returned in this case when doing a search for any of the
533
    /// // patterns.
534
    /// let expected = Some(HalfMatch::must(0, 6));
535
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
536
    /// assert_eq!(expected, got);
537
    ///
538
    /// // But if we want to check whether some other pattern matches, then we
539
    /// // can provide its pattern ID.
540
    /// let expected = Some(HalfMatch::must(1, 6));
541
    /// let input = Input::new(haystack)
542
    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
543
    /// let got = dfa.try_search_fwd(&mut cache, &input)?;
544
    /// assert_eq!(expected, got);
545
    ///
546
    /// # Ok::<(), Box<dyn std::error::Error>>(())
547
    /// ```
548
    ///
549
    /// # Example: specifying the bounds of a search
550
    ///
551
    /// This example shows how providing the bounds of a search can produce
552
    /// different results than simply sub-slicing the haystack.
553
    ///
554
    /// ```
555
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
556
    ///
557
    /// // N.B. We disable Unicode here so that we use a simple ASCII word
558
    /// // boundary. Alternatively, we could enable heuristic support for
559
    /// // Unicode word boundaries since our haystack is pure ASCII.
560
    /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
561
    /// let mut cache = dfa.create_cache();
562
    /// let haystack = "foo123bar";
563
    ///
564
    /// // Since we sub-slice the haystack, the search doesn't know about the
565
    /// // larger context and assumes that `123` is surrounded by word
566
    /// // boundaries. And of course, the match position is reported relative
567
    /// // to the sub-slice as well, which means we get `3` instead of `6`.
568
    /// let expected = Some(HalfMatch::must(0, 3));
569
    /// let got = dfa.try_search_fwd(
570
    ///     &mut cache,
571
    ///     &Input::new(&haystack[3..6]),
572
    /// )?;
573
    /// assert_eq!(expected, got);
574
    ///
575
    /// // But if we provide the bounds of the search within the context of the
576
    /// // entire haystack, then the search can take the surrounding context
577
    /// // into account. (And if we did find a match, it would be reported
578
    /// // as a valid offset into `haystack` instead of its sub-slice.)
579
    /// let expected = None;
580
    /// let got = dfa.try_search_fwd(
581
    ///     &mut cache,
582
    ///     &Input::new(haystack).range(3..6),
583
    /// )?;
584
    /// assert_eq!(expected, got);
585
    ///
586
    /// # Ok::<(), Box<dyn std::error::Error>>(())
587
    /// ```
588
    #[inline]
589
0
    pub fn try_search_fwd(
590
0
        &self,
591
0
        cache: &mut Cache,
592
0
        input: &Input<'_>,
593
0
    ) -> Result<Option<HalfMatch>, MatchError> {
594
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
595
0
        let hm = match search::find_fwd(self, cache, input)? {
596
0
            None => return Ok(None),
597
0
            Some(hm) if !utf8empty => return Ok(Some(hm)),
598
0
            Some(hm) => hm,
599
        };
600
        // We get to this point when we know our DFA can match the empty string
601
        // AND when UTF-8 mode is enabled. In this case, we skip any matches
602
        // whose offset splits a codepoint. Such a match is necessarily a
603
        // zero-width match, because UTF-8 mode requires the underlying NFA
604
        // to be built such that all non-empty matches span valid UTF-8.
605
        // Therefore, any match that ends in the middle of a codepoint cannot
606
        // be part of a span of valid UTF-8 and thus must be an empty match.
607
        // In such cases, we skip it, so as not to report matches that split a
608
        // codepoint.
609
        //
610
        // Note that this is not a checked assumption. Callers *can* provide an
611
        // NFA with UTF-8 mode enabled but produces non-empty matches that span
612
        // invalid UTF-8. But doing so is documented to result in unspecified
613
        // behavior.
614
0
        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
615
0
            let got = search::find_fwd(self, cache, input)?;
616
0
            Ok(got.map(|hm| (hm, hm.offset())))
617
0
        })
618
0
    }
619
620
    /// Executes a reverse search and returns the start of the position of the
621
    /// leftmost match that is found. If no match exists, then `None` is
622
    /// returned.
623
    ///
624
    /// # Errors
625
    ///
626
    /// This routine errors if the search could not complete. This can occur
627
    /// in a number of circumstances:
628
    ///
629
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
630
    /// For example, setting quit bytes or enabling heuristic support for
631
    /// Unicode word boundaries. The default configuration does not enable any
632
    /// option that could result in the lazy DFA quitting.
633
    /// * The configuration of the lazy DFA may also permit it to "give up"
634
    /// on a search if it makes ineffective use of its transition table
635
    /// cache. The default configuration does not enable this by default,
636
    /// although it is typically a good idea to.
637
    /// * When the provided `Input` configuration is not supported. For
638
    /// example, by providing an unsupported anchor mode.
639
    ///
640
    /// When a search returns an error, callers cannot know whether a match
641
    /// exists or not.
642
    ///
643
    /// # Example
644
    ///
645
    /// This routine is principally useful when used in
646
    /// conjunction with the
647
    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
648
    /// configuration. In general, it's unlikely to be correct to use both
649
    /// `try_search_fwd` and `try_search_rev` with the same DFA since any
650
    /// particular DFA will only support searching in one direction with
651
    /// respect to the pattern.
652
    ///
653
    /// ```
654
    /// use regex_automata::{
655
    ///     nfa::thompson,
656
    ///     hybrid::dfa::DFA,
657
    ///     HalfMatch, Input,
658
    /// };
659
    ///
660
    /// let dfa = DFA::builder()
661
    ///     .thompson(thompson::Config::new().reverse(true))
662
    ///     .build("foo[0-9]+")?;
663
    /// let mut cache = dfa.create_cache();
664
    /// let expected = HalfMatch::must(0, 0);
665
    /// assert_eq!(
666
    ///     Some(expected),
667
    ///     dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
668
    /// );
669
    ///
670
    /// // Even though a match is found after reading the last byte (`c`),
671
    /// // the leftmost first match semantics demand that we find the earliest
672
    /// // match that prefers earlier parts of the pattern over latter parts.
673
    /// let dfa = DFA::builder()
674
    ///     .thompson(thompson::Config::new().reverse(true))
675
    ///     .build("abc|c")?;
676
    /// let mut cache = dfa.create_cache();
677
    /// let expected = HalfMatch::must(0, 0);
678
    /// assert_eq!(Some(expected), dfa.try_search_rev(
679
    ///     &mut cache, &Input::new("abc"))?,
680
    /// );
681
    ///
682
    /// # Ok::<(), Box<dyn std::error::Error>>(())
683
    /// ```
684
    ///
685
    /// # Example: UTF-8 mode
686
    ///
687
    /// This examples demonstrates that UTF-8 mode applies to reverse
688
    /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
689
    /// matches reported must correspond to valid UTF-8 spans. This includes
690
    /// prohibiting zero-width matches that split a codepoint.
691
    ///
692
    /// UTF-8 mode is enabled by default. Notice below how the only zero-width
693
    /// matches reported are those at UTF-8 boundaries:
694
    ///
695
    /// ```
696
    /// use regex_automata::{
697
    ///     hybrid::dfa::DFA,
698
    ///     nfa::thompson,
699
    ///     HalfMatch, Input, MatchKind,
700
    /// };
701
    ///
702
    /// let dfa = DFA::builder()
703
    ///     .thompson(thompson::Config::new().reverse(true))
704
    ///     .build(r"")?;
705
    /// let mut cache = dfa.create_cache();
706
    ///
707
    /// // Run the reverse DFA to collect all matches.
708
    /// let mut input = Input::new("☃");
709
    /// let mut matches = vec![];
710
    /// loop {
711
    ///     match dfa.try_search_rev(&mut cache, &input)? {
712
    ///         None => break,
713
    ///         Some(hm) => {
714
    ///             matches.push(hm);
715
    ///             if hm.offset() == 0 || input.end() == 0 {
716
    ///                 break;
717
    ///             } else if hm.offset() < input.end() {
718
    ///                 input.set_end(hm.offset());
719
    ///             } else {
720
    ///                 // This is only necessary to handle zero-width
721
    ///                 // matches, which of course occur in this example.
722
    ///                 // Without this, the search would never advance
723
    ///                 // backwards beyond the initial match.
724
    ///                 input.set_end(input.end() - 1);
725
    ///             }
726
    ///         }
727
    ///     }
728
    /// }
729
    ///
730
    /// // No matches split a codepoint.
731
    /// let expected = vec![
732
    ///     HalfMatch::must(0, 3),
733
    ///     HalfMatch::must(0, 0),
734
    /// ];
735
    /// assert_eq!(expected, matches);
736
    ///
737
    /// # Ok::<(), Box<dyn std::error::Error>>(())
738
    /// ```
739
    ///
740
    /// Now let's look at the same example, but with UTF-8 mode on the
741
    /// underlying NFA disabled:
742
    ///
743
    /// ```
744
    /// use regex_automata::{
745
    ///     hybrid::dfa::DFA,
746
    ///     nfa::thompson,
747
    ///     HalfMatch, Input, MatchKind,
748
    /// };
749
    ///
750
    /// let dfa = DFA::builder()
751
    ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
752
    ///     .build(r"")?;
753
    /// let mut cache = dfa.create_cache();
754
    ///
755
    /// // Run the reverse DFA to collect all matches.
756
    /// let mut input = Input::new("☃");
757
    /// let mut matches = vec![];
758
    /// loop {
759
    ///     match dfa.try_search_rev(&mut cache, &input)? {
760
    ///         None => break,
761
    ///         Some(hm) => {
762
    ///             matches.push(hm);
763
    ///             if hm.offset() == 0 || input.end() == 0 {
764
    ///                 break;
765
    ///             } else if hm.offset() < input.end() {
766
    ///                 input.set_end(hm.offset());
767
    ///             } else {
768
    ///                 // This is only necessary to handle zero-width
769
    ///                 // matches, which of course occur in this example.
770
    ///                 // Without this, the search would never advance
771
    ///                 // backwards beyond the initial match.
772
    ///                 input.set_end(input.end() - 1);
773
    ///             }
774
    ///         }
775
    ///     }
776
    /// }
777
    ///
778
    /// // No matches split a codepoint.
779
    /// let expected = vec![
780
    ///     HalfMatch::must(0, 3),
781
    ///     HalfMatch::must(0, 2),
782
    ///     HalfMatch::must(0, 1),
783
    ///     HalfMatch::must(0, 0),
784
    /// ];
785
    /// assert_eq!(expected, matches);
786
    ///
787
    /// # Ok::<(), Box<dyn std::error::Error>>(())
788
    /// ```
789
    #[inline]
790
0
    pub fn try_search_rev(
791
0
        &self,
792
0
        cache: &mut Cache,
793
0
        input: &Input<'_>,
794
0
    ) -> Result<Option<HalfMatch>, MatchError> {
795
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
796
0
        let hm = match search::find_rev(self, cache, input)? {
797
0
            None => return Ok(None),
798
0
            Some(hm) if !utf8empty => return Ok(Some(hm)),
799
0
            Some(hm) => hm,
800
        };
801
0
        empty::skip_splits_rev(input, hm, hm.offset(), |input| {
802
0
            let got = search::find_rev(self, cache, input)?;
803
0
            Ok(got.map(|hm| (hm, hm.offset())))
804
0
        })
805
0
    }
806
807
    /// Executes an overlapping forward search and returns the end position of
808
    /// matches as they are found. If no match exists, then `None` is returned.
809
    ///
810
    /// This routine is principally only useful when searching for multiple
811
    /// patterns on inputs where multiple patterns may match the same regions
812
    /// of text. In particular, callers must preserve the automaton's search
813
    /// state from prior calls so that the implementation knows where the last
814
    /// match occurred.
815
    ///
816
    /// When using this routine to implement an iterator of overlapping
817
    /// matches, the `start` of the search should remain invariant throughout
818
    /// iteration. The `OverlappingState` given to the search will keep track
819
    /// of the current position of the search. (This is because multiple
820
    /// matches may be reported at the same position, so only the search
821
    /// implementation itself knows when to advance the position.)
822
    ///
823
    /// If for some reason you want the search to forget about its previous
824
    /// state and restart the search at a particular position, then setting the
825
    /// state to [`OverlappingState::start`] will accomplish that.
826
    ///
827
    /// # Errors
828
    ///
829
    /// This routine errors if the search could not complete. This can occur
830
    /// in a number of circumstances:
831
    ///
832
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
833
    /// For example, setting quit bytes or enabling heuristic support for
834
    /// Unicode word boundaries. The default configuration does not enable any
835
    /// option that could result in the lazy DFA quitting.
836
    /// * The configuration of the lazy DFA may also permit it to "give up"
837
    /// on a search if it makes ineffective use of its transition table
838
    /// cache. The default configuration does not enable this by default,
839
    /// although it is typically a good idea to.
840
    /// * When the provided `Input` configuration is not supported. For
841
    /// example, by providing an unsupported anchor mode.
842
    ///
843
    /// When a search returns an error, callers cannot know whether a match
844
    /// exists or not.
845
    ///
846
    /// # Example
847
    ///
848
    /// This example shows how to run a basic overlapping search. Notice
849
    /// that we build the automaton with a `MatchKind::All` configuration.
850
    /// Overlapping searches are unlikely to work as one would expect when
851
    /// using the default `MatchKind::LeftmostFirst` match semantics, since
852
    /// leftmost-first matching is fundamentally incompatible with overlapping
853
    /// searches. Namely, overlapping searches need to report matches as they
854
    /// are seen, where as leftmost-first searches will continue searching even
855
    /// after a match has been observed in order to find the conventional end
856
    /// position of the match. More concretely, leftmost-first searches use
857
    /// dead states to terminate a search after a specific match can no longer
858
    /// be extended. Overlapping searches instead do the opposite by continuing
859
    /// the search to find totally new matches (potentially of other patterns).
860
    ///
861
    /// ```
862
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
863
    /// use regex_automata::{
864
    ///     hybrid::dfa::{DFA, OverlappingState},
865
    ///     HalfMatch, Input, MatchKind,
866
    /// };
867
    ///
868
    /// let dfa = DFA::builder()
869
    ///     .configure(DFA::config().match_kind(MatchKind::All))
870
    ///     .build_many(&[r"\w+$", r"\S+$"])?;
871
    /// let mut cache = dfa.create_cache();
872
    ///
873
    /// let haystack = "@foo";
874
    /// let mut state = OverlappingState::start();
875
    ///
876
    /// let expected = Some(HalfMatch::must(1, 4));
877
    /// dfa.try_search_overlapping_fwd(
878
    ///     &mut cache, &Input::new(haystack), &mut state,
879
    /// )?;
880
    /// assert_eq!(expected, state.get_match());
881
    ///
882
    /// // The first pattern also matches at the same position, so re-running
883
    /// // the search will yield another match. Notice also that the first
884
    /// // pattern is returned after the second. This is because the second
885
    /// // pattern begins its match before the first, is therefore an earlier
886
    /// // match and is thus reported first.
887
    /// let expected = Some(HalfMatch::must(0, 4));
888
    /// dfa.try_search_overlapping_fwd(
889
    ///     &mut cache, &Input::new(haystack), &mut state,
890
    /// )?;
891
    /// assert_eq!(expected, state.get_match());
892
    ///
893
    /// # Ok::<(), Box<dyn std::error::Error>>(())
894
    /// ```
895
    #[inline]
896
0
    pub fn try_search_overlapping_fwd(
897
0
        &self,
898
0
        cache: &mut Cache,
899
0
        input: &Input<'_>,
900
0
        state: &mut OverlappingState,
901
0
    ) -> Result<(), MatchError> {
902
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
903
0
        search::find_overlapping_fwd(self, cache, input, state)?;
904
0
        match state.get_match() {
905
0
            None => Ok(()),
906
0
            Some(_) if !utf8empty => Ok(()),
907
0
            Some(_) => skip_empty_utf8_splits_overlapping(
908
0
                input,
909
0
                state,
910
0
                |input, state| {
911
0
                    search::find_overlapping_fwd(self, cache, input, state)
912
0
                },
913
            ),
914
        }
915
0
    }
916
917
    /// Executes a reverse overlapping search and returns the start of the
918
    /// position of the leftmost match that is found. If no match exists, then
919
    /// `None` is returned.
920
    ///
921
    /// When using this routine to implement an iterator of overlapping
922
    /// matches, the `start` of the search should remain invariant throughout
923
    /// iteration. The `OverlappingState` given to the search will keep track
924
    /// of the current position of the search. (This is because multiple
925
    /// matches may be reported at the same position, so only the search
926
    /// implementation itself knows when to advance the position.)
927
    ///
928
    /// If for some reason you want the search to forget about its previous
929
    /// state and restart the search at a particular position, then setting the
930
    /// state to [`OverlappingState::start`] will accomplish that.
931
    ///
932
    /// # Errors
933
    ///
934
    /// This routine errors if the search could not complete. This can occur
935
    /// in a number of circumstances:
936
    ///
937
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
938
    /// For example, setting quit bytes or enabling heuristic support for
939
    /// Unicode word boundaries. The default configuration does not enable any
940
    /// option that could result in the lazy DFA quitting.
941
    /// * The configuration of the lazy DFA may also permit it to "give up"
942
    /// on a search if it makes ineffective use of its transition table
943
    /// cache. The default configuration does not enable this by default,
944
    /// although it is typically a good idea to.
945
    /// * When the provided `Input` configuration is not supported. For
946
    /// example, by providing an unsupported anchor mode.
947
    ///
948
    /// When a search returns an error, callers cannot know whether a match
949
    /// exists or not.
950
    ///
951
    /// # Example: UTF-8 mode
952
    ///
953
    /// This examples demonstrates that UTF-8 mode applies to reverse
954
    /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
955
    /// matches reported must correspond to valid UTF-8 spans. This includes
956
    /// prohibiting zero-width matches that split a codepoint.
957
    ///
958
    /// UTF-8 mode is enabled by default. Notice below how the only zero-width
959
    /// matches reported are those at UTF-8 boundaries:
960
    ///
961
    /// ```
962
    /// use regex_automata::{
963
    ///     hybrid::dfa::{DFA, OverlappingState},
964
    ///     nfa::thompson,
965
    ///     HalfMatch, Input, MatchKind,
966
    /// };
967
    ///
968
    /// let dfa = DFA::builder()
969
    ///     .configure(DFA::config().match_kind(MatchKind::All))
970
    ///     .thompson(thompson::Config::new().reverse(true))
971
    ///     .build_many(&[r"", r"☃"])?;
972
    /// let mut cache = dfa.create_cache();
973
    ///
974
    /// // Run the reverse DFA to collect all matches.
975
    /// let input = Input::new("☃");
976
    /// let mut state = OverlappingState::start();
977
    /// let mut matches = vec![];
978
    /// loop {
979
    ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
980
    ///     match state.get_match() {
981
    ///         None => break,
982
    ///         Some(hm) => matches.push(hm),
983
    ///     }
984
    /// }
985
    ///
986
    /// // No matches split a codepoint.
987
    /// let expected = vec![
988
    ///     HalfMatch::must(0, 3),
989
    ///     HalfMatch::must(1, 0),
990
    ///     HalfMatch::must(0, 0),
991
    /// ];
992
    /// assert_eq!(expected, matches);
993
    ///
994
    /// # Ok::<(), Box<dyn std::error::Error>>(())
995
    /// ```
996
    ///
997
    /// Now let's look at the same example, but with UTF-8 mode on the
998
    /// underlying NFA disabled:
999
    ///
1000
    /// ```
1001
    /// use regex_automata::{
1002
    ///     hybrid::dfa::{DFA, OverlappingState},
1003
    ///     nfa::thompson,
1004
    ///     HalfMatch, Input, MatchKind,
1005
    /// };
1006
    ///
1007
    /// let dfa = DFA::builder()
1008
    ///     .configure(DFA::config().match_kind(MatchKind::All))
1009
    ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
1010
    ///     .build_many(&[r"", r"☃"])?;
1011
    /// let mut cache = dfa.create_cache();
1012
    ///
1013
    /// // Run the reverse DFA to collect all matches.
1014
    /// let input = Input::new("☃");
1015
    /// let mut state = OverlappingState::start();
1016
    /// let mut matches = vec![];
1017
    /// loop {
1018
    ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
1019
    ///     match state.get_match() {
1020
    ///         None => break,
1021
    ///         Some(hm) => matches.push(hm),
1022
    ///     }
1023
    /// }
1024
    ///
1025
    /// // Now *all* positions match, even within a codepoint,
1026
    /// // because we lifted the requirement that matches
1027
    /// // correspond to valid UTF-8 spans.
1028
    /// let expected = vec![
1029
    ///     HalfMatch::must(0, 3),
1030
    ///     HalfMatch::must(0, 2),
1031
    ///     HalfMatch::must(0, 1),
1032
    ///     HalfMatch::must(1, 0),
1033
    ///     HalfMatch::must(0, 0),
1034
    /// ];
1035
    /// assert_eq!(expected, matches);
1036
    ///
1037
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1038
    /// ```
1039
    #[inline]
1040
0
    pub fn try_search_overlapping_rev(
1041
0
        &self,
1042
0
        cache: &mut Cache,
1043
0
        input: &Input<'_>,
1044
0
        state: &mut OverlappingState,
1045
0
    ) -> Result<(), MatchError> {
1046
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1047
0
        search::find_overlapping_rev(self, cache, input, state)?;
1048
0
        match state.get_match() {
1049
0
            None => Ok(()),
1050
0
            Some(_) if !utf8empty => Ok(()),
1051
0
            Some(_) => skip_empty_utf8_splits_overlapping(
1052
0
                input,
1053
0
                state,
1054
0
                |input, state| {
1055
0
                    search::find_overlapping_rev(self, cache, input, state)
1056
0
                },
1057
            ),
1058
        }
1059
0
    }
1060
1061
    /// Writes the set of patterns that match anywhere in the given search
1062
    /// configuration to `patset`. If multiple patterns match at the same
1063
    /// position and the underlying DFA supports overlapping matches, then all
1064
    /// matching patterns are written to the given set.
1065
    ///
1066
    /// Unless all of the patterns in this DFA are anchored, then generally
1067
    /// speaking, this will visit every byte in the haystack.
1068
    ///
1069
    /// This search routine *does not* clear the pattern set. This gives some
1070
    /// flexibility to the caller (e.g., running multiple searches with the
1071
    /// same pattern set), but does make the API bug-prone if you're reusing
1072
    /// the same pattern set for multiple searches but intended them to be
1073
    /// independent.
1074
    ///
1075
    /// If a pattern ID matched but the given `PatternSet` does not have
1076
    /// sufficient capacity to store it, then it is not inserted and silently
1077
    /// dropped.
1078
    ///
1079
    /// # Errors
1080
    ///
1081
    /// This routine errors if the search could not complete. This can occur
1082
    /// in a number of circumstances:
1083
    ///
1084
    /// * The configuration of the lazy DFA may permit it to "quit" the search.
1085
    /// For example, setting quit bytes or enabling heuristic support for
1086
    /// Unicode word boundaries. The default configuration does not enable any
1087
    /// option that could result in the lazy DFA quitting.
1088
    /// * The configuration of the lazy DFA may also permit it to "give up"
1089
    /// on a search if it makes ineffective use of its transition table
1090
    /// cache. The default configuration does not enable this by default,
1091
    /// although it is typically a good idea to.
1092
    /// * When the provided `Input` configuration is not supported. For
1093
    /// example, by providing an unsupported anchor mode.
1094
    ///
1095
    /// When a search returns an error, callers cannot know whether a match
1096
    /// exists or not.
1097
    ///
1098
    /// # Example
1099
    ///
1100
    /// This example shows how to find all matching patterns in a haystack,
1101
    /// even when some patterns match at the same position as other patterns.
1102
    ///
1103
    /// ```
1104
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1105
    /// use regex_automata::{
1106
    ///     hybrid::dfa::DFA,
1107
    ///     Input, MatchKind, PatternSet,
1108
    /// };
1109
    ///
1110
    /// let patterns = &[
1111
    ///     r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1112
    /// ];
1113
    /// let dfa = DFA::builder()
1114
    ///     .configure(DFA::config().match_kind(MatchKind::All))
1115
    ///     .build_many(patterns)?;
1116
    /// let mut cache = dfa.create_cache();
1117
    ///
1118
    /// let input = Input::new("foobar");
1119
    /// let mut patset = PatternSet::new(dfa.pattern_len());
1120
    /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
1121
    /// let expected = vec![0, 2, 3, 4, 6];
1122
    /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1123
    /// assert_eq!(expected, got);
1124
    ///
1125
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1126
    /// ```
1127
    #[inline]
1128
0
    pub fn try_which_overlapping_matches(
1129
0
        &self,
1130
0
        cache: &mut Cache,
1131
0
        input: &Input<'_>,
1132
0
        patset: &mut PatternSet,
1133
0
    ) -> Result<(), MatchError> {
1134
0
        let mut state = OverlappingState::start();
1135
0
        while let Some(m) = {
1136
0
            self.try_search_overlapping_fwd(cache, input, &mut state)?;
1137
0
            state.get_match()
1138
        } {
1139
0
            let _ = patset.try_insert(m.pattern());
1140
            // There's nothing left to find, so we can stop. Or the caller
1141
            // asked us to.
1142
0
            if patset.is_full() || input.get_earliest() {
1143
0
                break;
1144
0
            }
1145
        }
1146
0
        Ok(())
1147
0
    }
1148
}
1149
1150
impl DFA {
1151
    /// Transitions from the current state to the next state, given the next
1152
    /// byte of input.
1153
    ///
1154
    /// The given cache is used to either reuse pre-computed state
1155
    /// transitions, or to store this newly computed transition for future
1156
    /// reuse. Thus, this routine guarantees that it will never return a state
1157
    /// ID that has an "unknown" tag.
1158
    ///
1159
    /// # State identifier validity
1160
    ///
1161
    /// The only valid value for `current` is the lazy state ID returned
1162
    /// by the most recent call to `next_state`, `next_state_untagged`,
1163
    /// `next_state_untagged_unchecked`, `start_state_forward` or
1164
    /// `state_state_reverse` for the given `cache`. Any state ID returned from
1165
    /// prior calls to these routines (with the same `cache`) is considered
1166
    /// invalid (even if it gives an appearance of working). State IDs returned
1167
    /// from _any_ prior call for different `cache` values are also always
1168
    /// invalid.
1169
    ///
1170
    /// The returned ID is always a valid ID when `current` refers to a valid
1171
    /// ID. Moreover, this routine is defined for all possible values of
1172
    /// `input`.
1173
    ///
1174
    /// These validity rules are not checked, even in debug mode. Callers are
1175
    /// required to uphold these rules themselves.
1176
    ///
1177
    /// Violating these state ID validity rules will not sacrifice memory
1178
    /// safety, but _may_ produce an incorrect result or a panic.
1179
    ///
1180
    /// # Panics
1181
    ///
1182
    /// If the given ID does not refer to a valid state, then this routine
1183
    /// may panic but it also may not panic and instead return an invalid or
1184
    /// incorrect ID.
1185
    ///
1186
    /// # Example
1187
    ///
1188
    /// This shows a simplistic example for walking a lazy DFA for a given
1189
    /// haystack by using the `next_state` method.
1190
    ///
1191
    /// ```
1192
    /// use regex_automata::{hybrid::dfa::DFA, Input};
1193
    ///
1194
    /// let dfa = DFA::new(r"[a-z]+r")?;
1195
    /// let mut cache = dfa.create_cache();
1196
    /// let haystack = "bar".as_bytes();
1197
    ///
1198
    /// // The start state is determined by inspecting the position and the
1199
    /// // initial bytes of the haystack.
1200
    /// let mut sid = dfa.start_state_forward(
1201
    ///     &mut cache, &Input::new(haystack),
1202
    /// )?;
1203
    /// // Walk all the bytes in the haystack.
1204
    /// for &b in haystack {
1205
    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1206
    /// }
1207
    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1208
    /// // special "EOI" transition at the end of the search.
1209
    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1210
    /// assert!(sid.is_match());
1211
    ///
1212
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1213
    /// ```
1214
    #[inline]
1215
0
    pub fn next_state(
1216
0
        &self,
1217
0
        cache: &mut Cache,
1218
0
        current: LazyStateID,
1219
0
        input: u8,
1220
0
    ) -> Result<LazyStateID, CacheError> {
1221
0
        let class = usize::from(self.classes.get(input));
1222
0
        let offset = current.as_usize_untagged() + class;
1223
0
        let sid = cache.trans[offset];
1224
0
        if !sid.is_unknown() {
1225
0
            return Ok(sid);
1226
0
        }
1227
0
        let unit = alphabet::Unit::u8(input);
1228
0
        Lazy::new(self, cache).cache_next_state(current, unit)
1229
0
    }
1230
1231
    /// Transitions from the current state to the next state, given the next
1232
    /// byte of input and a state ID that is not tagged.
1233
    ///
1234
    /// The only reason to use this routine is performance. In particular, the
1235
    /// `next_state` method needs to do some additional checks, among them is
1236
    /// to account for identifiers to states that are not yet computed. In
1237
    /// such a case, the transition is computed on the fly. However, if it is
1238
    /// known that the `current` state ID is untagged, then these checks can be
1239
    /// omitted.
1240
    ///
1241
    /// Since this routine does not compute states on the fly, it does not
1242
    /// modify the cache and thus cannot return an error. Consequently, `cache`
1243
    /// does not need to be mutable and it is possible for this routine to
1244
    /// return a state ID corresponding to the special "unknown" state. In
1245
    /// this case, it is the caller's responsibility to use the prior state
1246
    /// ID and `input` with `next_state` in order to force the computation of
1247
    /// the unknown transition. Otherwise, trying to use the "unknown" state
1248
    /// ID will just result in transitioning back to itself, and thus never
1249
    /// terminating. (This is technically a special exemption to the state ID
1250
    /// validity rules, but is permissible since this routine is guaranteed to
1251
    /// never mutate the given `cache`, and thus the identifier is guaranteed
1252
    /// to remain valid.)
1253
    ///
1254
    /// See [`LazyStateID`] for more details on what it means for a state ID
1255
    /// to be tagged. Also, see
1256
    /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
1257
    /// for this same idea, but with bounds checks forcefully elided.
1258
    ///
1259
    /// # State identifier validity
1260
    ///
1261
    /// The only valid value for `current` is an **untagged** lazy
1262
    /// state ID returned by the most recent call to `next_state`,
1263
    /// `next_state_untagged`, `next_state_untagged_unchecked`,
1264
    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1265
    /// Any state ID returned from prior calls to these routines (with the
1266
    /// same `cache`) is considered invalid (even if it gives an appearance
1267
    /// of working). State IDs returned from _any_ prior call for different
1268
    /// `cache` values are also always invalid.
1269
    ///
1270
    /// The returned ID is always a valid ID when `current` refers to a valid
1271
    /// ID, although it may be tagged. Moreover, this routine is defined for
1272
    /// all possible values of `input`.
1273
    ///
1274
    /// Not all validity rules are checked, even in debug mode. Callers are
1275
    /// required to uphold these rules themselves.
1276
    ///
1277
    /// Violating these state ID validity rules will not sacrifice memory
1278
    /// safety, but _may_ produce an incorrect result or a panic.
1279
    ///
1280
    /// # Panics
1281
    ///
1282
    /// If the given ID does not refer to a valid state, then this routine
1283
    /// may panic but it also may not panic and instead return an invalid or
1284
    /// incorrect ID.
1285
    ///
1286
    /// # Example
1287
    ///
1288
    /// This shows a simplistic example for walking a lazy DFA for a given
1289
    /// haystack by using the `next_state_untagged` method where possible.
1290
    ///
1291
    /// ```
1292
    /// use regex_automata::{hybrid::dfa::DFA, Input};
1293
    ///
1294
    /// let dfa = DFA::new(r"[a-z]+r")?;
1295
    /// let mut cache = dfa.create_cache();
1296
    /// let haystack = "bar".as_bytes();
1297
    ///
1298
    /// // The start state is determined by inspecting the position and the
1299
    /// // initial bytes of the haystack.
1300
    /// let mut sid = dfa.start_state_forward(
1301
    ///     &mut cache, &Input::new(haystack),
1302
    /// )?;
1303
    /// // Walk all the bytes in the haystack.
1304
    /// let mut at = 0;
1305
    /// while at < haystack.len() {
1306
    ///     if sid.is_tagged() {
1307
    ///         sid = dfa.next_state(&mut cache, sid, haystack[at])?;
1308
    ///     } else {
1309
    ///         let mut prev_sid = sid;
1310
    ///         // We attempt to chew through as much as we can while moving
1311
    ///         // through untagged state IDs. Thus, the transition function
1312
    ///         // does less work on average per byte. (Unrolling this loop
1313
    ///         // may help even more.)
1314
    ///         while at < haystack.len() {
1315
    ///             prev_sid = sid;
1316
    ///             sid = dfa.next_state_untagged(
1317
    ///                 &mut cache, sid, haystack[at],
1318
    ///             );
1319
    ///             at += 1;
1320
    ///             if sid.is_tagged() {
1321
    ///                 break;
1322
    ///             }
1323
    ///         }
1324
    ///         // We must ensure that we never proceed to the next iteration
1325
    ///         // with an unknown state ID. If we don't account for this
1326
    ///         // case, then search isn't guaranteed to terminate since all
1327
    ///         // transitions on unknown states loop back to itself.
1328
    ///         if sid.is_unknown() {
1329
    ///             sid = dfa.next_state(
1330
    ///                 &mut cache, prev_sid, haystack[at - 1],
1331
    ///             )?;
1332
    ///         }
1333
    ///     }
1334
    /// }
1335
    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1336
    /// // special "EOI" transition at the end of the search.
1337
    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1338
    /// assert!(sid.is_match());
1339
    ///
1340
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1341
    /// ```
1342
    #[inline]
1343
0
    pub fn next_state_untagged(
1344
0
        &self,
1345
0
        cache: &Cache,
1346
0
        current: LazyStateID,
1347
0
        input: u8,
1348
0
    ) -> LazyStateID {
1349
0
        debug_assert!(!current.is_tagged());
1350
0
        let class = usize::from(self.classes.get(input));
1351
0
        let offset = current.as_usize_unchecked() + class;
1352
0
        cache.trans[offset]
1353
0
    }
1354
1355
    /// Transitions from the current state to the next state, eliding bounds
1356
    /// checks, given the next byte of input and a state ID that is not tagged.
1357
    ///
1358
    /// The only reason to use this routine is performance. In particular, the
1359
    /// `next_state` method needs to do some additional checks, among them is
1360
    /// to account for identifiers to states that are not yet computed. In
1361
    /// such a case, the transition is computed on the fly. However, if it is
1362
    /// known that the `current` state ID is untagged, then these checks can be
1363
    /// omitted.
1364
    ///
1365
    /// Since this routine does not compute states on the fly, it does not
1366
    /// modify the cache and thus cannot return an error. Consequently, `cache`
1367
    /// does not need to be mutable and it is possible for this routine to
1368
    /// return a state ID corresponding to the special "unknown" state. In
1369
    /// this case, it is the caller's responsibility to use the prior state
1370
    /// ID and `input` with `next_state` in order to force the computation of
1371
    /// the unknown transition. Otherwise, trying to use the "unknown" state
1372
    /// ID will just result in transitioning back to itself, and thus never
1373
    /// terminating. (This is technically a special exemption to the state ID
1374
    /// validity rules, but is permissible since this routine is guaranteed to
1375
    /// never mutate the given `cache`, and thus the identifier is guaranteed
1376
    /// to remain valid.)
1377
    ///
1378
    /// See [`LazyStateID`] for more details on what it means for a state ID
1379
    /// to be tagged. Also, see
1380
    /// [`next_state_untagged`](DFA::next_state_untagged)
1381
    /// for this same idea, but with memory safety guaranteed by retaining
1382
    /// bounds checks.
1383
    ///
1384
    /// # State identifier validity
1385
    ///
1386
    /// The only valid value for `current` is an **untagged** lazy
1387
    /// state ID returned by the most recent call to `next_state`,
1388
    /// `next_state_untagged`, `next_state_untagged_unchecked`,
1389
    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1390
    /// Any state ID returned from prior calls to these routines (with the
1391
    /// same `cache`) is considered invalid (even if it gives an appearance
1392
    /// of working). State IDs returned from _any_ prior call for different
1393
    /// `cache` values are also always invalid.
1394
    ///
1395
    /// The returned ID is always a valid ID when `current` refers to a valid
1396
    /// ID, although it may be tagged. Moreover, this routine is defined for
1397
    /// all possible values of `input`.
1398
    ///
1399
    /// Not all validity rules are checked, even in debug mode. Callers are
1400
    /// required to uphold these rules themselves.
1401
    ///
1402
    /// Violating these state ID validity rules will not sacrifice memory
1403
    /// safety, but _may_ produce an incorrect result or a panic.
1404
    ///
1405
    /// # Safety
1406
    ///
1407
    /// Callers of this method must guarantee that `current` refers to a valid
1408
    /// state ID according to the rules described above. If `current` is not a
1409
    /// valid state ID for this automaton, then calling this routine may result
1410
    /// in undefined behavior.
1411
    ///
1412
    /// If `current` is valid, then the ID returned is valid for all possible
1413
    /// values of `input`.
1414
    #[inline]
1415
0
    pub unsafe fn next_state_untagged_unchecked(
1416
0
        &self,
1417
0
        cache: &Cache,
1418
0
        current: LazyStateID,
1419
0
        input: u8,
1420
0
    ) -> LazyStateID {
1421
0
        debug_assert!(!current.is_tagged());
1422
0
        let class = usize::from(self.classes.get(input));
1423
0
        let offset = current.as_usize_unchecked() + class;
1424
0
        *cache.trans.get_unchecked(offset)
1425
0
    }
1426
1427
    /// Transitions from the current state to the next state for the special
1428
    /// EOI symbol.
1429
    ///
1430
    /// The given cache is used to either reuse pre-computed state
1431
    /// transitions, or to store this newly computed transition for future
1432
    /// reuse. Thus, this routine guarantees that it will never return a state
1433
    /// ID that has an "unknown" tag.
1434
    ///
1435
    /// This routine must be called at the end of every search in a correct
1436
    /// implementation of search. Namely, lazy DFAs in this crate delay matches
1437
    /// by one byte in order to support look-around operators. Thus, after
1438
    /// reaching the end of a haystack, a search implementation must follow one
1439
    /// last EOI transition.
1440
    ///
1441
    /// It is best to think of EOI as an additional symbol in the alphabet of a
1442
    /// DFA that is distinct from every other symbol. That is, the alphabet of
1443
    /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
1444
    /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
1445
    /// physical alphabet size may be smaller because of alphabet compression
1446
    /// via equivalence classes, but EOI is always represented somehow in the
1447
    /// alphabet.)
1448
    ///
1449
    /// # State identifier validity
1450
    ///
1451
    /// The only valid value for `current` is the lazy state ID returned
1452
    /// by the most recent call to `next_state`, `next_state_untagged`,
1453
    /// `next_state_untagged_unchecked`, `start_state_forward` or
1454
    /// `state_state_reverse` for the given `cache`. Any state ID returned from
1455
    /// prior calls to these routines (with the same `cache`) is considered
1456
    /// invalid (even if it gives an appearance of working). State IDs returned
1457
    /// from _any_ prior call for different `cache` values are also always
1458
    /// invalid.
1459
    ///
1460
    /// The returned ID is always a valid ID when `current` refers to a valid
1461
    /// ID.
1462
    ///
1463
    /// These validity rules are not checked, even in debug mode. Callers are
1464
    /// required to uphold these rules themselves.
1465
    ///
1466
    /// Violating these state ID validity rules will not sacrifice memory
1467
    /// safety, but _may_ produce an incorrect result or a panic.
1468
    ///
1469
    /// # Panics
1470
    ///
1471
    /// If the given ID does not refer to a valid state, then this routine
1472
    /// may panic but it also may not panic and instead return an invalid or
1473
    /// incorrect ID.
1474
    ///
1475
    /// # Example
1476
    ///
1477
    /// This shows a simplistic example for walking a DFA for a given haystack,
1478
    /// and then finishing the search with the final EOI transition.
1479
    ///
1480
    /// ```
1481
    /// use regex_automata::{hybrid::dfa::DFA, Input};
1482
    ///
1483
    /// let dfa = DFA::new(r"[a-z]+r")?;
1484
    /// let mut cache = dfa.create_cache();
1485
    /// let haystack = "bar".as_bytes();
1486
    ///
1487
    /// // The start state is determined by inspecting the position and the
1488
    /// // initial bytes of the haystack.
1489
    /// let mut sid = dfa.start_state_forward(
1490
    ///     &mut cache, &Input::new(haystack),
1491
    /// )?;
1492
    /// // Walk all the bytes in the haystack.
1493
    /// for &b in haystack {
1494
    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1495
    /// }
1496
    /// // Matches are always delayed by 1 byte, so we must explicitly walk
1497
    /// // the special "EOI" transition at the end of the search. Without this
1498
    /// // final transition, the assert below will fail since the DFA will not
1499
    /// // have entered a match state yet!
1500
    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1501
    /// assert!(sid.is_match());
1502
    ///
1503
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1504
    /// ```
1505
    #[inline]
1506
0
    pub fn next_eoi_state(
1507
0
        &self,
1508
0
        cache: &mut Cache,
1509
0
        current: LazyStateID,
1510
0
    ) -> Result<LazyStateID, CacheError> {
1511
0
        let eoi = self.classes.eoi().as_usize();
1512
0
        let offset = current.as_usize_untagged() + eoi;
1513
0
        let sid = cache.trans[offset];
1514
0
        if !sid.is_unknown() {
1515
0
            return Ok(sid);
1516
0
        }
1517
0
        let unit = self.classes.eoi();
1518
0
        Lazy::new(self, cache).cache_next_state(current, unit)
1519
0
    }
1520
1521
    /// Return the ID of the start state for this lazy DFA for the given
1522
    /// starting configuration.
1523
    ///
1524
    /// Unlike typical DFA implementations, the start state for DFAs in this
1525
    /// crate is dependent on a few different factors:
1526
    ///
1527
    /// * The [`Anchored`] mode of the search. Unanchored, anchored and
1528
    /// anchored searches for a specific [`PatternID`] all use different start
1529
    /// states.
1530
    /// * Whether a "look-behind" byte exists. For example, the `^` anchor
1531
    /// matches if and only if there is no look-behind byte.
1532
    /// * The specific value of that look-behind byte. For example, a `(?m:^)`
1533
    /// assertion only matches when there is either no look-behind byte, or
1534
    /// when the look-behind byte is a line terminator.
1535
    ///
1536
    /// The [starting configuration](start::Config) provides the above
1537
    /// information.
1538
    ///
1539
    /// This routine can be used for either forward or reverse searches.
1540
    /// Although, as a convenience, if you have an [`Input`], then it
1541
    /// may be more succinct to use [`DFA::start_state_forward`] or
1542
    /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
1543
    /// routines return a [`MatchError`] on failure where as this routine
1544
    /// returns a [`StartError`].
1545
    ///
1546
    /// # Errors
1547
    ///
1548
    /// This may return a [`StartError`] if the search needs to give up when
1549
    /// determining the start state (for example, if it sees a "quit" byte
1550
    /// or if the cache has become inefficient). This can also return an
1551
    /// error if the given configuration contains an unsupported [`Anchored`]
1552
    /// configuration.
1553
    #[cfg_attr(feature = "perf-inline", inline(always))]
1554
0
    pub fn start_state(
1555
0
        &self,
1556
0
        cache: &mut Cache,
1557
0
        config: &start::Config,
1558
0
    ) -> Result<LazyStateID, StartError> {
1559
0
        let lazy = LazyRef::new(self, cache);
1560
0
        let anchored = config.get_anchored();
1561
0
        let start = match config.get_look_behind() {
1562
0
            None => Start::Text,
1563
0
            Some(byte) => {
1564
0
                if !self.quitset.is_empty() && self.quitset.contains(byte) {
1565
0
                    return Err(StartError::quit(byte));
1566
0
                }
1567
0
                self.start_map.get(byte)
1568
            }
1569
        };
1570
0
        let start_id = lazy.get_cached_start_id(anchored, start)?;
1571
0
        if !start_id.is_unknown() {
1572
0
            return Ok(start_id);
1573
0
        }
1574
0
        Lazy::new(self, cache).cache_start_group(anchored, start)
1575
0
    }
1576
1577
    /// Return the ID of the start state for this lazy DFA when executing a
1578
    /// forward search.
1579
    ///
1580
    /// This is a convenience routine for calling [`DFA::start_state`] that
1581
    /// converts the given [`Input`] to a [start configuration](start::Config).
1582
    /// Additionally, if an error occurs, it is converted from a [`StartError`]
1583
    /// to a [`MatchError`] using the offset information in the given
1584
    /// [`Input`].
1585
    ///
1586
    /// # Errors
1587
    ///
1588
    /// This may return a [`MatchError`] if the search needs to give up when
1589
    /// determining the start state (for example, if it sees a "quit" byte or
1590
    /// if the cache has become inefficient). This can also return an error if
1591
    /// the given `Input` contains an unsupported [`Anchored`] configuration.
1592
    #[cfg_attr(feature = "perf-inline", inline(always))]
1593
0
    pub fn start_state_forward(
1594
0
        &self,
1595
0
        cache: &mut Cache,
1596
0
        input: &Input<'_>,
1597
0
    ) -> Result<LazyStateID, MatchError> {
1598
0
        let config = start::Config::from_input_forward(input);
1599
0
        self.start_state(cache, &config).map_err(|err| match err {
1600
0
            StartError::Cache { .. } => MatchError::gave_up(input.start()),
1601
0
            StartError::Quit { byte } => {
1602
0
                let offset = input
1603
0
                    .start()
1604
0
                    .checked_sub(1)
1605
0
                    .expect("no quit in start without look-behind");
1606
0
                MatchError::quit(byte, offset)
1607
            }
1608
0
            StartError::UnsupportedAnchored { mode } => {
1609
0
                MatchError::unsupported_anchored(mode)
1610
            }
1611
0
        })
1612
0
    }
1613
1614
    /// Return the ID of the start state for this lazy DFA when executing a
1615
    /// reverse search.
1616
    ///
1617
    /// This is a convenience routine for calling [`DFA::start_state`] that
1618
    /// converts the given [`Input`] to a [start configuration](start::Config).
1619
    /// Additionally, if an error occurs, it is converted from a [`StartError`]
1620
    /// to a [`MatchError`] using the offset information in the given
1621
    /// [`Input`].
1622
    ///
1623
    /// # Errors
1624
    ///
1625
    /// This may return a [`MatchError`] if the search needs to give up when
1626
    /// determining the start state (for example, if it sees a "quit" byte or
1627
    /// if the cache has become inefficient). This can also return an error if
1628
    /// the given `Input` contains an unsupported [`Anchored`] configuration.
1629
    #[cfg_attr(feature = "perf-inline", inline(always))]
1630
0
    pub fn start_state_reverse(
1631
0
        &self,
1632
0
        cache: &mut Cache,
1633
0
        input: &Input<'_>,
1634
0
    ) -> Result<LazyStateID, MatchError> {
1635
0
        let config = start::Config::from_input_reverse(input);
1636
0
        self.start_state(cache, &config).map_err(|err| match err {
1637
0
            StartError::Cache { .. } => MatchError::gave_up(input.end()),
1638
0
            StartError::Quit { byte } => {
1639
0
                let offset = input.end();
1640
0
                MatchError::quit(byte, offset)
1641
            }
1642
0
            StartError::UnsupportedAnchored { mode } => {
1643
0
                MatchError::unsupported_anchored(mode)
1644
            }
1645
0
        })
1646
0
    }
1647
1648
    /// Returns the total number of patterns that match in this state.
1649
    ///
1650
    /// If the lazy DFA was compiled with one pattern, then this must
1651
    /// necessarily always return `1` for all match states.
1652
    ///
1653
    /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
1654
    /// indices up to (but not including) the length returned by this routine
1655
    /// without panicking.
1656
    ///
1657
    /// # Panics
1658
    ///
1659
    /// If the given state is not a match state, then this may either panic
1660
    /// or return an incorrect result.
1661
    ///
1662
    /// # Example
1663
    ///
1664
    /// This example shows a simple instance of implementing overlapping
1665
    /// matches. In particular, it shows not only how to determine how many
1666
    /// patterns have matched in a particular state, but also how to access
1667
    /// which specific patterns have matched.
1668
    ///
1669
    /// Notice that we must use [`MatchKind::All`] when building the DFA. If we
1670
    /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
1671
    /// constructed in a way that supports overlapping matches. (It would only
1672
    /// report a single pattern that matches at any particular point in time.)
1673
    ///
1674
    /// Another thing to take note of is the patterns used and the order in
1675
    /// which the pattern IDs are reported. In the example below, pattern `3`
1676
    /// is yielded first. Why? Because it corresponds to the match that
1677
    /// appears first. Namely, the `@` symbol is part of `\S+` but not part
1678
    /// of any of the other patterns. Since the `\S+` pattern has a match that
1679
    /// starts to the left of any other pattern, its ID is returned before any
1680
    /// other.
1681
    ///
1682
    /// ```
1683
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1684
    /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
1685
    ///
1686
    /// let dfa = DFA::builder()
1687
    ///     .configure(DFA::config().match_kind(MatchKind::All))
1688
    ///     .build_many(&[
1689
    ///         r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
1690
    ///     ])?;
1691
    /// let mut cache = dfa.create_cache();
1692
    /// let haystack = "@bar".as_bytes();
1693
    ///
1694
    /// // The start state is determined by inspecting the position and the
1695
    /// // initial bytes of the haystack.
1696
    /// let mut sid = dfa.start_state_forward(
1697
    ///     &mut cache, &Input::new(haystack),
1698
    /// )?;
1699
    /// // Walk all the bytes in the haystack.
1700
    /// for &b in haystack {
1701
    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1702
    /// }
1703
    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1704
    ///
1705
    /// assert!(sid.is_match());
1706
    /// assert_eq!(dfa.match_len(&mut cache, sid), 3);
1707
    /// // The following calls are guaranteed to not panic since `match_len`
1708
    /// // returned `3` above.
1709
    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
1710
    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
1711
    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
1712
    ///
1713
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1714
    /// ```
1715
    #[inline]
1716
0
    pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
1717
0
        assert!(id.is_match());
1718
0
        LazyRef::new(self, cache).get_cached_state(id).match_len()
1719
0
    }
1720
1721
    /// Returns the pattern ID corresponding to the given match index in the
1722
    /// given state.
1723
    ///
1724
    /// See [`DFA::match_len`] for an example of how to use this method
1725
    /// correctly. Note that if you know your lazy DFA is configured with a
1726
    /// single pattern, then this routine is never necessary since it will
1727
    /// always return a pattern ID of `0` for an index of `0` when `id`
1728
    /// corresponds to a match state.
1729
    ///
1730
    /// Typically, this routine is used when implementing an overlapping
1731
    /// search, as the example for `DFA::match_len` does.
1732
    ///
1733
    /// # Panics
1734
    ///
1735
    /// If the state ID is not a match state or if the match index is out
1736
    /// of bounds for the given state, then this routine may either panic
1737
    /// or produce an incorrect result. If the state ID is correct and the
1738
    /// match index is correct, then this routine always produces a valid
1739
    /// `PatternID`.
1740
    #[inline]
1741
0
    pub fn match_pattern(
1742
0
        &self,
1743
0
        cache: &Cache,
1744
0
        id: LazyStateID,
1745
0
        match_index: usize,
1746
0
    ) -> PatternID {
1747
        // This is an optimization for the very common case of a DFA with a
1748
        // single pattern. This conditional avoids a somewhat more costly path
1749
        // that finds the pattern ID from the corresponding `State`, which
1750
        // requires a bit of slicing/pointer-chasing. This optimization tends
1751
        // to only matter when matches are frequent.
1752
0
        if self.pattern_len() == 1 {
1753
0
            return PatternID::ZERO;
1754
0
        }
1755
0
        LazyRef::new(self, cache)
1756
0
            .get_cached_state(id)
1757
0
            .match_pattern(match_index)
1758
0
    }
1759
}
1760
1761
/// A cache represents a partially computed DFA.
1762
///
1763
/// A cache is the key component that differentiates a classical DFA and a
1764
/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
1765
/// complete transition table that can handle all possible inputs, a hybrid
1766
/// NFA/DFA starts with an empty transition table and builds only the parts
1767
/// required during search. The parts that are built are stored in a cache. For
1768
/// this reason, a cache is a required parameter for nearly every operation on
1769
/// a [`DFA`].
1770
///
1771
/// Caches can be created from their corresponding DFA via
1772
/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
1773
/// created it, or the DFA that was most recently used to reset it with
1774
/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
1775
/// or incorrect results.
1776
#[derive(Clone, Debug)]
1777
pub struct Cache {
1778
    // N.B. If you're looking to understand how determinization works, it
1779
    // is probably simpler to first grok src/dfa/determinize.rs, since that
1780
    // doesn't have the "laziness" component.
1781
    /// The transition table.
1782
    ///
1783
    /// Given a `current` LazyStateID and an `input` byte, the next state can
1784
    /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
1785
    /// that no multiplication is used. That's because state identifiers are
1786
    /// "premultiplied."
1787
    ///
1788
    /// Note that the next state may be the "unknown" state. In this case, the
1789
    /// next state is not known and determinization for `current` on `input`
1790
    /// must be performed.
1791
    trans: Vec<LazyStateID>,
1792
    /// The starting states for this DFA.
1793
    ///
1794
    /// These are computed lazily. Initially, these are all set to "unknown"
1795
    /// lazy state IDs.
1796
    ///
1797
    /// When 'starts_for_each_pattern' is disabled (the default), then the size
1798
    /// of this is constrained to the possible starting configurations based
1799
    /// on the search parameters. (At time of writing, that's 4.) However,
1800
    /// when starting states for each pattern is enabled, then there are N
1801
    /// additional groups of starting states, where each group reflects the
1802
    /// different possible configurations and N is the number of patterns.
1803
    starts: Vec<LazyStateID>,
1804
    /// A sequence of NFA/DFA powerset states that have been computed for this
1805
    /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
1806
    /// tagged LazyStateID can be used to index this sequence by converting it
1807
    /// to its untagged form.)
1808
    states: Vec<State>,
1809
    /// A map from states to their corresponding IDs. This map may be accessed
1810
    /// via the raw byte representation of a state, which means that a `State`
1811
    /// does not need to be allocated to determine whether it already exists
1812
    /// in this map. Indeed, the existence of such a state is what determines
1813
    /// whether we allocate a new `State` or not.
1814
    ///
1815
    /// The higher level idea here is that we do just enough determinization
1816
    /// for a state to check whether we've already computed it. If we have,
1817
    /// then we can save a little (albeit not much) work. The real savings is
1818
    /// in memory usage. If we never checked for trivially duplicate states,
1819
    /// then our memory usage would explode to unreasonable levels.
1820
    states_to_id: StateMap,
1821
    /// Sparse sets used to track which NFA states have been visited during
1822
    /// various traversals.
1823
    sparses: SparseSets,
1824
    /// Scratch space for traversing the NFA graph. (We use space on the heap
1825
    /// instead of the call stack.)
1826
    stack: Vec<NFAStateID>,
1827
    /// Scratch space for building a NFA/DFA powerset state. This is used to
1828
    /// help amortize allocation since not every powerset state generated is
1829
    /// added to the cache. In particular, if it already exists in the cache,
1830
    /// then there is no need to allocate a new `State` for it.
1831
    scratch_state_builder: StateBuilderEmpty,
1832
    /// A simple abstraction for handling the saving of at most a single state
1833
    /// across a cache clearing. This is required for correctness. Namely, if
1834
    /// adding a new state after clearing the cache fails, then the caller
1835
    /// must retain the ability to continue using the state ID given. The
1836
    /// state corresponding to the state ID is what we preserve across cache
1837
    /// clearings.
1838
    state_saver: StateSaver,
1839
    /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
1840
    /// track this as new states are added since states use a variable amount
1841
    /// of heap. Tracking this as we add states makes it possible to compute
1842
    /// the total amount of memory used by the determinizer in constant time.
1843
    memory_usage_state: usize,
1844
    /// The number of times the cache has been cleared. When a minimum cache
1845
    /// clear count is set, then the cache will return an error instead of
1846
    /// clearing the cache if the count has been exceeded.
1847
    clear_count: usize,
1848
    /// The total number of bytes searched since the last time this cache was
1849
    /// cleared, not including the current search.
1850
    ///
1851
    /// This can be added to the length of the current search to get the true
1852
    /// total number of bytes searched.
1853
    ///
1854
    /// This is generally only non-zero when the
1855
    /// `Cache::search_{start,update,finish}` APIs are used to track search
1856
    /// progress.
1857
    bytes_searched: usize,
1858
    /// The progress of the current search.
1859
    ///
1860
    /// This is only non-`None` when callers utilize the `Cache::search_start`,
1861
    /// `Cache::search_update` and `Cache::search_finish` APIs.
1862
    ///
1863
    /// The purpose of recording search progress is to be able to make a
1864
    /// determination about the efficiency of the cache. Namely, by keeping
1865
    /// track of the
1866
    progress: Option<SearchProgress>,
1867
}
1868
1869
impl Cache {
1870
    /// Create a new cache for the given lazy DFA.
1871
    ///
1872
    /// The cache returned should only be used for searches for the given DFA.
1873
    /// If you want to reuse the cache for another DFA, then you must call
1874
    /// [`Cache::reset`] with that DFA.
1875
0
    pub fn new(dfa: &DFA) -> Cache {
1876
0
        let mut cache = Cache {
1877
0
            trans: alloc::vec![],
1878
0
            starts: alloc::vec![],
1879
0
            states: alloc::vec![],
1880
0
            states_to_id: StateMap::new(),
1881
0
            sparses: SparseSets::new(dfa.get_nfa().states().len()),
1882
0
            stack: alloc::vec![],
1883
0
            scratch_state_builder: StateBuilderEmpty::new(),
1884
0
            state_saver: StateSaver::none(),
1885
0
            memory_usage_state: 0,
1886
0
            clear_count: 0,
1887
0
            bytes_searched: 0,
1888
0
            progress: None,
1889
0
        };
1890
        debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
1891
0
        Lazy { dfa, cache: &mut cache }.init_cache();
1892
        debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
1893
0
        cache
1894
0
    }
1895
1896
    /// Reset this cache such that it can be used for searching with the given
1897
    /// lazy DFA (and only that DFA).
1898
    ///
1899
    /// A cache reset permits reusing memory already allocated in this cache
1900
    /// with a different lazy DFA.
1901
    ///
1902
    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
1903
    /// lazy DFA has been configured to "give up" after it has cleared the
1904
    /// cache a certain number of times.
1905
    ///
1906
    /// Any lazy state ID generated by the cache prior to resetting it is
1907
    /// invalid after the reset.
1908
    ///
1909
    /// # Example
1910
    ///
1911
    /// This shows how to re-purpose a cache for use with a different DFA.
1912
    ///
1913
    /// ```
1914
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1915
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
1916
    ///
1917
    /// let dfa1 = DFA::new(r"\w")?;
1918
    /// let dfa2 = DFA::new(r"\W")?;
1919
    ///
1920
    /// let mut cache = dfa1.create_cache();
1921
    /// assert_eq!(
1922
    ///     Some(HalfMatch::must(0, 2)),
1923
    ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
1924
    /// );
1925
    ///
1926
    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
1927
    /// // incorrect results. In order to re-purpose the cache, we must reset
1928
    /// // it with the DFA we'd like to use it with.
1929
    /// //
1930
    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
1931
    /// // allowed.
1932
    /// cache.reset(&dfa2);
1933
    /// assert_eq!(
1934
    ///     Some(HalfMatch::must(0, 3)),
1935
    ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
1936
    /// );
1937
    ///
1938
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1939
    /// ```
1940
0
    pub fn reset(&mut self, dfa: &DFA) {
1941
0
        Lazy::new(dfa, self).reset_cache()
1942
0
    }
1943
1944
    /// Initializes a new search starting at the given position.
1945
    ///
1946
    /// If a previous search was unfinished, then it is finished automatically
1947
    /// and a new search is begun.
1948
    ///
1949
    /// Note that keeping track of search progress is _not necessary_
1950
    /// for correct implementations of search using a lazy DFA. Keeping
1951
    /// track of search progress is only necessary if you want the
1952
    /// [`Config::minimum_bytes_per_state`] configuration knob to work.
1953
    #[inline]
1954
0
    pub fn search_start(&mut self, at: usize) {
1955
        // If a previous search wasn't marked as finished, then finish it
1956
        // now automatically.
1957
0
        if let Some(p) = self.progress.take() {
1958
0
            self.bytes_searched += p.len();
1959
0
        }
1960
0
        self.progress = Some(SearchProgress { start: at, at });
1961
0
    }
1962
1963
    /// Updates the current search to indicate that it has search to the
1964
    /// current position.
1965
    ///
1966
    /// No special care needs to be taken for reverse searches. Namely, the
1967
    /// position given may be _less than_ the starting position of the search.
1968
    ///
1969
    /// # Panics
1970
    ///
1971
    /// This panics if no search has been started by [`Cache::search_start`].
1972
    #[inline]
1973
0
    pub fn search_update(&mut self, at: usize) {
1974
0
        let p =
1975
0
            self.progress.as_mut().expect("no in-progress search to update");
1976
0
        p.at = at;
1977
0
    }
1978
1979
    /// Indicates that a search has finished at the given position.
1980
    ///
1981
    /// # Panics
1982
    ///
1983
    /// This panics if no search has been started by [`Cache::search_start`].
1984
    #[inline]
1985
0
    pub fn search_finish(&mut self, at: usize) {
1986
0
        let mut p =
1987
0
            self.progress.take().expect("no in-progress search to finish");
1988
0
        p.at = at;
1989
0
        self.bytes_searched += p.len();
1990
0
    }
1991
1992
    /// Returns the total number of bytes that have been searched since this
1993
    /// cache was last cleared.
1994
    ///
1995
    /// This is useful for determining the efficiency of the cache. For
1996
    /// example, the lazy DFA uses this value in conjunction with the
1997
    /// [`Config::minimum_bytes_per_state`] knob to help determine whether it
1998
    /// should quit searching.
1999
    ///
2000
    /// This always returns `0` if search progress isn't being tracked. Note
2001
    /// that the lazy DFA search routines in this crate always track search
2002
    /// progress.
2003
0
    pub fn search_total_len(&self) -> usize {
2004
0
        self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len())
2005
0
    }
2006
2007
    /// Returns the total number of times this cache has been cleared since it
2008
    /// was either created or last reset.
2009
    ///
2010
    /// This is useful for informational purposes or if you want to change
2011
    /// search strategies based on the number of times the cache has been
2012
    /// cleared.
2013
0
    pub fn clear_count(&self) -> usize {
2014
0
        self.clear_count
2015
0
    }
2016
2017
    /// Returns the heap memory usage, in bytes, of this cache.
2018
    ///
2019
    /// This does **not** include the stack size used up by this cache. To
2020
    /// compute that, use `std::mem::size_of::<Cache>()`.
2021
0
    pub fn memory_usage(&self) -> usize {
2022
        const ID_SIZE: usize = size_of::<LazyStateID>();
2023
        const STATE_SIZE: usize = size_of::<State>();
2024
2025
        // NOTE: If you make changes to the below, then
2026
        // 'minimum_cache_capacity' should be updated correspondingly.
2027
2028
0
        self.trans.len() * ID_SIZE
2029
0
        + self.starts.len() * ID_SIZE
2030
0
        + self.states.len() * STATE_SIZE
2031
0
        // Maps likely use more memory than this, but it's probably close.
2032
0
        + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
2033
0
        + self.sparses.memory_usage()
2034
0
        + self.stack.capacity() * ID_SIZE
2035
0
        + self.scratch_state_builder.capacity()
2036
0
        // Heap memory used by 'State' in both 'states' and 'states_to_id'.
2037
0
        + self.memory_usage_state
2038
0
    }
2039
}
2040
2041
/// Keeps track of the progress of the current search.
2042
///
2043
/// This is updated via the `Cache::search_{start,update,finish}` APIs to
2044
/// record how many bytes have been searched. This permits computing a
2045
/// heuristic that represents the efficiency of a cache, and thus helps inform
2046
/// whether the lazy DFA should give up or not.
2047
#[derive(Clone, Debug)]
2048
struct SearchProgress {
2049
    start: usize,
2050
    at: usize,
2051
}
2052
2053
impl SearchProgress {
2054
    /// Returns the length, in bytes, of this search so far.
2055
    ///
2056
    /// This automatically handles the case of a reverse search, where `at`
2057
    /// is likely to be less than `start`.
2058
0
    fn len(&self) -> usize {
2059
0
        if self.start <= self.at {
2060
0
            self.at - self.start
2061
        } else {
2062
0
            self.start - self.at
2063
        }
2064
0
    }
2065
}
2066
2067
/// A map from states to state identifiers. When using std, we use a standard
2068
/// hashmap, since it's a bit faster for this use case. (Other maps, like
2069
/// one's based on FNV, have not yet been benchmarked.)
2070
///
2071
/// The main purpose of this map is to reuse states where possible. This won't
2072
/// fully minimize the DFA, but it works well in a lot of cases.
2073
#[cfg(feature = "std")]
2074
type StateMap = std::collections::HashMap<State, LazyStateID>;
2075
#[cfg(not(feature = "std"))]
2076
type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
2077
2078
/// A type that groups methods that require the base NFA/DFA and writable
2079
/// access to the cache.
2080
#[derive(Debug)]
2081
struct Lazy<'i, 'c> {
2082
    dfa: &'i DFA,
2083
    cache: &'c mut Cache,
2084
}
2085
2086
impl<'i, 'c> Lazy<'i, 'c> {
2087
    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2088
0
    fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
2089
0
        Lazy { dfa, cache }
2090
0
    }
2091
2092
    /// Return an immutable view by downgrading a writable cache to a read-only
2093
    /// cache.
2094
0
    fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
2095
0
        LazyRef::new(self.dfa, self.cache)
2096
0
    }
2097
2098
    /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
2099
    /// like 'next_state' and 'next_eoi_state' that are called in critical
2100
    /// areas. The idea is to let the optimizer focus on the other areas of
2101
    /// those methods as the hot path.
2102
    ///
2103
    /// Here's an example that justifies 'inline(never)'
2104
    ///
2105
    /// ```ignore
2106
    /// regex-cli find match hybrid \
2107
    ///   --cache-capacity 100000000 \
2108
    ///   -p '\pL{100}'
2109
    ///   all-codepoints-utf8-100x
2110
    /// ```
2111
    ///
2112
    /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
2113
    /// codepoint, in sequence, repeated 100 times.
2114
    ///
2115
    /// With 'inline(never)' hyperfine reports 1.1s per run. With
2116
    /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
2117
    #[cold]
2118
    #[inline(never)]
2119
0
    fn cache_next_state(
2120
0
        &mut self,
2121
0
        mut current: LazyStateID,
2122
0
        unit: alphabet::Unit,
2123
0
    ) -> Result<LazyStateID, CacheError> {
2124
0
        let stride2 = self.dfa.stride2();
2125
0
        let empty_builder = self.get_state_builder();
2126
0
        let builder = determinize::next(
2127
0
            self.dfa.get_nfa(),
2128
0
            self.dfa.get_config().get_match_kind(),
2129
0
            &mut self.cache.sparses,
2130
0
            &mut self.cache.stack,
2131
0
            &self.cache.states[current.as_usize_untagged() >> stride2],
2132
0
            unit,
2133
0
            empty_builder,
2134
        );
2135
        // This is subtle, but if we *might* clear the cache, then we should
2136
        // try to save the current state so that we can re-map its ID after
2137
        // cache clearing. We *might* clear the cache when either the new
2138
        // state can't fit in the cache or when the number of transitions has
2139
        // reached the maximum. Even if either of these conditions is true,
2140
        // the cache might not be cleared if we can reuse an existing state.
2141
        // But we don't know that at this point. Moreover, we don't save the
2142
        // current state every time because it is costly.
2143
        //
2144
        // TODO: We should try to find a way to make this less subtle and error
2145
        // prone. ---AG
2146
0
        let save_state = !self.as_ref().state_builder_fits_in_cache(&builder)
2147
0
            || self.cache.trans.len() >= LazyStateID::MAX;
2148
0
        if save_state {
2149
0
            self.save_state(current);
2150
0
        }
2151
0
        let next = self.add_builder_state(builder, |sid| sid)?;
2152
0
        if save_state {
2153
0
            current = self.saved_state_id();
2154
0
        }
2155
        // This is the payoff. The next time 'next_state' is called with this
2156
        // state and alphabet unit, it will find this transition and avoid
2157
        // having to re-determinize this transition.
2158
0
        self.set_transition(current, unit, next);
2159
0
        Ok(next)
2160
0
    }
2161
2162
    /// Compute and cache the starting state for the given pattern ID (if
2163
    /// present) and the starting configuration.
2164
    ///
2165
    /// This panics if a pattern ID is given and the DFA isn't configured to
2166
    /// build anchored start states for each pattern.
2167
    ///
2168
    /// This will never return an unknown lazy state ID.
2169
    ///
2170
    /// If caching this state would otherwise result in a cache that has been
2171
    /// cleared too many times, then an error is returned.
2172
    #[cold]
2173
    #[inline(never)]
2174
0
    fn cache_start_group(
2175
0
        &mut self,
2176
0
        anchored: Anchored,
2177
0
        start: Start,
2178
0
    ) -> Result<LazyStateID, StartError> {
2179
0
        let nfa_start_id = match anchored {
2180
0
            Anchored::No => self.dfa.get_nfa().start_unanchored(),
2181
0
            Anchored::Yes => self.dfa.get_nfa().start_anchored(),
2182
0
            Anchored::Pattern(pid) => {
2183
0
                if !self.dfa.get_config().get_starts_for_each_pattern() {
2184
0
                    return Err(StartError::unsupported_anchored(anchored));
2185
0
                }
2186
0
                match self.dfa.get_nfa().start_pattern(pid) {
2187
0
                    None => return Ok(self.as_ref().dead_id()),
2188
0
                    Some(sid) => sid,
2189
                }
2190
            }
2191
        };
2192
2193
0
        let id = self
2194
0
            .cache_start_one(nfa_start_id, start)
2195
0
            .map_err(StartError::cache)?;
2196
0
        self.set_start_state(anchored, start, id);
2197
0
        Ok(id)
2198
0
    }
2199
2200
    /// Compute and cache the starting state for the given NFA state ID and the
2201
    /// starting configuration. The NFA state ID might be one of the following:
2202
    ///
2203
    /// 1) An unanchored start state to match any pattern.
2204
    /// 2) An anchored start state to match any pattern.
2205
    /// 3) An anchored start state for a particular pattern.
2206
    ///
2207
    /// This will never return an unknown lazy state ID.
2208
    ///
2209
    /// If caching this state would otherwise result in a cache that has been
2210
    /// cleared too many times, then an error is returned.
2211
0
    fn cache_start_one(
2212
0
        &mut self,
2213
0
        nfa_start_id: NFAStateID,
2214
0
        start: Start,
2215
0
    ) -> Result<LazyStateID, CacheError> {
2216
0
        let mut builder_matches = self.get_state_builder().into_matches();
2217
0
        determinize::set_lookbehind_from_start(
2218
0
            self.dfa.get_nfa(),
2219
0
            &start,
2220
0
            &mut builder_matches,
2221
        );
2222
0
        self.cache.sparses.set1.clear();
2223
0
        determinize::epsilon_closure(
2224
0
            self.dfa.get_nfa(),
2225
0
            nfa_start_id,
2226
0
            builder_matches.look_have(),
2227
0
            &mut self.cache.stack,
2228
0
            &mut self.cache.sparses.set1,
2229
        );
2230
0
        let mut builder = builder_matches.into_nfa();
2231
0
        determinize::add_nfa_states(
2232
0
            &self.dfa.get_nfa(),
2233
0
            &self.cache.sparses.set1,
2234
0
            &mut builder,
2235
        );
2236
0
        let tag_starts = self.dfa.get_config().get_specialize_start_states();
2237
0
        self.add_builder_state(builder, |id| {
2238
0
            if tag_starts {
2239
0
                id.to_start()
2240
            } else {
2241
0
                id
2242
            }
2243
0
        })
2244
0
    }
2245
2246
    /// Either add the given builder state to this cache, or return an ID to an
2247
    /// equivalent state already in this cache.
2248
    ///
2249
    /// In the case where no equivalent state exists, the idmap function given
2250
    /// may be used to transform the identifier allocated. This is useful if
2251
    /// the caller needs to tag the ID with additional information.
2252
    ///
2253
    /// This will never return an unknown lazy state ID.
2254
    ///
2255
    /// If caching this state would otherwise result in a cache that has been
2256
    /// cleared too many times, then an error is returned.
2257
0
    fn add_builder_state(
2258
0
        &mut self,
2259
0
        builder: StateBuilderNFA,
2260
0
        idmap: impl Fn(LazyStateID) -> LazyStateID,
2261
0
    ) -> Result<LazyStateID, CacheError> {
2262
0
        if let Some(&cached_id) =
2263
0
            self.cache.states_to_id.get(builder.as_bytes())
2264
        {
2265
            // Since we have a cached state, put the constructed state's
2266
            // memory back into our scratch space, so that it can be reused.
2267
0
            self.put_state_builder(builder);
2268
0
            return Ok(cached_id);
2269
0
        }
2270
0
        let result = self.add_state(builder.to_state(), idmap);
2271
0
        self.put_state_builder(builder);
2272
0
        result
2273
0
    }
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_builder_state::<<regex_automata::hybrid::dfa::Lazy>::cache_start_one::{closure#0}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_builder_state::<<regex_automata::hybrid::dfa::Lazy>::cache_next_state::{closure#0}>
2274
2275
    /// Allocate a new state ID and add the given state to this cache.
2276
    ///
2277
    /// The idmap function given may be used to transform the identifier
2278
    /// allocated. This is useful if the caller needs to tag the ID with
2279
    /// additional information.
2280
    ///
2281
    /// This will never return an unknown lazy state ID.
2282
    ///
2283
    /// If caching this state would otherwise result in a cache that has been
2284
    /// cleared too many times, then an error is returned.
2285
0
    fn add_state(
2286
0
        &mut self,
2287
0
        state: State,
2288
0
        idmap: impl Fn(LazyStateID) -> LazyStateID,
2289
0
    ) -> Result<LazyStateID, CacheError> {
2290
0
        if !self.as_ref().state_fits_in_cache(&state) {
2291
0
            self.try_clear_cache()?;
2292
0
        }
2293
        // It's important for this to come second, since the above may clear
2294
        // the cache. If we clear the cache after ID generation, then the ID
2295
        // is likely bunk since it would have been generated based on a larger
2296
        // transition table.
2297
0
        let mut id = idmap(self.next_state_id()?);
2298
0
        if state.is_match() {
2299
0
            id = id.to_match();
2300
0
        }
2301
        // Add room in the transition table. Since this is a fresh state, all
2302
        // of its transitions are unknown.
2303
0
        self.cache.trans.extend(
2304
0
            iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
2305
        );
2306
        // When we add a sentinel state, we never want to set any quit
2307
        // transitions. Technically, this is harmless, since sentinel states
2308
        // have all of their transitions set to loop back to themselves. But
2309
        // when creating sentinel states before the quit sentinel state,
2310
        // this will try to call 'set_transition' on a state ID that doesn't
2311
        // actually exist yet, which isn't allowed. So we just skip doing so
2312
        // entirely.
2313
0
        if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
2314
0
            let quit_id = self.as_ref().quit_id();
2315
0
            for b in self.dfa.quitset.iter() {
2316
0
                self.set_transition(id, alphabet::Unit::u8(b), quit_id);
2317
0
            }
2318
0
        }
2319
0
        self.cache.memory_usage_state += state.memory_usage();
2320
0
        self.cache.states.push(state.clone());
2321
0
        self.cache.states_to_id.insert(state, id);
2322
0
        Ok(id)
2323
0
    }
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#0}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#2}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#1}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::clear_cache::{closure#0}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::cache_start_one::{closure#0}>
Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::cache_next_state::{closure#0}>
2324
2325
    /// Allocate a new state ID.
2326
    ///
2327
    /// This will never return an unknown lazy state ID.
2328
    ///
2329
    /// If caching this state would otherwise result in a cache that has been
2330
    /// cleared too many times, then an error is returned.
2331
0
    fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
2332
0
        let sid = match LazyStateID::new(self.cache.trans.len()) {
2333
0
            Ok(sid) => sid,
2334
            Err(_) => {
2335
0
                self.try_clear_cache()?;
2336
                // This has to pass since we check that ID capacity at
2337
                // construction time can fit at least MIN_STATES states.
2338
0
                LazyStateID::new(self.cache.trans.len()).unwrap()
2339
            }
2340
        };
2341
0
        Ok(sid)
2342
0
    }
2343
2344
    /// Attempt to clear the cache used by this lazy DFA.
2345
    ///
2346
    /// If clearing the cache exceeds the minimum number of required cache
2347
    /// clearings, then this will return a cache error. In this case,
2348
    /// callers should bubble this up as the cache can't be used until it is
2349
    /// reset. Implementations of search should convert this error into a
2350
    /// [`MatchError::gave_up`].
2351
    ///
2352
    /// If 'self.state_saver' is set to save a state, then this state is
2353
    /// persisted through cache clearing. Otherwise, the cache is returned to
2354
    /// its state after initialization with two exceptions: its clear count
2355
    /// is incremented and some of its memory likely has additional capacity.
2356
    /// That is, clearing a cache does _not_ release memory.
2357
    ///
2358
    /// Otherwise, any lazy state ID generated by the cache prior to resetting
2359
    /// it is invalid after the reset.
2360
0
    fn try_clear_cache(&mut self) -> Result<(), CacheError> {
2361
0
        let c = self.dfa.get_config();
2362
0
        if let Some(min_count) = c.get_minimum_cache_clear_count() {
2363
0
            if self.cache.clear_count >= min_count {
2364
0
                if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
2365
0
                    let len = self.cache.search_total_len();
2366
0
                    let min_bytes =
2367
0
                        min_bytes_per.saturating_mul(self.cache.states.len());
2368
                    // If we've searched 0 bytes then probably something has
2369
                    // gone wrong and the lazy DFA search implementation isn't
2370
                    // correctly updating the search progress state.
2371
0
                    if len == 0 {
2372
0
                        trace!(
2373
0
                            "number of bytes searched is 0, but \
2374
0
                             a minimum bytes per state searched ({}) is \
2375
0
                             enabled, maybe Cache::search_update \
2376
0
                             is not being used?",
2377
0
                            min_bytes_per,
2378
0
                        );
2379
0
                    }
2380
0
                    if len < min_bytes {
2381
                        trace!(
2382
                            "lazy DFA cache has been cleared {} times, \
2383
                             which exceeds the limit of {}, \
2384
                             AND its bytes searched per state is less \
2385
                             than the configured minimum of {}, \
2386
                             therefore lazy DFA is giving up \
2387
                             (bytes searched since cache clear = {}, \
2388
                              number of states = {})",
2389
                            self.cache.clear_count,
2390
                            min_count,
2391
                            min_bytes_per,
2392
                            len,
2393
                            self.cache.states.len(),
2394
                        );
2395
0
                        return Err(CacheError::bad_efficiency());
2396
0
                    } else {
2397
0
                        trace!(
2398
0
                            "lazy DFA cache has been cleared {} times, \
2399
0
                             which exceeds the limit of {}, \
2400
0
                             AND its bytes searched per state is greater \
2401
0
                             than the configured minimum of {}, \
2402
0
                             therefore lazy DFA is continuing! \
2403
0
                             (bytes searched since cache clear = {}, \
2404
0
                              number of states = {})",
2405
0
                            self.cache.clear_count,
2406
0
                            min_count,
2407
0
                            min_bytes_per,
2408
0
                            len,
2409
0
                            self.cache.states.len(),
2410
0
                        );
2411
0
                    }
2412
                } else {
2413
                    trace!(
2414
                        "lazy DFA cache has been cleared {} times, \
2415
                         which exceeds the limit of {}, \
2416
                         since there is no configured bytes per state \
2417
                         minimum, lazy DFA is giving up",
2418
                        self.cache.clear_count,
2419
                        min_count,
2420
                    );
2421
0
                    return Err(CacheError::too_many_cache_clears());
2422
                }
2423
0
            }
2424
0
        }
2425
0
        self.clear_cache();
2426
0
        Ok(())
2427
0
    }
2428
2429
    /// Clears _and_ resets the cache. Resetting the cache means that no
2430
    /// states are persisted and the clear count is reset to 0. No heap memory
2431
    /// is released.
2432
    ///
2433
    /// Note that the caller may reset a cache with a different DFA than what
2434
    /// it was created from. In which case, the cache can now be used with the
2435
    /// new DFA (and not the old DFA).
2436
0
    fn reset_cache(&mut self) {
2437
0
        self.cache.state_saver = StateSaver::none();
2438
0
        self.clear_cache();
2439
        // If a new DFA is used, it might have a different number of NFA
2440
        // states, so we need to make sure our sparse sets have the appropriate
2441
        // size.
2442
0
        self.cache.sparses.resize(self.dfa.get_nfa().states().len());
2443
0
        self.cache.clear_count = 0;
2444
0
        self.cache.progress = None;
2445
0
    }
2446
2447
    /// Clear the cache used by this lazy DFA.
2448
    ///
2449
    /// If 'self.state_saver' is set to save a state, then this state is
2450
    /// persisted through cache clearing. Otherwise, the cache is returned to
2451
    /// its state after initialization with two exceptions: its clear count
2452
    /// is incremented and some of its memory likely has additional capacity.
2453
    /// That is, clearing a cache does _not_ release memory.
2454
    ///
2455
    /// Otherwise, any lazy state ID generated by the cache prior to resetting
2456
    /// it is invalid after the reset.
2457
0
    fn clear_cache(&mut self) {
2458
0
        self.cache.trans.clear();
2459
0
        self.cache.starts.clear();
2460
0
        self.cache.states.clear();
2461
0
        self.cache.states_to_id.clear();
2462
0
        self.cache.memory_usage_state = 0;
2463
0
        self.cache.clear_count += 1;
2464
0
        self.cache.bytes_searched = 0;
2465
0
        if let Some(ref mut progress) = self.cache.progress {
2466
0
            progress.start = progress.at;
2467
0
        }
2468
        trace!(
2469
            "lazy DFA cache has been cleared (count: {})",
2470
            self.cache.clear_count
2471
        );
2472
0
        self.init_cache();
2473
        // If the state we want to save is one of the sentinel
2474
        // (unknown/dead/quit) states, then 'init_cache' adds those back, and
2475
        // their identifier values remains invariant. So there's no need to add
2476
        // it again. (And indeed, doing so would be incorrect!)
2477
0
        if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
2478
            // If the state is one of the special sentinel states, then it is
2479
            // automatically added by cache initialization and its ID always
2480
            // remains the same. With that said, this should never occur since
2481
            // the sentinel states are all loop states back to themselves. So
2482
            // we should never be in a position where we're attempting to save
2483
            // a sentinel state since we never compute transitions out of a
2484
            // sentinel state.
2485
0
            assert!(
2486
0
                !self.as_ref().is_sentinel(old_id),
2487
0
                "cannot save sentinel state"
2488
            );
2489
0
            let new_id = self
2490
0
                .add_state(state, |id| {
2491
0
                    if old_id.is_start() {
2492
                        // We don't need to consult the
2493
                        // 'specialize_start_states' config knob here, because
2494
                        // if it's disabled, old_id.is_start() will never
2495
                        // return true.
2496
0
                        id.to_start()
2497
                    } else {
2498
0
                        id
2499
                    }
2500
0
                })
2501
                // The unwrap here is OK because lazy DFA creation ensures that
2502
                // we have room in the cache to add MIN_STATES states. Since
2503
                // 'init_cache' above adds 3, this adds a 4th.
2504
0
                .expect("adding one state after cache clear must work");
2505
0
            self.cache.state_saver = StateSaver::Saved(new_id);
2506
0
        }
2507
0
    }
2508
2509
    /// Initialize this cache from emptiness to a place where it can be used
2510
    /// for search.
2511
    ///
2512
    /// This is called both at cache creation time and after the cache has been
2513
    /// cleared.
2514
    ///
2515
    /// Primarily, this adds the three sentinel states and allocates some
2516
    /// initial memory.
2517
0
    fn init_cache(&mut self) {
2518
        // Why multiply by 2 here? Because we make room for both the unanchored
2519
        // and anchored start states. Unanchored is first and then anchored.
2520
0
        let mut starts_len = Start::len().checked_mul(2).unwrap();
2521
        // ... but if we also want start states for every pattern, we make room
2522
        // for that too.
2523
0
        if self.dfa.get_config().get_starts_for_each_pattern() {
2524
0
            starts_len += Start::len() * self.dfa.pattern_len();
2525
0
        }
2526
0
        self.cache
2527
0
            .starts
2528
0
            .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
2529
        // This is the set of NFA states that corresponds to each of our three
2530
        // sentinel states: the empty set.
2531
0
        let dead = State::dead();
2532
        // This sets up some states that we use as sentinels that are present
2533
        // in every DFA. While it would be technically possible to implement
2534
        // this DFA without explicitly putting these states in the transition
2535
        // table, this is convenient to do to make `next_state` correct for all
2536
        // valid state IDs without needing explicit conditionals to special
2537
        // case these sentinel states.
2538
        //
2539
        // All three of these states are "dead" states. That is, all of
2540
        // them transition only to themselves. So once you enter one of
2541
        // these states, it's impossible to leave them. Thus, any correct
2542
        // search routine must explicitly check for these state types. (Sans
2543
        // `unknown`, since that is only used internally to represent missing
2544
        // states.)
2545
0
        let unk_id =
2546
0
            self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
2547
0
        let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
2548
0
        let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
2549
0
        assert_eq!(unk_id, self.as_ref().unknown_id());
2550
0
        assert_eq!(dead_id, self.as_ref().dead_id());
2551
0
        assert_eq!(quit_id, self.as_ref().quit_id());
2552
        // The idea here is that if you start in an unknown/dead/quit state and
2553
        // try to transition on them, then you should end up where you started.
2554
0
        self.set_all_transitions(unk_id, unk_id);
2555
0
        self.set_all_transitions(dead_id, dead_id);
2556
0
        self.set_all_transitions(quit_id, quit_id);
2557
        // All of these states are technically equivalent from the FSM
2558
        // perspective, so putting all three of them in the cache isn't
2559
        // possible. (They are distinct merely because we use their
2560
        // identifiers as sentinels to mean something, as indicated by the
2561
        // names.) Moreover, we wouldn't want to do that. Unknown and quit
2562
        // states are special in that they are artificial constructions
2563
        // this implementation. But dead states are a natural part of
2564
        // determinization. When you reach a point in the NFA where you cannot
2565
        // go anywhere else, a dead state will naturally arise and we MUST
2566
        // reuse the canonical dead state that we've created here. Why? Because
2567
        // it is the state ID that tells the search routine whether a state is
2568
        // dead or not, and thus, whether to stop the search. Having a bunch of
2569
        // distinct dead states would be quite wasteful!
2570
0
        self.cache.states_to_id.insert(dead, dead_id);
2571
0
    }
2572
2573
    /// Save the state corresponding to the ID given such that the state
2574
    /// persists through a cache clearing.
2575
    ///
2576
    /// While the state may persist, the ID may not. In order to discover the
2577
    /// new state ID, one must call 'saved_state_id' after a cache clearing.
2578
0
    fn save_state(&mut self, id: LazyStateID) {
2579
0
        let state = self.as_ref().get_cached_state(id).clone();
2580
0
        self.cache.state_saver = StateSaver::ToSave { id, state };
2581
0
    }
2582
2583
    /// Returns the updated lazy state ID for a state that was persisted
2584
    /// through a cache clearing.
2585
    ///
2586
    /// It is only correct to call this routine when both a state has been
2587
    /// saved and the cache has just been cleared. Otherwise, this panics.
2588
0
    fn saved_state_id(&mut self) -> LazyStateID {
2589
0
        self.cache
2590
0
            .state_saver
2591
0
            .take_saved()
2592
0
            .expect("state saver does not have saved state ID")
2593
0
    }
2594
2595
    /// Set all transitions on the state 'from' to 'to'.
2596
0
    fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
2597
0
        for unit in self.dfa.classes.representatives(..) {
2598
0
            self.set_transition(from, unit, to);
2599
0
        }
2600
0
    }
2601
2602
    /// Set the transition on 'from' for 'unit' to 'to'.
2603
    ///
2604
    /// This panics if either 'from' or 'to' is invalid.
2605
    ///
2606
    /// All unit values are OK.
2607
0
    fn set_transition(
2608
0
        &mut self,
2609
0
        from: LazyStateID,
2610
0
        unit: alphabet::Unit,
2611
0
        to: LazyStateID,
2612
0
    ) {
2613
0
        assert!(self.as_ref().is_valid(from), "invalid 'from' id: {from:?}");
2614
0
        assert!(self.as_ref().is_valid(to), "invalid 'to' id: {to:?}");
2615
0
        let offset =
2616
0
            from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
2617
0
        self.cache.trans[offset] = to;
2618
0
    }
2619
2620
    /// Set the start ID for the given pattern ID (if given) and starting
2621
    /// configuration to the ID given.
2622
    ///
2623
    /// This panics if 'id' is not valid or if a pattern ID is given and
2624
    /// 'starts_for_each_pattern' is not enabled.
2625
0
    fn set_start_state(
2626
0
        &mut self,
2627
0
        anchored: Anchored,
2628
0
        start: Start,
2629
0
        id: LazyStateID,
2630
0
    ) {
2631
0
        assert!(self.as_ref().is_valid(id));
2632
0
        let start_index = start.as_usize();
2633
0
        let index = match anchored {
2634
0
            Anchored::No => start_index,
2635
0
            Anchored::Yes => Start::len() + start_index,
2636
0
            Anchored::Pattern(pid) => {
2637
0
                assert!(
2638
0
                    self.dfa.get_config().get_starts_for_each_pattern(),
2639
0
                    "attempted to search for a specific pattern \
2640
0
                     without enabling starts_for_each_pattern",
2641
                );
2642
0
                let pid = pid.as_usize();
2643
0
                (2 * Start::len()) + (Start::len() * pid) + start_index
2644
            }
2645
        };
2646
0
        self.cache.starts[index] = id;
2647
0
    }
2648
2649
    /// Returns a state builder from this DFA that might have existing
2650
    /// capacity. This helps avoid allocs in cases where a state is built that
2651
    /// turns out to already be cached.
2652
    ///
2653
    /// Callers must put the state builder back with 'put_state_builder',
2654
    /// otherwise the allocation reuse won't work.
2655
0
    fn get_state_builder(&mut self) -> StateBuilderEmpty {
2656
0
        core::mem::replace(
2657
0
            &mut self.cache.scratch_state_builder,
2658
0
            StateBuilderEmpty::new(),
2659
        )
2660
0
    }
2661
2662
    /// Puts the given state builder back into this DFA for reuse.
2663
    ///
2664
    /// Note that building a 'State' from a builder always creates a new alloc,
2665
    /// so callers should always put the builder back.
2666
0
    fn put_state_builder(&mut self, builder: StateBuilderNFA) {
2667
0
        let _ = core::mem::replace(
2668
0
            &mut self.cache.scratch_state_builder,
2669
0
            builder.clear(),
2670
0
        );
2671
0
    }
2672
}
2673
2674
/// A type that groups methods that require the base NFA/DFA and read-only
2675
/// access to the cache.
2676
#[derive(Debug)]
2677
struct LazyRef<'i, 'c> {
2678
    dfa: &'i DFA,
2679
    cache: &'c Cache,
2680
}
2681
2682
impl<'i, 'c> LazyRef<'i, 'c> {
2683
    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2684
0
    fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
2685
0
        LazyRef { dfa, cache }
2686
0
    }
2687
2688
    /// Return the ID of the start state for the given configuration.
2689
    ///
2690
    /// If the start state has not yet been computed, then this returns an
2691
    /// unknown lazy state ID.
2692
    #[cfg_attr(feature = "perf-inline", inline(always))]
2693
0
    fn get_cached_start_id(
2694
0
        &self,
2695
0
        anchored: Anchored,
2696
0
        start: Start,
2697
0
    ) -> Result<LazyStateID, StartError> {
2698
0
        let start_index = start.as_usize();
2699
0
        let index = match anchored {
2700
0
            Anchored::No => start_index,
2701
0
            Anchored::Yes => Start::len() + start_index,
2702
0
            Anchored::Pattern(pid) => {
2703
0
                if !self.dfa.get_config().get_starts_for_each_pattern() {
2704
0
                    return Err(StartError::unsupported_anchored(anchored));
2705
0
                }
2706
0
                if pid.as_usize() >= self.dfa.pattern_len() {
2707
0
                    return Ok(self.dead_id());
2708
0
                }
2709
0
                (2 * Start::len())
2710
0
                    + (Start::len() * pid.as_usize())
2711
0
                    + start_index
2712
            }
2713
        };
2714
0
        Ok(self.cache.starts[index])
2715
0
    }
2716
2717
    /// Return the cached NFA/DFA powerset state for the given ID.
2718
    ///
2719
    /// This panics if the given ID does not address a valid state.
2720
0
    fn get_cached_state(&self, sid: LazyStateID) -> &State {
2721
0
        let index = sid.as_usize_untagged() >> self.dfa.stride2();
2722
0
        &self.cache.states[index]
2723
0
    }
2724
2725
    /// Returns true if and only if the given ID corresponds to a "sentinel"
2726
    /// state.
2727
    ///
2728
    /// A sentinel state is a state that signifies a special condition of
2729
    /// search, and where every transition maps back to itself. See LazyStateID
2730
    /// for more details. Note that start and match states are _not_ sentinels
2731
    /// since they may otherwise be real states with non-trivial transitions.
2732
    /// The purposes of sentinel states is purely to indicate something. Their
2733
    /// transitions are not meant to be followed.
2734
0
    fn is_sentinel(&self, id: LazyStateID) -> bool {
2735
0
        id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
2736
0
    }
2737
2738
    /// Returns the ID of the unknown state for this lazy DFA.
2739
0
    fn unknown_id(&self) -> LazyStateID {
2740
        // This unwrap is OK since 0 is always a valid state ID.
2741
0
        LazyStateID::new(0).unwrap().to_unknown()
2742
0
    }
2743
2744
    /// Returns the ID of the dead state for this lazy DFA.
2745
0
    fn dead_id(&self) -> LazyStateID {
2746
        // This unwrap is OK since the maximum value here is 1 * 512 = 512,
2747
        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2748
        // 512 is the worst case for our equivalence classes (every byte is a
2749
        // distinct class).
2750
0
        LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
2751
0
    }
2752
2753
    /// Returns the ID of the quit state for this lazy DFA.
2754
0
    fn quit_id(&self) -> LazyStateID {
2755
        // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
2756
        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2757
        // 512 is the worst case for our equivalence classes (every byte is a
2758
        // distinct class).
2759
0
        LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
2760
0
    }
2761
2762
    /// Returns true if and only if the given ID is valid.
2763
    ///
2764
    /// An ID is valid if it is both a valid index into the transition table
2765
    /// and is a multiple of the DFA's stride.
2766
0
    fn is_valid(&self, id: LazyStateID) -> bool {
2767
0
        let id = id.as_usize_untagged();
2768
0
        id < self.cache.trans.len() && id % self.dfa.stride() == 0
2769
0
    }
2770
2771
    /// Returns true if adding the state given would fit in this cache.
2772
0
    fn state_fits_in_cache(&self, state: &State) -> bool {
2773
0
        let needed = self.cache.memory_usage()
2774
0
            + self.memory_usage_for_one_more_state(state.memory_usage());
2775
        trace!(
2776
            "lazy DFA cache capacity state check: {:?} ?<=? {:?}",
2777
            needed,
2778
            self.dfa.cache_capacity
2779
        );
2780
0
        needed <= self.dfa.cache_capacity
2781
0
    }
2782
2783
    /// Returns true if adding the state to be built by the given builder would
2784
    /// fit in this cache.
2785
0
    fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
2786
0
        let needed = self.cache.memory_usage()
2787
0
            + self.memory_usage_for_one_more_state(state.as_bytes().len());
2788
        trace!(
2789
            "lazy DFA cache capacity state builder check: {:?} ?<=? {:?}",
2790
            needed,
2791
            self.dfa.cache_capacity
2792
        );
2793
0
        needed <= self.dfa.cache_capacity
2794
0
    }
2795
2796
    /// Returns the additional memory usage, in bytes, required to add one more
2797
    /// state to this cache. The given size should be the heap size, in bytes,
2798
    /// that would be used by the new state being added.
2799
0
    fn memory_usage_for_one_more_state(
2800
0
        &self,
2801
0
        state_heap_size: usize,
2802
0
    ) -> usize {
2803
        const ID_SIZE: usize = size_of::<LazyStateID>();
2804
        const STATE_SIZE: usize = size_of::<State>();
2805
2806
0
        self.dfa.stride() * ID_SIZE // additional space needed in trans table
2807
0
        + STATE_SIZE // space in cache.states
2808
0
        + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
2809
0
        + state_heap_size // heap memory used by state itself
2810
0
    }
2811
}
2812
2813
/// A simple type that encapsulates the saving of a state ID through a cache
2814
/// clearing.
2815
///
2816
/// A state ID can be marked for saving with ToSave, while a state ID can be
2817
/// saved itself with Saved.
2818
#[derive(Clone, Debug)]
2819
enum StateSaver {
2820
    /// An empty state saver. In this case, no states (other than the special
2821
    /// sentinel states) are preserved after clearing the cache.
2822
    None,
2823
    /// An ID of a state (and the state itself) that should be preserved after
2824
    /// the lazy DFA's cache has been cleared. After clearing, the updated ID
2825
    /// is stored in 'Saved' since it may have changed.
2826
    ToSave { id: LazyStateID, state: State },
2827
    /// An ID that of a state that has been persisted through a lazy DFA
2828
    /// cache clearing. The ID recorded here corresponds to an ID that was
2829
    /// once marked as ToSave. The IDs are likely not equivalent even though
2830
    /// the states they point to are.
2831
    Saved(LazyStateID),
2832
}
2833
2834
impl StateSaver {
2835
    /// Create an empty state saver.
2836
0
    fn none() -> StateSaver {
2837
0
        StateSaver::None
2838
0
    }
2839
2840
    /// Replace this state saver with an empty saver, and if this saver is a
2841
    /// request to save a state, return that request.
2842
0
    fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
2843
0
        match core::mem::replace(self, StateSaver::None) {
2844
0
            StateSaver::None | StateSaver::Saved(_) => None,
2845
0
            StateSaver::ToSave { id, state } => Some((id, state)),
2846
        }
2847
0
    }
2848
2849
    /// Replace this state saver with an empty saver, and if this saver is a
2850
    /// saved state (or a request to save a state), return that state's ID.
2851
    ///
2852
    /// The idea here is that a request to save a state isn't necessarily
2853
    /// honored because it might not be needed. e.g., Some higher level code
2854
    /// might request a state to be saved on the off chance that the cache gets
2855
    /// cleared when a new state is added at a lower level. But if that new
2856
    /// state is never added, then the cache is never cleared and the state and
2857
    /// its ID remain unchanged.
2858
0
    fn take_saved(&mut self) -> Option<LazyStateID> {
2859
0
        match core::mem::replace(self, StateSaver::None) {
2860
0
            StateSaver::None => None,
2861
0
            StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
2862
        }
2863
0
    }
2864
}
2865
2866
/// The configuration used for building a lazy DFA.
2867
///
2868
/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
2869
/// advantage of the former is that it often lets you avoid importing the
2870
/// `Config` type directly.
2871
///
2872
/// A lazy DFA configuration is a simple data object that is typically used
2873
/// with [`Builder::configure`].
2874
///
2875
/// The default configuration guarantees that a search will never return a
2876
/// "gave up" or "quit" error, although it is possible for a search to fail
2877
/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
2878
/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
2879
#[derive(Clone, Debug, Default)]
2880
pub struct Config {
2881
    // As with other configuration types in this crate, we put all our knobs
2882
    // in options so that we can distinguish between "default" and "not set."
2883
    // This makes it possible to easily combine multiple configurations
2884
    // without default values overwriting explicitly specified values. See the
2885
    // 'overwrite' method.
2886
    //
2887
    // For docs on the fields below, see the corresponding method setters.
2888
    match_kind: Option<MatchKind>,
2889
    pre: Option<Option<Prefilter>>,
2890
    starts_for_each_pattern: Option<bool>,
2891
    byte_classes: Option<bool>,
2892
    unicode_word_boundary: Option<bool>,
2893
    quitset: Option<ByteSet>,
2894
    specialize_start_states: Option<bool>,
2895
    cache_capacity: Option<usize>,
2896
    skip_cache_capacity_check: Option<bool>,
2897
    minimum_cache_clear_count: Option<Option<usize>>,
2898
    minimum_bytes_per_state: Option<Option<usize>>,
2899
}
2900
2901
impl Config {
2902
    /// Return a new default lazy DFA builder configuration.
2903
0
    pub fn new() -> Config {
2904
0
        Config::default()
2905
0
    }
2906
2907
    /// Set the desired match semantics.
2908
    ///
2909
    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
2910
    /// match semantics of Perl-like regex engines. That is, when multiple
2911
    /// patterns would match at the same leftmost position, the pattern that
2912
    /// appears first in the concrete syntax is chosen.
2913
    ///
2914
    /// Currently, the only other kind of match semantics supported is
2915
    /// [`MatchKind::All`]. This corresponds to classical DFA construction
2916
    /// where all possible matches are added to the lazy DFA.
2917
    ///
2918
    /// Typically, `All` is used when one wants to execute an overlapping
2919
    /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
2920
    /// sense to use `All` with the various "leftmost" find routines, since the
2921
    /// leftmost routines depend on the `LeftmostFirst` automata construction
2922
    /// strategy. Specifically, `LeftmostFirst` adds dead states to the
2923
    /// lazy DFA as a way to terminate the search and report a match.
2924
    /// `LeftmostFirst` also supports non-greedy matches using this strategy
2925
    /// where as `All` does not.
2926
    ///
2927
    /// # Example: overlapping search
2928
    ///
2929
    /// This example shows the typical use of `MatchKind::All`, which is to
2930
    /// report overlapping matches.
2931
    ///
2932
    /// ```
2933
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2934
    /// use regex_automata::{
2935
    ///     hybrid::dfa::{DFA, OverlappingState},
2936
    ///     HalfMatch, Input, MatchKind,
2937
    /// };
2938
    ///
2939
    /// let dfa = DFA::builder()
2940
    ///     .configure(DFA::config().match_kind(MatchKind::All))
2941
    ///     .build_many(&[r"\w+$", r"\S+$"])?;
2942
    /// let mut cache = dfa.create_cache();
2943
    /// let haystack = "@foo";
2944
    /// let mut state = OverlappingState::start();
2945
    ///
2946
    /// let expected = Some(HalfMatch::must(1, 4));
2947
    /// dfa.try_search_overlapping_fwd(
2948
    ///     &mut cache, &Input::new(haystack), &mut state,
2949
    /// )?;
2950
    /// assert_eq!(expected, state.get_match());
2951
    ///
2952
    /// // The first pattern also matches at the same position, so re-running
2953
    /// // the search will yield another match. Notice also that the first
2954
    /// // pattern is returned after the second. This is because the second
2955
    /// // pattern begins its match before the first, is therefore an earlier
2956
    /// // match and is thus reported first.
2957
    /// let expected = Some(HalfMatch::must(0, 4));
2958
    /// dfa.try_search_overlapping_fwd(
2959
    ///     &mut cache, &Input::new(haystack), &mut state,
2960
    /// )?;
2961
    /// assert_eq!(expected, state.get_match());
2962
    ///
2963
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2964
    /// ```
2965
    ///
2966
    /// # Example: reverse automaton to find start of match
2967
    ///
2968
    /// Another example for using `MatchKind::All` is for constructing a
2969
    /// reverse automaton to find the start of a match. `All` semantics are
2970
    /// used for this in order to find the longest possible match, which
2971
    /// corresponds to the leftmost starting position.
2972
    ///
2973
    /// Note that if you need the starting position then
2974
    /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
2975
    /// for you, so it's usually not necessary to do this yourself.
2976
    ///
2977
    /// ```
2978
    /// use regex_automata::{
2979
    ///     hybrid::dfa::DFA,
2980
    ///     nfa::thompson::NFA,
2981
    ///     Anchored, HalfMatch, Input, MatchKind,
2982
    /// };
2983
    ///
2984
    /// let input = Input::new("123foobar456");
2985
    /// let pattern = r"[a-z]+r";
2986
    ///
2987
    /// let dfa_fwd = DFA::new(pattern)?;
2988
    /// let dfa_rev = DFA::builder()
2989
    ///     .thompson(NFA::config().reverse(true))
2990
    ///     .configure(DFA::config().match_kind(MatchKind::All))
2991
    ///     .build(pattern)?;
2992
    /// let mut cache_fwd = dfa_fwd.create_cache();
2993
    /// let mut cache_rev = dfa_rev.create_cache();
2994
    ///
2995
    /// let expected_fwd = HalfMatch::must(0, 9);
2996
    /// let expected_rev = HalfMatch::must(0, 3);
2997
    /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
2998
    /// // Here we don't specify the pattern to search for since there's only
2999
    /// // one pattern and we're doing a leftmost search. But if this were an
3000
    /// // overlapping search, you'd need to specify the pattern that matched
3001
    /// // in the forward direction. (Otherwise, you might wind up finding the
3002
    /// // starting position of a match of some other pattern.) That in turn
3003
    /// // requires building the reverse automaton with starts_for_each_pattern
3004
    /// // enabled.
3005
    /// let input = input
3006
    ///     .clone()
3007
    ///     .range(..got_fwd.offset())
3008
    ///     .anchored(Anchored::Yes);
3009
    /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
3010
    /// assert_eq!(expected_fwd, got_fwd);
3011
    /// assert_eq!(expected_rev, got_rev);
3012
    ///
3013
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3014
    /// ```
3015
0
    pub fn match_kind(mut self, kind: MatchKind) -> Config {
3016
0
        self.match_kind = Some(kind);
3017
0
        self
3018
0
    }
3019
3020
    /// Set a prefilter to be used whenever a start state is entered.
3021
    ///
3022
    /// A [`Prefilter`] in this context is meant to accelerate searches by
3023
    /// looking for literal prefixes that every match for the corresponding
3024
    /// pattern (or patterns) must start with. Once a prefilter produces a
3025
    /// match, the underlying search routine continues on to try and confirm
3026
    /// the match.
3027
    ///
3028
    /// Be warned that setting a prefilter does not guarantee that the search
3029
    /// will be faster. While it's usually a good bet, if the prefilter
3030
    /// produces a lot of false positive candidates (i.e., positions matched
3031
    /// by the prefilter but not by the regex), then the overall result can
3032
    /// be slower than if you had just executed the regex engine without any
3033
    /// prefilters.
3034
    ///
3035
    /// Note that unless [`Config::specialize_start_states`] has been
3036
    /// explicitly set, then setting this will also enable (when `pre` is
3037
    /// `Some`) or disable (when `pre` is `None`) start state specialization.
3038
    /// This occurs because without start state specialization, a prefilter
3039
    /// is likely to be less effective. And without a prefilter, start state
3040
    /// specialization is usually pointless.
3041
    ///
3042
    /// By default no prefilter is set.
3043
    ///
3044
    /// # Example
3045
    ///
3046
    /// ```
3047
    /// use regex_automata::{
3048
    ///     hybrid::dfa::DFA,
3049
    ///     util::prefilter::Prefilter,
3050
    ///     Input, HalfMatch, MatchKind,
3051
    /// };
3052
    ///
3053
    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
3054
    /// let re = DFA::builder()
3055
    ///     .configure(DFA::config().prefilter(pre))
3056
    ///     .build(r"(foo|bar)[a-z]+")?;
3057
    /// let mut cache = re.create_cache();
3058
    /// let input = Input::new("foo1 barfox bar");
3059
    /// assert_eq!(
3060
    ///     Some(HalfMatch::must(0, 11)),
3061
    ///     re.try_search_fwd(&mut cache, &input)?,
3062
    /// );
3063
    ///
3064
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3065
    /// ```
3066
    ///
3067
    /// Be warned though that an incorrect prefilter can lead to incorrect
3068
    /// results!
3069
    ///
3070
    /// ```
3071
    /// use regex_automata::{
3072
    ///     hybrid::dfa::DFA,
3073
    ///     util::prefilter::Prefilter,
3074
    ///     Input, HalfMatch, MatchKind,
3075
    /// };
3076
    ///
3077
    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
3078
    /// let re = DFA::builder()
3079
    ///     .configure(DFA::config().prefilter(pre))
3080
    ///     .build(r"(foo|bar)[a-z]+")?;
3081
    /// let mut cache = re.create_cache();
3082
    /// let input = Input::new("foo1 barfox bar");
3083
    /// assert_eq!(
3084
    ///     // No match reported even though there clearly is one!
3085
    ///     None,
3086
    ///     re.try_search_fwd(&mut cache, &input)?,
3087
    /// );
3088
    ///
3089
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3090
    /// ```
3091
0
    pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
3092
0
        self.pre = Some(pre);
3093
0
        if self.specialize_start_states.is_none() {
3094
0
            self.specialize_start_states =
3095
0
                Some(self.get_prefilter().is_some());
3096
0
        }
3097
0
        self
3098
0
    }
3099
3100
    /// Whether to compile a separate start state for each pattern in the
3101
    /// lazy DFA.
3102
    ///
3103
    /// When enabled, a separate **anchored** start state is added for each
3104
    /// pattern in the lazy DFA. When this start state is used, then the DFA
3105
    /// will only search for matches for the pattern specified, even if there
3106
    /// are other patterns in the DFA.
3107
    ///
3108
    /// The main downside of this option is that it can potentially increase
3109
    /// the size of the DFA and/or increase the time it takes to build the
3110
    /// DFA at search time. However, since this is configuration for a lazy
3111
    /// DFA, these states aren't actually built unless they're used. Enabling
3112
    /// this isn't necessarily free, however, as it may result in higher cache
3113
    /// usage.
3114
    ///
3115
    /// There are a few reasons one might want to enable this (it's disabled
3116
    /// by default):
3117
    ///
3118
    /// 1. When looking for the start of an overlapping match (using a reverse
3119
    /// DFA), doing it correctly requires starting the reverse search using the
3120
    /// starting state of the pattern that matched in the forward direction.
3121
    /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
3122
    /// will automatically enable this option when building the reverse DFA
3123
    /// internally.
3124
    /// 2. When you want to use a DFA with multiple patterns to both search
3125
    /// for matches of any pattern or to search for anchored matches of one
3126
    /// particular pattern while using the same DFA. (Otherwise, you would need
3127
    /// to compile a new DFA for each pattern.)
3128
    ///
3129
    /// By default this is disabled.
3130
    ///
3131
    /// # Example
3132
    ///
3133
    /// This example shows how to use this option to permit the same lazy DFA
3134
    /// to run both general searches for any pattern and anchored searches for
3135
    /// a specific pattern.
3136
    ///
3137
    /// ```
3138
    /// use regex_automata::{
3139
    ///     hybrid::dfa::DFA,
3140
    ///     Anchored, HalfMatch, Input, PatternID,
3141
    /// };
3142
    ///
3143
    /// let dfa = DFA::builder()
3144
    ///     .configure(DFA::config().starts_for_each_pattern(true))
3145
    ///     .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
3146
    /// let mut cache = dfa.create_cache();
3147
    /// let haystack = "bar foo123";
3148
    ///
3149
    /// // Here's a normal unanchored search that looks for any pattern.
3150
    /// let expected = HalfMatch::must(0, 10);
3151
    /// let input = Input::new(haystack);
3152
    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3153
    /// // We can also do a normal anchored search for any pattern. Since it's
3154
    /// // an anchored search, we position the start of the search where we
3155
    /// // know the match will begin.
3156
    /// let expected = HalfMatch::must(0, 10);
3157
    /// let input = Input::new(haystack).range(4..);
3158
    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3159
    /// // Since we compiled anchored start states for each pattern, we can
3160
    /// // also look for matches of other patterns explicitly, even if a
3161
    /// // different pattern would have normally matched.
3162
    /// let expected = HalfMatch::must(1, 10);
3163
    /// let input = Input::new(haystack)
3164
    ///     .range(4..)
3165
    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
3166
    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3167
    ///
3168
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3169
    /// ```
3170
0
    pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
3171
0
        self.starts_for_each_pattern = Some(yes);
3172
0
        self
3173
0
    }
3174
3175
    /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
3176
    /// not.
3177
    ///
3178
    /// This option is enabled by default and should never be disabled unless
3179
    /// one is debugging the lazy DFA.
3180
    ///
3181
    /// When enabled, the lazy DFA will use a map from all possible bytes
3182
    /// to their corresponding equivalence class. Each equivalence class
3183
    /// represents a set of bytes that does not discriminate between a match
3184
    /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
3185
    /// least two equivalence classes: a set containing `a` and `b` and a set
3186
    /// containing every byte except for `a` and `b`. `a` and `b` are in the
3187
    /// same equivalence classes because they never discriminate between a
3188
    /// match and a non-match.
3189
    ///
3190
    /// The advantage of this map is that the size of the transition table
3191
    /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
3192
    /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
3193
    /// equivalence classes (rounded up to the nearest power of 2). As a
3194
    /// result, total space usage can decrease substantially. Moreover, since a
3195
    /// smaller alphabet is used, DFA compilation during search becomes faster
3196
    /// as well since it will potentially be able to reuse a single transition
3197
    /// for multiple bytes.
3198
    ///
3199
    /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
3200
    /// this does not yield any speed advantages. Namely, even when this is
3201
    /// disabled, a byte class map is still used while searching. The only
3202
    /// difference is that every byte will be forced into its own distinct
3203
    /// equivalence class. This is useful for debugging the actual generated
3204
    /// transitions because it lets one see the transitions defined on actual
3205
    /// bytes instead of the equivalence classes.
3206
0
    pub fn byte_classes(mut self, yes: bool) -> Config {
3207
0
        self.byte_classes = Some(yes);
3208
0
        self
3209
0
    }
3210
3211
    /// Heuristically enable Unicode word boundaries.
3212
    ///
3213
    /// When set, this will attempt to implement Unicode word boundaries as if
3214
    /// they were ASCII word boundaries. This only works when the search input
3215
    /// is ASCII only. If a non-ASCII byte is observed while searching, then a
3216
    /// [`MatchError::quit`] error is returned.
3217
    ///
3218
    /// A possible alternative to enabling this option is to simply use an
3219
    /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
3220
    /// option is if you absolutely need Unicode support. This option lets one
3221
    /// use a fast search implementation (a DFA) for some potentially very
3222
    /// common cases, while providing the option to fall back to some other
3223
    /// regex engine to handle the general case when an error is returned.
3224
    ///
3225
    /// If the pattern provided has no Unicode word boundary in it, then this
3226
    /// option has no effect. (That is, quitting on a non-ASCII byte only
3227
    /// occurs when this option is enabled _and_ a Unicode word boundary is
3228
    /// present in the pattern.)
3229
    ///
3230
    /// This is almost equivalent to setting all non-ASCII bytes to be quit
3231
    /// bytes. The only difference is that this will cause non-ASCII bytes to
3232
    /// be quit bytes _only_ when a Unicode word boundary is present in the
3233
    /// pattern.
3234
    ///
3235
    /// When enabling this option, callers _must_ be prepared to
3236
    /// handle a [`MatchError`] error during search. When using a
3237
    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3238
    /// `try_` suite of methods. Alternatively, if callers can guarantee that
3239
    /// their input is ASCII only, then a [`MatchError::quit`] error will never
3240
    /// be returned while searching.
3241
    ///
3242
    /// This is disabled by default.
3243
    ///
3244
    /// # Example
3245
    ///
3246
    /// This example shows how to heuristically enable Unicode word boundaries
3247
    /// in a pattern. It also shows what happens when a search comes across a
3248
    /// non-ASCII byte.
3249
    ///
3250
    /// ```
3251
    /// use regex_automata::{
3252
    ///     hybrid::dfa::DFA,
3253
    ///     HalfMatch, Input, MatchError,
3254
    /// };
3255
    ///
3256
    /// let dfa = DFA::builder()
3257
    ///     .configure(DFA::config().unicode_word_boundary(true))
3258
    ///     .build(r"\b[0-9]+\b")?;
3259
    /// let mut cache = dfa.create_cache();
3260
    ///
3261
    /// // The match occurs before the search ever observes the snowman
3262
    /// // character, so no error occurs.
3263
    /// let haystack = "foo 123  ☃";
3264
    /// let expected = Some(HalfMatch::must(0, 7));
3265
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3266
    /// assert_eq!(expected, got);
3267
    ///
3268
    /// // Notice that this search fails, even though the snowman character
3269
    /// // occurs after the ending match offset. This is because search
3270
    /// // routines read one byte past the end of the search to account for
3271
    /// // look-around, and indeed, this is required here to determine whether
3272
    /// // the trailing \b matches.
3273
    /// let haystack = "foo 123 ☃";
3274
    /// let expected = MatchError::quit(0xE2, 8);
3275
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
3276
    /// assert_eq!(Err(expected), got);
3277
    ///
3278
    /// // Another example is executing a search where the span of the haystack
3279
    /// // we specify is all ASCII, but there is non-ASCII just before it. This
3280
    /// // correctly also reports an error.
3281
    /// let input = Input::new("β123").range(2..);
3282
    /// let expected = MatchError::quit(0xB2, 1);
3283
    /// let got = dfa.try_search_fwd(&mut cache, &input);
3284
    /// assert_eq!(Err(expected), got);
3285
    ///
3286
    /// // And similarly for the trailing word boundary.
3287
    /// let input = Input::new("123β").range(..3);
3288
    /// let expected = MatchError::quit(0xCE, 3);
3289
    /// let got = dfa.try_search_fwd(&mut cache, &input);
3290
    /// assert_eq!(Err(expected), got);
3291
    ///
3292
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3293
    /// ```
3294
0
    pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
3295
        // We have a separate option for this instead of just setting the
3296
        // appropriate quit bytes here because we don't want to set quit bytes
3297
        // for every regex. We only want to set them when the regex contains a
3298
        // Unicode word boundary.
3299
0
        self.unicode_word_boundary = Some(yes);
3300
0
        self
3301
0
    }
3302
3303
    /// Add a "quit" byte to the lazy DFA.
3304
    ///
3305
    /// When a quit byte is seen during search time, then search will return a
3306
    /// [`MatchError::quit`] error indicating the offset at which the search
3307
    /// stopped.
3308
    ///
3309
    /// A quit byte will always overrule any other aspects of a regex. For
3310
    /// example, if the `x` byte is added as a quit byte and the regex `\w` is
3311
    /// used, then observing `x` will cause the search to quit immediately
3312
    /// despite the fact that `x` is in the `\w` class.
3313
    ///
3314
    /// This mechanism is primarily useful for heuristically enabling certain
3315
    /// features like Unicode word boundaries in a DFA. Namely, if the input
3316
    /// to search is ASCII, then a Unicode word boundary can be implemented
3317
    /// via an ASCII word boundary with no change in semantics. Thus, a DFA
3318
    /// can attempt to match a Unicode word boundary but give up as soon as it
3319
    /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
3320
    /// to be quit bytes, then Unicode word boundaries will be permitted when
3321
    /// building lazy DFAs. Of course, callers should enable
3322
    /// [`Config::unicode_word_boundary`] if they want this behavior instead.
3323
    /// (The advantage being that non-ASCII quit bytes will only be added if a
3324
    /// Unicode word boundary is in the pattern.)
3325
    ///
3326
    /// When enabling this option, callers _must_ be prepared to
3327
    /// handle a [`MatchError`] error during search. When using a
3328
    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3329
    /// `try_` suite of methods.
3330
    ///
3331
    /// By default, there are no quit bytes set.
3332
    ///
3333
    /// # Panics
3334
    ///
3335
    /// This panics if heuristic Unicode word boundaries are enabled and any
3336
    /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
3337
    /// Unicode word boundaries requires setting every non-ASCII byte to a quit
3338
    /// byte. So if the caller attempts to undo any of that, then this will
3339
    /// panic.
3340
    ///
3341
    /// # Example
3342
    ///
3343
    /// This example shows how to cause a search to terminate if it sees a
3344
    /// `\n` byte. This could be useful if, for example, you wanted to prevent
3345
    /// a user supplied pattern from matching across a line boundary.
3346
    ///
3347
    /// ```
3348
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3349
    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3350
    ///
3351
    /// let dfa = DFA::builder()
3352
    ///     .configure(DFA::config().quit(b'\n', true))
3353
    ///     .build(r"foo\p{any}+bar")?;
3354
    /// let mut cache = dfa.create_cache();
3355
    ///
3356
    /// let haystack = "foo\nbar";
3357
    /// // Normally this would produce a match, since \p{any} contains '\n'.
3358
    /// // But since we instructed the automaton to enter a quit state if a
3359
    /// // '\n' is observed, this produces a match error instead.
3360
    /// let expected = MatchError::quit(b'\n', 3);
3361
    /// let got = dfa.try_search_fwd(
3362
    ///     &mut cache,
3363
    ///     &Input::new(haystack),
3364
    /// ).unwrap_err();
3365
    /// assert_eq!(expected, got);
3366
    ///
3367
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3368
    /// ```
3369
0
    pub fn quit(mut self, byte: u8, yes: bool) -> Config {
3370
0
        if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
3371
0
            panic!(
3372
0
                "cannot set non-ASCII byte to be non-quit when \
3373
0
                 Unicode word boundaries are enabled"
3374
            );
3375
0
        }
3376
0
        if self.quitset.is_none() {
3377
0
            self.quitset = Some(ByteSet::empty());
3378
0
        }
3379
0
        if yes {
3380
0
            self.quitset.as_mut().unwrap().add(byte);
3381
0
        } else {
3382
0
            self.quitset.as_mut().unwrap().remove(byte);
3383
0
        }
3384
0
        self
3385
0
    }
3386
3387
    /// Enable specializing start states in the lazy DFA.
3388
    ///
3389
    /// When start states are specialized, an implementor of a search routine
3390
    /// using a lazy DFA can tell when the search has entered a starting state.
3391
    /// When start states aren't specialized, then it is impossible to know
3392
    /// whether the search has entered a start state.
3393
    ///
3394
    /// Ideally, this option wouldn't need to exist and we could always
3395
    /// specialize start states. The problem is that start states can be quite
3396
    /// active. This in turn means that an efficient search routine is likely
3397
    /// to ping-pong between a heavily optimized hot loop that handles most
3398
    /// states and to a less optimized specialized handling of start states.
3399
    /// This causes branches to get heavily mispredicted and overall can
3400
    /// materially decrease throughput. Therefore, specializing start states
3401
    /// should only be enabled when it is needed.
3402
    ///
3403
    /// Knowing whether a search is in a start state is typically useful when a
3404
    /// prefilter is active for the search. A prefilter is typically only run
3405
    /// when in a start state and a prefilter can greatly accelerate a search.
3406
    /// Therefore, the possible cost of specializing start states is worth it
3407
    /// in this case. Otherwise, if you have no prefilter, there is likely no
3408
    /// reason to specialize start states.
3409
    ///
3410
    /// This is disabled by default, but note that it is automatically
3411
    /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
3412
    /// `specialize_start_states` has already been set, [`Config::prefilter`]
3413
    /// will automatically enable or disable it based on whether a prefilter
3414
    /// is present or not, respectively. This is done because a prefilter's
3415
    /// effectiveness is rooted in being executed whenever the DFA is in a
3416
    /// start state, and that's only possible to do when they are specialized.
3417
    ///
3418
    /// Note that it is plausibly reasonable to _disable_ this option
3419
    /// explicitly while _enabling_ a prefilter. In that case, a prefilter
3420
    /// will still be run at the beginning of a search, but never again. This
3421
    /// in theory could strike a good balance if you're in a situation where a
3422
    /// prefilter is likely to produce many false positive candidates.
3423
    ///
3424
    /// # Example
3425
    ///
3426
    /// This example shows how to enable start state specialization and then
3427
    /// shows how to check whether a state is a start state or not.
3428
    ///
3429
    /// ```
3430
    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3431
    ///
3432
    /// let dfa = DFA::builder()
3433
    ///     .configure(DFA::config().specialize_start_states(true))
3434
    ///     .build(r"[a-z]+")?;
3435
    /// let mut cache = dfa.create_cache();
3436
    ///
3437
    /// let haystack = "123 foobar 4567".as_bytes();
3438
    /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3439
    /// // The ID returned by 'start_state_forward' will always be tagged as
3440
    /// // a start state when start state specialization is enabled.
3441
    /// assert!(sid.is_tagged());
3442
    /// assert!(sid.is_start());
3443
    ///
3444
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3445
    /// ```
3446
    ///
3447
    /// Compare the above with the default lazy DFA configuration where
3448
    /// start states are _not_ specialized. In this case, the start state
3449
    /// is not tagged and `sid.is_start()` returns false.
3450
    ///
3451
    /// ```
3452
    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3453
    ///
3454
    /// let dfa = DFA::new(r"[a-z]+")?;
3455
    /// let mut cache = dfa.create_cache();
3456
    ///
3457
    /// let haystack = "123 foobar 4567".as_bytes();
3458
    /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3459
    /// // Start states are not tagged in the default configuration!
3460
    /// assert!(!sid.is_tagged());
3461
    /// assert!(!sid.is_start());
3462
    ///
3463
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3464
    /// ```
3465
0
    pub fn specialize_start_states(mut self, yes: bool) -> Config {
3466
0
        self.specialize_start_states = Some(yes);
3467
0
        self
3468
0
    }
3469
3470
    /// Sets the maximum amount of heap memory, in bytes, to allocate to the
3471
    /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
3472
    /// use more heap memory, then, depending on other configuration knobs,
3473
    /// either stop the search and return an error or clear the cache and
3474
    /// continue the search.
3475
    ///
3476
    /// The default cache capacity is some "reasonable" number that will
3477
    /// accommodate most regular expressions. You may find that if you need
3478
    /// to build a large DFA then it may be necessary to increase the cache
3479
    /// capacity.
3480
    ///
3481
    /// Note that while building a lazy DFA will do a "minimum" check to ensure
3482
    /// the capacity is big enough, this is more or less about correctness.
3483
    /// If the cache is bigger than the minimum but still "too small," then the
3484
    /// lazy DFA could wind up spending a lot of time clearing the cache and
3485
    /// recomputing transitions, thus negating the performance benefits of a
3486
    /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
3487
    /// endeavor. For most common patterns, however, the default should be
3488
    /// sufficient.
3489
    ///
3490
    /// For more details on how the lazy DFA's cache is used, see the
3491
    /// documentation for [`Cache`].
3492
    ///
3493
    /// # Example
3494
    ///
3495
    /// This example shows what happens if the configured cache capacity is
3496
    /// too small. In such cases, one can override the cache capacity to make
3497
    /// it bigger. Alternatively, one might want to use less memory by setting
3498
    /// a smaller cache capacity.
3499
    ///
3500
    /// ```
3501
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3502
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3503
    ///
3504
    /// let pattern = r"\p{L}{1000}";
3505
    ///
3506
    /// // The default cache capacity is likely too small to deal with regexes
3507
    /// // that are very large. Large repetitions of large Unicode character
3508
    /// // classes are a common way to make very large regexes.
3509
    /// let _ = DFA::new(pattern).unwrap_err();
3510
    /// // Bump up the capacity to something bigger.
3511
    /// let dfa = DFA::builder()
3512
    ///     .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
3513
    ///     .build(pattern)?;
3514
    /// let mut cache = dfa.create_cache();
3515
    ///
3516
    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3517
    /// let expected = Some(HalfMatch::must(0, 2000));
3518
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3519
    /// assert_eq!(expected, got);
3520
    ///
3521
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3522
    /// ```
3523
0
    pub fn cache_capacity(mut self, bytes: usize) -> Config {
3524
0
        self.cache_capacity = Some(bytes);
3525
0
        self
3526
0
    }
3527
3528
    /// Configures construction of a lazy DFA to use the minimum cache capacity
3529
    /// if the configured capacity is otherwise too small for the provided NFA.
3530
    ///
3531
    /// This is useful if you never want lazy DFA construction to fail because
3532
    /// of a capacity that is too small.
3533
    ///
3534
    /// In general, this option is typically not a good idea. In particular,
3535
    /// while a minimum cache capacity does permit the lazy DFA to function
3536
    /// where it otherwise couldn't, it's plausible that it may not function
3537
    /// well if it's constantly running out of room. In that case, the speed
3538
    /// advantages of the lazy DFA may be negated. On the other hand, the
3539
    /// "minimum" cache capacity computed may not be completely accurate and
3540
    /// could actually be bigger than what is really necessary. Therefore, it
3541
    /// is plausible that using the minimum cache capacity could still result
3542
    /// in very good performance.
3543
    ///
3544
    /// This is disabled by default.
3545
    ///
3546
    /// # Example
3547
    ///
3548
    /// This example shows what happens if the configured cache capacity is
3549
    /// too small. In such cases, one could override the capacity explicitly.
3550
    /// An alternative, demonstrated here, let's us force construction to use
3551
    /// the minimum cache capacity if the configured capacity is otherwise
3552
    /// too small.
3553
    ///
3554
    /// ```
3555
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3556
    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3557
    ///
3558
    /// let pattern = r"\p{L}{1000}";
3559
    ///
3560
    /// // The default cache capacity is likely too small to deal with regexes
3561
    /// // that are very large. Large repetitions of large Unicode character
3562
    /// // classes are a common way to make very large regexes.
3563
    /// let _ = DFA::new(pattern).unwrap_err();
3564
    /// // Configure construction such it automatically selects the minimum
3565
    /// // cache capacity if it would otherwise be too small.
3566
    /// let dfa = DFA::builder()
3567
    ///     .configure(DFA::config().skip_cache_capacity_check(true))
3568
    ///     .build(pattern)?;
3569
    /// let mut cache = dfa.create_cache();
3570
    ///
3571
    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3572
    /// let expected = Some(HalfMatch::must(0, 2000));
3573
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3574
    /// assert_eq!(expected, got);
3575
    ///
3576
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3577
    /// ```
3578
0
    pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
3579
0
        self.skip_cache_capacity_check = Some(yes);
3580
0
        self
3581
0
    }
3582
3583
    /// Configure a lazy DFA search to quit after a certain number of cache
3584
    /// clearings.
3585
    ///
3586
    /// When a minimum is set, then a lazy DFA search will *possibly* "give
3587
    /// up" after the minimum number of cache clearings has occurred. This is
3588
    /// typically useful in scenarios where callers want to detect whether the
3589
    /// lazy DFA search is "efficient" or not. If the cache is cleared too many
3590
    /// times, this is a good indicator that it is not efficient, and thus, the
3591
    /// caller may wish to use some other regex engine.
3592
    ///
3593
    /// Note that the number of times a cache is cleared is a property of
3594
    /// the cache itself. Thus, if a cache is used in a subsequent search
3595
    /// with a similarly configured lazy DFA, then it could cause the
3596
    /// search to "give up" if the cache needed to be cleared, depending
3597
    /// on its internal count and configured minimum. The cache clear
3598
    /// count can only be reset to `0` via [`DFA::reset_cache`] (or
3599
    /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
3600
    /// you're using the `Regex` API).
3601
    ///
3602
    /// By default, no minimum is configured. Thus, a lazy DFA search will
3603
    /// never give up due to cache clearings. If you do set this option, you
3604
    /// might consider also setting [`Config::minimum_bytes_per_state`] in
3605
    /// order for the lazy DFA to take efficiency into account before giving
3606
    /// up.
3607
    ///
3608
    /// # Example
3609
    ///
3610
    /// This example uses a somewhat pathological configuration to demonstrate
3611
    /// the _possible_ behavior of cache clearing and how it might result
3612
    /// in a search that returns an error.
3613
    ///
3614
    /// It is important to note that the precise mechanics of how and when
3615
    /// a cache gets cleared is an implementation detail.
3616
    ///
3617
    /// ```
3618
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3619
    /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
3620
    ///
3621
    /// // This is a carefully chosen regex. The idea is to pick one
3622
    /// // that requires some decent number of states (hence the bounded
3623
    /// // repetition). But we specifically choose to create a class with an
3624
    /// // ASCII letter and a non-ASCII letter so that we can check that no new
3625
    /// // states are created once the cache is full. Namely, if we fill up the
3626
    /// // cache on a haystack of 'a's, then in order to match one 'β', a new
3627
    /// // state will need to be created since a 'β' is encoded with multiple
3628
    /// // bytes. Since there's no room for this state, the search should quit
3629
    /// // at the very first position.
3630
    /// let pattern = r"[aβ]{100}";
3631
    /// let dfa = DFA::builder()
3632
    ///     .configure(
3633
    ///         // Configure it so that we have the minimum cache capacity
3634
    ///         // possible. And that if any clearings occur, the search quits.
3635
    ///         DFA::config()
3636
    ///             .skip_cache_capacity_check(true)
3637
    ///             .cache_capacity(0)
3638
    ///             .minimum_cache_clear_count(Some(0)),
3639
    ///     )
3640
    ///     .build(pattern)?;
3641
    /// let mut cache = dfa.create_cache();
3642
    ///
3643
    /// // Our search will give up before reaching the end!
3644
    /// let haystack = "a".repeat(101).into_bytes();
3645
    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3646
    /// assert!(matches!(
3647
    ///     *result.unwrap_err().kind(),
3648
    ///     MatchErrorKind::GaveUp { .. },
3649
    /// ));
3650
    ///
3651
    /// // Now that we know the cache is full, if we search a haystack that we
3652
    /// // know will require creating at least one new state, it should not
3653
    /// // be able to make much progress.
3654
    /// let haystack = "β".repeat(101).into_bytes();
3655
    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3656
    /// assert!(matches!(
3657
    ///     *result.unwrap_err().kind(),
3658
    ///     MatchErrorKind::GaveUp { .. },
3659
    /// ));
3660
    ///
3661
    /// // If we reset the cache, then we should be able to create more states
3662
    /// // and make more progress with searching for betas.
3663
    /// cache.reset(&dfa);
3664
    /// let haystack = "β".repeat(101).into_bytes();
3665
    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3666
    /// assert!(matches!(
3667
    ///     *result.unwrap_err().kind(),
3668
    ///     MatchErrorKind::GaveUp { .. },
3669
    /// ));
3670
    ///
3671
    /// // ... switching back to ASCII still makes progress since it just needs
3672
    /// // to set transitions on existing states!
3673
    /// let haystack = "a".repeat(101).into_bytes();
3674
    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3675
    /// assert!(matches!(
3676
    ///     *result.unwrap_err().kind(),
3677
    ///     MatchErrorKind::GaveUp { .. },
3678
    /// ));
3679
    ///
3680
    /// # Ok::<(), Box<dyn std::error::Error>>(())
3681
    /// ```
3682
0
    pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
3683
0
        self.minimum_cache_clear_count = Some(min);
3684
0
        self
3685
0
    }
3686
3687
    /// Configure a lazy DFA search to quit only when its efficiency drops
3688
    /// below the given minimum.
3689
    ///
3690
    /// The efficiency of the cache is determined by the number of DFA states
3691
    /// compiled per byte of haystack searched. For example, if the efficiency
3692
    /// is 2, then it means the lazy DFA is creating a new DFA state after
3693
    /// searching approximately 2 bytes in a haystack. Generally speaking, 2
3694
    /// is quite bad and it's likely that even a slower regex engine like the
3695
    /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
3696
    ///
3697
    /// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
3698
    /// Namely, this option only kicks in when the cache has been cleared more
3699
    /// than the minimum number. If no minimum is set, then the cache is simply
3700
    /// cleared whenever it fills up and it is impossible for the lazy DFA to
3701
    /// quit due to ineffective use of the cache.
3702
    ///
3703
    /// In general, if one is setting [`Config::minimum_cache_clear_count`],
3704
    /// then one should probably also set this knob as well. The reason is
3705
    /// that the absolute number of times the cache is cleared is generally
3706
    /// not a great predictor of efficiency. For example, if a new DFA state
3707
    /// is created for every 1,000 bytes searched, then it wouldn't be hard
3708
    /// for the cache to get cleared more than `N` times and then cause the
3709
    /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
3710
    /// good from a performance perspective, and it's likely that the lazy
3711
    /// DFA should continue searching, even if it requires clearing the cache
3712
    /// occasionally.
3713
    ///
3714
    /// Finally, note that if you're implementing your own lazy DFA search
3715
    /// routine and also want this efficiency check to work correctly, then
3716
    /// you'll need to use the following routines to record search progress:
3717
    ///
3718
    /// * Call [`Cache::search_start`] at the beginning of every search.
3719
    /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is
3720
    /// called.
3721
    /// * Call [`Cache::search_finish`] before completing a search. (It is
3722
    /// not strictly necessary to call this when an error is returned, as
3723
    /// `Cache::search_start` will automatically finish the previous search
3724
    /// for you. But calling it where possible before returning helps improve
3725
    /// the accuracy of how many bytes have actually been searched.)
3726
0
    pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
3727
0
        self.minimum_bytes_per_state = Some(min);
3728
0
        self
3729
0
    }
3730
3731
    /// Returns the match semantics set in this configuration.
3732
0
    pub fn get_match_kind(&self) -> MatchKind {
3733
0
        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
3734
0
    }
3735
3736
    /// Returns the prefilter set in this configuration, if one at all.
3737
0
    pub fn get_prefilter(&self) -> Option<&Prefilter> {
3738
0
        self.pre.as_ref().unwrap_or(&None).as_ref()
3739
0
    }
3740
3741
    /// Returns whether this configuration has enabled anchored starting states
3742
    /// for every pattern in the DFA.
3743
0
    pub fn get_starts_for_each_pattern(&self) -> bool {
3744
0
        self.starts_for_each_pattern.unwrap_or(false)
3745
0
    }
3746
3747
    /// Returns whether this configuration has enabled byte classes or not.
3748
    /// This is typically a debugging oriented option, as disabling it confers
3749
    /// no speed benefit.
3750
0
    pub fn get_byte_classes(&self) -> bool {
3751
0
        self.byte_classes.unwrap_or(true)
3752
0
    }
3753
3754
    /// Returns whether this configuration has enabled heuristic Unicode word
3755
    /// boundary support. When enabled, it is possible for a search to return
3756
    /// an error.
3757
0
    pub fn get_unicode_word_boundary(&self) -> bool {
3758
0
        self.unicode_word_boundary.unwrap_or(false)
3759
0
    }
3760
3761
    /// Returns whether this configuration will instruct the lazy DFA to enter
3762
    /// a quit state whenever the given byte is seen during a search. When at
3763
    /// least one byte has this enabled, it is possible for a search to return
3764
    /// an error.
3765
0
    pub fn get_quit(&self, byte: u8) -> bool {
3766
0
        self.quitset.map_or(false, |q| q.contains(byte))
3767
0
    }
3768
3769
    /// Returns whether this configuration will instruct the lazy DFA to
3770
    /// "specialize" start states. When enabled, the lazy DFA will tag start
3771
    /// states so that search routines using the lazy DFA can detect when
3772
    /// it's in a start state and do some kind of optimization (like run a
3773
    /// prefilter).
3774
0
    pub fn get_specialize_start_states(&self) -> bool {
3775
0
        self.specialize_start_states.unwrap_or(false)
3776
0
    }
3777
3778
    /// Returns the cache capacity set on this configuration.
3779
0
    pub fn get_cache_capacity(&self) -> usize {
3780
0
        self.cache_capacity.unwrap_or(2 * (1 << 20))
3781
0
    }
3782
3783
    /// Returns whether the cache capacity check should be skipped.
3784
0
    pub fn get_skip_cache_capacity_check(&self) -> bool {
3785
0
        self.skip_cache_capacity_check.unwrap_or(false)
3786
0
    }
3787
3788
    /// Returns, if set, the minimum number of times the cache must be cleared
3789
    /// before a lazy DFA search can give up. When no minimum is set, then a
3790
    /// search will never quit and will always clear the cache whenever it
3791
    /// fills up.
3792
0
    pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
3793
0
        self.minimum_cache_clear_count.unwrap_or(None)
3794
0
    }
3795
3796
    /// Returns, if set, the minimum number of bytes per state that need to be
3797
    /// processed in order for the lazy DFA to keep going. If the minimum falls
3798
    /// below this number (and the cache has been cleared a minimum number of
3799
    /// times), then the lazy DFA will return a "gave up" error.
3800
0
    pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
3801
0
        self.minimum_bytes_per_state.unwrap_or(None)
3802
0
    }
3803
3804
    /// Returns the minimum lazy DFA cache capacity required for the given NFA.
3805
    ///
3806
    /// The cache capacity required for a particular NFA may change without
3807
    /// notice. Callers should not rely on it being stable.
3808
    ///
3809
    /// This is useful for informational purposes, but can also be useful for
3810
    /// other reasons. For example, if one wants to check the minimum cache
3811
    /// capacity themselves or if one wants to set the capacity based on the
3812
    /// minimum.
3813
    ///
3814
    /// This may return an error if this configuration does not support all of
3815
    /// the instructions used in the given NFA. For example, if the NFA has a
3816
    /// Unicode word boundary but this configuration does not enable heuristic
3817
    /// support for Unicode word boundaries.
3818
0
    pub fn get_minimum_cache_capacity(
3819
0
        &self,
3820
0
        nfa: &thompson::NFA,
3821
0
    ) -> Result<usize, BuildError> {
3822
0
        let quitset = self.quit_set_from_nfa(nfa)?;
3823
0
        let classes = self.byte_classes_from_nfa(nfa, &quitset);
3824
0
        let starts = self.get_starts_for_each_pattern();
3825
0
        Ok(minimum_cache_capacity(nfa, &classes, starts))
3826
0
    }
3827
3828
    /// Returns the byte class map used during search from the given NFA.
3829
    ///
3830
    /// If byte classes are disabled on this configuration, then a map is
3831
    /// returned that puts each byte in its own equivalent class.
3832
0
    fn byte_classes_from_nfa(
3833
0
        &self,
3834
0
        nfa: &thompson::NFA,
3835
0
        quit: &ByteSet,
3836
0
    ) -> ByteClasses {
3837
0
        if !self.get_byte_classes() {
3838
            // The lazy DFA will always use the equivalence class map, but
3839
            // enabling this option is useful for debugging. Namely, this will
3840
            // cause all transitions to be defined over their actual bytes
3841
            // instead of an opaque equivalence class identifier. The former is
3842
            // much easier to grok as a human.
3843
0
            ByteClasses::singletons()
3844
        } else {
3845
0
            let mut set = nfa.byte_class_set().clone();
3846
            // It is important to distinguish any "quit" bytes from all other
3847
            // bytes. Otherwise, a non-quit byte may end up in the same class
3848
            // as a quit byte, and thus cause the DFA stop when it shouldn't.
3849
            //
3850
            // Test case:
3851
            //
3852
            //   regex-cli find match hybrid --unicode-word-boundary \
3853
            //     -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
3854
0
            if !quit.is_empty() {
3855
0
                set.add_set(&quit);
3856
0
            }
3857
0
            set.byte_classes()
3858
        }
3859
0
    }
3860
3861
    /// Return the quit set for this configuration and the given NFA.
3862
    ///
3863
    /// This may return an error if the NFA is incompatible with this
3864
    /// configuration's quit set. For example, if the NFA has a Unicode word
3865
    /// boundary and the quit set doesn't include non-ASCII bytes.
3866
0
    fn quit_set_from_nfa(
3867
0
        &self,
3868
0
        nfa: &thompson::NFA,
3869
0
    ) -> Result<ByteSet, BuildError> {
3870
0
        let mut quit = self.quitset.unwrap_or(ByteSet::empty());
3871
0
        if nfa.look_set_any().contains_word_unicode() {
3872
0
            if self.get_unicode_word_boundary() {
3873
0
                for b in 0x80..=0xFF {
3874
0
                    quit.add(b);
3875
0
                }
3876
            } else {
3877
                // If heuristic support for Unicode word boundaries wasn't
3878
                // enabled, then we can still check if our quit set is correct.
3879
                // If the caller set their quit bytes in a way that causes the
3880
                // DFA to quit on at least all non-ASCII bytes, then that's all
3881
                // we need for heuristic support to work.
3882
0
                if !quit.contains_range(0x80, 0xFF) {
3883
0
                    return Err(
3884
0
                        BuildError::unsupported_dfa_word_boundary_unicode(),
3885
0
                    );
3886
0
                }
3887
            }
3888
0
        }
3889
0
        Ok(quit)
3890
0
    }
3891
3892
    /// Overwrite the default configuration such that the options in `o` are
3893
    /// always used. If an option in `o` is not set, then the corresponding
3894
    /// option in `self` is used. If it's not set in `self` either, then it
3895
    /// remains not set.
3896
0
    fn overwrite(&self, o: Config) -> Config {
3897
        Config {
3898
0
            match_kind: o.match_kind.or(self.match_kind),
3899
0
            pre: o.pre.or_else(|| self.pre.clone()),
3900
0
            starts_for_each_pattern: o
3901
0
                .starts_for_each_pattern
3902
0
                .or(self.starts_for_each_pattern),
3903
0
            byte_classes: o.byte_classes.or(self.byte_classes),
3904
0
            unicode_word_boundary: o
3905
0
                .unicode_word_boundary
3906
0
                .or(self.unicode_word_boundary),
3907
0
            quitset: o.quitset.or(self.quitset),
3908
0
            specialize_start_states: o
3909
0
                .specialize_start_states
3910
0
                .or(self.specialize_start_states),
3911
0
            cache_capacity: o.cache_capacity.or(self.cache_capacity),
3912
0
            skip_cache_capacity_check: o
3913
0
                .skip_cache_capacity_check
3914
0
                .or(self.skip_cache_capacity_check),
3915
0
            minimum_cache_clear_count: o
3916
0
                .minimum_cache_clear_count
3917
0
                .or(self.minimum_cache_clear_count),
3918
0
            minimum_bytes_per_state: o
3919
0
                .minimum_bytes_per_state
3920
0
                .or(self.minimum_bytes_per_state),
3921
        }
3922
0
    }
3923
}
3924
3925
/// A builder for constructing a lazy deterministic finite automaton from
3926
/// regular expressions.
3927
///
3928
/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
3929
/// advantage of the former is that it often lets you avoid importing the
3930
/// `Builder` type directly.
3931
///
3932
/// This builder provides two main things:
3933
///
3934
/// 1. It provides a few different `build` routines for actually constructing
3935
/// a DFA from different kinds of inputs. The most convenient is
3936
/// [`Builder::build`], which builds a DFA directly from a pattern string. The
3937
/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
3938
/// from an NFA.
3939
/// 2. The builder permits configuring a number of things.
3940
/// [`Builder::configure`] is used with [`Config`] to configure aspects of
3941
/// the DFA and the construction process itself. [`Builder::syntax`] and
3942
/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
3943
/// construction, respectively. The syntax and thompson configurations only
3944
/// apply when building from a pattern string.
3945
///
3946
/// This builder always constructs a *single* lazy DFA. As such, this builder
3947
/// can only be used to construct regexes that either detect the presence
3948
/// of a match or find the end location of a match. A single DFA cannot
3949
/// produce both the start and end of a match. For that information, use a
3950
/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
3951
/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
3952
/// to use a DFA directly is if the end location of a match is enough for your
3953
/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
3954
/// since a second reverse DFA is needed to find the start of a match.
3955
///
3956
/// # Example
3957
///
3958
/// This example shows how to build a lazy DFA that uses a tiny cache capacity
3959
/// and completely disables Unicode. That is:
3960
///
3961
/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
3962
///   and `\b` are ASCII-only while `.` matches any byte except for `\n`
3963
///   (instead of any UTF-8 encoding of a Unicode scalar value except for
3964
///   `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
3965
/// * The pattern itself is permitted to match invalid UTF-8. For example,
3966
///   things like `[^a]` that match any byte except for `a` are permitted.
3967
///
3968
/// ```
3969
/// use regex_automata::{
3970
///     hybrid::dfa::DFA,
3971
///     nfa::thompson,
3972
///     util::syntax,
3973
///     HalfMatch, Input,
3974
/// };
3975
///
3976
/// let dfa = DFA::builder()
3977
///     .configure(DFA::config().cache_capacity(5_000))
3978
///     .thompson(thompson::Config::new().utf8(false))
3979
///     .syntax(syntax::Config::new().unicode(false).utf8(false))
3980
///     .build(r"foo[^b]ar.*")?;
3981
/// let mut cache = dfa.create_cache();
3982
///
3983
/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
3984
/// let expected = Some(HalfMatch::must(0, 10));
3985
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3986
/// assert_eq!(expected, got);
3987
///
3988
/// # Ok::<(), Box<dyn std::error::Error>>(())
3989
/// ```
3990
#[derive(Clone, Debug)]
3991
pub struct Builder {
3992
    config: Config,
3993
    #[cfg(feature = "syntax")]
3994
    thompson: thompson::Compiler,
3995
}
3996
3997
impl Builder {
3998
    /// Create a new lazy DFA builder with the default configuration.
3999
0
    pub fn new() -> Builder {
4000
0
        Builder {
4001
0
            config: Config::default(),
4002
0
            #[cfg(feature = "syntax")]
4003
0
            thompson: thompson::Compiler::new(),
4004
0
        }
4005
0
    }
4006
4007
    /// Build a lazy DFA from the given pattern.
4008
    ///
4009
    /// If there was a problem parsing or compiling the pattern, then an error
4010
    /// is returned.
4011
    #[cfg(feature = "syntax")]
4012
0
    pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
4013
0
        self.build_many(&[pattern])
4014
0
    }
4015
4016
    /// Build a lazy DFA from the given patterns.
4017
    ///
4018
    /// When matches are returned, the pattern ID corresponds to the index of
4019
    /// the pattern in the slice given.
4020
    #[cfg(feature = "syntax")]
4021
0
    pub fn build_many<P: AsRef<str>>(
4022
0
        &self,
4023
0
        patterns: &[P],
4024
0
    ) -> Result<DFA, BuildError> {
4025
0
        let nfa = self
4026
0
            .thompson
4027
0
            .clone()
4028
0
            // We can always forcefully disable captures because DFAs do not
4029
0
            // support them.
4030
0
            .configure(
4031
0
                thompson::Config::new()
4032
0
                    .which_captures(thompson::WhichCaptures::None),
4033
0
            )
4034
0
            .build_many(patterns)
4035
0
            .map_err(BuildError::nfa)?;
4036
0
        self.build_from_nfa(nfa)
4037
0
    }
4038
4039
    /// Build a DFA from the given NFA.
4040
    ///
4041
    /// Note that this requires owning a `thompson::NFA`. While this may force
4042
    /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
4043
    /// are defined internally to support shared ownership such that cloning is
4044
    /// very cheap.
4045
    ///
4046
    /// # Example
4047
    ///
4048
    /// This example shows how to build a lazy DFA if you already have an NFA
4049
    /// in hand.
4050
    ///
4051
    /// ```
4052
    /// use regex_automata::{
4053
    ///     hybrid::dfa::DFA,
4054
    ///     nfa::thompson,
4055
    ///     HalfMatch, Input,
4056
    /// };
4057
    ///
4058
    /// let haystack = "foo123bar";
4059
    ///
4060
    /// // This shows how to set non-default options for building an NFA.
4061
    /// let nfa = thompson::Compiler::new()
4062
    ///     .configure(thompson::Config::new().shrink(true))
4063
    ///     .build(r"[0-9]+")?;
4064
    /// let dfa = DFA::builder().build_from_nfa(nfa)?;
4065
    /// let mut cache = dfa.create_cache();
4066
    /// let expected = Some(HalfMatch::must(0, 6));
4067
    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
4068
    /// assert_eq!(expected, got);
4069
    ///
4070
    /// # Ok::<(), Box<dyn std::error::Error>>(())
4071
    /// ```
4072
0
    pub fn build_from_nfa(
4073
0
        &self,
4074
0
        nfa: thompson::NFA,
4075
0
    ) -> Result<DFA, BuildError> {
4076
0
        let quitset = self.config.quit_set_from_nfa(&nfa)?;
4077
0
        let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
4078
        // Check that we can fit at least a few states into our cache,
4079
        // otherwise it's pretty senseless to use the lazy DFA. This does have
4080
        // a possible failure mode though. This assumes the maximum size of a
4081
        // state in powerset space (so, the total number of NFA states), which
4082
        // may never actually materialize, and could be quite a bit larger
4083
        // than the actual biggest state. If this turns out to be a problem,
4084
        // we could expose a knob that disables this check. But if so, we have
4085
        // to be careful not to panic in other areas of the code (the cache
4086
        // clearing and init code) that tend to assume some minimum useful
4087
        // cache capacity.
4088
0
        let min_cache = minimum_cache_capacity(
4089
0
            &nfa,
4090
0
            &classes,
4091
0
            self.config.get_starts_for_each_pattern(),
4092
        );
4093
0
        let mut cache_capacity = self.config.get_cache_capacity();
4094
0
        if cache_capacity < min_cache {
4095
            // When the caller has asked us to skip the cache capacity check,
4096
            // then we simply force the cache capacity to its minimum amount
4097
            // and mush on.
4098
0
            if self.config.get_skip_cache_capacity_check() {
4099
0
                debug!(
4100
0
                    "given capacity ({cache_capacity}) is too small, \
4101
0
                     since skip_cache_capacity_check is enabled, \
4102
0
                     setting cache capacity to minimum ({min_cache})",
4103
0
                );
4104
0
                cache_capacity = min_cache;
4105
0
            } else {
4106
0
                return Err(BuildError::insufficient_cache_capacity(
4107
0
                    min_cache,
4108
0
                    cache_capacity,
4109
0
                ));
4110
            }
4111
0
        }
4112
        // We also need to check that we can fit at least some small number
4113
        // of states in our state ID space. This is unlikely to trigger in
4114
        // >=32-bit systems, but 16-bit systems have a pretty small state ID
4115
        // space since a number of bits are used up as sentinels.
4116
0
        if let Err(err) = minimum_lazy_state_id(&classes) {
4117
0
            return Err(BuildError::insufficient_state_id_capacity(err));
4118
0
        }
4119
0
        let stride2 = classes.stride2();
4120
0
        let start_map = StartByteMap::new(nfa.look_matcher());
4121
0
        Ok(DFA {
4122
0
            config: self.config.clone(),
4123
0
            nfa,
4124
0
            stride2,
4125
0
            start_map,
4126
0
            classes,
4127
0
            quitset,
4128
0
            cache_capacity,
4129
0
        })
4130
0
    }
4131
4132
    /// Apply the given lazy DFA configuration options to this builder.
4133
0
    pub fn configure(&mut self, config: Config) -> &mut Builder {
4134
0
        self.config = self.config.overwrite(config);
4135
0
        self
4136
0
    }
4137
4138
    /// Set the syntax configuration for this builder using
4139
    /// [`syntax::Config`](crate::util::syntax::Config).
4140
    ///
4141
    /// This permits setting things like case insensitivity, Unicode and multi
4142
    /// line mode.
4143
    ///
4144
    /// These settings only apply when constructing a lazy DFA directly from a
4145
    /// pattern.
4146
    #[cfg(feature = "syntax")]
4147
0
    pub fn syntax(
4148
0
        &mut self,
4149
0
        config: crate::util::syntax::Config,
4150
0
    ) -> &mut Builder {
4151
0
        self.thompson.syntax(config);
4152
0
        self
4153
0
    }
4154
4155
    /// Set the Thompson NFA configuration for this builder using
4156
    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
4157
    ///
4158
    /// This permits setting things like whether the DFA should match the regex
4159
    /// in reverse or if additional time should be spent shrinking the size of
4160
    /// the NFA.
4161
    ///
4162
    /// These settings only apply when constructing a DFA directly from a
4163
    /// pattern.
4164
    #[cfg(feature = "syntax")]
4165
0
    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
4166
0
        self.thompson.configure(config);
4167
0
        self
4168
0
    }
4169
}
4170
4171
/// Represents the current state of an overlapping search.
4172
///
4173
/// This is used for overlapping searches since they need to know something
4174
/// about the previous search. For example, when multiple patterns match at the
4175
/// same position, this state tracks the last reported pattern so that the next
4176
/// search knows whether to report another matching pattern or continue with
4177
/// the search at the next position. Additionally, it also tracks which state
4178
/// the last search call terminated in.
4179
///
4180
/// This type provides little introspection capabilities. The only thing a
4181
/// caller can do is construct it and pass it around to permit search routines
4182
/// to use it to track state, and also ask whether a match has been found.
4183
///
4184
/// Callers should always provide a fresh state constructed via
4185
/// [`OverlappingState::start`] when starting a new search. Reusing state from
4186
/// a previous search may result in incorrect results.
4187
#[derive(Clone, Debug, Eq, PartialEq)]
4188
pub struct OverlappingState {
4189
    /// The match reported by the most recent overlapping search to use this
4190
    /// state.
4191
    ///
4192
    /// If a search does not find any matches, then it is expected to clear
4193
    /// this value.
4194
    pub(crate) mat: Option<HalfMatch>,
4195
    /// The state ID of the state at which the search was in when the call
4196
    /// terminated. When this is a match state, `last_match` must be set to a
4197
    /// non-None value.
4198
    ///
4199
    /// A `None` value indicates the start state of the corresponding
4200
    /// automaton. We cannot use the actual ID, since any one automaton may
4201
    /// have many start states, and which one is in use depends on several
4202
    /// search-time factors.
4203
    pub(crate) id: Option<LazyStateID>,
4204
    /// The position of the search.
4205
    ///
4206
    /// When `id` is None (i.e., we are starting a search), this is set to
4207
    /// the beginning of the search as given by the caller regardless of its
4208
    /// current value. Subsequent calls to an overlapping search pick up at
4209
    /// this offset.
4210
    pub(crate) at: usize,
4211
    /// The index into the matching patterns of the next match to report if the
4212
    /// current state is a match state. Note that this may be 1 greater than
4213
    /// the total number of matches to report for the current match state. (In
4214
    /// which case, no more matches should be reported at the current position
4215
    /// and the search should advance to the next position.)
4216
    pub(crate) next_match_index: Option<usize>,
4217
    /// This is set to true when a reverse overlapping search has entered its
4218
    /// EOI transitions.
4219
    ///
4220
    /// This isn't used in a forward search because it knows to stop once the
4221
    /// position exceeds the end of the search range. In a reverse search,
4222
    /// since we use unsigned offsets, we don't "know" once we've gone past
4223
    /// `0`. So the only way to detect it is with this extra flag. The reverse
4224
    /// overlapping search knows to terminate specifically after it has
4225
    /// reported all matches after following the EOI transition.
4226
    pub(crate) rev_eoi: bool,
4227
}
4228
4229
impl OverlappingState {
4230
    /// Create a new overlapping state that begins at the start state of any
4231
    /// automaton.
4232
0
    pub fn start() -> OverlappingState {
4233
0
        OverlappingState {
4234
0
            mat: None,
4235
0
            id: None,
4236
0
            at: 0,
4237
0
            next_match_index: None,
4238
0
            rev_eoi: false,
4239
0
        }
4240
0
    }
4241
4242
    /// Return the match result of the most recent search to execute with this
4243
    /// state.
4244
    ///
4245
    /// A searches will clear this result automatically, such that if no
4246
    /// match is found, this will correctly report `None`.
4247
0
    pub fn get_match(&self) -> Option<HalfMatch> {
4248
0
        self.mat
4249
0
    }
4250
}
4251
4252
/// Runs the given overlapping `search` function (forwards or backwards) until
4253
/// a match is found whose offset does not split a codepoint.
4254
///
4255
/// This is *not* always correct to call. It should only be called when the
4256
/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width
4257
/// matches. Calling this when both of those things aren't true might result
4258
/// in legitimate matches getting skipped.
4259
#[cold]
4260
#[inline(never)]
4261
0
fn skip_empty_utf8_splits_overlapping<F>(
4262
0
    input: &Input<'_>,
4263
0
    state: &mut OverlappingState,
4264
0
    mut search: F,
4265
0
) -> Result<(), MatchError>
4266
0
where
4267
0
    F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
4268
{
4269
    // Note that this routine works for forwards and reverse searches
4270
    // even though there's no code here to handle those cases. That's
4271
    // because overlapping searches drive themselves to completion via
4272
    // `OverlappingState`. So all we have to do is push it until no matches are
4273
    // found.
4274
4275
0
    let mut hm = match state.get_match() {
4276
0
        None => return Ok(()),
4277
0
        Some(hm) => hm,
4278
    };
4279
0
    if input.get_anchored().is_anchored() {
4280
0
        if !input.is_char_boundary(hm.offset()) {
4281
0
            state.mat = None;
4282
0
        }
4283
0
        return Ok(());
4284
0
    }
4285
0
    while !input.is_char_boundary(hm.offset()) {
4286
0
        search(input, state)?;
4287
0
        hm = match state.get_match() {
4288
0
            None => return Ok(()),
4289
0
            Some(hm) => hm,
4290
        };
4291
    }
4292
0
    Ok(())
4293
0
}
4294
4295
/// Based on the minimum number of states required for a useful lazy DFA cache,
4296
/// this returns the minimum lazy state ID that must be representable.
4297
///
4298
/// It's not likely for this to have any impact 32-bit systems (or higher), but
4299
/// on 16-bit systems, the lazy state ID space is quite constrained and thus
4300
/// may be insufficient if our MIN_STATES value is (for some reason) too high.
4301
0
fn minimum_lazy_state_id(
4302
0
    classes: &ByteClasses,
4303
0
) -> Result<LazyStateID, LazyStateIDError> {
4304
0
    let stride = 1 << classes.stride2();
4305
0
    let min_state_index = MIN_STATES.checked_sub(1).unwrap();
4306
0
    LazyStateID::new(min_state_index * stride)
4307
0
}
4308
4309
/// Based on the minimum number of states required for a useful lazy DFA cache,
4310
/// this returns a heuristic minimum number of bytes of heap space required.
4311
///
4312
/// This is a "heuristic" because the minimum it returns is likely bigger than
4313
/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
4314
/// the maximum number of NFA states (all of them). This is likely bigger
4315
/// than what is required in practice. Computing the true minimum effectively
4316
/// requires determinization, which is probably too much work to do for a
4317
/// simple check like this.
4318
///
4319
/// One of the issues with this approach IMO is that it requires that this
4320
/// be in sync with the calculation above for computing how much heap memory
4321
/// the DFA cache uses. If we get it wrong, it's possible for example for the
4322
/// minimum to be smaller than the computed heap memory, and thus, it may be
4323
/// the case that we can't add the required minimum number of states. That in
4324
/// turn will make lazy DFA panic because we assume that we can add at least a
4325
/// minimum number of states.
4326
///
4327
/// Another approach would be to always allow the minimum number of states to
4328
/// be added to the lazy DFA cache, even if it exceeds the configured cache
4329
/// limit. This does mean that the limit isn't really a limit in all cases,
4330
/// which is unfortunate. But it does at least guarantee that the lazy DFA can
4331
/// always make progress, even if it is slow. (This approach is very similar to
4332
/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
4333
/// rely on cache size calculation. Instead, it would just always permit a
4334
/// minimum number of states to be added.)
4335
0
fn minimum_cache_capacity(
4336
0
    nfa: &thompson::NFA,
4337
0
    classes: &ByteClasses,
4338
0
    starts_for_each_pattern: bool,
4339
0
) -> usize {
4340
    const ID_SIZE: usize = size_of::<LazyStateID>();
4341
    const STATE_SIZE: usize = size_of::<State>();
4342
4343
0
    let stride = 1 << classes.stride2();
4344
0
    let states_len = nfa.states().len();
4345
0
    let sparses = 2 * states_len * NFAStateID::SIZE;
4346
0
    let trans = MIN_STATES * stride * ID_SIZE;
4347
4348
0
    let mut starts = Start::len() * ID_SIZE;
4349
0
    if starts_for_each_pattern {
4350
0
        starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
4351
0
    }
4352
4353
    // The min number of states HAS to be at least 4: we have 3 sentinel states
4354
    // and then we need space for one more when we save a state after clearing
4355
    // the cache. We also need space for one more, otherwise we get stuck in a
4356
    // loop where we try to add a 5th state, which gets rejected, which clears
4357
    // the cache, which adds back a saved state (4th total state) which then
4358
    // tries to add the 5th state again.
4359
0
    assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5");
4360
    // The minimum number of non-sentinel states. We consider this separately
4361
    // because sentinel states are much smaller in that they contain no NFA
4362
    // states. Given our aggressive calculation here, it's worth being more
4363
    // precise with the number of states we need.
4364
0
    let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
4365
4366
    // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
4367
    // patterns, followed by 32-bit encodings of patterns and then delta
4368
    // varint encodings of NFA state IDs. We use the worst case (which isn't
4369
    // technically possible) of 5 bytes for each NFA state ID.
4370
    //
4371
    // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
4372
    // unknown, dead and quit states. Those states have a known size and it is
4373
    // small.
4374
0
    let dead_state_size = State::dead().memory_usage();
4375
0
    let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5);
4376
0
    let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
4377
0
        + (non_sentinel * (STATE_SIZE + max_state_size));
4378
    // NOTE: We don't double count heap memory used by State for this map since
4379
    // we use reference counting to avoid doubling memory usage. (This tends to
4380
    // be where most memory is allocated in the cache.)
4381
0
    let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
4382
0
    let stack = states_len * NFAStateID::SIZE;
4383
0
    let scratch_state_builder = max_state_size;
4384
4385
0
    trans
4386
0
        + starts
4387
0
        + states
4388
0
        + states_to_sid
4389
0
        + sparses
4390
0
        + stack
4391
0
        + scratch_state_builder
4392
0
}
4393
4394
#[cfg(all(test, feature = "syntax"))]
4395
mod tests {
4396
    use super::*;
4397
4398
    // Tests that we handle heuristic Unicode word boundary support in reverse
4399
    // DFAs in the specific case of contextual searches.
4400
    //
4401
    // I wrote this test when I discovered a bug in how heuristic word
4402
    // boundaries were handled. Namely, that the starting state selection
4403
    // didn't consider the DFA's quit byte set when looking at the byte
4404
    // immediately before the start of the search (or immediately after the
4405
    // end of the search in the case of a reverse search). As a result, it was
4406
    // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
4407
    // in the 'β' codepoint would be treated as a non-word character. But of
4408
    // course, this search should trigger the DFA to quit, since there is a
4409
    // non-ASCII byte in consideration.
4410
    //
4411
    // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
4412
    // if it wasn't empty. The forward case is tested in the doc test for the
4413
    // Config::unicode_word_boundary API. We test the reverse case here, which
4414
    // is sufficiently niche that it doesn't really belong in a doc test.
4415
    #[test]
4416
    fn heuristic_unicode_reverse() {
4417
        let dfa = DFA::builder()
4418
            .configure(DFA::config().unicode_word_boundary(true))
4419
            .thompson(thompson::Config::new().reverse(true))
4420
            .build(r"\b[0-9]+\b")
4421
            .unwrap();
4422
        let mut cache = dfa.create_cache();
4423
4424
        let input = Input::new("β123").range(2..);
4425
        let expected = MatchError::quit(0xB2, 1);
4426
        let got = dfa.try_search_rev(&mut cache, &input);
4427
        assert_eq!(Err(expected), got);
4428
4429
        let input = Input::new("123β").range(..3);
4430
        let expected = MatchError::quit(0xCE, 3);
4431
        let got = dfa.try_search_rev(&mut cache, &input);
4432
        assert_eq!(Err(expected), got);
4433
    }
4434
}