Coverage Report

Created: 2025-07-18 06:52

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/dfa/onepass.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
A DFA that can return spans for matching capturing groups.
3
4
This module is the home of a [one-pass DFA](DFA).
5
6
This module also contains a [`Builder`] and a [`Config`] for building and
7
configuring a one-pass DFA.
8
*/
9
10
// A note on naming and credit:
11
//
12
// As far as I know, Russ Cox came up with the practical vision and
13
// implementation of a "one-pass regex engine." He mentions and describes it
14
// briefly in the third article of his regexp article series:
15
// https://swtch.com/~rsc/regexp/regexp3.html
16
//
17
// Cox's implementation is in RE2, and the implementation below is most
18
// heavily inspired by RE2's. The key thing they have in common is that
19
// their transitions are defined over an alphabet of bytes. In contrast,
20
// Go's regex engine also has a one-pass engine, but its transitions are
21
// more firmly rooted on Unicode codepoints. The ideas are the same, but the
22
// implementations are different.
23
//
24
// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA."
25
// They're both true in their own ways:
26
//
27
// * The "one-pass" criterion is generally a property of the NFA itself. In
28
// particular, it is said that an NFA is one-pass if, after each byte of input
29
// during a search, there is at most one "VM thread" remaining to take for the
30
// next byte of input. That is, there is never any ambiguity as to the path to
31
// take through the NFA during a search.
32
//
33
// * On the other hand, once a one-pass NFA has its representation converted
34
// to something where a constant number of instructions is used for each byte
35
// of input, the implementation looks a lot more like a DFA. It's technically
36
// more powerful than a DFA since it has side effects (storing offsets inside
37
// of slots activated by a transition), but it is far closer to a DFA than an
38
// NFA simulation.
39
//
40
// Thus, in this crate, we call it a one-pass DFA.
41
42
use alloc::{vec, vec::Vec};
43
44
use crate::{
45
    dfa::{remapper::Remapper, DEAD},
46
    nfa::thompson::{self, NFA},
47
    util::{
48
        alphabet::ByteClasses,
49
        captures::Captures,
50
        escape::DebugByte,
51
        int::{Usize, U32, U64, U8},
52
        look::{Look, LookSet, UnicodeWordBoundaryError},
53
        primitives::{NonMaxUsize, PatternID, StateID},
54
        search::{Anchored, Input, Match, MatchError, MatchKind, Span},
55
        sparse_set::SparseSet,
56
    },
57
};
58
59
/// The configuration used for building a [one-pass DFA](DFA).
60
///
61
/// A one-pass DFA configuration is a simple data object that is typically used
62
/// with [`Builder::configure`]. It can be cheaply cloned.
63
///
64
/// A default configuration can be created either with `Config::new`, or
65
/// perhaps more conveniently, with [`DFA::config`].
66
#[derive(Clone, Debug, Default)]
67
pub struct Config {
68
    match_kind: Option<MatchKind>,
69
    starts_for_each_pattern: Option<bool>,
70
    byte_classes: Option<bool>,
71
    size_limit: Option<Option<usize>>,
72
}
73
74
impl Config {
75
    /// Return a new default one-pass DFA configuration.
76
0
    pub fn new() -> Config {
77
0
        Config::default()
78
0
    }
79
80
    /// Set the desired match semantics.
81
    ///
82
    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
83
    /// match semantics of Perl-like regex engines. That is, when multiple
84
    /// patterns would match at the same leftmost position, the pattern that
85
    /// appears first in the concrete syntax is chosen.
86
    ///
87
    /// Currently, the only other kind of match semantics supported is
88
    /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
89
    /// where all possible matches are visited.
90
    ///
91
    /// When it comes to the one-pass DFA, it is rarer for preference order and
92
    /// "longest match" to actually disagree. Since if they did disagree, then
93
    /// the regex typically isn't one-pass. For example, searching `Samwise`
94
    /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and
95
    /// `Samwise` for "longest match" or "all" matching. However, this regex is
96
    /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)`
97
    /// is one-pass and `Sam|Samwise` may be optimized to it.
98
    ///
99
    /// The other main difference is that "all" match semantics don't support
100
    /// non-greedy matches. "All" match semantics always try to match as much
101
    /// as possible.
102
0
    pub fn match_kind(mut self, kind: MatchKind) -> Config {
103
0
        self.match_kind = Some(kind);
104
0
        self
105
0
    }
106
107
    /// Whether to compile a separate start state for each pattern in the
108
    /// one-pass DFA.
109
    ///
110
    /// When enabled, a separate **anchored** start state is added for each
111
    /// pattern in the DFA. When this start state is used, then the DFA will
112
    /// only search for matches for the pattern specified, even if there are
113
    /// other patterns in the DFA.
114
    ///
115
    /// The main downside of this option is that it can potentially increase
116
    /// the size of the DFA and/or increase the time it takes to build the DFA.
117
    ///
118
    /// You might want to enable this option when you want to both search for
119
    /// anchored matches of any pattern or to search for anchored matches of
120
    /// one particular pattern while using the same DFA. (Otherwise, you would
121
    /// need to compile a new DFA for each pattern.)
122
    ///
123
    /// By default this is disabled.
124
    ///
125
    /// # Example
126
    ///
127
    /// This example shows how to build a multi-regex and then search for
128
    /// matches for a any of the patterns or matches for a specific pattern.
129
    ///
130
    /// ```
131
    /// use regex_automata::{
132
    ///     dfa::onepass::DFA, Anchored, Input, Match, PatternID,
133
    /// };
134
    ///
135
    /// let re = DFA::builder()
136
    ///     .configure(DFA::config().starts_for_each_pattern(true))
137
    ///     .build_many(&["[a-z]+", "[0-9]+"])?;
138
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
139
    /// let haystack = "123abc";
140
    /// let input = Input::new(haystack).anchored(Anchored::Yes);
141
    ///
142
    /// // A normal multi-pattern search will show pattern 1 matches.
143
    /// re.try_search(&mut cache, &input, &mut caps)?;
144
    /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
145
    ///
146
    /// // If we only want to report pattern 0 matches, then we'll get no
147
    /// // match here.
148
    /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
149
    /// re.try_search(&mut cache, &input, &mut caps)?;
150
    /// assert_eq!(None, caps.get_match());
151
    ///
152
    /// # Ok::<(), Box<dyn std::error::Error>>(())
153
    /// ```
154
0
    pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
155
0
        self.starts_for_each_pattern = Some(yes);
156
0
        self
157
0
    }
158
159
    /// Whether to attempt to shrink the size of the DFA's alphabet or not.
160
    ///
161
    /// This option is enabled by default and should never be disabled unless
162
    /// one is debugging a one-pass DFA.
163
    ///
164
    /// When enabled, the DFA will use a map from all possible bytes to their
165
    /// corresponding equivalence class. Each equivalence class represents a
166
    /// set of bytes that does not discriminate between a match and a non-match
167
    /// in the DFA. For example, the pattern `[ab]+` has at least two
168
    /// equivalence classes: a set containing `a` and `b` and a set containing
169
    /// every byte except for `a` and `b`. `a` and `b` are in the same
170
    /// equivalence class because they never discriminate between a match and a
171
    /// non-match.
172
    ///
173
    /// The advantage of this map is that the size of the transition table
174
    /// can be reduced drastically from (approximately) `#states * 256 *
175
    /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the
176
    /// number of equivalence classes (rounded up to the nearest power of 2).
177
    /// As a result, total space usage can decrease substantially. Moreover,
178
    /// since a smaller alphabet is used, DFA compilation becomes faster as
179
    /// well.
180
    ///
181
    /// **WARNING:** This is only useful for debugging DFAs. Disabling this
182
    /// does not yield any speed advantages. Namely, even when this is
183
    /// disabled, a byte class map is still used while searching. The only
184
    /// difference is that every byte will be forced into its own distinct
185
    /// equivalence class. This is useful for debugging the actual generated
186
    /// transitions because it lets one see the transitions defined on actual
187
    /// bytes instead of the equivalence classes.
188
0
    pub fn byte_classes(mut self, yes: bool) -> Config {
189
0
        self.byte_classes = Some(yes);
190
0
        self
191
0
    }
192
193
    /// Set a size limit on the total heap used by a one-pass DFA.
194
    ///
195
    /// This size limit is expressed in bytes and is applied during
196
    /// construction of a one-pass DFA. If the DFA's heap usage exceeds
197
    /// this configured limit, then construction is stopped and an error is
198
    /// returned.
199
    ///
200
    /// The default is no limit.
201
    ///
202
    /// # Example
203
    ///
204
    /// This example shows a one-pass DFA that fails to build because of
205
    /// a configured size limit. This particular example also serves as a
206
    /// cautionary tale demonstrating just how big DFAs with large Unicode
207
    /// character classes can get.
208
    ///
209
    /// ```
210
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
211
    /// use regex_automata::{dfa::onepass::DFA, Match};
212
    ///
213
    /// // 6MB isn't enough!
214
    /// DFA::builder()
215
    ///     .configure(DFA::config().size_limit(Some(6_000_000)))
216
    ///     .build(r"\w{20}")
217
    ///     .unwrap_err();
218
    ///
219
    /// // ... but 7MB probably is!
220
    /// // (Note that DFA sizes aren't necessarily stable between releases.)
221
    /// let re = DFA::builder()
222
    ///     .configure(DFA::config().size_limit(Some(7_000_000)))
223
    ///     .build(r"\w{20}")?;
224
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
225
    /// let haystack = "A".repeat(20);
226
    /// re.captures(&mut cache, &haystack, &mut caps);
227
    /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match());
228
    ///
229
    /// # Ok::<(), Box<dyn std::error::Error>>(())
230
    /// ```
231
    ///
232
    /// While one needs a little more than 3MB to represent `\w{20}`, it
233
    /// turns out that you only need a little more than 4KB to represent
234
    /// `(?-u:\w{20})`. So only use Unicode if you need it!
235
0
    pub fn size_limit(mut self, limit: Option<usize>) -> Config {
236
0
        self.size_limit = Some(limit);
237
0
        self
238
0
    }
239
240
    /// Returns the match semantics set in this configuration.
241
0
    pub fn get_match_kind(&self) -> MatchKind {
242
0
        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
243
0
    }
244
245
    /// Returns whether this configuration has enabled anchored starting states
246
    /// for every pattern in the DFA.
247
0
    pub fn get_starts_for_each_pattern(&self) -> bool {
248
0
        self.starts_for_each_pattern.unwrap_or(false)
249
0
    }
250
251
    /// Returns whether this configuration has enabled byte classes or not.
252
    /// This is typically a debugging oriented option, as disabling it confers
253
    /// no speed benefit.
254
0
    pub fn get_byte_classes(&self) -> bool {
255
0
        self.byte_classes.unwrap_or(true)
256
0
    }
257
258
    /// Returns the DFA size limit of this configuration if one was set.
259
    /// The size limit is total number of bytes on the heap that a DFA is
260
    /// permitted to use. If the DFA exceeds this limit during construction,
261
    /// then construction is stopped and an error is returned.
262
0
    pub fn get_size_limit(&self) -> Option<usize> {
263
0
        self.size_limit.unwrap_or(None)
264
0
    }
265
266
    /// Overwrite the default configuration such that the options in `o` are
267
    /// always used. If an option in `o` is not set, then the corresponding
268
    /// option in `self` is used. If it's not set in `self` either, then it
269
    /// remains not set.
270
0
    pub(crate) fn overwrite(&self, o: Config) -> Config {
271
0
        Config {
272
0
            match_kind: o.match_kind.or(self.match_kind),
273
0
            starts_for_each_pattern: o
274
0
                .starts_for_each_pattern
275
0
                .or(self.starts_for_each_pattern),
276
0
            byte_classes: o.byte_classes.or(self.byte_classes),
277
0
            size_limit: o.size_limit.or(self.size_limit),
278
0
        }
279
0
    }
280
}
281
282
/// A builder for a [one-pass DFA](DFA).
283
///
284
/// This builder permits configuring options for the syntax of a pattern, the
285
/// NFA construction and the DFA construction. This builder is different from a
286
/// general purpose regex builder in that it permits fine grain configuration
287
/// of the construction process. The trade off for this is complexity, and
288
/// the possibility of setting a configuration that might not make sense. For
289
/// example, there are two different UTF-8 modes:
290
///
291
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
292
/// whether the pattern itself can contain sub-expressions that match invalid
293
/// UTF-8.
294
/// * [`thompson::Config::utf8`] controls whether empty matches that split a
295
/// Unicode codepoint are reported or not.
296
///
297
/// Generally speaking, callers will want to either enable all of these or
298
/// disable all of these.
299
///
300
/// # Example
301
///
302
/// This example shows how to disable UTF-8 mode in the syntax and the NFA.
303
/// This is generally what you want for matching on arbitrary bytes.
304
///
305
/// ```
306
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
307
/// use regex_automata::{
308
///     dfa::onepass::DFA,
309
///     nfa::thompson,
310
///     util::syntax,
311
///     Match,
312
/// };
313
///
314
/// let re = DFA::builder()
315
///     .syntax(syntax::Config::new().utf8(false))
316
///     .thompson(thompson::Config::new().utf8(false))
317
///     .build(r"foo(?-u:[^b])ar.*")?;
318
/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
319
///
320
/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
321
/// re.captures(&mut cache, haystack, &mut caps);
322
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
323
/// // but the subsequent `.*` does not! Disabling UTF-8
324
/// // on the syntax permits this.
325
/// //
326
/// // N.B. This example does not show the impact of
327
/// // disabling UTF-8 mode on a one-pass DFA Config,
328
/// //  since that only impacts regexes that can
329
/// // produce matches of length 0.
330
/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
331
///
332
/// # Ok::<(), Box<dyn std::error::Error>>(())
333
/// ```
334
#[derive(Clone, Debug)]
335
pub struct Builder {
336
    config: Config,
337
    #[cfg(feature = "syntax")]
338
    thompson: thompson::Compiler,
339
}
340
341
impl Builder {
342
    /// Create a new one-pass DFA builder with the default configuration.
343
0
    pub fn new() -> Builder {
344
0
        Builder {
345
0
            config: Config::default(),
346
0
            #[cfg(feature = "syntax")]
347
0
            thompson: thompson::Compiler::new(),
348
0
        }
349
0
    }
350
351
    /// Build a one-pass DFA from the given pattern.
352
    ///
353
    /// If there was a problem parsing or compiling the pattern, then an error
354
    /// is returned.
355
    #[cfg(feature = "syntax")]
356
0
    pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
357
0
        self.build_many(&[pattern])
358
0
    }
359
360
    /// Build a one-pass DFA from the given patterns.
361
    ///
362
    /// When matches are returned, the pattern ID corresponds to the index of
363
    /// the pattern in the slice given.
364
    #[cfg(feature = "syntax")]
365
0
    pub fn build_many<P: AsRef<str>>(
366
0
        &self,
367
0
        patterns: &[P],
368
0
    ) -> Result<DFA, BuildError> {
369
0
        let nfa =
370
0
            self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
371
0
        self.build_from_nfa(nfa)
372
0
    }
373
374
    /// Build a DFA from the given NFA.
375
    ///
376
    /// # Example
377
    ///
378
    /// This example shows how to build a DFA if you already have an NFA in
379
    /// hand.
380
    ///
381
    /// ```
382
    /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match};
383
    ///
384
    /// // This shows how to set non-default options for building an NFA.
385
    /// let nfa = NFA::compiler()
386
    ///     .configure(NFA::config().shrink(true))
387
    ///     .build(r"[a-z0-9]+")?;
388
    /// let re = DFA::builder().build_from_nfa(nfa)?;
389
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
390
    /// re.captures(&mut cache, "foo123bar", &mut caps);
391
    /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match());
392
    ///
393
    /// # Ok::<(), Box<dyn std::error::Error>>(())
394
    /// ```
395
0
    pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> {
396
0
        // Why take ownership if we're just going to pass a reference to the
397
0
        // NFA to our internal builder? Well, the first thing to note is that
398
0
        // an NFA uses reference counting internally, so either choice is going
399
0
        // to be cheap. So there isn't much cost either way.
400
0
        //
401
0
        // The real reason is that a one-pass DFA, semantically, shares
402
0
        // ownership of an NFA. This is unlike other DFAs that don't share
403
0
        // ownership of an NFA at all, primarily because they want to be
404
0
        // self-contained in order to support cheap (de)serialization.
405
0
        //
406
0
        // But then why pass a '&nfa' below if we want to share ownership?
407
0
        // Well, it turns out that using a '&NFA' in our internal builder
408
0
        // separates its lifetime from the DFA we're building, and this turns
409
0
        // out to make code a bit more composable. e.g., We can iterate over
410
0
        // things inside the NFA while borrowing the builder as mutable because
411
0
        // we know the NFA cannot be mutated. So TL;DR --- this weirdness is
412
0
        // "because borrow checker."
413
0
        InternalBuilder::new(self.config.clone(), &nfa).build()
414
0
    }
415
416
    /// Apply the given one-pass DFA configuration options to this builder.
417
0
    pub fn configure(&mut self, config: Config) -> &mut Builder {
418
0
        self.config = self.config.overwrite(config);
419
0
        self
420
0
    }
421
422
    /// Set the syntax configuration for this builder using
423
    /// [`syntax::Config`](crate::util::syntax::Config).
424
    ///
425
    /// This permits setting things like case insensitivity, Unicode and multi
426
    /// line mode.
427
    ///
428
    /// These settings only apply when constructing a one-pass DFA directly
429
    /// from a pattern.
430
    #[cfg(feature = "syntax")]
431
0
    pub fn syntax(
432
0
        &mut self,
433
0
        config: crate::util::syntax::Config,
434
0
    ) -> &mut Builder {
435
0
        self.thompson.syntax(config);
436
0
        self
437
0
    }
438
439
    /// Set the Thompson NFA configuration for this builder using
440
    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
441
    ///
442
    /// This permits setting things like whether additional time should be
443
    /// spent shrinking the size of the NFA.
444
    ///
445
    /// These settings only apply when constructing a DFA directly from a
446
    /// pattern.
447
    #[cfg(feature = "syntax")]
448
0
    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
449
0
        self.thompson.configure(config);
450
0
        self
451
0
    }
452
}
453
454
/// An internal builder for encapsulating the state necessary to build a
455
/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`.
456
///
457
/// There is no separate pass for determining whether the NFA is one-pass or
458
/// not. We just try to build the DFA. If during construction we discover that
459
/// it is not one-pass, we bail out. This is likely to lead to some undesirable
460
/// expense in some cases, so it might make sense to try an identify common
461
/// patterns in the NFA that make it definitively not one-pass. That way, we
462
/// can avoid ever trying to build a one-pass DFA in the first place. For
463
/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by
464
/// default, it's probably not a trivial cost to try and build a one-pass DFA
465
/// for it and then fail.
466
///
467
/// Note that some (immutable) fields are duplicated here. For example, the
468
/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing,
469
/// but we duplicate them because it makes composition easier below. Otherwise,
470
/// since the borrow checker can't see through method calls, the mutable borrow
471
/// we use to mutate the DFA winds up preventing borrowing from any other part
472
/// of the DFA, even though we aren't mutating those parts. We only do this
473
/// because the duplication is cheap.
474
#[derive(Debug)]
475
struct InternalBuilder<'a> {
476
    /// The DFA we're building.
477
    dfa: DFA,
478
    /// An unordered collection of NFA state IDs that we haven't yet tried to
479
    /// build into a DFA state yet.
480
    ///
481
    /// This collection does not ultimately wind up including every NFA state
482
    /// ID. Instead, each ID represents a "start" state for a sub-graph of the
483
    /// NFA. The set of NFA states we then use to build a DFA state consists
484
    /// of that "start" state and all states reachable from it via epsilon
485
    /// transitions.
486
    uncompiled_nfa_ids: Vec<StateID>,
487
    /// A map from NFA state ID to DFA state ID. This is useful for easily
488
    /// determining whether an NFA state has been used as a "starting" point
489
    /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD,
490
    /// and since DEAD is specially added and never corresponds to any NFA
491
    /// state, it follows that a mapping to DEAD implies the NFA state has
492
    /// no corresponding DFA state yet.
493
    nfa_to_dfa_id: Vec<StateID>,
494
    /// A stack used to traverse the NFA states that make up a single DFA
495
    /// state. Traversal occurs until the stack is empty, and we only push to
496
    /// the stack when the state ID isn't in 'seen'. Actually, even more than
497
    /// that, if we try to push something on to this stack that is already in
498
    /// 'seen', then we bail out on construction completely, since it implies
499
    /// that the NFA is not one-pass.
500
    stack: Vec<(StateID, Epsilons)>,
501
    /// The set of NFA states that we've visited via 'stack'.
502
    seen: SparseSet,
503
    /// Whether a match NFA state has been observed while constructing a
504
    /// one-pass DFA state. Once a match state is seen, assuming we are using
505
    /// leftmost-first match semantics, then we don't add any more transitions
506
    /// to the DFA state we're building.
507
    matched: bool,
508
    /// The config passed to the builder.
509
    ///
510
    /// This is duplicated in dfa.config.
511
    config: Config,
512
    /// The NFA we're building a one-pass DFA from.
513
    ///
514
    /// This is duplicated in dfa.nfa.
515
    nfa: &'a NFA,
516
    /// The equivalence classes that make up the alphabet for this DFA>
517
    ///
518
    /// This is duplicated in dfa.classes.
519
    classes: ByteClasses,
520
}
521
522
impl<'a> InternalBuilder<'a> {
523
    /// Create a new builder with an initial empty DFA.
524
0
    fn new(config: Config, nfa: &'a NFA) -> InternalBuilder<'a> {
525
0
        let classes = if !config.get_byte_classes() {
526
            // A one-pass DFA will always use the equivalence class map, but
527
            // enabling this option is useful for debugging. Namely, this will
528
            // cause all transitions to be defined over their actual bytes
529
            // instead of an opaque equivalence class identifier. The former is
530
            // much easier to grok as a human.
531
0
            ByteClasses::singletons()
532
        } else {
533
0
            nfa.byte_classes().clone()
534
        };
535
        // Normally a DFA alphabet includes the EOI symbol, but we don't need
536
        // that in the one-pass DFA since we handle look-around explicitly
537
        // without encoding it into the DFA. Thus, we don't need to delay
538
        // matches by 1 byte. However, we reuse the space that *would* be used
539
        // by the EOI transition by putting match information there (like which
540
        // pattern matches and which look-around assertions need to hold). So
541
        // this means our real alphabet length is 1 fewer than what the byte
542
        // classes report, since we don't use EOI.
543
0
        let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap();
544
0
        let stride2 = classes.stride2();
545
0
        let dfa = DFA {
546
0
            config: config.clone(),
547
0
            nfa: nfa.clone(),
548
0
            table: vec![],
549
0
            starts: vec![],
550
0
            // Since one-pass DFAs have a smaller state ID max than
551
0
            // StateID::MAX, it follows that StateID::MAX is a valid initial
552
0
            // value for min_match_id since no state ID can ever be greater
553
0
            // than it. In the case of a one-pass DFA with no match states, the
554
0
            // min_match_id will keep this sentinel value.
555
0
            min_match_id: StateID::MAX,
556
0
            classes: classes.clone(),
557
0
            alphabet_len,
558
0
            stride2,
559
0
            pateps_offset: alphabet_len,
560
0
            // OK because PatternID::MAX*2 is guaranteed not to overflow.
561
0
            explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(),
562
0
        };
563
0
        InternalBuilder {
564
0
            dfa,
565
0
            uncompiled_nfa_ids: vec![],
566
0
            nfa_to_dfa_id: vec![DEAD; nfa.states().len()],
567
0
            stack: vec![],
568
0
            seen: SparseSet::new(nfa.states().len()),
569
0
            matched: false,
570
0
            config,
571
0
            nfa,
572
0
            classes,
573
0
        }
574
0
    }
575
576
    /// Build the DFA from the NFA given to this builder. If the NFA is not
577
    /// one-pass, then return an error. An error may also be returned if a
578
    /// particular limit is exceeded. (Some limits, like the total heap memory
579
    /// used, are configurable. Others, like the total patterns or slots, are
580
    /// hard-coded based on representational limitations.)
581
0
    fn build(mut self) -> Result<DFA, BuildError> {
582
0
        self.nfa.look_set_any().available().map_err(BuildError::word)?;
583
0
        for look in self.nfa.look_set_any().iter() {
584
            // This is a future incompatibility check where if we add any
585
            // more look-around assertions, then the one-pass DFA either
586
            // needs to reject them (what we do here) or it needs to have its
587
            // Transition representation modified to be capable of storing the
588
            // new assertions.
589
0
            if look.as_repr() > Look::WordUnicodeNegate.as_repr() {
590
0
                return Err(BuildError::unsupported_look(look));
591
0
            }
592
        }
593
0
        if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT
594
        {
595
0
            return Err(BuildError::too_many_patterns(
596
0
                PatternEpsilons::PATTERN_ID_LIMIT,
597
0
            ));
598
0
        }
599
0
        if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT {
600
0
            return Err(BuildError::not_one_pass(
601
0
                "too many explicit capturing groups (max is 16)",
602
0
            ));
603
0
        }
604
0
        assert_eq!(DEAD, self.add_empty_state()?);
605
606
        // This is where the explicit slots start. We care about this because
607
        // we only need to track explicit slots. The implicit slots---two for
608
        // each pattern---are tracked as part of the search routine itself.
609
0
        let explicit_slot_start = self.nfa.pattern_len() * 2;
610
0
        self.add_start_state(None, self.nfa.start_anchored())?;
611
0
        if self.config.get_starts_for_each_pattern() {
612
0
            for pid in self.nfa.patterns() {
613
0
                self.add_start_state(
614
0
                    Some(pid),
615
0
                    self.nfa.start_pattern(pid).unwrap(),
616
0
                )?;
617
            }
618
0
        }
619
        // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids'
620
        // as a stack are. It is really an unordered *set* of NFA state IDs.
621
        // If it, for example, in practice led to discovering whether a regex
622
        // was or wasn't one-pass later than if we processed NFA state IDs in
623
        // ascending order, then that would make this routine more costly in
624
        // the somewhat common case of a regex that isn't one-pass.
625
0
        while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() {
626
0
            let dfa_id = self.nfa_to_dfa_id[nfa_id];
627
0
            // Once we see a match, we keep going, but don't add any new
628
0
            // transitions. Normally we'd just stop, but we have to keep
629
0
            // going in order to verify that our regex is actually one-pass.
630
0
            self.matched = false;
631
0
            // The NFA states we've already explored for this DFA state.
632
0
            self.seen.clear();
633
0
            // The NFA states to explore via epsilon transitions. If we ever
634
0
            // try to push an NFA state that we've already seen, then the NFA
635
0
            // is not one-pass because it implies there are multiple epsilon
636
0
            // transition paths that lead to the same NFA state. In other
637
0
            // words, there is ambiguity.
638
0
            self.stack_push(nfa_id, Epsilons::empty())?;
639
0
            while let Some((id, epsilons)) = self.stack.pop() {
640
0
                match *self.nfa.state(id) {
641
0
                    thompson::State::ByteRange { ref trans } => {
642
0
                        self.compile_transition(dfa_id, trans, epsilons)?;
643
                    }
644
0
                    thompson::State::Sparse(ref sparse) => {
645
0
                        for trans in sparse.transitions.iter() {
646
0
                            self.compile_transition(dfa_id, trans, epsilons)?;
647
                        }
648
                    }
649
0
                    thompson::State::Dense(ref dense) => {
650
0
                        for trans in dense.iter() {
651
0
                            self.compile_transition(dfa_id, &trans, epsilons)?;
652
                        }
653
                    }
654
0
                    thompson::State::Look { look, next } => {
655
0
                        let looks = epsilons.looks().insert(look);
656
0
                        self.stack_push(next, epsilons.set_looks(looks))?;
657
                    }
658
0
                    thompson::State::Union { ref alternates } => {
659
0
                        for &sid in alternates.iter().rev() {
660
0
                            self.stack_push(sid, epsilons)?;
661
                        }
662
                    }
663
0
                    thompson::State::BinaryUnion { alt1, alt2 } => {
664
0
                        self.stack_push(alt2, epsilons)?;
665
0
                        self.stack_push(alt1, epsilons)?;
666
                    }
667
0
                    thompson::State::Capture { next, slot, .. } => {
668
0
                        let slot = slot.as_usize();
669
0
                        let epsilons = if slot < explicit_slot_start {
670
                            // If this is an implicit slot, we don't care
671
                            // about it, since we handle implicit slots in
672
                            // the search routine. We can get away with that
673
                            // because there are 2 implicit slots for every
674
                            // pattern.
675
0
                            epsilons
676
                        } else {
677
                            // Offset our explicit slots so that they start
678
                            // at index 0.
679
0
                            let offset = slot - explicit_slot_start;
680
0
                            epsilons.set_slots(epsilons.slots().insert(offset))
681
                        };
682
0
                        self.stack_push(next, epsilons)?;
683
                    }
684
                    thompson::State::Fail => {
685
0
                        continue;
686
                    }
687
0
                    thompson::State::Match { pattern_id } => {
688
0
                        // If we found two different paths to a match state
689
0
                        // for the same DFA state, then we have ambiguity.
690
0
                        // Thus, it's not one-pass.
691
0
                        if self.matched {
692
0
                            return Err(BuildError::not_one_pass(
693
0
                                "multiple epsilon transitions to match state",
694
0
                            ));
695
0
                        }
696
0
                        self.matched = true;
697
0
                        // Shove the matching pattern ID and the 'epsilons'
698
0
                        // into the current DFA state's pattern epsilons. The
699
0
                        // 'epsilons' includes the slots we need to capture
700
0
                        // before reporting the match and also the conditional
701
0
                        // epsilon transitions we need to check before we can
702
0
                        // report a match.
703
0
                        self.dfa.set_pattern_epsilons(
704
0
                            dfa_id,
705
0
                            PatternEpsilons::empty()
706
0
                                .set_pattern_id(pattern_id)
707
0
                                .set_epsilons(epsilons),
708
0
                        );
709
                        // N.B. It is tempting to just bail out here when
710
                        // compiling a leftmost-first DFA, since we will never
711
                        // compile any more transitions in that case. But we
712
                        // actually need to keep going in order to verify that
713
                        // we actually have a one-pass regex. e.g., We might
714
                        // see more Match states (e.g., for other patterns)
715
                        // that imply that we don't have a one-pass regex.
716
                        // So instead, we mark that we've found a match and
717
                        // continue on. When we go to compile a new DFA state,
718
                        // we just skip that part. But otherwise check that the
719
                        // one-pass property is upheld.
720
                    }
721
                }
722
            }
723
        }
724
0
        self.shuffle_states();
725
0
        Ok(self.dfa)
726
0
    }
727
728
    /// Shuffle all match states to the end of the transition table and set
729
    /// 'min_match_id' to the ID of the first such match state.
730
    ///
731
    /// The point of this is to make it extremely cheap to determine whether
732
    /// a state is a match state or not. We need to check on this on every
733
    /// transition during a search, so it being cheap is important. This
734
    /// permits us to check it by simply comparing two state identifiers, as
735
    /// opposed to looking for the pattern ID in the state's `PatternEpsilons`.
736
    /// (Which requires a memory load and some light arithmetic.)
737
0
    fn shuffle_states(&mut self) {
738
0
        let mut remapper = Remapper::new(&self.dfa);
739
0
        let mut next_dest = self.dfa.last_state_id();
740
0
        for i in (0..self.dfa.state_len()).rev() {
741
0
            let id = StateID::must(i);
742
0
            let is_match =
743
0
                self.dfa.pattern_epsilons(id).pattern_id().is_some();
744
0
            if !is_match {
745
0
                continue;
746
0
            }
747
0
            remapper.swap(&mut self.dfa, next_dest, id);
748
0
            self.dfa.min_match_id = next_dest;
749
0
            next_dest = self.dfa.prev_state_id(next_dest).expect(
750
0
                "match states should be a proper subset of all states",
751
0
            );
752
        }
753
0
        remapper.remap(&mut self.dfa);
754
0
    }
755
756
    /// Compile the given NFA transition into the DFA state given.
757
    ///
758
    /// 'Epsilons' corresponds to any conditional epsilon transitions that need
759
    /// to be satisfied to follow this transition, and any slots that need to
760
    /// be saved if the transition is followed.
761
    ///
762
    /// If this transition indicates that the NFA is not one-pass, then
763
    /// this returns an error. (This occurs, for example, if the DFA state
764
    /// already has a transition defined for the same input symbols as the
765
    /// given transition, *and* the result of the old and new transitions is
766
    /// different.)
767
0
    fn compile_transition(
768
0
        &mut self,
769
0
        dfa_id: StateID,
770
0
        trans: &thompson::Transition,
771
0
        epsilons: Epsilons,
772
0
    ) -> Result<(), BuildError> {
773
0
        let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?;
774
0
        for byte in self
775
0
            .classes
776
0
            .representatives(trans.start..=trans.end)
777
0
            .filter_map(|r| r.as_u8())
778
        {
779
0
            let oldtrans = self.dfa.transition(dfa_id, byte);
780
0
            let newtrans =
781
0
                Transition::new(self.matched, next_dfa_id, epsilons);
782
0
            // If the old transition points to the DEAD state, then we know
783
0
            // 'byte' has not been mapped to any transition for this DFA state
784
0
            // yet. So set it unconditionally. Otherwise, we require that the
785
0
            // old and new transitions are equivalent. Otherwise, there is
786
0
            // ambiguity and thus the regex is not one-pass.
787
0
            if oldtrans.state_id() == DEAD {
788
0
                self.dfa.set_transition(dfa_id, byte, newtrans);
789
0
            } else if oldtrans != newtrans {
790
0
                return Err(BuildError::not_one_pass(
791
0
                    "conflicting transition",
792
0
                ));
793
0
            }
794
        }
795
0
        Ok(())
796
0
    }
797
798
    /// Add a start state to the DFA corresponding to the given NFA starting
799
    /// state ID.
800
    ///
801
    /// If adding a state would blow any limits (configured or hard-coded),
802
    /// then an error is returned.
803
    ///
804
    /// If the starting state is an anchored state for a particular pattern,
805
    /// then callers must provide the pattern ID for that starting state.
806
    /// Callers must also ensure that the first starting state added is the
807
    /// start state for all patterns, and then each anchored starting state for
808
    /// each pattern (if necessary) added in order. Otherwise, this panics.
809
0
    fn add_start_state(
810
0
        &mut self,
811
0
        pid: Option<PatternID>,
812
0
        nfa_id: StateID,
813
0
    ) -> Result<StateID, BuildError> {
814
0
        match pid {
815
            // With no pid, this should be the start state for all patterns
816
            // and thus be the first one.
817
0
            None => assert!(self.dfa.starts.is_empty()),
818
            // With a pid, we want it to be at self.dfa.starts[pid+1].
819
0
            Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()),
820
        }
821
0
        let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?;
822
0
        self.dfa.starts.push(dfa_id);
823
0
        Ok(dfa_id)
824
0
    }
825
826
    /// Add a new DFA state corresponding to the given NFA state. If adding a
827
    /// state would blow any limits (configured or hard-coded), then an error
828
    /// is returned. If a DFA state already exists for the given NFA state,
829
    /// then that DFA state's ID is returned and no new states are added.
830
    ///
831
    /// It is not expected that this routine is called for every NFA state.
832
    /// Instead, an NFA state ID will usually correspond to the "start" state
833
    /// for a sub-graph of the NFA, where all states in the sub-graph are
834
    /// reachable via epsilon transitions (conditional or unconditional). That
835
    /// sub-graph of NFA states is ultimately what produces a single DFA state.
836
0
    fn add_dfa_state_for_nfa_state(
837
0
        &mut self,
838
0
        nfa_id: StateID,
839
0
    ) -> Result<StateID, BuildError> {
840
0
        // If we've already built a DFA state for the given NFA state, then
841
0
        // just return that. We definitely do not want to have more than one
842
0
        // DFA state in existence for the same NFA state, since all but one of
843
0
        // them will likely become unreachable. And at least some of them are
844
0
        // likely to wind up being incomplete.
845
0
        let existing_dfa_id = self.nfa_to_dfa_id[nfa_id];
846
0
        if existing_dfa_id != DEAD {
847
0
            return Ok(existing_dfa_id);
848
0
        }
849
        // If we don't have any DFA state yet, add it and then add the given
850
        // NFA state to the list of states to explore.
851
0
        let dfa_id = self.add_empty_state()?;
852
0
        self.nfa_to_dfa_id[nfa_id] = dfa_id;
853
0
        self.uncompiled_nfa_ids.push(nfa_id);
854
0
        Ok(dfa_id)
855
0
    }
856
857
    /// Unconditionally add a new empty DFA state. If adding it would exceed
858
    /// any limits (configured or hard-coded), then an error is returned. The
859
    /// ID of the new state is returned on success.
860
    ///
861
    /// The added state is *not* a match state.
862
0
    fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
863
0
        let state_limit = Transition::STATE_ID_LIMIT;
864
0
        // Note that unlike dense and lazy DFAs, we specifically do NOT
865
0
        // premultiply our state IDs here. The reason is that we want to pack
866
0
        // our state IDs into 64-bit transitions with other info, so the fewer
867
0
        // the bits we use for state IDs the better. If we premultiply, then
868
0
        // our state ID space shrinks. We justify this by the assumption that
869
0
        // a one-pass DFA is just already doing a fair bit more work than a
870
0
        // normal DFA anyway, so an extra multiplication to compute a state
871
0
        // transition doesn't seem like a huge deal.
872
0
        let next_id = self.dfa.table.len() >> self.dfa.stride2();
873
0
        let id = StateID::new(next_id)
874
0
            .map_err(|_| BuildError::too_many_states(state_limit))?;
875
0
        if id.as_u64() > Transition::STATE_ID_LIMIT {
876
0
            return Err(BuildError::too_many_states(state_limit));
877
0
        }
878
0
        self.dfa
879
0
            .table
880
0
            .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride()));
881
0
        // The default empty value for 'PatternEpsilons' is sadly not all
882
0
        // zeroes. Instead, a special sentinel is used to indicate that there
883
0
        // is no pattern. So we need to explicitly set the pattern epsilons to
884
0
        // the correct "empty" PatternEpsilons.
885
0
        self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty());
886
0
        if let Some(size_limit) = self.config.get_size_limit() {
887
0
            if self.dfa.memory_usage() > size_limit {
888
0
                return Err(BuildError::exceeded_size_limit(size_limit));
889
0
            }
890
0
        }
891
0
        Ok(id)
892
0
    }
893
894
    /// Push the given NFA state ID and its corresponding epsilons (slots and
895
    /// conditional epsilon transitions) on to a stack for use in a depth first
896
    /// traversal of a sub-graph of the NFA.
897
    ///
898
    /// If the given NFA state ID has already been pushed on to the stack, then
899
    /// it indicates the regex is not one-pass and this correspondingly returns
900
    /// an error.
901
0
    fn stack_push(
902
0
        &mut self,
903
0
        nfa_id: StateID,
904
0
        epsilons: Epsilons,
905
0
    ) -> Result<(), BuildError> {
906
0
        // If we already have seen a match and we are compiling a leftmost
907
0
        // first DFA, then we shouldn't add any more states to look at. This is
908
0
        // effectively how preference order and non-greediness is implemented.
909
0
        // if !self.config.get_match_kind().continue_past_first_match()
910
0
        // && self.matched
911
0
        // {
912
0
        // return Ok(());
913
0
        // }
914
0
        if !self.seen.insert(nfa_id) {
915
0
            return Err(BuildError::not_one_pass(
916
0
                "multiple epsilon transitions to same state",
917
0
            ));
918
0
        }
919
0
        self.stack.push((nfa_id, epsilons));
920
0
        Ok(())
921
0
    }
922
}
923
924
/// A one-pass DFA for executing a subset of anchored regex searches while
925
/// resolving capturing groups.
926
///
927
/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is
928
/// one-pass when there is never any ambiguity about how to continue a search.
929
/// For example, `a*a` is not one-pass becuase during a search, it's not
930
/// possible to know whether to continue matching the `a*` or to move on to
931
/// the single `a`. However, `a*b` is one-pass, because for every byte in the
932
/// input, it's always clear when to move on from `a*` to `b`.
933
///
934
/// # Only anchored searches are supported
935
///
936
/// In this crate, especially for DFAs, unanchored searches are implemented by
937
/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix
938
/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will
939
/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches
940
/// any byte, and there is therefore ambiguity as to when the prefix should
941
/// stop matching and something else should start matching.
942
///
943
/// Therefore, one-pass DFAs do not support unanchored searches. In addition
944
/// to many regexes simply not being one-pass, it implies that one-pass DFAs
945
/// have limited utility. With that said, when a one-pass DFA can be used, it
946
/// can potentially provide a dramatic speed up over alternatives like the
947
/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
948
/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular,
949
/// a one-pass DFA is the only DFA capable of reporting the spans of matching
950
/// capturing groups.
951
///
952
/// To clarify, when we say that unanchored searches are not supported, what
953
/// that actually means is:
954
///
955
/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always
956
/// do anchored searches.
957
/// * Since iterators are most useful in the context of unanchored searches,
958
/// there is no `DFA::captures_iter` method.
959
/// * For lower level routines like [`DFA::try_search`], an error will be
960
/// returned if the given [`Input`] is configured to do an unanchored search or
961
/// search for an invalid pattern ID. (Note that an [`Input`] is configured to
962
/// do an unanchored search by default, so just giving a `Input::new` is
963
/// guaranteed to return an error.)
964
///
965
/// # Other limitations
966
///
967
/// In addition to the [configurable heap limit](Config::size_limit) and
968
/// the requirement that a regex pattern be one-pass, there are some other
969
/// limitations:
970
///
971
/// * There is an internal limit on the total number of explicit capturing
972
/// groups that appear across all patterns. It is somewhat small and there is
973
/// no way to configure it. If your pattern(s) exceed this limit, then building
974
/// a one-pass DFA will fail.
975
/// * If the number of patterns exceeds an internal unconfigurable limit, then
976
/// building a one-pass DFA will fail. This limit is quite large and you're
977
/// unlikely to hit it.
978
/// * If the total number of states exceeds an internal unconfigurable limit,
979
/// then building a one-pass DFA will fail. This limit is quite large and
980
/// you're unlikely to hit it.
981
///
982
/// # Other examples of regexes that aren't one-pass
983
///
984
/// One particularly unfortunate example is that enabling Unicode can cause
985
/// regexes that were one-pass to no longer be one-pass. Consider the regex
986
/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no
987
/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s`
988
/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get
989
/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s`
990
/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the
991
/// overlap between UTF-8 automata, the use of Unicode character classes will
992
/// tend to vastly increase the likelihood of a regex not being one-pass.
993
///
994
/// # How does one know if a regex is one-pass or not?
995
///
996
/// At the time of writing, the only way to know is to try and build a one-pass
997
/// DFA. The one-pass property is checked while constructing the DFA.
998
///
999
/// This does mean that you might potentially waste some CPU cycles and memory
1000
/// by optimistically trying to build a one-pass DFA. But this is currently the
1001
/// only way. In the future, building a one-pass DFA might be able to use some
1002
/// heuristics to detect common violations of the one-pass property and bail
1003
/// more quickly.
1004
///
1005
/// # Resource usage
1006
///
1007
/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource
1008
/// usage. Namely, construction of a one-pass DFA has a time and space
1009
/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time
1010
/// and space complexity is `O(2^n)`.) This smaller time bound is achieved
1011
/// because there is at most one DFA state created for each NFA state. If
1012
/// additional DFA states would be required, then the pattern is not one-pass
1013
/// and construction will fail.
1014
///
1015
/// Note though that currently, this DFA uses a fully dense representation.
1016
/// This means that while its space complexity is no worse than an NFA, it may
1017
/// in practice use more memory because of higher constant factors. The reason
1018
/// for this trade off is two-fold. Firstly, a dense representation makes the
1019
/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be
1020
/// one-pass. Therefore, most one-pass DFAs are usually pretty small.
1021
///
1022
/// # Example
1023
///
1024
/// This example shows that the one-pass DFA implements Unicode word boundaries
1025
/// correctly while simultaneously reporting spans for capturing groups that
1026
/// participate in a match. (This is the only DFA that implements full support
1027
/// for Unicode word boundaries.)
1028
///
1029
/// ```
1030
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1031
/// use regex_automata::{dfa::onepass::DFA, Match, Span};
1032
///
1033
/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?;
1034
/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1035
///
1036
/// re.captures(&mut cache, "Шерлок Холмс", &mut caps);
1037
/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match());
1038
/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first"));
1039
/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last"));
1040
/// # Ok::<(), Box<dyn std::error::Error>>(())
1041
/// ```
1042
///
1043
/// # Example: iteration
1044
///
1045
/// Unlike other regex engines in this crate, this one does not provide
1046
/// iterator search functions. This is because a one-pass DFA only supports
1047
/// anchored searches, and so iterator functions are generally not applicable.
1048
///
1049
/// However, if you know that all of your matches are
1050
/// directly adjacent, then an iterator can be used. The
1051
/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for
1052
/// this purpose:
1053
///
1054
/// ```
1055
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1056
/// use regex_automata::{
1057
///     dfa::onepass::DFA,
1058
///     util::iter::Searcher,
1059
///     Anchored, Input, Span,
1060
/// };
1061
///
1062
/// let re = DFA::new(r"\w(\d)\w")?;
1063
/// let (mut cache, caps) = (re.create_cache(), re.create_captures());
1064
/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes);
1065
///
1066
/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| {
1067
///     Ok(re.try_search(&mut cache, input, caps)?)
1068
/// }).infallible();
1069
/// let caps0 = it.next().unwrap();
1070
/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1));
1071
///
1072
/// # Ok::<(), Box<dyn std::error::Error>>(())
1073
/// ```
1074
#[derive(Clone)]
1075
pub struct DFA {
1076
    /// The configuration provided by the caller.
1077
    config: Config,
1078
    /// The NFA used to build this DFA.
1079
    ///
1080
    /// NOTE: We probably don't need to store the NFA here, but we use enough
1081
    /// bits from it that it's convenient to do so. And there really isn't much
1082
    /// cost to doing so either, since an NFA is reference counted internally.
1083
    nfa: NFA,
1084
    /// The transition table. Given a state ID 's' and a byte of haystack 'b',
1085
    /// the next state is `table[sid + classes[byte]]`.
1086
    ///
1087
    /// The stride of this table (i.e., the number of columns) is always
1088
    /// a power of 2, even if the alphabet length is smaller. This makes
1089
    /// converting between state IDs and state indices very cheap.
1090
    ///
1091
    /// Note that the stride always includes room for one extra "transition"
1092
    /// that isn't actually a transition. It is a 'PatternEpsilons' that is
1093
    /// used for match states only. Because of this, the maximum number of
1094
    /// active columns in the transition table is 257, which means the maximum
1095
    /// stride is 512 (the next power of 2 greater than or equal to 257).
1096
    table: Vec<Transition>,
1097
    /// The DFA state IDs of the starting states.
1098
    ///
1099
    /// `starts[0]` is always present and corresponds to the starting state
1100
    /// when searching for matches of any pattern in the DFA.
1101
    ///
1102
    /// `starts[i]` where i>0 corresponds to the starting state for the pattern
1103
    /// ID 'i-1'. These starting states are optional.
1104
    starts: Vec<StateID>,
1105
    /// Every state ID >= this value corresponds to a match state.
1106
    ///
1107
    /// This is what a search uses to detect whether a state is a match state
1108
    /// or not. It requires only a simple comparison instead of bit-unpacking
1109
    /// the PatternEpsilons from every state.
1110
    min_match_id: StateID,
1111
    /// The alphabet of this DFA, split into equivalence classes. Bytes in the
1112
    /// same equivalence class can never discriminate between a match and a
1113
    /// non-match.
1114
    classes: ByteClasses,
1115
    /// The number of elements in each state in the transition table. This may
1116
    /// be less than the stride, since the stride is always a power of 2 and
1117
    /// the alphabet length can be anything up to and including 256.
1118
    alphabet_len: usize,
1119
    /// The number of columns in the transition table, expressed as a power of
1120
    /// 2.
1121
    stride2: usize,
1122
    /// The offset at which the PatternEpsilons for a match state is stored in
1123
    /// the transition table.
1124
    ///
1125
    /// PERF: One wonders whether it would be better to put this in a separate
1126
    /// allocation, since only match states have a non-empty PatternEpsilons
1127
    /// and the number of match states tends be dwarfed by the number of
1128
    /// non-match states. So this would save '8*len(non_match_states)' for each
1129
    /// DFA. The question is whether moving this to a different allocation will
1130
    /// lead to a perf hit during searches. You might think dealing with match
1131
    /// states is rare, but some regexes spend a lot of time in match states
1132
    /// gobbling up input. But... match state handling is already somewhat
1133
    /// expensive, so maybe this wouldn't do much? Either way, it's worth
1134
    /// experimenting.
1135
    pateps_offset: usize,
1136
    /// The first explicit slot index. This refers to the first slot appearing
1137
    /// immediately after the last implicit slot. It is always 'patterns.len()
1138
    /// * 2'.
1139
    ///
1140
    /// We record this because we only store the explicit slots in our DFA
1141
    /// transition table that need to be saved. Implicit slots are handled
1142
    /// automatically as part of the search.
1143
    explicit_slot_start: usize,
1144
}
1145
1146
impl DFA {
1147
    /// Parse the given regular expression using the default configuration and
1148
    /// return the corresponding one-pass DFA.
1149
    ///
1150
    /// If you want a non-default configuration, then use the [`Builder`] to
1151
    /// set your own configuration.
1152
    ///
1153
    /// # Example
1154
    ///
1155
    /// ```
1156
    /// use regex_automata::{dfa::onepass::DFA, Match};
1157
    ///
1158
    /// let re = DFA::new("foo[0-9]+bar")?;
1159
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1160
    ///
1161
    /// re.captures(&mut cache, "foo12345barzzz", &mut caps);
1162
    /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match());
1163
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1164
    /// ```
1165
    #[cfg(feature = "syntax")]
1166
    #[inline]
1167
0
    pub fn new(pattern: &str) -> Result<DFA, BuildError> {
1168
0
        DFA::builder().build(pattern)
1169
0
    }
1170
1171
    /// Like `new`, but parses multiple patterns into a single "multi regex."
1172
    /// This similarly uses the default regex configuration.
1173
    ///
1174
    /// # Example
1175
    ///
1176
    /// ```
1177
    /// use regex_automata::{dfa::onepass::DFA, Match};
1178
    ///
1179
    /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?;
1180
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1181
    ///
1182
    /// re.captures(&mut cache, "abc123", &mut caps);
1183
    /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match());
1184
    ///
1185
    /// re.captures(&mut cache, "123abc", &mut caps);
1186
    /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
1187
    ///
1188
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1189
    /// ```
1190
    #[cfg(feature = "syntax")]
1191
    #[inline]
1192
0
    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
1193
0
        DFA::builder().build_many(patterns)
1194
0
    }
1195
1196
    /// Like `new`, but builds a one-pass DFA directly from an NFA. This is
1197
    /// useful if you already have an NFA, or even if you hand-assembled the
1198
    /// NFA.
1199
    ///
1200
    /// # Example
1201
    ///
1202
    /// This shows how to hand assemble a regular expression via its HIR,
1203
    /// compile an NFA from it and build a one-pass DFA from the NFA.
1204
    ///
1205
    /// ```
1206
    /// use regex_automata::{
1207
    ///     dfa::onepass::DFA,
1208
    ///     nfa::thompson::NFA,
1209
    ///     Match,
1210
    /// };
1211
    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
1212
    ///
1213
    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
1214
    ///     ClassBytesRange::new(b'0', b'9'),
1215
    ///     ClassBytesRange::new(b'A', b'Z'),
1216
    ///     ClassBytesRange::new(b'_', b'_'),
1217
    ///     ClassBytesRange::new(b'a', b'z'),
1218
    /// ])));
1219
    ///
1220
    /// let config = NFA::config().nfa_size_limit(Some(1_000));
1221
    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
1222
    ///
1223
    /// let re = DFA::new_from_nfa(nfa)?;
1224
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1225
    /// let expected = Some(Match::must(0, 0..1));
1226
    /// re.captures(&mut cache, "A", &mut caps);
1227
    /// assert_eq!(expected, caps.get_match());
1228
    ///
1229
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1230
    /// ```
1231
0
    pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> {
1232
0
        DFA::builder().build_from_nfa(nfa)
1233
0
    }
1234
1235
    /// Create a new one-pass DFA that matches every input.
1236
    ///
1237
    /// # Example
1238
    ///
1239
    /// ```
1240
    /// use regex_automata::{dfa::onepass::DFA, Match};
1241
    ///
1242
    /// let dfa = DFA::always_match()?;
1243
    /// let mut cache = dfa.create_cache();
1244
    /// let mut caps = dfa.create_captures();
1245
    ///
1246
    /// let expected = Match::must(0, 0..0);
1247
    /// dfa.captures(&mut cache, "", &mut caps);
1248
    /// assert_eq!(Some(expected), caps.get_match());
1249
    /// dfa.captures(&mut cache, "foo", &mut caps);
1250
    /// assert_eq!(Some(expected), caps.get_match());
1251
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1252
    /// ```
1253
0
    pub fn always_match() -> Result<DFA, BuildError> {
1254
0
        let nfa = thompson::NFA::always_match();
1255
0
        Builder::new().build_from_nfa(nfa)
1256
0
    }
1257
1258
    /// Create a new one-pass DFA that never matches any input.
1259
    ///
1260
    /// # Example
1261
    ///
1262
    /// ```
1263
    /// use regex_automata::dfa::onepass::DFA;
1264
    ///
1265
    /// let dfa = DFA::never_match()?;
1266
    /// let mut cache = dfa.create_cache();
1267
    /// let mut caps = dfa.create_captures();
1268
    ///
1269
    /// dfa.captures(&mut cache, "", &mut caps);
1270
    /// assert_eq!(None, caps.get_match());
1271
    /// dfa.captures(&mut cache, "foo", &mut caps);
1272
    /// assert_eq!(None, caps.get_match());
1273
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1274
    /// ```
1275
0
    pub fn never_match() -> Result<DFA, BuildError> {
1276
0
        let nfa = thompson::NFA::never_match();
1277
0
        Builder::new().build_from_nfa(nfa)
1278
0
    }
1279
1280
    /// Return a default configuration for a DFA.
1281
    ///
1282
    /// This is a convenience routine to avoid needing to import the `Config`
1283
    /// type when customizing the construction of a DFA.
1284
    ///
1285
    /// # Example
1286
    ///
1287
    /// This example shows how to change the match semantics of this DFA from
1288
    /// its default "leftmost first" to "all." When using "all," non-greediness
1289
    /// doesn't apply and neither does preference order matching. Instead, the
1290
    /// longest match possible is always returned. (Although, by construction,
1291
    /// it's impossible for a one-pass DFA to have a different answer for
1292
    /// "preference order" vs "longest match.")
1293
    ///
1294
    /// ```
1295
    /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind};
1296
    ///
1297
    /// let re = DFA::builder()
1298
    ///     .configure(DFA::config().match_kind(MatchKind::All))
1299
    ///     .build(r"(abc)+?")?;
1300
    /// let mut cache = re.create_cache();
1301
    /// let mut caps = re.create_captures();
1302
    ///
1303
    /// re.captures(&mut cache, "abcabc", &mut caps);
1304
    /// // Normally, the non-greedy repetition would give us a 0..3 match.
1305
    /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
1306
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1307
    /// ```
1308
    #[inline]
1309
0
    pub fn config() -> Config {
1310
0
        Config::new()
1311
0
    }
1312
1313
    /// Return a builder for configuring the construction of a DFA.
1314
    ///
1315
    /// This is a convenience routine to avoid needing to import the
1316
    /// [`Builder`] type in common cases.
1317
    ///
1318
    /// # Example
1319
    ///
1320
    /// This example shows how to use the builder to disable UTF-8 mode.
1321
    ///
1322
    /// ```
1323
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1324
    /// use regex_automata::{
1325
    ///     dfa::onepass::DFA,
1326
    ///     nfa::thompson,
1327
    ///     util::syntax,
1328
    ///     Match,
1329
    /// };
1330
    ///
1331
    /// let re = DFA::builder()
1332
    ///     .syntax(syntax::Config::new().utf8(false))
1333
    ///     .thompson(thompson::Config::new().utf8(false))
1334
    ///     .build(r"foo(?-u:[^b])ar.*")?;
1335
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1336
    ///
1337
    /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
1338
    /// let expected = Some(Match::must(0, 0..8));
1339
    /// re.captures(&mut cache, haystack, &mut caps);
1340
    /// assert_eq!(expected, caps.get_match());
1341
    ///
1342
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1343
    /// ```
1344
    #[inline]
1345
0
    pub fn builder() -> Builder {
1346
0
        Builder::new()
1347
0
    }
1348
1349
    /// Create a new empty set of capturing groups that is guaranteed to be
1350
    /// valid for the search APIs on this DFA.
1351
    ///
1352
    /// A `Captures` value created for a specific DFA cannot be used with any
1353
    /// other DFA.
1354
    ///
1355
    /// This is a convenience function for [`Captures::all`]. See the
1356
    /// [`Captures`] documentation for an explanation of its alternative
1357
    /// constructors that permit the DFA to do less work during a search, and
1358
    /// thus might make it faster.
1359
    #[inline]
1360
0
    pub fn create_captures(&self) -> Captures {
1361
0
        Captures::all(self.nfa.group_info().clone())
1362
0
    }
1363
1364
    /// Create a new cache for this DFA.
1365
    ///
1366
    /// The cache returned should only be used for searches for this
1367
    /// DFA. If you want to reuse the cache for another DFA, then you
1368
    /// must call [`Cache::reset`] with that DFA (or, equivalently,
1369
    /// [`DFA::reset_cache`]).
1370
    #[inline]
1371
0
    pub fn create_cache(&self) -> Cache {
1372
0
        Cache::new(self)
1373
0
    }
1374
1375
    /// Reset the given cache such that it can be used for searching with the
1376
    /// this DFA (and only this DFA).
1377
    ///
1378
    /// A cache reset permits reusing memory already allocated in this cache
1379
    /// with a different DFA.
1380
    ///
1381
    /// # Example
1382
    ///
1383
    /// This shows how to re-purpose a cache for use with a different DFA.
1384
    ///
1385
    /// ```
1386
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1387
    /// use regex_automata::{dfa::onepass::DFA, Match};
1388
    ///
1389
    /// let re1 = DFA::new(r"\w")?;
1390
    /// let re2 = DFA::new(r"\W")?;
1391
    /// let mut caps1 = re1.create_captures();
1392
    /// let mut caps2 = re2.create_captures();
1393
    ///
1394
    /// let mut cache = re1.create_cache();
1395
    /// assert_eq!(
1396
    ///     Some(Match::must(0, 0..2)),
1397
    ///     { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
1398
    /// );
1399
    ///
1400
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
1401
    /// // incorrect results. In order to re-purpose the cache, we must reset
1402
    /// // it with the one-pass DFA we'd like to use it with.
1403
    /// //
1404
    /// // Similarly, after this reset, using the cache with 're1' is also not
1405
    /// // allowed.
1406
    /// re2.reset_cache(&mut cache);
1407
    /// assert_eq!(
1408
    ///     Some(Match::must(0, 0..3)),
1409
    ///     { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
1410
    /// );
1411
    ///
1412
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1413
    /// ```
1414
    #[inline]
1415
0
    pub fn reset_cache(&self, cache: &mut Cache) {
1416
0
        cache.reset(self);
1417
0
    }
1418
1419
    /// Return the config for this one-pass DFA.
1420
    #[inline]
1421
0
    pub fn get_config(&self) -> &Config {
1422
0
        &self.config
1423
0
    }
1424
1425
    /// Returns a reference to the underlying NFA.
1426
    #[inline]
1427
0
    pub fn get_nfa(&self) -> &NFA {
1428
0
        &self.nfa
1429
0
    }
1430
1431
    /// Returns the total number of patterns compiled into this DFA.
1432
    ///
1433
    /// In the case of a DFA that contains no patterns, this returns `0`.
1434
    #[inline]
1435
0
    pub fn pattern_len(&self) -> usize {
1436
0
        self.get_nfa().pattern_len()
1437
0
    }
1438
1439
    /// Returns the total number of states in this one-pass DFA.
1440
    ///
1441
    /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1442
    /// a low level DFA API. Therefore, this routine has little use other than
1443
    /// being informational.
1444
    #[inline]
1445
0
    pub fn state_len(&self) -> usize {
1446
0
        self.table.len() >> self.stride2()
1447
0
    }
1448
1449
    /// Returns the total number of elements in the alphabet for this DFA.
1450
    ///
1451
    /// That is, this returns the total number of transitions that each
1452
    /// state in this DFA must have. The maximum alphabet size is 256, which
1453
    /// corresponds to each possible byte value.
1454
    ///
1455
    /// The alphabet size may be less than 256 though, and unless
1456
    /// [`Config::byte_classes`] is disabled, it is typically must less than
1457
    /// 256. Namely, bytes are grouped into equivalence classes such that no
1458
    /// two bytes in the same class can distinguish a match from a non-match.
1459
    /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could
1460
    /// all be in the same equivalence class. This leads to a massive space
1461
    /// savings.
1462
    ///
1463
    /// Note though that the alphabet length does _not_ necessarily equal the
1464
    /// total stride space taken up by a single DFA state in the transition
1465
    /// table. Namely, for performance reasons, the stride is always the
1466
    /// smallest power of two that is greater than or equal to the alphabet
1467
    /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
1468
    /// often more useful. The alphabet length is typically useful only for
1469
    /// informational purposes.
1470
    ///
1471
    /// Note also that unlike dense or sparse DFAs, a one-pass DFA does
1472
    /// not have a special end-of-input (EOI) transition. This is because
1473
    /// a one-pass DFA handles look-around assertions explicitly (like the
1474
    /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build
1475
    /// them into the transitions of the DFA.
1476
    #[inline]
1477
0
    pub fn alphabet_len(&self) -> usize {
1478
0
        self.alphabet_len
1479
0
    }
1480
1481
    /// Returns the total stride for every state in this DFA, expressed as the
1482
    /// exponent of a power of 2. The stride is the amount of space each state
1483
    /// takes up in the transition table, expressed as a number of transitions.
1484
    /// (Unused transitions map to dead states.)
1485
    ///
1486
    /// The stride of a DFA is always equivalent to the smallest power of
1487
    /// 2 that is greater than or equal to the DFA's alphabet length. This
1488
    /// definition uses extra space, but possibly permits faster translation
1489
    /// between state identifiers and their corresponding offsets in this DFA's
1490
    /// transition table.
1491
    ///
1492
    /// For example, if the DFA's stride is 16 transitions, then its `stride2`
1493
    /// is `4` since `2^4 = 16`.
1494
    ///
1495
    /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
1496
    /// while the maximum `stride2` value is `9` (corresponding to a stride
1497
    /// of `512`). The maximum in theory should be `8`, but because of some
1498
    /// implementation quirks that may be relaxed in the future, it is one more
1499
    /// than `8`. (Do note that a maximal stride is incredibly rare, as it
1500
    /// would imply that there is almost no redundant in the regex pattern.)
1501
    ///
1502
    /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1503
    /// a low level DFA API. Therefore, this routine has little use other than
1504
    /// being informational.
1505
    #[inline]
1506
0
    pub fn stride2(&self) -> usize {
1507
0
        self.stride2
1508
0
    }
1509
1510
    /// Returns the total stride for every state in this DFA. This corresponds
1511
    /// to the total number of transitions used by each state in this DFA's
1512
    /// transition table.
1513
    ///
1514
    /// Please see [`DFA::stride2`] for more information. In particular, this
1515
    /// returns the stride as the number of transitions, where as `stride2`
1516
    /// returns it as the exponent of a power of 2.
1517
    ///
1518
    /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1519
    /// a low level DFA API. Therefore, this routine has little use other than
1520
    /// being informational.
1521
    #[inline]
1522
0
    pub fn stride(&self) -> usize {
1523
0
        1 << self.stride2()
1524
0
    }
1525
1526
    /// Returns the memory usage, in bytes, of this DFA.
1527
    ///
1528
    /// The memory usage is computed based on the number of bytes used to
1529
    /// represent this DFA.
1530
    ///
1531
    /// This does **not** include the stack size used up by this DFA. To
1532
    /// compute that, use `std::mem::size_of::<onepass::DFA>()`.
1533
    #[inline]
1534
0
    pub fn memory_usage(&self) -> usize {
1535
        use core::mem::size_of;
1536
1537
0
        self.table.len() * size_of::<Transition>()
1538
0
            + self.starts.len() * size_of::<StateID>()
1539
0
    }
1540
}
1541
1542
impl DFA {
1543
    /// Executes an anchored leftmost forward search, and returns true if and
1544
    /// only if this one-pass DFA matches the given haystack.
1545
    ///
1546
    /// This routine may short circuit if it knows that scanning future
1547
    /// input will never lead to a different result. In particular, if the
1548
    /// underlying DFA enters a match state, then this routine will return
1549
    /// `true` immediately without inspecting any future input. (Consider how
1550
    /// this might make a difference given the regex `a+` on the haystack
1551
    /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
1552
    /// but routines like `find` need to continue searching because `+` is
1553
    /// greedy by default.)
1554
    ///
1555
    /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1556
    /// given configuration was [`Anchored::No`] (which is the default).
1557
    ///
1558
    /// # Panics
1559
    ///
1560
    /// This routine panics if the search could not complete. This can occur
1561
    /// in the following circumstances:
1562
    ///
1563
    /// * When the provided `Input` configuration is not supported. For
1564
    /// example, by providing an unsupported anchor mode. Concretely,
1565
    /// this occurs when using [`Anchored::Pattern`] without enabling
1566
    /// [`Config::starts_for_each_pattern`].
1567
    ///
1568
    /// When a search panics, callers cannot know whether a match exists or
1569
    /// not.
1570
    ///
1571
    /// Use [`DFA::try_search`] if you want to handle these panics as error
1572
    /// values instead.
1573
    ///
1574
    /// # Example
1575
    ///
1576
    /// This shows basic usage:
1577
    ///
1578
    /// ```
1579
    /// use regex_automata::dfa::onepass::DFA;
1580
    ///
1581
    /// let re = DFA::new("foo[0-9]+bar")?;
1582
    /// let mut cache = re.create_cache();
1583
    ///
1584
    /// assert!(re.is_match(&mut cache, "foo12345bar"));
1585
    /// assert!(!re.is_match(&mut cache, "foobar"));
1586
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1587
    /// ```
1588
    ///
1589
    /// # Example: consistency with search APIs
1590
    ///
1591
    /// `is_match` is guaranteed to return `true` whenever `captures` returns
1592
    /// a match. This includes searches that are executed entirely within a
1593
    /// codepoint:
1594
    ///
1595
    /// ```
1596
    /// use regex_automata::{dfa::onepass::DFA, Input};
1597
    ///
1598
    /// let re = DFA::new("a*")?;
1599
    /// let mut cache = re.create_cache();
1600
    ///
1601
    /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
1602
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1603
    /// ```
1604
    ///
1605
    /// Notice that when UTF-8 mode is disabled, then the above reports a
1606
    /// match because the restriction against zero-width matches that split a
1607
    /// codepoint has been lifted:
1608
    ///
1609
    /// ```
1610
    /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input};
1611
    ///
1612
    /// let re = DFA::builder()
1613
    ///     .thompson(NFA::config().utf8(false))
1614
    ///     .build("a*")?;
1615
    /// let mut cache = re.create_cache();
1616
    ///
1617
    /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
1618
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1619
    /// ```
1620
    #[inline]
1621
0
    pub fn is_match<'h, I: Into<Input<'h>>>(
1622
0
        &self,
1623
0
        cache: &mut Cache,
1624
0
        input: I,
1625
0
    ) -> bool {
1626
0
        let mut input = input.into().earliest(true);
1627
0
        if matches!(input.get_anchored(), Anchored::No) {
1628
0
            input.set_anchored(Anchored::Yes);
1629
0
        }
1630
0
        self.try_search_slots(cache, &input, &mut []).unwrap().is_some()
1631
0
    }
1632
1633
    /// Executes an anchored leftmost forward search, and returns a `Match` if
1634
    /// and only if this one-pass DFA matches the given haystack.
1635
    ///
1636
    /// This routine only includes the overall match span. To get access to the
1637
    /// individual spans of each capturing group, use [`DFA::captures`].
1638
    ///
1639
    /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1640
    /// given configuration was [`Anchored::No`] (which is the default).
1641
    ///
1642
    /// # Panics
1643
    ///
1644
    /// This routine panics if the search could not complete. This can occur
1645
    /// in the following circumstances:
1646
    ///
1647
    /// * When the provided `Input` configuration is not supported. For
1648
    /// example, by providing an unsupported anchor mode. Concretely,
1649
    /// this occurs when using [`Anchored::Pattern`] without enabling
1650
    /// [`Config::starts_for_each_pattern`].
1651
    ///
1652
    /// When a search panics, callers cannot know whether a match exists or
1653
    /// not.
1654
    ///
1655
    /// Use [`DFA::try_search`] if you want to handle these panics as error
1656
    /// values instead.
1657
    ///
1658
    /// # Example
1659
    ///
1660
    /// Leftmost first match semantics corresponds to the match with the
1661
    /// smallest starting offset, but where the end offset is determined by
1662
    /// preferring earlier branches in the original regular expression. For
1663
    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
1664
    /// will match `Samwise` in `Samwise`.
1665
    ///
1666
    /// Generally speaking, the "leftmost first" match is how most backtracking
1667
    /// regular expressions tend to work. This is in contrast to POSIX-style
1668
    /// regular expressions that yield "leftmost longest" matches. Namely,
1669
    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
1670
    /// leftmost longest semantics. (This crate does not currently support
1671
    /// leftmost longest semantics.)
1672
    ///
1673
    /// ```
1674
    /// use regex_automata::{dfa::onepass::DFA, Match};
1675
    ///
1676
    /// let re = DFA::new("foo[0-9]+")?;
1677
    /// let mut cache = re.create_cache();
1678
    /// let expected = Match::must(0, 0..8);
1679
    /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
1680
    ///
1681
    /// // Even though a match is found after reading the first byte (`a`),
1682
    /// // the leftmost first match semantics demand that we find the earliest
1683
    /// // match that prefers earlier parts of the pattern over later parts.
1684
    /// let re = DFA::new("abc|a")?;
1685
    /// let mut cache = re.create_cache();
1686
    /// let expected = Match::must(0, 0..3);
1687
    /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
1688
    ///
1689
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1690
    /// ```
1691
    #[inline]
1692
0
    pub fn find<'h, I: Into<Input<'h>>>(
1693
0
        &self,
1694
0
        cache: &mut Cache,
1695
0
        input: I,
1696
0
    ) -> Option<Match> {
1697
0
        let mut input = input.into();
1698
0
        if matches!(input.get_anchored(), Anchored::No) {
1699
0
            input.set_anchored(Anchored::Yes);
1700
0
        }
1701
0
        if self.get_nfa().pattern_len() == 1 {
1702
0
            let mut slots = [None, None];
1703
0
            let pid =
1704
0
                self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1705
0
            let start = slots[0].unwrap().get();
1706
0
            let end = slots[1].unwrap().get();
1707
0
            return Some(Match::new(pid, Span { start, end }));
1708
0
        }
1709
0
        let ginfo = self.get_nfa().group_info();
1710
0
        let slots_len = ginfo.implicit_slot_len();
1711
0
        let mut slots = vec![None; slots_len];
1712
0
        let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1713
0
        let start = slots[pid.as_usize() * 2].unwrap().get();
1714
0
        let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
1715
0
        Some(Match::new(pid, Span { start, end }))
1716
0
    }
1717
1718
    /// Executes an anchored leftmost forward search and writes the spans
1719
    /// of capturing groups that participated in a match into the provided
1720
    /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1721
    /// is guaranteed to return `false`.
1722
    ///
1723
    /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1724
    /// given configuration was [`Anchored::No`] (which is the default).
1725
    ///
1726
    /// # Panics
1727
    ///
1728
    /// This routine panics if the search could not complete. This can occur
1729
    /// in the following circumstances:
1730
    ///
1731
    /// * When the provided `Input` configuration is not supported. For
1732
    /// example, by providing an unsupported anchor mode. Concretely,
1733
    /// this occurs when using [`Anchored::Pattern`] without enabling
1734
    /// [`Config::starts_for_each_pattern`].
1735
    ///
1736
    /// When a search panics, callers cannot know whether a match exists or
1737
    /// not.
1738
    ///
1739
    /// Use [`DFA::try_search`] if you want to handle these panics as error
1740
    /// values instead.
1741
    ///
1742
    /// # Example
1743
    ///
1744
    /// This shows a simple example of a one-pass regex that extracts
1745
    /// capturing group spans.
1746
    ///
1747
    /// ```
1748
    /// use regex_automata::{dfa::onepass::DFA, Match, Span};
1749
    ///
1750
    /// let re = DFA::new(
1751
    ///     // Notice that we use ASCII here. The corresponding Unicode regex
1752
    ///     // is sadly not one-pass.
1753
    ///     "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)",
1754
    /// )?;
1755
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1756
    ///
1757
    /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
1758
    /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
1759
    /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
1760
    /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
1761
    ///
1762
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1763
    /// ```
1764
    #[inline]
1765
0
    pub fn captures<'h, I: Into<Input<'h>>>(
1766
0
        &self,
1767
0
        cache: &mut Cache,
1768
0
        input: I,
1769
0
        caps: &mut Captures,
1770
0
    ) {
1771
0
        let mut input = input.into();
1772
0
        if matches!(input.get_anchored(), Anchored::No) {
1773
0
            input.set_anchored(Anchored::Yes);
1774
0
        }
1775
0
        self.try_search(cache, &input, caps).unwrap();
1776
0
    }
1777
1778
    /// Executes an anchored leftmost forward search and writes the spans
1779
    /// of capturing groups that participated in a match into the provided
1780
    /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1781
    /// is guaranteed to return `false`.
1782
    ///
1783
    /// The differences with [`DFA::captures`] are:
1784
    ///
1785
    /// 1. This returns an error instead of panicking if the search fails.
1786
    /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing
1787
    /// the same input for multiple searches, which _may_ be important for
1788
    /// latency.
1789
    /// 3. This does not automatically change the [`Anchored`] mode from `No`
1790
    /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an
1791
    /// error is returned.
1792
    ///
1793
    /// # Errors
1794
    ///
1795
    /// This routine errors if the search could not complete. This can occur
1796
    /// in the following circumstances:
1797
    ///
1798
    /// * When the provided `Input` configuration is not supported. For
1799
    /// example, by providing an unsupported anchor mode. Concretely,
1800
    /// this occurs when using [`Anchored::Pattern`] without enabling
1801
    /// [`Config::starts_for_each_pattern`].
1802
    ///
1803
    /// When a search returns an error, callers cannot know whether a match
1804
    /// exists or not.
1805
    ///
1806
    /// # Example: specific pattern search
1807
    ///
1808
    /// This example shows how to build a multi-regex that permits searching
1809
    /// for specific patterns. Note that this is somewhat less useful than
1810
    /// in other regex engines, since a one-pass DFA by definition has no
1811
    /// ambiguity about which pattern can match at a position. That is, if it
1812
    /// were possible for two different patterns to match at the same starting
1813
    /// position, then the multi-regex would not be one-pass and construction
1814
    /// would have failed.
1815
    ///
1816
    /// Nevertheless, this can still be useful if you only care about matches
1817
    /// for a specific pattern, and want the DFA to report "no match" even if
1818
    /// some other pattern would have matched.
1819
    ///
1820
    /// Note that in order to make use of this functionality,
1821
    /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled
1822
    /// by default since it may result in higher memory usage.
1823
    ///
1824
    /// ```
1825
    /// use regex_automata::{
1826
    ///     dfa::onepass::DFA, Anchored, Input, Match, PatternID,
1827
    /// };
1828
    ///
1829
    /// let re = DFA::builder()
1830
    ///     .configure(DFA::config().starts_for_each_pattern(true))
1831
    ///     .build_many(&["[a-z]+", "[0-9]+"])?;
1832
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1833
    /// let haystack = "123abc";
1834
    /// let input = Input::new(haystack).anchored(Anchored::Yes);
1835
    ///
1836
    /// // A normal multi-pattern search will show pattern 1 matches.
1837
    /// re.try_search(&mut cache, &input, &mut caps)?;
1838
    /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
1839
    ///
1840
    /// // If we only want to report pattern 0 matches, then we'll get no
1841
    /// // match here.
1842
    /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
1843
    /// re.try_search(&mut cache, &input, &mut caps)?;
1844
    /// assert_eq!(None, caps.get_match());
1845
    ///
1846
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1847
    /// ```
1848
    ///
1849
    /// # Example: specifying the bounds of a search
1850
    ///
1851
    /// This example shows how providing the bounds of a search can produce
1852
    /// different results than simply sub-slicing the haystack.
1853
    ///
1854
    /// ```
1855
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1856
    /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match};
1857
    ///
1858
    /// // one-pass DFAs fully support Unicode word boundaries!
1859
    /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass.
1860
    /// // :-(
1861
    /// let re = DFA::new(r"\b[0-9]{3}\b")?;
1862
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1863
    /// let haystack = "foo123bar";
1864
    ///
1865
    /// // Since we sub-slice the haystack, the search doesn't know about
1866
    /// // the larger context and assumes that `123` is surrounded by word
1867
    /// // boundaries. And of course, the match position is reported relative
1868
    /// // to the sub-slice as well, which means we get `0..3` instead of
1869
    /// // `3..6`.
1870
    /// let expected = Some(Match::must(0, 0..3));
1871
    /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes);
1872
    /// re.try_search(&mut cache, &input, &mut caps)?;
1873
    /// assert_eq!(expected, caps.get_match());
1874
    ///
1875
    /// // But if we provide the bounds of the search within the context of the
1876
    /// // entire haystack, then the search can take the surrounding context
1877
    /// // into account. (And if we did find a match, it would be reported
1878
    /// // as a valid offset into `haystack` instead of its sub-slice.)
1879
    /// let expected = None;
1880
    /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes);
1881
    /// re.try_search(&mut cache, &input, &mut caps)?;
1882
    /// assert_eq!(expected, caps.get_match());
1883
    ///
1884
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1885
    /// ```
1886
    #[inline]
1887
0
    pub fn try_search(
1888
0
        &self,
1889
0
        cache: &mut Cache,
1890
0
        input: &Input<'_>,
1891
0
        caps: &mut Captures,
1892
0
    ) -> Result<(), MatchError> {
1893
0
        let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1894
0
        caps.set_pattern(pid);
1895
0
        Ok(())
1896
0
    }
1897
1898
    /// Executes an anchored leftmost forward search and writes the spans
1899
    /// of capturing groups that participated in a match into the provided
1900
    /// `slots`, and returns the matching pattern ID. The contents of the
1901
    /// slots for patterns other than the matching pattern are unspecified. If
1902
    /// no match was found, then `None` is returned and the contents of all
1903
    /// `slots` is unspecified.
1904
    ///
1905
    /// This is like [`DFA::try_search`], but it accepts a raw slots slice
1906
    /// instead of a `Captures` value. This is useful in contexts where you
1907
    /// don't want or need to allocate a `Captures`.
1908
    ///
1909
    /// It is legal to pass _any_ number of slots to this routine. If the regex
1910
    /// engine would otherwise write a slot offset that doesn't fit in the
1911
    /// provided slice, then it is simply skipped. In general though, there are
1912
    /// usually three slice lengths you might want to use:
1913
    ///
1914
    /// * An empty slice, if you only care about which pattern matched.
1915
    /// * A slice with
1916
    /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len)
1917
    /// slots, if you only care about the overall match spans for each matching
1918
    /// pattern.
1919
    /// * A slice with
1920
    /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1921
    /// permits recording match offsets for every capturing group in every
1922
    /// pattern.
1923
    ///
1924
    /// # Errors
1925
    ///
1926
    /// This routine errors if the search could not complete. This can occur
1927
    /// in the following circumstances:
1928
    ///
1929
    /// * When the provided `Input` configuration is not supported. For
1930
    /// example, by providing an unsupported anchor mode. Concretely,
1931
    /// this occurs when using [`Anchored::Pattern`] without enabling
1932
    /// [`Config::starts_for_each_pattern`].
1933
    ///
1934
    /// When a search returns an error, callers cannot know whether a match
1935
    /// exists or not.
1936
    ///
1937
    /// # Example
1938
    ///
1939
    /// This example shows how to find the overall match offsets in a
1940
    /// multi-pattern search without allocating a `Captures` value. Indeed, we
1941
    /// can put our slots right on the stack.
1942
    ///
1943
    /// ```
1944
    /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID};
1945
    ///
1946
    /// let re = DFA::new_many(&[
1947
    ///     r"[a-zA-Z]+",
1948
    ///     r"[0-9]+",
1949
    /// ])?;
1950
    /// let mut cache = re.create_cache();
1951
    /// let input = Input::new("123").anchored(Anchored::Yes);
1952
    ///
1953
    /// // We only care about the overall match offsets here, so we just
1954
    /// // allocate two slots for each pattern. Each slot records the start
1955
    /// // and end of the match.
1956
    /// let mut slots = [None; 4];
1957
    /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1958
    /// assert_eq!(Some(PatternID::must(1)), pid);
1959
    ///
1960
    /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1961
    /// // See 'GroupInfo' for more details on the mapping between groups and
1962
    /// // slot indices.
1963
    /// let slot_start = pid.unwrap().as_usize() * 2;
1964
    /// let slot_end = slot_start + 1;
1965
    /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get()));
1966
    /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get()));
1967
    ///
1968
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1969
    /// ```
1970
    #[inline]
1971
0
    pub fn try_search_slots(
1972
0
        &self,
1973
0
        cache: &mut Cache,
1974
0
        input: &Input<'_>,
1975
0
        slots: &mut [Option<NonMaxUsize>],
1976
0
    ) -> Result<Option<PatternID>, MatchError> {
1977
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1978
0
        if !utf8empty {
1979
0
            return self.try_search_slots_imp(cache, input, slots);
1980
0
        }
1981
0
        // See PikeVM::try_search_slots for why we do this.
1982
0
        let min = self.get_nfa().group_info().implicit_slot_len();
1983
0
        if slots.len() >= min {
1984
0
            return self.try_search_slots_imp(cache, input, slots);
1985
0
        }
1986
0
        if self.get_nfa().pattern_len() == 1 {
1987
0
            let mut enough = [None, None];
1988
0
            let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1989
            // This is OK because we know `enough_slots` is strictly bigger
1990
            // than `slots`, otherwise this special case isn't reached.
1991
0
            slots.copy_from_slice(&enough[..slots.len()]);
1992
0
            return Ok(got);
1993
0
        }
1994
0
        let mut enough = vec![None; min];
1995
0
        let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1996
        // This is OK because we know `enough_slots` is strictly bigger than
1997
        // `slots`, otherwise this special case isn't reached.
1998
0
        slots.copy_from_slice(&enough[..slots.len()]);
1999
0
        Ok(got)
2000
0
    }
2001
2002
    #[inline(never)]
2003
0
    fn try_search_slots_imp(
2004
0
        &self,
2005
0
        cache: &mut Cache,
2006
0
        input: &Input<'_>,
2007
0
        slots: &mut [Option<NonMaxUsize>],
2008
0
    ) -> Result<Option<PatternID>, MatchError> {
2009
0
        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
2010
0
        match self.search_imp(cache, input, slots)? {
2011
0
            None => return Ok(None),
2012
0
            Some(pid) if !utf8empty => return Ok(Some(pid)),
2013
0
            Some(pid) => {
2014
0
                // These slot indices are always correct because we know our
2015
0
                // 'pid' is valid and thus we know that the slot indices for it
2016
0
                // are valid.
2017
0
                let slot_start = pid.as_usize().wrapping_mul(2);
2018
0
                let slot_end = slot_start.wrapping_add(1);
2019
0
                // OK because we know we have a match and we know our caller
2020
0
                // provided slots are big enough (which we make true above if
2021
0
                // the caller didn't). Namely, we're only here when 'utf8empty'
2022
0
                // is true, and when that's true, we require slots for every
2023
0
                // pattern.
2024
0
                let start = slots[slot_start].unwrap().get();
2025
0
                let end = slots[slot_end].unwrap().get();
2026
0
                // If our match splits a codepoint, then we cannot report is
2027
0
                // as a match. And since one-pass DFAs only support anchored
2028
0
                // searches, we don't try to skip ahead to find the next match.
2029
0
                // We can just quit with nothing.
2030
0
                if start == end && !input.is_char_boundary(start) {
2031
0
                    return Ok(None);
2032
0
                }
2033
0
                Ok(Some(pid))
2034
            }
2035
        }
2036
0
    }
2037
}
2038
2039
impl DFA {
2040
0
    fn search_imp(
2041
0
        &self,
2042
0
        cache: &mut Cache,
2043
0
        input: &Input<'_>,
2044
0
        slots: &mut [Option<NonMaxUsize>],
2045
0
    ) -> Result<Option<PatternID>, MatchError> {
2046
0
        // PERF: Some ideas. I ran out of steam after my initial impl to try
2047
0
        // many of these.
2048
0
        //
2049
0
        // 1) Try doing more state shuffling. Right now, all we do is push
2050
0
        // match states to the end of the transition table so that we can do
2051
0
        // 'if sid >= self.min_match_id' to know whether we're in a match
2052
0
        // state or not. But what about doing something like dense DFAs and
2053
0
        // pushing dead, match and states with captures/looks all toward the
2054
0
        // beginning of the transition table. Then we could do 'if sid <=
2055
0
        // self.max_special_id', in which case, we need to do some special
2056
0
        // handling of some sort. Otherwise, we get the happy path, just
2057
0
        // like in a DFA search. The main argument against this is that the
2058
0
        // one-pass DFA is likely to be used most often with capturing groups
2059
0
        // and if capturing groups are common, then this might wind up being a
2060
0
        // pessimization.
2061
0
        //
2062
0
        // 2) Consider moving 'PatternEpsilons' out of the transition table.
2063
0
        // It is only needed for match states and usually a small minority of
2064
0
        // states are match states. Therefore, we're using an extra 'u64' for
2065
0
        // most states.
2066
0
        //
2067
0
        // 3) I played around with the match state handling and it seems like
2068
0
        // there is probably a lot left on the table for improvement. The
2069
0
        // key tension is that the 'find_match' routine is a giant mess, but
2070
0
        // splitting it out into a non-inlineable function is a non-starter
2071
0
        // because the match state might consume input, so 'find_match' COULD
2072
0
        // be called quite a lot, and a function call at that point would trash
2073
0
        // perf. In theory, we could detect whether a match state consumes
2074
0
        // input and then specialize our search routine based on that. In that
2075
0
        // case, maybe an extra function call is OK, but even then, it might be
2076
0
        // too much of a latency hit. Another idea is to just try and figure
2077
0
        // out how to reduce the code size of 'find_match'. RE2 has a trick
2078
0
        // here where the match handling isn't done if we know the next byte of
2079
0
        // input yields a match too. Maybe we adopt that?
2080
0
        //
2081
0
        // This just might be a tricky DFA to optimize.
2082
0
2083
0
        if input.is_done() {
2084
0
            return Ok(None);
2085
0
        }
2086
0
        // We unfortunately have a bit of book-keeping to do to set things
2087
0
        // up. We do have to setup our cache and clear all of our slots. In
2088
0
        // particular, clearing the slots is necessary for the case where we
2089
0
        // report a match, but one of the capturing groups didn't participate
2090
0
        // in the match but had a span set from a previous search. That would
2091
0
        // be bad. In theory, we could avoid all this slot clearing if we knew
2092
0
        // that every slot was always activated for every match. Then we would
2093
0
        // know they would always be overwritten when a match is found.
2094
0
        let explicit_slots_len = core::cmp::min(
2095
0
            Slots::LIMIT,
2096
0
            slots.len().saturating_sub(self.explicit_slot_start),
2097
0
        );
2098
0
        cache.setup_search(explicit_slots_len);
2099
0
        for slot in cache.explicit_slots() {
2100
0
            *slot = None;
2101
0
        }
2102
0
        for slot in slots.iter_mut() {
2103
0
            *slot = None;
2104
0
        }
2105
        // We set the starting slots for every pattern up front. This does
2106
        // increase our latency somewhat, but it avoids having to do it every
2107
        // time we see a match state (which could be many times in a single
2108
        // search if the match state consumes input).
2109
0
        for pid in self.nfa.patterns() {
2110
0
            let i = pid.as_usize() * 2;
2111
0
            if i >= slots.len() {
2112
0
                break;
2113
0
            }
2114
0
            slots[i] = NonMaxUsize::new(input.start());
2115
        }
2116
0
        let mut pid = None;
2117
0
        let mut next_sid = match input.get_anchored() {
2118
0
            Anchored::Yes => self.start(),
2119
0
            Anchored::Pattern(pid) => self.start_pattern(pid)?,
2120
            Anchored::No => {
2121
                // If the regex is itself always anchored, then we're fine,
2122
                // even if the search is configured to be unanchored.
2123
0
                if !self.nfa.is_always_start_anchored() {
2124
0
                    return Err(MatchError::unsupported_anchored(
2125
0
                        Anchored::No,
2126
0
                    ));
2127
0
                }
2128
0
                self.start()
2129
            }
2130
        };
2131
0
        let leftmost_first =
2132
0
            matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst);
2133
0
        for at in input.start()..input.end() {
2134
0
            let sid = next_sid;
2135
0
            let trans = self.transition(sid, input.haystack()[at]);
2136
0
            next_sid = trans.state_id();
2137
0
            let epsilons = trans.epsilons();
2138
0
            if sid >= self.min_match_id {
2139
0
                if self.find_match(cache, input, at, sid, slots, &mut pid) {
2140
0
                    if input.get_earliest()
2141
0
                        || (leftmost_first && trans.match_wins())
2142
                    {
2143
0
                        return Ok(pid);
2144
0
                    }
2145
0
                }
2146
0
            }
2147
0
            if sid == DEAD
2148
0
                || (!epsilons.looks().is_empty()
2149
0
                    && !self.nfa.look_matcher().matches_set_inline(
2150
0
                        epsilons.looks(),
2151
0
                        input.haystack(),
2152
0
                        at,
2153
0
                    ))
2154
            {
2155
0
                return Ok(pid);
2156
0
            }
2157
0
            epsilons.slots().apply(at, cache.explicit_slots());
2158
        }
2159
0
        if next_sid >= self.min_match_id {
2160
0
            self.find_match(
2161
0
                cache,
2162
0
                input,
2163
0
                input.end(),
2164
0
                next_sid,
2165
0
                slots,
2166
0
                &mut pid,
2167
0
            );
2168
0
        }
2169
0
        Ok(pid)
2170
0
    }
2171
2172
    /// Assumes 'sid' is a match state and looks for whether a match can
2173
    /// be reported. If so, appropriate offsets are written to 'slots' and
2174
    /// 'matched_pid' is set to the matching pattern ID.
2175
    ///
2176
    /// Even when 'sid' is a match state, it's possible that a match won't
2177
    /// be reported. For example, when the conditional epsilon transitions
2178
    /// leading to the match state aren't satisfied at the given position in
2179
    /// the haystack.
2180
    #[cfg_attr(feature = "perf-inline", inline(always))]
2181
0
    fn find_match(
2182
0
        &self,
2183
0
        cache: &mut Cache,
2184
0
        input: &Input<'_>,
2185
0
        at: usize,
2186
0
        sid: StateID,
2187
0
        slots: &mut [Option<NonMaxUsize>],
2188
0
        matched_pid: &mut Option<PatternID>,
2189
0
    ) -> bool {
2190
0
        debug_assert!(sid >= self.min_match_id);
2191
0
        let pateps = self.pattern_epsilons(sid);
2192
0
        let epsilons = pateps.epsilons();
2193
0
        if !epsilons.looks().is_empty()
2194
0
            && !self.nfa.look_matcher().matches_set_inline(
2195
0
                epsilons.looks(),
2196
0
                input.haystack(),
2197
0
                at,
2198
0
            )
2199
        {
2200
0
            return false;
2201
0
        }
2202
0
        let pid = pateps.pattern_id_unchecked();
2203
0
        // This calculation is always correct because we know our 'pid' is
2204
0
        // valid and thus we know that the slot indices for it are valid.
2205
0
        let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1);
2206
0
        // Set the implicit 'end' slot for the matching pattern. (The 'start'
2207
0
        // slot was set at the beginning of the search.)
2208
0
        if slot_end < slots.len() {
2209
0
            slots[slot_end] = NonMaxUsize::new(at);
2210
0
        }
2211
        // If the caller provided enough room, copy the previously recorded
2212
        // explicit slots from our scratch space to the caller provided slots.
2213
        // We *also* need to set any explicit slots that are active as part of
2214
        // the path to the match state.
2215
0
        if self.explicit_slot_start < slots.len() {
2216
0
            // NOTE: The 'cache.explicit_slots()' slice is setup at the
2217
0
            // beginning of every search such that it is guaranteed to return a
2218
0
            // slice of length equivalent to 'slots[explicit_slot_start..]'.
2219
0
            slots[self.explicit_slot_start..]
2220
0
                .copy_from_slice(cache.explicit_slots());
2221
0
            epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]);
2222
0
        }
2223
0
        *matched_pid = Some(pid);
2224
0
        true
2225
0
    }
2226
}
2227
2228
impl DFA {
2229
    /// Returns the anchored start state for matching any pattern in this DFA.
2230
0
    fn start(&self) -> StateID {
2231
0
        self.starts[0]
2232
0
    }
2233
2234
    /// Returns the anchored start state for matching the given pattern. If
2235
    /// 'starts_for_each_pattern'
2236
    /// was not enabled, then this returns an error. If the given pattern is
2237
    /// not in this DFA, then `Ok(None)` is returned.
2238
0
    fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> {
2239
0
        if !self.config.get_starts_for_each_pattern() {
2240
0
            return Err(MatchError::unsupported_anchored(Anchored::Pattern(
2241
0
                pid,
2242
0
            )));
2243
0
        }
2244
0
        // 'starts' always has non-zero length. The first entry is always the
2245
0
        // anchored starting state for all patterns, and the following entries
2246
0
        // are optional and correspond to the anchored starting states for
2247
0
        // patterns at pid+1. Thus, starts.len()-1 corresponds to the total
2248
0
        // number of patterns that one can explicitly search for. (And it may
2249
0
        // be zero.)
2250
0
        Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD))
2251
0
    }
2252
2253
    /// Returns the transition from the given state ID and byte of input. The
2254
    /// transition includes the next state ID, the slots that should be saved
2255
    /// and any conditional epsilon transitions that must be satisfied in order
2256
    /// to take this transition.
2257
0
    fn transition(&self, sid: StateID, byte: u8) -> Transition {
2258
0
        let offset = sid.as_usize() << self.stride2();
2259
0
        let class = self.classes.get(byte).as_usize();
2260
0
        self.table[offset + class]
2261
0
    }
2262
2263
    /// Set the transition from the given state ID and byte of input to the
2264
    /// transition given.
2265
0
    fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) {
2266
0
        let offset = sid.as_usize() << self.stride2();
2267
0
        let class = self.classes.get(byte).as_usize();
2268
0
        self.table[offset + class] = to;
2269
0
    }
2270
2271
    /// Return an iterator of "sparse" transitions for the given state ID.
2272
    /// "sparse" in this context means that consecutive transitions that are
2273
    /// equivalent are returned as one group, and transitions to the DEAD state
2274
    /// are ignored.
2275
    ///
2276
    /// This winds up being useful for debug printing, since it's much terser
2277
    /// to display runs of equivalent transitions than the transition for every
2278
    /// possible byte value. Indeed, in practice, it's very common for runs
2279
    /// of equivalent transitions to appear.
2280
0
    fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> {
2281
0
        let start = sid.as_usize() << self.stride2();
2282
0
        let end = start + self.alphabet_len();
2283
0
        SparseTransitionIter {
2284
0
            it: self.table[start..end].iter().enumerate(),
2285
0
            cur: None,
2286
0
        }
2287
0
    }
2288
2289
    /// Return the pattern epsilons for the given state ID.
2290
    ///
2291
    /// If the given state ID does not correspond to a match state ID, then the
2292
    /// pattern epsilons returned is empty.
2293
0
    fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons {
2294
0
        let offset = sid.as_usize() << self.stride2();
2295
0
        PatternEpsilons(self.table[offset + self.pateps_offset].0)
2296
0
    }
2297
2298
    /// Set the pattern epsilons for the given state ID.
2299
0
    fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) {
2300
0
        let offset = sid.as_usize() << self.stride2();
2301
0
        self.table[offset + self.pateps_offset] = Transition(pateps.0);
2302
0
    }
2303
2304
    /// Returns the state ID prior to the one given. This returns None if the
2305
    /// given ID is the first DFA state.
2306
0
    fn prev_state_id(&self, id: StateID) -> Option<StateID> {
2307
0
        if id == DEAD {
2308
0
            None
2309
        } else {
2310
            // CORRECTNESS: Since 'id' is not the first state, subtracting 1
2311
            // is always valid.
2312
0
            Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap()))
2313
        }
2314
0
    }
2315
2316
    /// Returns the state ID of the last state in this DFA's transition table.
2317
    /// "last" in this context means the last state to appear in memory, i.e.,
2318
    /// the one with the greatest ID.
2319
0
    fn last_state_id(&self) -> StateID {
2320
0
        // CORRECTNESS: A DFA table is always non-empty since it always at
2321
0
        // least contains a DEAD state. Since every state has the same stride,
2322
0
        // we can just compute what the "next" state ID would have been and
2323
0
        // then subtract 1 from it.
2324
0
        StateID::new_unchecked(
2325
0
            (self.table.len() >> self.stride2()).checked_sub(1).unwrap(),
2326
0
        )
2327
0
    }
2328
2329
    /// Move the transitions from 'id1' to 'id2' and vice versa.
2330
    ///
2331
    /// WARNING: This does not update the rest of the transition table to have
2332
    /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves
2333
    /// the states in memory.
2334
0
    pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) {
2335
0
        let o1 = id1.as_usize() << self.stride2();
2336
0
        let o2 = id2.as_usize() << self.stride2();
2337
0
        for b in 0..self.stride() {
2338
0
            self.table.swap(o1 + b, o2 + b);
2339
0
        }
2340
0
    }
2341
2342
    /// Map all state IDs in this DFA (transition table + start states)
2343
    /// according to the closure given.
2344
0
    pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
2345
0
        for i in 0..self.state_len() {
2346
0
            let offset = i << self.stride2();
2347
0
            for b in 0..self.alphabet_len() {
2348
0
                let next = self.table[offset + b].state_id();
2349
0
                self.table[offset + b].set_state_id(map(next));
2350
0
            }
2351
        }
2352
0
        for i in 0..self.starts.len() {
2353
0
            self.starts[i] = map(self.starts[i]);
2354
0
        }
2355
0
    }
2356
}
2357
2358
impl core::fmt::Debug for DFA {
2359
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2360
0
        fn debug_state_transitions(
2361
0
            f: &mut core::fmt::Formatter,
2362
0
            dfa: &DFA,
2363
0
            sid: StateID,
2364
0
        ) -> core::fmt::Result {
2365
0
            for (i, (start, end, trans)) in
2366
0
                dfa.sparse_transitions(sid).enumerate()
2367
            {
2368
0
                let next = trans.state_id();
2369
0
                if i > 0 {
2370
0
                    write!(f, ", ")?;
2371
0
                }
2372
0
                if start == end {
2373
0
                    write!(
2374
0
                        f,
2375
0
                        "{:?} => {:?}",
2376
0
                        DebugByte(start),
2377
0
                        next.as_usize(),
2378
0
                    )?;
2379
                } else {
2380
0
                    write!(
2381
0
                        f,
2382
0
                        "{:?}-{:?} => {:?}",
2383
0
                        DebugByte(start),
2384
0
                        DebugByte(end),
2385
0
                        next.as_usize(),
2386
0
                    )?;
2387
                }
2388
0
                if trans.match_wins() {
2389
0
                    write!(f, " (MW)")?;
2390
0
                }
2391
0
                if !trans.epsilons().is_empty() {
2392
0
                    write!(f, " ({:?})", trans.epsilons())?;
2393
0
                }
2394
            }
2395
0
            Ok(())
2396
0
        }
2397
2398
0
        writeln!(f, "onepass::DFA(")?;
2399
0
        for index in 0..self.state_len() {
2400
0
            let sid = StateID::must(index);
2401
0
            let pateps = self.pattern_epsilons(sid);
2402
0
            if sid == DEAD {
2403
0
                write!(f, "D ")?;
2404
0
            } else if pateps.pattern_id().is_some() {
2405
0
                write!(f, "* ")?;
2406
            } else {
2407
0
                write!(f, "  ")?;
2408
            }
2409
0
            write!(f, "{:06?}", sid.as_usize())?;
2410
0
            if !pateps.is_empty() {
2411
0
                write!(f, " ({:?})", pateps)?;
2412
0
            }
2413
0
            write!(f, ": ")?;
2414
0
            debug_state_transitions(f, self, sid)?;
2415
0
            write!(f, "\n")?;
2416
        }
2417
0
        writeln!(f, "")?;
2418
0
        for (i, &sid) in self.starts.iter().enumerate() {
2419
0
            if i == 0 {
2420
0
                writeln!(f, "START(ALL): {:?}", sid.as_usize())?;
2421
            } else {
2422
0
                writeln!(
2423
0
                    f,
2424
0
                    "START(pattern: {:?}): {:?}",
2425
0
                    i - 1,
2426
0
                    sid.as_usize(),
2427
0
                )?;
2428
            }
2429
        }
2430
0
        writeln!(f, "state length: {:?}", self.state_len())?;
2431
0
        writeln!(f, "pattern length: {:?}", self.pattern_len())?;
2432
0
        writeln!(f, ")")?;
2433
0
        Ok(())
2434
0
    }
2435
}
2436
2437
/// An iterator over groups of consecutive equivalent transitions in a single
2438
/// state.
2439
#[derive(Debug)]
2440
struct SparseTransitionIter<'a> {
2441
    it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>,
2442
    cur: Option<(u8, u8, Transition)>,
2443
}
2444
2445
impl<'a> Iterator for SparseTransitionIter<'a> {
2446
    type Item = (u8, u8, Transition);
2447
2448
0
    fn next(&mut self) -> Option<(u8, u8, Transition)> {
2449
0
        while let Some((b, &trans)) = self.it.next() {
2450
            // Fine because we'll never have more than u8::MAX transitions in
2451
            // one state.
2452
0
            let b = b.as_u8();
2453
0
            let (prev_start, prev_end, prev_trans) = match self.cur {
2454
0
                Some(t) => t,
2455
                None => {
2456
0
                    self.cur = Some((b, b, trans));
2457
0
                    continue;
2458
                }
2459
            };
2460
0
            if prev_trans == trans {
2461
0
                self.cur = Some((prev_start, b, prev_trans));
2462
0
            } else {
2463
0
                self.cur = Some((b, b, trans));
2464
0
                if prev_trans.state_id() != DEAD {
2465
0
                    return Some((prev_start, prev_end, prev_trans));
2466
0
                }
2467
            }
2468
        }
2469
0
        if let Some((start, end, trans)) = self.cur.take() {
2470
0
            if trans.state_id() != DEAD {
2471
0
                return Some((start, end, trans));
2472
0
            }
2473
0
        }
2474
0
        None
2475
0
    }
2476
}
2477
2478
/// A cache represents mutable state that a one-pass [`DFA`] requires during a
2479
/// search.
2480
///
2481
/// For a given one-pass DFA, its corresponding cache may be created either via
2482
/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every
2483
/// way, except the former does not require explicitly importing `Cache`.
2484
///
2485
/// A particular `Cache` is coupled with the one-pass DFA from which it was
2486
/// created. It may only be used with that one-pass DFA. A cache and its
2487
/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
2488
/// only be used with the new one-pass DFA (and not the old one).
2489
#[derive(Clone, Debug)]
2490
pub struct Cache {
2491
    /// Scratch space used to store slots during a search. Basically, we use
2492
    /// the caller provided slots to store slots known when a match occurs.
2493
    /// But after a match occurs, we might continue a search but ultimately
2494
    /// fail to extend the match. When continuing the search, we need some
2495
    /// place to store candidate capture offsets without overwriting the slot
2496
    /// offsets recorded for the most recently seen match.
2497
    explicit_slots: Vec<Option<NonMaxUsize>>,
2498
    /// The number of slots in the caller-provided 'Captures' value for the
2499
    /// current search. This is always at most 'explicit_slots.len()', but
2500
    /// might be less than it, if the caller provided fewer slots to fill.
2501
    explicit_slot_len: usize,
2502
}
2503
2504
impl Cache {
2505
    /// Create a new [`onepass::DFA`](DFA) cache.
2506
    ///
2507
    /// A potentially more convenient routine to create a cache is
2508
    /// [`DFA::create_cache`], as it does not require also importing the
2509
    /// `Cache` type.
2510
    ///
2511
    /// If you want to reuse the returned `Cache` with some other one-pass DFA,
2512
    /// then you must call [`Cache::reset`] with the desired one-pass DFA.
2513
0
    pub fn new(re: &DFA) -> Cache {
2514
0
        let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 };
2515
0
        cache.reset(re);
2516
0
        cache
2517
0
    }
2518
2519
    /// Reset this cache such that it can be used for searching with a
2520
    /// different [`onepass::DFA`](DFA).
2521
    ///
2522
    /// A cache reset permits reusing memory already allocated in this cache
2523
    /// with a different one-pass DFA.
2524
    ///
2525
    /// # Example
2526
    ///
2527
    /// This shows how to re-purpose a cache for use with a different one-pass
2528
    /// DFA.
2529
    ///
2530
    /// ```
2531
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2532
    /// use regex_automata::{dfa::onepass::DFA, Match};
2533
    ///
2534
    /// let re1 = DFA::new(r"\w")?;
2535
    /// let re2 = DFA::new(r"\W")?;
2536
    /// let mut caps1 = re1.create_captures();
2537
    /// let mut caps2 = re2.create_captures();
2538
    ///
2539
    /// let mut cache = re1.create_cache();
2540
    /// assert_eq!(
2541
    ///     Some(Match::must(0, 0..2)),
2542
    ///     { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
2543
    /// );
2544
    ///
2545
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
2546
    /// // incorrect results. In order to re-purpose the cache, we must reset
2547
    /// // it with the one-pass DFA we'd like to use it with.
2548
    /// //
2549
    /// // Similarly, after this reset, using the cache with 're1' is also not
2550
    /// // allowed.
2551
    /// re2.reset_cache(&mut cache);
2552
    /// assert_eq!(
2553
    ///     Some(Match::must(0, 0..3)),
2554
    ///     { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
2555
    /// );
2556
    ///
2557
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2558
    /// ```
2559
0
    pub fn reset(&mut self, re: &DFA) {
2560
0
        let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len();
2561
0
        self.explicit_slots.resize(explicit_slot_len, None);
2562
0
        self.explicit_slot_len = explicit_slot_len;
2563
0
    }
2564
2565
    /// Returns the heap memory usage, in bytes, of this cache.
2566
    ///
2567
    /// This does **not** include the stack size used up by this cache. To
2568
    /// compute that, use `std::mem::size_of::<Cache>()`.
2569
0
    pub fn memory_usage(&self) -> usize {
2570
0
        self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2571
0
    }
2572
2573
0
    fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] {
2574
0
        &mut self.explicit_slots[..self.explicit_slot_len]
2575
0
    }
2576
2577
0
    fn setup_search(&mut self, explicit_slot_len: usize) {
2578
0
        self.explicit_slot_len = explicit_slot_len;
2579
0
    }
2580
}
2581
2582
/// Represents a single transition in a one-pass DFA.
2583
///
2584
/// The high 21 bits corresponds to the state ID. The bit following corresponds
2585
/// to the special "match wins" flag. The remaining low 42 bits corresponds to
2586
/// the transition epsilons, which contains the slots that should be saved when
2587
/// this transition is followed and the conditional epsilon transitions that
2588
/// must be satisfied in order to follow this transition.
2589
#[derive(Clone, Copy, Eq, PartialEq)]
2590
struct Transition(u64);
2591
2592
impl Transition {
2593
    const STATE_ID_BITS: u64 = 21;
2594
    const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS;
2595
    const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS;
2596
    const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1);
2597
    const INFO_MASK: u64 = 0x000003FF_FFFFFFFF;
2598
2599
    /// Return a new transition to the given state ID with the given epsilons.
2600
0
    fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition {
2601
0
        let match_wins =
2602
0
            if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 };
2603
0
        let sid = sid.as_u64() << Transition::STATE_ID_SHIFT;
2604
0
        Transition(sid | match_wins | epsilons.0)
2605
0
    }
2606
2607
    /// Returns true if and only if this transition points to the DEAD state.
2608
0
    fn is_dead(self) -> bool {
2609
0
        self.state_id() == DEAD
2610
0
    }
2611
2612
    /// Return whether this transition has a "match wins" property.
2613
    ///
2614
    /// When a transition has this property, it means that if a match has been
2615
    /// found and the search uses leftmost-first semantics, then that match
2616
    /// should be returned immediately instead of continuing on.
2617
    ///
2618
    /// The "match wins" name comes from RE2, which uses a pretty much
2619
    /// identical mechanism for implementing leftmost-first semantics.
2620
0
    fn match_wins(&self) -> bool {
2621
0
        (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1
2622
0
    }
2623
2624
    /// Return the "next" state ID that this transition points to.
2625
0
    fn state_id(&self) -> StateID {
2626
0
        // OK because a Transition has a valid StateID in its upper bits by
2627
0
        // construction. The cast to usize is also correct, even on 16-bit
2628
0
        // targets because, again, we know the upper bits is a valid StateID,
2629
0
        // which can never overflow usize on any supported target.
2630
0
        StateID::new_unchecked(
2631
0
            (self.0 >> Transition::STATE_ID_SHIFT).as_usize(),
2632
0
        )
2633
0
    }
2634
2635
    /// Set the "next" state ID in this transition.
2636
0
    fn set_state_id(&mut self, sid: StateID) {
2637
0
        *self = Transition::new(self.match_wins(), sid, self.epsilons());
2638
0
    }
2639
2640
    /// Return the epsilons embedded in this transition.
2641
0
    fn epsilons(&self) -> Epsilons {
2642
0
        Epsilons(self.0 & Transition::INFO_MASK)
2643
0
    }
2644
}
2645
2646
impl core::fmt::Debug for Transition {
2647
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2648
0
        if self.is_dead() {
2649
0
            return write!(f, "0");
2650
0
        }
2651
0
        write!(f, "{}", self.state_id().as_usize())?;
2652
0
        if self.match_wins() {
2653
0
            write!(f, "-MW")?;
2654
0
        }
2655
0
        if !self.epsilons().is_empty() {
2656
0
            write!(f, "-{:?}", self.epsilons())?;
2657
0
        }
2658
0
        Ok(())
2659
0
    }
2660
}
2661
2662
/// A representation of a match state's pattern ID along with the epsilons for
2663
/// when a match occurs.
2664
///
2665
/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly
2666
/// one pattern ID. If it had more, then the original NFA would not have been
2667
/// one-pass.
2668
///
2669
/// The "epsilons" part of this corresponds to what was found in the epsilon
2670
/// transitions between the transition taken in the last byte of input and the
2671
/// ultimate match state. This might include saving slots and/or conditional
2672
/// epsilon transitions that must be satisfied before one can report the match.
2673
///
2674
/// Technically, every state has room for a 'PatternEpsilons', but it is only
2675
/// ever non-empty for match states.
2676
#[derive(Clone, Copy)]
2677
struct PatternEpsilons(u64);
2678
2679
impl PatternEpsilons {
2680
    const PATTERN_ID_BITS: u64 = 22;
2681
    const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS;
2682
    // A sentinel value indicating that this is not a match state. We don't
2683
    // use 0 since 0 is a valid pattern ID.
2684
    const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF;
2685
    const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE;
2686
    const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000;
2687
    const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF;
2688
2689
    /// Return a new empty pattern epsilons that has no pattern ID and has no
2690
    /// epsilons. This is suitable for non-match states.
2691
0
    fn empty() -> PatternEpsilons {
2692
0
        PatternEpsilons(
2693
0
            PatternEpsilons::PATTERN_ID_NONE
2694
0
                << PatternEpsilons::PATTERN_ID_SHIFT,
2695
0
        )
2696
0
    }
2697
2698
    /// Whether this pattern epsilons is empty or not. It's empty when it has
2699
    /// no pattern ID and an empty epsilons.
2700
0
    fn is_empty(self) -> bool {
2701
0
        self.pattern_id().is_none() && self.epsilons().is_empty()
2702
0
    }
2703
2704
    /// Return the pattern ID in this pattern epsilons if one exists.
2705
0
    fn pattern_id(self) -> Option<PatternID> {
2706
0
        let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2707
0
        if pid == PatternEpsilons::PATTERN_ID_LIMIT {
2708
0
            None
2709
        } else {
2710
0
            Some(PatternID::new_unchecked(pid.as_usize()))
2711
        }
2712
0
    }
2713
2714
    /// Returns the pattern ID without checking whether it's valid. If this is
2715
    /// called and there is no pattern ID in this `PatternEpsilons`, then this
2716
    /// will likely produce an incorrect result or possibly even a panic or
2717
    /// an overflow. But safety will not be violated.
2718
    ///
2719
    /// This is useful when you know a particular state is a match state. If
2720
    /// it's a match state, then it must have a pattern ID.
2721
0
    fn pattern_id_unchecked(self) -> PatternID {
2722
0
        let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2723
0
        PatternID::new_unchecked(pid.as_usize())
2724
0
    }
2725
2726
    /// Return a new pattern epsilons with the given pattern ID, but the same
2727
    /// epsilons.
2728
0
    fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons {
2729
0
        PatternEpsilons(
2730
0
            (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT)
2731
0
                | (self.0 & PatternEpsilons::EPSILONS_MASK),
2732
0
        )
2733
0
    }
2734
2735
    /// Return the epsilons part of this pattern epsilons.
2736
0
    fn epsilons(self) -> Epsilons {
2737
0
        Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK)
2738
0
    }
2739
2740
    /// Return a new pattern epsilons with the given epsilons, but the same
2741
    /// pattern ID.
2742
0
    fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
2743
0
        PatternEpsilons(
2744
0
            (self.0 & PatternEpsilons::PATTERN_ID_MASK)
2745
0
                | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK),
2746
0
        )
2747
0
    }
2748
}
2749
2750
impl core::fmt::Debug for PatternEpsilons {
2751
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2752
0
        if self.is_empty() {
2753
0
            return write!(f, "N/A");
2754
0
        }
2755
0
        if let Some(pid) = self.pattern_id() {
2756
0
            write!(f, "{}", pid.as_usize())?;
2757
0
        }
2758
0
        if !self.epsilons().is_empty() {
2759
0
            if self.pattern_id().is_some() {
2760
0
                write!(f, "/")?;
2761
0
            }
2762
0
            write!(f, "{:?}", self.epsilons())?;
2763
0
        }
2764
0
        Ok(())
2765
0
    }
2766
}
2767
2768
/// Epsilons represents all of the NFA epsilons transitions that went into a
2769
/// single transition in a single DFA state. In this case, it only represents
2770
/// the epsilon transitions that have some kind of non-consuming side effect:
2771
/// either the transition requires storing the current position of the search
2772
/// into a slot, or the transition is conditional and requires the current
2773
/// position in the input to satisfy an assertion before the transition may be
2774
/// taken.
2775
///
2776
/// This folds the cumulative effect of a group of NFA states (all connected
2777
/// by epsilon transitions) down into a single set of bits. While these bits
2778
/// can represent all possible conditional epsilon transitions, it only permits
2779
/// storing up to a somewhat small number of slots.
2780
///
2781
/// Epsilons is represented as a 42-bit integer. For example, it is packed into
2782
/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a
2783
/// `StateID` and a special "match wins" property.)
2784
#[derive(Clone, Copy)]
2785
struct Epsilons(u64);
2786
2787
impl Epsilons {
2788
    const SLOT_MASK: u64 = 0x000003FF_FFFFFC00;
2789
    const SLOT_SHIFT: u64 = 10;
2790
    const LOOK_MASK: u64 = 0x00000000_000003FF;
2791
2792
    /// Create a new empty epsilons. It has no slots and no assertions that
2793
    /// need to be satisfied.
2794
0
    fn empty() -> Epsilons {
2795
0
        Epsilons(0)
2796
0
    }
2797
2798
    /// Returns true if this epsilons contains no slots and no assertions.
2799
0
    fn is_empty(self) -> bool {
2800
0
        self.0 == 0
2801
0
    }
2802
2803
    /// Returns the slot epsilon transitions.
2804
0
    fn slots(self) -> Slots {
2805
0
        Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32())
2806
0
    }
2807
2808
    /// Set the slot epsilon transitions.
2809
0
    fn set_slots(self, slots: Slots) -> Epsilons {
2810
0
        Epsilons(
2811
0
            (u64::from(slots.0) << Epsilons::SLOT_SHIFT)
2812
0
                | (self.0 & Epsilons::LOOK_MASK),
2813
0
        )
2814
0
    }
2815
2816
    /// Return the set of look-around assertions in these epsilon transitions.
2817
0
    fn looks(self) -> LookSet {
2818
0
        LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
2819
0
    }
2820
2821
    /// Set the look-around assertions on these epsilon transitions.
2822
0
    fn set_looks(self, look_set: LookSet) -> Epsilons {
2823
0
        Epsilons(
2824
0
            (self.0 & Epsilons::SLOT_MASK)
2825
0
                | (u64::from(look_set.bits) & Epsilons::LOOK_MASK),
2826
0
        )
2827
0
    }
2828
}
2829
2830
impl core::fmt::Debug for Epsilons {
2831
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2832
0
        let mut wrote = false;
2833
0
        if !self.slots().is_empty() {
2834
0
            write!(f, "{:?}", self.slots())?;
2835
0
            wrote = true;
2836
0
        }
2837
0
        if !self.looks().is_empty() {
2838
0
            if wrote {
2839
0
                write!(f, "/")?;
2840
0
            }
2841
0
            write!(f, "{:?}", self.looks())?;
2842
0
            wrote = true;
2843
0
        }
2844
0
        if !wrote {
2845
0
            write!(f, "N/A")?;
2846
0
        }
2847
0
        Ok(())
2848
0
    }
2849
}
2850
2851
/// The set of epsilon transitions indicating that the current position in a
2852
/// search should be saved to a slot.
2853
///
2854
/// This *only* represents explicit slots. So for example, the pattern
2855
/// `[a-z]+([0-9]+)([a-z]+)` has:
2856
///
2857
/// * 3 capturing groups, thus 6 slots.
2858
/// * 1 implicit capturing group, thus 2 implicit slots.
2859
/// * 2 explicit capturing groups, thus 4 explicit slots.
2860
///
2861
/// While implicit slots are represented by epsilon transitions in an NFA, we
2862
/// do not explicitly represent them here. Instead, implicit slots are assumed
2863
/// to be present and handled automatically in the search code. Therefore,
2864
/// that means we only need to represent explicit slots in our epsilon
2865
/// transitions.
2866
///
2867
/// Its representation is a bit set. The bit 'i' is set if and only if there
2868
/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That
2869
/// is, the bit 'i' corresponds to the first explicit slot and the first
2870
/// explicit slot appears immediately following the last implicit slot. (If
2871
/// this is confusing, see `GroupInfo` for more details on how slots works.)
2872
///
2873
/// A single `Slots` represents all the active slots in a sub-graph of an NFA,
2874
/// where all the states are connected by epsilon transitions. In effect, when
2875
/// traversing the one-pass DFA during a search, all slots set in a particular
2876
/// transition must be captured by recording the current search position.
2877
///
2878
/// The API of `Slots` requires the caller to handle the explicit slot offset.
2879
/// That is, a `Slots` doesn't know where the explicit slots start for a
2880
/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they
2881
/// need to do the arithmetic above to find 'c', which is the real actual slot
2882
/// index in the corresponding NFA.
2883
#[derive(Clone, Copy)]
2884
struct Slots(u32);
2885
2886
impl Slots {
2887
    const LIMIT: usize = 32;
2888
2889
    /// Insert the slot at the given bit index.
2890
0
    fn insert(self, slot: usize) -> Slots {
2891
0
        debug_assert!(slot < Slots::LIMIT);
2892
0
        Slots(self.0 | (1 << slot.as_u32()))
2893
0
    }
2894
2895
    /// Remove the slot at the given bit index.
2896
0
    fn remove(self, slot: usize) -> Slots {
2897
0
        debug_assert!(slot < Slots::LIMIT);
2898
0
        Slots(self.0 & !(1 << slot.as_u32()))
2899
0
    }
2900
2901
    /// Returns true if and only if this set contains no slots.
2902
0
    fn is_empty(self) -> bool {
2903
0
        self.0 == 0
2904
0
    }
2905
2906
    /// Returns an iterator over all of the set bits in this set.
2907
0
    fn iter(self) -> SlotsIter {
2908
0
        SlotsIter { slots: self }
2909
0
    }
2910
2911
    /// For the position `at` in the current haystack, copy it to
2912
    /// `caller_explicit_slots` for all slots that are in this set.
2913
    ///
2914
    /// Callers may pass a slice of any length. Slots in this set bigger than
2915
    /// the length of the given explicit slots are simply skipped.
2916
    ///
2917
    /// The slice *must* correspond only to the explicit slots and the first
2918
    /// element of the slice must always correspond to the first explicit slot
2919
    /// in the corresponding NFA.
2920
0
    fn apply(
2921
0
        self,
2922
0
        at: usize,
2923
0
        caller_explicit_slots: &mut [Option<NonMaxUsize>],
2924
0
    ) {
2925
0
        if self.is_empty() {
2926
0
            return;
2927
0
        }
2928
0
        let at = NonMaxUsize::new(at);
2929
0
        for slot in self.iter() {
2930
0
            if slot >= caller_explicit_slots.len() {
2931
0
                break;
2932
0
            }
2933
0
            caller_explicit_slots[slot] = at;
2934
        }
2935
0
    }
2936
}
2937
2938
impl core::fmt::Debug for Slots {
2939
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2940
0
        write!(f, "S")?;
2941
0
        for slot in self.iter() {
2942
0
            write!(f, "-{:?}", slot)?;
2943
        }
2944
0
        Ok(())
2945
0
    }
2946
}
2947
2948
/// An iterator over all of the bits set in a slot set.
2949
///
2950
/// This returns the bit index that is set, so callers may need to offset it
2951
/// to get the actual NFA slot index.
2952
#[derive(Debug)]
2953
struct SlotsIter {
2954
    slots: Slots,
2955
}
2956
2957
impl Iterator for SlotsIter {
2958
    type Item = usize;
2959
2960
0
    fn next(&mut self) -> Option<usize> {
2961
0
        // Number of zeroes here is always <= u8::MAX, and so fits in a usize.
2962
0
        let slot = self.slots.0.trailing_zeros().as_usize();
2963
0
        if slot >= Slots::LIMIT {
2964
0
            return None;
2965
0
        }
2966
0
        self.slots = self.slots.remove(slot);
2967
0
        Some(slot)
2968
0
    }
2969
}
2970
2971
/// An error that occurred during the construction of a one-pass DFA.
2972
///
2973
/// This error does not provide many introspection capabilities. There are
2974
/// generally only two things you can do with it:
2975
///
2976
/// * Obtain a human readable message via its `std::fmt::Display` impl.
2977
/// * Access an underlying [`thompson::BuildError`] type from its `source`
2978
/// method via the `std::error::Error` trait. This error only occurs when using
2979
/// convenience routines for building a one-pass DFA directly from a pattern
2980
/// string.
2981
///
2982
/// When the `std` feature is enabled, this implements the `std::error::Error`
2983
/// trait.
2984
#[derive(Clone, Debug)]
2985
pub struct BuildError {
2986
    kind: BuildErrorKind,
2987
}
2988
2989
/// The kind of error that occurred during the construction of a one-pass DFA.
2990
#[derive(Clone, Debug)]
2991
enum BuildErrorKind {
2992
    NFA(crate::nfa::thompson::BuildError),
2993
    Word(UnicodeWordBoundaryError),
2994
    TooManyStates { limit: u64 },
2995
    TooManyPatterns { limit: u64 },
2996
    UnsupportedLook { look: Look },
2997
    ExceededSizeLimit { limit: usize },
2998
    NotOnePass { msg: &'static str },
2999
}
3000
3001
impl BuildError {
3002
0
    fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError {
3003
0
        BuildError { kind: BuildErrorKind::NFA(err) }
3004
0
    }
3005
3006
0
    fn word(err: UnicodeWordBoundaryError) -> BuildError {
3007
0
        BuildError { kind: BuildErrorKind::Word(err) }
3008
0
    }
3009
3010
0
    fn too_many_states(limit: u64) -> BuildError {
3011
0
        BuildError { kind: BuildErrorKind::TooManyStates { limit } }
3012
0
    }
3013
3014
0
    fn too_many_patterns(limit: u64) -> BuildError {
3015
0
        BuildError { kind: BuildErrorKind::TooManyPatterns { limit } }
3016
0
    }
3017
3018
0
    fn unsupported_look(look: Look) -> BuildError {
3019
0
        BuildError { kind: BuildErrorKind::UnsupportedLook { look } }
3020
0
    }
3021
3022
0
    fn exceeded_size_limit(limit: usize) -> BuildError {
3023
0
        BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
3024
0
    }
3025
3026
0
    fn not_one_pass(msg: &'static str) -> BuildError {
3027
0
        BuildError { kind: BuildErrorKind::NotOnePass { msg } }
3028
0
    }
3029
}
3030
3031
#[cfg(feature = "std")]
3032
impl std::error::Error for BuildError {
3033
0
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
3034
        use self::BuildErrorKind::*;
3035
3036
0
        match self.kind {
3037
0
            NFA(ref err) => Some(err),
3038
0
            Word(ref err) => Some(err),
3039
0
            _ => None,
3040
        }
3041
0
    }
3042
}
3043
3044
impl core::fmt::Display for BuildError {
3045
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
3046
        use self::BuildErrorKind::*;
3047
3048
0
        match self.kind {
3049
0
            NFA(_) => write!(f, "error building NFA"),
3050
0
            Word(_) => write!(f, "NFA contains Unicode word boundary"),
3051
0
            TooManyStates { limit } => write!(
3052
0
                f,
3053
0
                "one-pass DFA exceeded a limit of {:?} for number of states",
3054
0
                limit,
3055
0
            ),
3056
0
            TooManyPatterns { limit } => write!(
3057
0
                f,
3058
0
                "one-pass DFA exceeded a limit of {:?} for number of patterns",
3059
0
                limit,
3060
0
            ),
3061
0
            UnsupportedLook { look } => write!(
3062
0
                f,
3063
0
                "one-pass DFA does not support the {:?} assertion",
3064
0
                look,
3065
0
            ),
3066
0
            ExceededSizeLimit { limit } => write!(
3067
0
                f,
3068
0
                "one-pass DFA exceeded size limit of {:?} during building",
3069
0
                limit,
3070
0
            ),
3071
0
            NotOnePass { msg } => write!(
3072
0
                f,
3073
0
                "one-pass DFA could not be built because \
3074
0
                 pattern is not one-pass: {}",
3075
0
                msg,
3076
0
            ),
3077
        }
3078
0
    }
3079
}
3080
3081
#[cfg(all(test, feature = "syntax"))]
3082
mod tests {
3083
    use alloc::string::ToString;
3084
3085
    use super::*;
3086
3087
    #[test]
3088
    fn fail_conflicting_transition() {
3089
        let predicate = |err: &str| err.contains("conflicting transition");
3090
3091
        let err = DFA::new(r"a*[ab]").unwrap_err().to_string();
3092
        assert!(predicate(&err), "{}", err);
3093
    }
3094
3095
    #[test]
3096
    fn fail_multiple_epsilon() {
3097
        let predicate = |err: &str| {
3098
            err.contains("multiple epsilon transitions to same state")
3099
        };
3100
3101
        let err = DFA::new(r"(^|$)a").unwrap_err().to_string();
3102
        assert!(predicate(&err), "{}", err);
3103
    }
3104
3105
    #[test]
3106
    fn fail_multiple_match() {
3107
        let predicate = |err: &str| {
3108
            err.contains("multiple epsilon transitions to match state")
3109
        };
3110
3111
        let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string();
3112
        assert!(predicate(&err), "{}", err);
3113
    }
3114
3115
    // This test is meant to build a one-pass regex with the maximum number of
3116
    // possible slots.
3117
    //
3118
    // NOTE: Remember that the slot limit only applies to explicit capturing
3119
    // groups. Any number of implicit capturing groups is supported (up to the
3120
    // maximum number of supported patterns), since implicit groups are handled
3121
    // by the search loop itself.
3122
    #[test]
3123
    fn max_slots() {
3124
        // One too many...
3125
        let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)";
3126
        assert!(DFA::new(pat).is_err());
3127
        // Just right.
3128
        let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)";
3129
        assert!(DFA::new(pat).is_ok());
3130
    }
3131
3132
    // This test ensures that the one-pass DFA works with all look-around
3133
    // assertions that we expect it to work with.
3134
    //
3135
    // The utility of this test is that each one-pass transition has a small
3136
    // amount of space to store look-around assertions. Currently, there is
3137
    // logic in the one-pass constructor to ensure there aren't more than ten
3138
    // possible assertions. And indeed, there are only ten possible assertions
3139
    // (at time of writing), so this is okay. But conceivably, more assertions
3140
    // could be added. So we check that things at least work with what we
3141
    // expect them to work with.
3142
    #[test]
3143
    fn assertions() {
3144
        // haystack anchors
3145
        assert!(DFA::new(r"^").is_ok());
3146
        assert!(DFA::new(r"$").is_ok());
3147
3148
        // line anchors
3149
        assert!(DFA::new(r"(?m)^").is_ok());
3150
        assert!(DFA::new(r"(?m)$").is_ok());
3151
        assert!(DFA::new(r"(?Rm)^").is_ok());
3152
        assert!(DFA::new(r"(?Rm)$").is_ok());
3153
3154
        // word boundaries
3155
        if cfg!(feature = "unicode-word-boundary") {
3156
            assert!(DFA::new(r"\b").is_ok());
3157
            assert!(DFA::new(r"\B").is_ok());
3158
        }
3159
        assert!(DFA::new(r"(?-u)\b").is_ok());
3160
        assert!(DFA::new(r"(?-u)\B").is_ok());
3161
    }
3162
3163
    #[cfg(not(miri))] // takes too long on miri
3164
    #[test]
3165
    fn is_one_pass() {
3166
        use crate::util::syntax;
3167
3168
        assert!(DFA::new(r"a*b").is_ok());
3169
        if cfg!(feature = "unicode-perl") {
3170
            assert!(DFA::new(r"\w").is_ok());
3171
        }
3172
        assert!(DFA::new(r"(?-u)\w*\s").is_ok());
3173
        assert!(DFA::new(r"(?s:.)*?").is_ok());
3174
        assert!(DFA::builder()
3175
            .syntax(syntax::Config::new().utf8(false))
3176
            .build(r"(?s-u:.)*?")
3177
            .is_ok());
3178
    }
3179
3180
    #[test]
3181
    fn is_not_one_pass() {
3182
        assert!(DFA::new(r"a*a").is_err());
3183
        assert!(DFA::new(r"(?s-u:.)*?").is_err());
3184
        assert!(DFA::new(r"(?s:.)*?a").is_err());
3185
    }
3186
3187
    #[cfg(not(miri))]
3188
    #[test]
3189
    fn is_not_one_pass_bigger() {
3190
        assert!(DFA::new(r"\w*\s").is_err());
3191
    }
3192
}