Coverage Report

Created: 2025-07-23 07:16

/src/regex/regex-automata/src/dfa/regex.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
A DFA-backed `Regex`.
3
4
This module provides [`Regex`], which is defined generically over the
5
[`Automaton`] trait. A `Regex` implements convenience routines you might have
6
come to expect, such as finding the start/end of a match and iterating over
7
all non-overlapping matches. This `Regex` type is limited in its capabilities
8
to what a DFA can provide. Therefore, APIs involving capturing groups, for
9
example, are not provided.
10
11
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12
finds the end offset of a match, where as the other is a "reverse" DFA that
13
find the start offset of a match.
14
15
See the [parent module](crate::dfa) for examples.
16
*/
17
18
#[cfg(feature = "alloc")]
19
use alloc::vec::Vec;
20
21
#[cfg(feature = "dfa-build")]
22
use crate::dfa::dense::BuildError;
23
use crate::{
24
    dfa::{automaton::Automaton, dense},
25
    util::{iter, search::Input},
26
    Anchored, Match, MatchError,
27
};
28
#[cfg(feature = "alloc")]
29
use crate::{
30
    dfa::{sparse, StartKind},
31
    util::search::MatchKind,
32
};
33
34
// When the alloc feature is enabled, the regex type sets its A type parameter
35
// to default to an owned dense DFA. But without alloc, we set no default. This
36
// makes things a lot more convenient in the common case, since writing out the
37
// DFA types is pretty annoying.
38
//
39
// Since we have two different definitions but only want to write one doc
40
// string, we use a macro to capture the doc and other attributes once and then
41
// repeat them for each definition.
42
macro_rules! define_regex_type {
43
    ($(#[$doc:meta])*) => {
44
        #[cfg(feature = "alloc")]
45
        $(#[$doc])*
46
        pub struct Regex<A = dense::OwnedDFA> {
47
            forward: A,
48
            reverse: A,
49
        }
50
51
        #[cfg(not(feature = "alloc"))]
52
        $(#[$doc])*
53
        pub struct Regex<A> {
54
            forward: A,
55
            reverse: A,
56
        }
57
    };
58
}
59
60
define_regex_type!(
61
    /// A regular expression that uses deterministic finite automata for fast
62
    /// searching.
63
    ///
64
    /// A regular expression is comprised of two DFAs, a "forward" DFA and a
65
    /// "reverse" DFA. The forward DFA is responsible for detecting the end of
66
    /// a match while the reverse DFA is responsible for detecting the start
67
    /// of a match. Thus, in order to find the bounds of any given match, a
68
    /// forward search must first be run followed by a reverse search. A match
69
    /// found by the forward DFA guarantees that the reverse DFA will also find
70
    /// a match.
71
    ///
72
    /// The type of the DFA used by a `Regex` corresponds to the `A` type
73
    /// parameter, which must satisfy the [`Automaton`] trait. Typically,
74
    /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
75
    /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
76
    /// memory but search faster, while sparse DFAs use less memory but search
77
    /// more slowly.
78
    ///
79
    /// # Crate features
80
    ///
81
    /// Note that despite what the documentation auto-generates, the _only_
82
    /// crate feature needed to use this type is `dfa-search`. You do _not_
83
    /// need to enable the `alloc` feature.
84
    ///
85
    /// By default, a regex's automaton type parameter is set to
86
    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
87
    /// in-memory work loads, this is the most convenient type that gives the
88
    /// best search performance. When the `alloc` feature is disabled, no
89
    /// default type is used.
90
    ///
91
    /// # When should I use this?
92
    ///
93
    /// Generally speaking, if you can afford the overhead of building a full
94
    /// DFA for your regex, and you don't need things like capturing groups,
95
    /// then this is a good choice if you're looking to optimize for matching
96
    /// speed. Note however that its speed may be worse than a general purpose
97
    /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
98
    /// underlying DFA.
99
    ///
100
    /// # Sparse DFAs
101
    ///
102
    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
103
    /// used with any kind of DFA. While this crate constructs dense DFAs by
104
    /// default, it is easy enough to build corresponding sparse DFAs, and then
105
    /// build a regex from them:
106
    ///
107
    /// ```
108
    /// use regex_automata::dfa::regex::Regex;
109
    ///
110
    /// // First, build a regex that uses dense DFAs.
111
    /// let dense_re = Regex::new("foo[0-9]+")?;
112
    ///
113
    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
114
    /// let fwd = dense_re.forward().to_sparse()?;
115
    /// let rev = dense_re.reverse().to_sparse()?;
116
    ///
117
    /// // Third, build a new regex from the constituent sparse DFAs.
118
    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
119
    ///
120
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
121
    /// assert_eq!(true, sparse_re.is_match(b"foo123"));
122
    ///
123
    /// # Ok::<(), Box<dyn std::error::Error>>(())
124
    /// ```
125
    ///
126
    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
127
    /// more succinctly. (Note though that dense DFAs are still constructed
128
    /// first internally, and then converted to sparse DFAs, as in the example
129
    /// above.)
130
    ///
131
    /// ```
132
    /// use regex_automata::dfa::regex::Regex;
133
    ///
134
    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
135
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
136
    /// assert!(sparse_re.is_match(b"foo123"));
137
    ///
138
    /// # Ok::<(), Box<dyn std::error::Error>>(())
139
    /// ```
140
    ///
141
    /// # Fallibility
142
    ///
143
    /// Most of the search routines defined on this type will _panic_ when the
144
    /// underlying search fails. This might be because the DFA gave up because
145
    /// it saw a quit byte, whether configured explicitly or via heuristic
146
    /// Unicode word boundary support, although neither are enabled by default.
147
    /// Or it might fail because an invalid `Input` configuration is given,
148
    /// for example, with an unsupported [`Anchored`] mode.
149
    ///
150
    /// If you need to handle these error cases instead of allowing them to
151
    /// trigger a panic, then the lower level [`Regex::try_search`] provides
152
    /// a fallible API that never panics.
153
    ///
154
    /// # Example
155
    ///
156
    /// This example shows how to cause a search to terminate if it sees a
157
    /// `\n` byte, and handle the error returned. This could be useful if, for
158
    /// example, you wanted to prevent a user supplied pattern from matching
159
    /// across a line boundary.
160
    ///
161
    /// ```
162
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
163
    /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
164
    ///
165
    /// let re = Regex::builder()
166
    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))
167
    ///     .build(r"foo\p{any}+bar")?;
168
    ///
169
    /// let input = Input::new("foo\nbar");
170
    /// // Normally this would produce a match, since \p{any} contains '\n'.
171
    /// // But since we instructed the automaton to enter a quit state if a
172
    /// // '\n' is observed, this produces a match error instead.
173
    /// let expected = MatchError::quit(b'\n', 3);
174
    /// let got = re.try_search(&input).unwrap_err();
175
    /// assert_eq!(expected, got);
176
    ///
177
    /// # Ok::<(), Box<dyn std::error::Error>>(())
178
    /// ```
179
    #[derive(Clone, Debug)]
180
);
181
182
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
183
impl Regex {
184
    /// Parse the given regular expression using the default configuration and
185
    /// return the corresponding regex.
186
    ///
187
    /// If you want a non-default configuration, then use the [`Builder`] to
188
    /// set your own configuration.
189
    ///
190
    /// # Example
191
    ///
192
    /// ```
193
    /// use regex_automata::{Match, dfa::regex::Regex};
194
    ///
195
    /// let re = Regex::new("foo[0-9]+bar")?;
196
    /// assert_eq!(
197
    ///     Some(Match::must(0, 3..14)),
198
    ///     re.find(b"zzzfoo12345barzzz"),
199
    /// );
200
    /// # Ok::<(), Box<dyn std::error::Error>>(())
201
    /// ```
202
0
    pub fn new(pattern: &str) -> Result<Regex, BuildError> {
203
0
        Builder::new().build(pattern)
204
0
    }
205
206
    /// Like `new`, but parses multiple patterns into a single "regex set."
207
    /// This similarly uses the default regex configuration.
208
    ///
209
    /// # Example
210
    ///
211
    /// ```
212
    /// use regex_automata::{Match, dfa::regex::Regex};
213
    ///
214
    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
215
    ///
216
    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
217
    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
218
    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
219
    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
220
    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
221
    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
222
    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
223
    /// assert_eq!(None, it.next());
224
    /// # Ok::<(), Box<dyn std::error::Error>>(())
225
    /// ```
226
    pub fn new_many<P: AsRef<str>>(
227
        patterns: &[P],
228
    ) -> Result<Regex, BuildError> {
229
        Builder::new().build_many(patterns)
230
    }
231
}
232
233
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
234
impl Regex<sparse::DFA<Vec<u8>>> {
235
    /// Parse the given regular expression using the default configuration,
236
    /// except using sparse DFAs, and return the corresponding regex.
237
    ///
238
    /// If you want a non-default configuration, then use the [`Builder`] to
239
    /// set your own configuration.
240
    ///
241
    /// # Example
242
    ///
243
    /// ```
244
    /// use regex_automata::{Match, dfa::regex::Regex};
245
    ///
246
    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
247
    /// assert_eq!(
248
    ///     Some(Match::must(0, 3..14)),
249
    ///     re.find(b"zzzfoo12345barzzz"),
250
    /// );
251
    /// # Ok::<(), Box<dyn std::error::Error>>(())
252
    /// ```
253
0
    pub fn new_sparse(
254
0
        pattern: &str,
255
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
256
0
        Builder::new().build_sparse(pattern)
257
0
    }
258
259
    /// Like `new`, but parses multiple patterns into a single "regex set"
260
    /// using sparse DFAs. This otherwise similarly uses the default regex
261
    /// configuration.
262
    ///
263
    /// # Example
264
    ///
265
    /// ```
266
    /// use regex_automata::{Match, dfa::regex::Regex};
267
    ///
268
    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
269
    ///
270
    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
271
    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
272
    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
273
    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
274
    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
275
    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
276
    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
277
    /// assert_eq!(None, it.next());
278
    /// # Ok::<(), Box<dyn std::error::Error>>(())
279
    /// ```
280
    pub fn new_many_sparse<P: AsRef<str>>(
281
        patterns: &[P],
282
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
283
        Builder::new().build_many_sparse(patterns)
284
    }
285
}
286
287
/// Convenience routines for regex construction.
288
impl Regex<dense::DFA<&'static [u32]>> {
289
    /// Return a builder for configuring the construction of a `Regex`.
290
    ///
291
    /// This is a convenience routine to avoid needing to import the
292
    /// [`Builder`] type in common cases.
293
    ///
294
    /// # Example
295
    ///
296
    /// This example shows how to use the builder to disable UTF-8 mode
297
    /// everywhere.
298
    ///
299
    /// ```
300
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
301
    /// use regex_automata::{
302
    ///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
303
    /// };
304
    ///
305
    /// let re = Regex::builder()
306
    ///     .syntax(syntax::Config::new().utf8(false))
307
    ///     .thompson(thompson::Config::new().utf8(false))
308
    ///     .build(r"foo(?-u:[^b])ar.*")?;
309
    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
310
    /// let expected = Some(Match::must(0, 1..9));
311
    /// let got = re.find(haystack);
312
    /// assert_eq!(expected, got);
313
    ///
314
    /// # Ok::<(), Box<dyn std::error::Error>>(())
315
    /// ```
316
0
    pub fn builder() -> Builder {
317
0
        Builder::new()
318
0
    }
319
}
320
321
/// Standard search routines for finding and iterating over matches.
322
impl<A: Automaton> Regex<A> {
323
    /// Returns true if and only if this regex matches the given haystack.
324
    ///
325
    /// This routine may short circuit if it knows that scanning future input
326
    /// will never lead to a different result. In particular, if the underlying
327
    /// DFA enters a match state or a dead state, then this routine will return
328
    /// `true` or `false`, respectively, without inspecting any future input.
329
    ///
330
    /// # Panics
331
    ///
332
    /// This routine panics if the search could not complete. This can occur
333
    /// in a number of circumstances:
334
    ///
335
    /// * The configuration of the DFA may permit it to "quit" the search.
336
    /// For example, setting quit bytes or enabling heuristic support for
337
    /// Unicode word boundaries. The default configuration does not enable any
338
    /// option that could result in the DFA quitting.
339
    /// * When the provided `Input` configuration is not supported. For
340
    /// example, by providing an unsupported anchor mode.
341
    ///
342
    /// When a search panics, callers cannot know whether a match exists or
343
    /// not.
344
    ///
345
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
346
    ///
347
    /// # Example
348
    ///
349
    /// ```
350
    /// use regex_automata::dfa::regex::Regex;
351
    ///
352
    /// let re = Regex::new("foo[0-9]+bar")?;
353
    /// assert_eq!(true, re.is_match("foo12345bar"));
354
    /// assert_eq!(false, re.is_match("foobar"));
355
    /// # Ok::<(), Box<dyn std::error::Error>>(())
356
    /// ```
357
    #[inline]
358
    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
359
        // Not only can we do an "earliest" search, but we can avoid doing a
360
        // reverse scan too.
361
        let input = input.into().earliest(true);
362
        self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
363
    }
364
365
    /// Returns the start and end offset of the leftmost match. If no match
366
    /// exists, then `None` is returned.
367
    ///
368
    /// # Panics
369
    ///
370
    /// This routine panics if the search could not complete. This can occur
371
    /// in a number of circumstances:
372
    ///
373
    /// * The configuration of the DFA may permit it to "quit" the search.
374
    /// For example, setting quit bytes or enabling heuristic support for
375
    /// Unicode word boundaries. The default configuration does not enable any
376
    /// option that could result in the DFA quitting.
377
    /// * When the provided `Input` configuration is not supported. For
378
    /// example, by providing an unsupported anchor mode.
379
    ///
380
    /// When a search panics, callers cannot know whether a match exists or
381
    /// not.
382
    ///
383
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
384
    ///
385
    /// # Example
386
    ///
387
    /// ```
388
    /// use regex_automata::{Match, dfa::regex::Regex};
389
    ///
390
    /// // Greediness is applied appropriately.
391
    /// let re = Regex::new("foo[0-9]+")?;
392
    /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
393
    ///
394
    /// // Even though a match is found after reading the first byte (`a`),
395
    /// // the default leftmost-first match semantics demand that we find the
396
    /// // earliest match that prefers earlier parts of the pattern over latter
397
    /// // parts.
398
    /// let re = Regex::new("abc|a")?;
399
    /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
400
    /// # Ok::<(), Box<dyn std::error::Error>>(())
401
    /// ```
402
    #[inline]
403
    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
404
        self.try_search(&input.into()).unwrap()
405
    }
406
407
    /// Returns an iterator over all non-overlapping leftmost matches in the
408
    /// given bytes. If no match exists, then the iterator yields no elements.
409
    ///
410
    /// This corresponds to the "standard" regex search iterator.
411
    ///
412
    /// # Panics
413
    ///
414
    /// If the search returns an error during iteration, then iteration
415
    /// panics. See [`Regex::find`] for the panic conditions.
416
    ///
417
    /// Use [`Regex::try_search`] with
418
    /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
419
    /// handle these error conditions.
420
    ///
421
    /// # Example
422
    ///
423
    /// ```
424
    /// use regex_automata::{Match, dfa::regex::Regex};
425
    ///
426
    /// let re = Regex::new("foo[0-9]+")?;
427
    /// let text = "foo1 foo12 foo123";
428
    /// let matches: Vec<Match> = re.find_iter(text).collect();
429
    /// assert_eq!(matches, vec![
430
    ///     Match::must(0, 0..4),
431
    ///     Match::must(0, 5..10),
432
    ///     Match::must(0, 11..17),
433
    /// ]);
434
    /// # Ok::<(), Box<dyn std::error::Error>>(())
435
    /// ```
436
    #[inline]
437
    pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
438
        &'r self,
439
        input: I,
440
    ) -> FindMatches<'r, 'h, A> {
441
        let it = iter::Searcher::new(input.into());
442
        FindMatches { re: self, it }
443
    }
444
}
445
446
/// Lower level fallible search routines that permit controlling where the
447
/// search starts and ends in a particular sequence.
448
impl<A: Automaton> Regex<A> {
449
    /// Returns the start and end offset of the leftmost match. If no match
450
    /// exists, then `None` is returned.
451
    ///
452
    /// This is like [`Regex::find`] but with two differences:
453
    ///
454
    /// 1. It is not generic over `Into<Input>` and instead accepts a
455
    /// `&Input`. This permits reusing the same `Input` for multiple searches
456
    /// without needing to create a new one. This _may_ help with latency.
457
    /// 2. It returns an error if the search could not complete where as
458
    /// [`Regex::find`] will panic.
459
    ///
460
    /// # Errors
461
    ///
462
    /// This routine errors if the search could not complete. This can occur
463
    /// in the following circumstances:
464
    ///
465
    /// * The configuration of the DFA may permit it to "quit" the search.
466
    /// For example, setting quit bytes or enabling heuristic support for
467
    /// Unicode word boundaries. The default configuration does not enable any
468
    /// option that could result in the DFA quitting.
469
    /// * When the provided `Input` configuration is not supported. For
470
    /// example, by providing an unsupported anchor mode.
471
    ///
472
    /// When a search returns an error, callers cannot know whether a match
473
    /// exists or not.
474
    #[inline]
475
34.2k
    pub fn try_search(
476
34.2k
        &self,
477
34.2k
        input: &Input<'_>,
478
34.2k
    ) -> Result<Option<Match>, MatchError> {
479
34.2k
        let (fwd, rev) = (self.forward(), self.reverse());
480
34.2k
        let end = match fwd.try_search_fwd(input)? {
481
11.9k
            None => return Ok(None),
482
8.94k
            Some(end) => end,
483
8.94k
        };
484
8.94k
        // This special cases an empty match at the beginning of the search. If
485
8.94k
        // our end matches our start, then since a reverse DFA can't match past
486
8.94k
        // the start, it must follow that our starting position is also our end
487
8.94k
        // position. So short circuit and skip the reverse search.
488
8.94k
        if input.start() == end.offset() {
489
5.65k
            return Ok(Some(Match::new(
490
5.65k
                end.pattern(),
491
5.65k
                end.offset()..end.offset(),
492
5.65k
            )));
493
3.29k
        }
494
3.29k
        // We can also skip the reverse search if we know our search was
495
3.29k
        // anchored. This occurs either when the input config is anchored or
496
3.29k
        // when we know the regex itself is anchored. In this case, we know the
497
3.29k
        // start of the match, if one is found, must be the start of the
498
3.29k
        // search.
499
3.29k
        if self.is_anchored(input) {
500
42
            return Ok(Some(Match::new(
501
42
                end.pattern(),
502
42
                input.start()..end.offset(),
503
42
            )));
504
3.25k
        }
505
3.25k
        // N.B. I have tentatively convinced myself that it isn't necessary
506
3.25k
        // to specify the specific pattern for the reverse search since the
507
3.25k
        // reverse search will always find the same pattern to match as the
508
3.25k
        // forward search. But I lack a rigorous proof. Why not just provide
509
3.25k
        // the pattern anyway? Well, if it is needed, then leaving it out
510
3.25k
        // gives us a chance to find a witness. (Also, if we don't need to
511
3.25k
        // specify the pattern, then we don't need to build the reverse DFA
512
3.25k
        // with 'starts_for_each_pattern' enabled.)
513
3.25k
        //
514
3.25k
        // We also need to be careful to disable 'earliest' for the reverse
515
3.25k
        // search, since it could be enabled for the forward search. In the
516
3.25k
        // reverse case, to satisfy "leftmost" criteria, we need to match
517
3.25k
        // as much as we can. We also need to be careful to make the search
518
3.25k
        // anchored. We don't want the reverse search to report any matches
519
3.25k
        // other than the one beginning at the end of our forward search.
520
3.25k
        let revsearch = input
521
3.25k
            .clone()
522
3.25k
            .span(input.start()..end.offset())
523
3.25k
            .anchored(Anchored::Yes)
524
3.25k
            .earliest(false);
525
3.25k
        let start = rev
526
3.25k
            .try_search_rev(&revsearch)?
527
3.09k
            .expect("reverse search must match if forward search does");
528
3.09k
        assert_eq!(
529
3.09k
            start.pattern(),
530
3.09k
            end.pattern(),
531
0
            "forward and reverse search must match same pattern",
532
        );
533
3.09k
        assert!(start.offset() <= end.offset());
534
3.09k
        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
535
34.2k
    }
536
537
    /// Returns true if either the given input specifies an anchored search
538
    /// or if the underlying DFA is always anchored.
539
3.29k
    fn is_anchored(&self, input: &Input<'_>) -> bool {
540
3.29k
        match input.get_anchored() {
541
3.29k
            Anchored::No => self.forward().is_always_start_anchored(),
542
0
            Anchored::Yes | Anchored::Pattern(_) => true,
543
        }
544
3.29k
    }
545
}
546
547
/// Non-search APIs for querying information about the regex and setting a
548
/// prefilter.
549
impl<A: Automaton> Regex<A> {
550
    /// Return the underlying DFA responsible for forward matching.
551
    ///
552
    /// This is useful for accessing the underlying DFA and converting it to
553
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
554
    /// for an example of where this might be useful.
555
66.5k
    pub fn forward(&self) -> &A {
556
66.5k
        &self.forward
557
66.5k
    }
558
559
    /// Return the underlying DFA responsible for reverse matching.
560
    ///
561
    /// This is useful for accessing the underlying DFA and converting it to
562
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
563
    /// for an example of where this might be useful.
564
275k
    pub fn reverse(&self) -> &A {
565
275k
        &self.reverse
566
275k
    }
567
568
    /// Returns the total number of patterns matched by this regex.
569
    ///
570
    /// # Example
571
    ///
572
    /// ```
573
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
574
    /// use regex_automata::dfa::regex::Regex;
575
    ///
576
    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
577
    /// assert_eq!(3, re.pattern_len());
578
    /// # Ok::<(), Box<dyn std::error::Error>>(())
579
    /// ```
580
    pub fn pattern_len(&self) -> usize {
581
        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
582
        self.forward().pattern_len()
583
    }
584
}
585
586
/// An iterator over all non-overlapping matches for an infallible search.
587
///
588
/// The iterator yields a [`Match`] value until no more matches could be found.
589
/// If the underlying regex engine returns an error, then a panic occurs.
590
///
591
/// The type parameters are as follows:
592
///
593
/// * `A` represents the type of the underlying DFA that implements the
594
/// [`Automaton`] trait.
595
///
596
/// The lifetime parameters are as follows:
597
///
598
/// * `'h` represents the lifetime of the haystack being searched.
599
/// * `'r` represents the lifetime of the regex object itself.
600
///
601
/// This iterator can be created with the [`Regex::find_iter`] method.
602
#[derive(Debug)]
603
pub struct FindMatches<'r, 'h, A> {
604
    re: &'r Regex<A>,
605
    it: iter::Searcher<'h>,
606
}
607
608
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
609
    type Item = Match;
610
611
    #[inline]
612
    fn next(&mut self) -> Option<Match> {
613
        let FindMatches { re, ref mut it } = *self;
614
        it.advance(|input| re.try_search(input))
615
    }
616
}
617
618
/// A builder for a regex based on deterministic finite automatons.
619
///
620
/// This builder permits configuring options for the syntax of a pattern, the
621
/// NFA construction, the DFA construction and finally the regex searching
622
/// itself. This builder is different from a general purpose regex builder in
623
/// that it permits fine grain configuration of the construction process. The
624
/// trade off for this is complexity, and the possibility of setting a
625
/// configuration that might not make sense. For example, there are two
626
/// different UTF-8 modes:
627
///
628
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
629
/// whether the pattern itself can contain sub-expressions that match invalid
630
/// UTF-8.
631
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
632
/// how the regex iterators themselves advance the starting position of the
633
/// next search when a match with zero length is found.
634
///
635
/// Generally speaking, callers will want to either enable all of these or
636
/// disable all of these.
637
///
638
/// Internally, building a regex requires building two DFAs, where one is
639
/// responsible for finding the end of a match and the other is responsible
640
/// for finding the start of a match. If you only need to detect whether
641
/// something matched, or only the end of a match, then you should use a
642
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
643
/// building two DFAs.
644
///
645
/// # Build methods
646
///
647
/// This builder has a few "build" methods. In general, it's the result of
648
/// combining the following parameters:
649
///
650
/// * Building one or many regexes.
651
/// * Building a regex with dense or sparse DFAs.
652
///
653
/// The simplest "build" method is [`Builder::build`]. It accepts a single
654
/// pattern and builds a dense DFA using `usize` for the state identifier
655
/// representation.
656
///
657
/// The most general "build" method is [`Builder::build_many`], which permits
658
/// building a regex that searches for multiple patterns simultaneously while
659
/// using a specific state identifier representation.
660
///
661
/// The most flexible "build" method, but hardest to use, is
662
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
663
/// just a pair of DFAs, and this method allows you to specify those DFAs
664
/// exactly.
665
///
666
/// # Example
667
///
668
/// This example shows how to disable UTF-8 mode in the syntax and the regex
669
/// itself. This is generally what you want for matching on arbitrary bytes.
670
///
671
/// ```
672
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
673
/// use regex_automata::{
674
///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
675
/// };
676
///
677
/// let re = Regex::builder()
678
///     .syntax(syntax::Config::new().utf8(false))
679
///     .thompson(thompson::Config::new().utf8(false))
680
///     .build(r"foo(?-u:[^b])ar.*")?;
681
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
682
/// let expected = Some(Match::must(0, 1..9));
683
/// let got = re.find(haystack);
684
/// assert_eq!(expected, got);
685
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
686
/// // but the subsequent `.*` does not! Disabling UTF-8
687
/// // on the syntax permits this.
688
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
689
///
690
/// # Ok::<(), Box<dyn std::error::Error>>(())
691
/// ```
692
#[derive(Clone, Debug)]
693
pub struct Builder {
694
    #[cfg(feature = "dfa-build")]
695
    dfa: dense::Builder,
696
}
697
698
impl Builder {
699
    /// Create a new regex builder with the default configuration.
700
38.5k
    pub fn new() -> Builder {
701
38.5k
        Builder {
702
38.5k
            #[cfg(feature = "dfa-build")]
703
38.5k
            dfa: dense::Builder::new(),
704
38.5k
        }
705
38.5k
    }
706
707
    /// Build a regex from the given pattern.
708
    ///
709
    /// If there was a problem parsing or compiling the pattern, then an error
710
    /// is returned.
711
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
712
0
    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
713
0
        self.build_many(&[pattern])
714
0
    }
715
716
    /// Build a regex from the given pattern using sparse DFAs.
717
    ///
718
    /// If there was a problem parsing or compiling the pattern, then an error
719
    /// is returned.
720
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
721
0
    pub fn build_sparse(
722
0
        &self,
723
0
        pattern: &str,
724
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
725
0
        self.build_many_sparse(&[pattern])
726
0
    }
727
728
    /// Build a regex from the given patterns.
729
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
730
0
    pub fn build_many<P: AsRef<str>>(
731
0
        &self,
732
0
        patterns: &[P],
733
0
    ) -> Result<Regex, BuildError> {
734
0
        let forward = self.dfa.build_many(patterns)?;
735
0
        let reverse = self
736
0
            .dfa
737
0
            .clone()
738
0
            .configure(
739
0
                dense::Config::new()
740
0
                    .prefilter(None)
741
0
                    .specialize_start_states(false)
742
0
                    .start_kind(StartKind::Anchored)
743
0
                    .match_kind(MatchKind::All),
744
0
            )
745
0
            .thompson(crate::nfa::thompson::Config::new().reverse(true))
746
0
            .build_many(patterns)?;
747
0
        Ok(self.build_from_dfas(forward, reverse))
748
0
    }
749
750
    /// Build a sparse regex from the given patterns.
751
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
752
0
    pub fn build_many_sparse<P: AsRef<str>>(
753
0
        &self,
754
0
        patterns: &[P],
755
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
756
0
        let re = self.build_many(patterns)?;
757
0
        let forward = re.forward().to_sparse()?;
758
0
        let reverse = re.reverse().to_sparse()?;
759
0
        Ok(self.build_from_dfas(forward, reverse))
760
0
    }
761
762
    /// Build a regex from its component forward and reverse DFAs.
763
    ///
764
    /// This is useful when deserializing a regex from some arbitrary
765
    /// memory region. This is also useful for building regexes from other
766
    /// types of DFAs.
767
    ///
768
    /// If you're building the DFAs from scratch instead of building new DFAs
769
    /// from other DFAs, then you'll need to make sure that the reverse DFA is
770
    /// configured correctly to match the intended semantics. Namely:
771
    ///
772
    /// * It should be anchored.
773
    /// * It should use [`MatchKind::All`] semantics.
774
    /// * It should match in reverse.
775
    /// * Otherwise, its configuration should match the forward DFA.
776
    ///
777
    /// If these conditions aren't satisfied, then the behavior of searches is
778
    /// unspecified.
779
    ///
780
    /// Note that when using this constructor, no configuration is applied.
781
    /// Since this routine provides the DFAs to the builder, there is no
782
    /// opportunity to apply other configuration options.
783
    ///
784
    /// # Example
785
    ///
786
    /// This example is a bit a contrived. The usual use of these methods
787
    /// would involve serializing `initial_re` somewhere and then deserializing
788
    /// it later to build a regex. But in this case, we do everything in
789
    /// memory.
790
    ///
791
    /// ```
792
    /// use regex_automata::dfa::regex::Regex;
793
    ///
794
    /// let initial_re = Regex::new("foo[0-9]+")?;
795
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
796
    ///
797
    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
798
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
799
    /// assert_eq!(true, re.is_match(b"foo123"));
800
    /// # Ok::<(), Box<dyn std::error::Error>>(())
801
    /// ```
802
    ///
803
    /// This example shows how to build a `Regex` that uses sparse DFAs instead
804
    /// of dense DFAs without using one of the convenience `build_sparse`
805
    /// routines:
806
    ///
807
    /// ```
808
    /// use regex_automata::dfa::regex::Regex;
809
    ///
810
    /// let initial_re = Regex::new("foo[0-9]+")?;
811
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
812
    ///
813
    /// let fwd = initial_re.forward().to_sparse()?;
814
    /// let rev = initial_re.reverse().to_sparse()?;
815
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
816
    /// assert_eq!(true, re.is_match(b"foo123"));
817
    /// # Ok::<(), Box<dyn std::error::Error>>(())
818
    /// ```
819
38.5k
    pub fn build_from_dfas<A: Automaton>(
820
38.5k
        &self,
821
38.5k
        forward: A,
822
38.5k
        reverse: A,
823
38.5k
    ) -> Regex<A> {
824
38.5k
        Regex { forward, reverse }
825
38.5k
    }
<regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::dense::DFA<alloc::vec::Vec<u32>>>
Line
Count
Source
819
38.5k
    pub fn build_from_dfas<A: Automaton>(
820
38.5k
        &self,
821
38.5k
        forward: A,
822
38.5k
        reverse: A,
823
38.5k
    ) -> Regex<A> {
824
38.5k
        Regex { forward, reverse }
825
38.5k
    }
Unexecuted instantiation: <regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::sparse::DFA<alloc::vec::Vec<u8>>>
826
827
    /// Set the syntax configuration for this builder using
828
    /// [`syntax::Config`](crate::util::syntax::Config).
829
    ///
830
    /// This permits setting things like case insensitivity, Unicode and multi
831
    /// line mode.
832
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
833
0
    pub fn syntax(
834
0
        &mut self,
835
0
        config: crate::util::syntax::Config,
836
0
    ) -> &mut Builder {
837
0
        self.dfa.syntax(config);
838
0
        self
839
0
    }
840
841
    /// Set the Thompson NFA configuration for this builder using
842
    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
843
    ///
844
    /// This permits setting things like whether additional time should be
845
    /// spent shrinking the size of the NFA.
846
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
847
0
    pub fn thompson(
848
0
        &mut self,
849
0
        config: crate::nfa::thompson::Config,
850
0
    ) -> &mut Builder {
851
0
        self.dfa.thompson(config);
852
0
        self
853
0
    }
854
855
    /// Set the dense DFA compilation configuration for this builder using
856
    /// [`dense::Config`].
857
    ///
858
    /// This permits setting things like whether the underlying DFAs should
859
    /// be minimized.
860
    #[cfg(feature = "dfa-build")]
861
0
    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
862
0
        self.dfa.configure(config);
863
0
        self
864
0
    }
865
}
866
867
impl Default for Builder {
868
0
    fn default() -> Builder {
869
0
        Builder::new()
870
0
    }
871
}