Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/dfa/regex.rs
Line
Count
Source
1
/*!
2
A DFA-backed `Regex`.
3
4
This module provides [`Regex`], which is defined generically over the
5
[`Automaton`] trait. A `Regex` implements convenience routines you might have
6
come to expect, such as finding the start/end of a match and iterating over
7
all non-overlapping matches. This `Regex` type is limited in its capabilities
8
to what a DFA can provide. Therefore, APIs involving capturing groups, for
9
example, are not provided.
10
11
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12
finds the end offset of a match, where as the other is a "reverse" DFA that
13
find the start offset of a match.
14
15
See the [parent module](crate::dfa) for examples.
16
*/
17
18
#[cfg(feature = "alloc")]
19
use alloc::vec::Vec;
20
21
#[cfg(feature = "dfa-build")]
22
use crate::dfa::dense::BuildError;
23
use crate::{
24
    dfa::{automaton::Automaton, dense},
25
    util::{iter, search::Input},
26
    Anchored, Match, MatchError,
27
};
28
#[cfg(feature = "alloc")]
29
use crate::{
30
    dfa::{sparse, StartKind},
31
    util::search::MatchKind,
32
};
33
34
// When the alloc feature is enabled, the regex type sets its A type parameter
35
// to default to an owned dense DFA. But without alloc, we set no default. This
36
// makes things a lot more convenient in the common case, since writing out the
37
// DFA types is pretty annoying.
38
//
39
// Since we have two different definitions but only want to write one doc
40
// string, we use a macro to capture the doc and other attributes once and then
41
// repeat them for each definition.
42
macro_rules! define_regex_type {
43
    ($(#[$doc:meta])*) => {
44
        #[cfg(feature = "alloc")]
45
        $(#[$doc])*
46
        pub struct Regex<A = dense::OwnedDFA> {
47
            forward: A,
48
            reverse: A,
49
        }
50
51
        #[cfg(not(feature = "alloc"))]
52
        $(#[$doc])*
53
        pub struct Regex<A> {
54
            forward: A,
55
            reverse: A,
56
        }
57
    };
58
}
59
60
define_regex_type!(
61
    /// A regular expression that uses deterministic finite automata for fast
62
    /// searching.
63
    ///
64
    /// A regular expression is comprised of two DFAs, a "forward" DFA and a
65
    /// "reverse" DFA. The forward DFA is responsible for detecting the end of
66
    /// a match while the reverse DFA is responsible for detecting the start
67
    /// of a match. Thus, in order to find the bounds of any given match, a
68
    /// forward search must first be run followed by a reverse search. A match
69
    /// found by the forward DFA guarantees that the reverse DFA will also find
70
    /// a match.
71
    ///
72
    /// The type of the DFA used by a `Regex` corresponds to the `A` type
73
    /// parameter, which must satisfy the [`Automaton`] trait. Typically, `A`
74
    /// is either a [`dense::DFA`] or a [`sparse::DFA`], where dense DFAs use
75
    /// more memory but search faster, while sparse DFAs use less memory but
76
    /// search more slowly.
77
    ///
78
    /// # Crate features
79
    ///
80
    /// Note that despite what the documentation auto-generates, the _only_
81
    /// crate feature needed to use this type is `dfa-search`. You do _not_
82
    /// need to enable the `alloc` feature.
83
    ///
84
    /// By default, a regex's automaton type parameter is set to
85
    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
86
    /// in-memory work loads, this is the most convenient type that gives the
87
    /// best search performance. When the `alloc` feature is disabled, no
88
    /// default type is used.
89
    ///
90
    /// # When should I use this?
91
    ///
92
    /// Generally speaking, if you can afford the overhead of building a full
93
    /// DFA for your regex, and you don't need things like capturing groups,
94
    /// then this is a good choice if you're looking to optimize for matching
95
    /// speed. Note however that its speed may be worse than a general purpose
96
    /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
97
    /// underlying DFA.
98
    ///
99
    /// # Sparse DFAs
100
    ///
101
    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
102
    /// used with any kind of DFA. While this crate constructs dense DFAs by
103
    /// default, it is easy enough to build corresponding sparse DFAs, and then
104
    /// build a regex from them:
105
    ///
106
    /// ```
107
    /// use regex_automata::dfa::regex::Regex;
108
    ///
109
    /// // First, build a regex that uses dense DFAs.
110
    /// let dense_re = Regex::new("foo[0-9]+")?;
111
    ///
112
    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
113
    /// let fwd = dense_re.forward().to_sparse()?;
114
    /// let rev = dense_re.reverse().to_sparse()?;
115
    ///
116
    /// // Third, build a new regex from the constituent sparse DFAs.
117
    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
118
    ///
119
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
120
    /// assert_eq!(true, sparse_re.is_match(b"foo123"));
121
    ///
122
    /// # Ok::<(), Box<dyn std::error::Error>>(())
123
    /// ```
124
    ///
125
    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
126
    /// more succinctly. (Note though that dense DFAs are still constructed
127
    /// first internally, and then converted to sparse DFAs, as in the example
128
    /// above.)
129
    ///
130
    /// ```
131
    /// use regex_automata::dfa::regex::Regex;
132
    ///
133
    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
134
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
135
    /// assert!(sparse_re.is_match(b"foo123"));
136
    ///
137
    /// # Ok::<(), Box<dyn std::error::Error>>(())
138
    /// ```
139
    ///
140
    /// # Fallibility
141
    ///
142
    /// Most of the search routines defined on this type will _panic_ when the
143
    /// underlying search fails. This might be because the DFA gave up because
144
    /// it saw a quit byte, whether configured explicitly or via heuristic
145
    /// Unicode word boundary support, although neither are enabled by default.
146
    /// Or it might fail because an invalid `Input` configuration is given,
147
    /// for example, with an unsupported [`Anchored`] mode.
148
    ///
149
    /// If you need to handle these error cases instead of allowing them to
150
    /// trigger a panic, then the lower level [`Regex::try_search`] provides
151
    /// a fallible API that never panics.
152
    ///
153
    /// # Example
154
    ///
155
    /// This example shows how to cause a search to terminate if it sees a
156
    /// `\n` byte, and handle the error returned. This could be useful if, for
157
    /// example, you wanted to prevent a user supplied pattern from matching
158
    /// across a line boundary.
159
    ///
160
    /// ```
161
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
162
    /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
163
    ///
164
    /// let re = Regex::builder()
165
    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))
166
    ///     .build(r"foo\p{any}+bar")?;
167
    ///
168
    /// let input = Input::new("foo\nbar");
169
    /// // Normally this would produce a match, since \p{any} contains '\n'.
170
    /// // But since we instructed the automaton to enter a quit state if a
171
    /// // '\n' is observed, this produces a match error instead.
172
    /// let expected = MatchError::quit(b'\n', 3);
173
    /// let got = re.try_search(&input).unwrap_err();
174
    /// assert_eq!(expected, got);
175
    ///
176
    /// # Ok::<(), Box<dyn std::error::Error>>(())
177
    /// ```
178
    #[derive(Clone, Debug)]
179
);
180
181
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
182
impl Regex {
183
    /// Parse the given regular expression using the default configuration and
184
    /// return the corresponding regex.
185
    ///
186
    /// If you want a non-default configuration, then use the [`Builder`] to
187
    /// set your own configuration.
188
    ///
189
    /// # Example
190
    ///
191
    /// ```
192
    /// use regex_automata::{Match, dfa::regex::Regex};
193
    ///
194
    /// let re = Regex::new("foo[0-9]+bar")?;
195
    /// assert_eq!(
196
    ///     Some(Match::must(0, 3..14)),
197
    ///     re.find(b"zzzfoo12345barzzz"),
198
    /// );
199
    /// # Ok::<(), Box<dyn std::error::Error>>(())
200
    /// ```
201
0
    pub fn new(pattern: &str) -> Result<Regex, BuildError> {
202
0
        Builder::new().build(pattern)
203
0
    }
204
205
    /// Like `new`, but parses multiple patterns into a single "regex set."
206
    /// This similarly uses the default regex configuration.
207
    ///
208
    /// # Example
209
    ///
210
    /// ```
211
    /// use regex_automata::{Match, dfa::regex::Regex};
212
    ///
213
    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
214
    ///
215
    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
216
    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
217
    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
218
    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
219
    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
220
    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
221
    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
222
    /// assert_eq!(None, it.next());
223
    /// # Ok::<(), Box<dyn std::error::Error>>(())
224
    /// ```
225
    pub fn new_many<P: AsRef<str>>(
226
        patterns: &[P],
227
    ) -> Result<Regex, BuildError> {
228
        Builder::new().build_many(patterns)
229
    }
230
}
231
232
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
233
impl Regex<sparse::DFA<Vec<u8>>> {
234
    /// Parse the given regular expression using the default configuration,
235
    /// except using sparse DFAs, and return the corresponding regex.
236
    ///
237
    /// If you want a non-default configuration, then use the [`Builder`] to
238
    /// set your own configuration.
239
    ///
240
    /// # Example
241
    ///
242
    /// ```
243
    /// use regex_automata::{Match, dfa::regex::Regex};
244
    ///
245
    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
246
    /// assert_eq!(
247
    ///     Some(Match::must(0, 3..14)),
248
    ///     re.find(b"zzzfoo12345barzzz"),
249
    /// );
250
    /// # Ok::<(), Box<dyn std::error::Error>>(())
251
    /// ```
252
0
    pub fn new_sparse(
253
0
        pattern: &str,
254
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
255
0
        Builder::new().build_sparse(pattern)
256
0
    }
257
258
    /// Like `new`, but parses multiple patterns into a single "regex set"
259
    /// using sparse DFAs. This otherwise similarly uses the default regex
260
    /// configuration.
261
    ///
262
    /// # Example
263
    ///
264
    /// ```
265
    /// use regex_automata::{Match, dfa::regex::Regex};
266
    ///
267
    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
268
    ///
269
    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
270
    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
271
    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
272
    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
273
    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
274
    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
275
    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
276
    /// assert_eq!(None, it.next());
277
    /// # Ok::<(), Box<dyn std::error::Error>>(())
278
    /// ```
279
    pub fn new_many_sparse<P: AsRef<str>>(
280
        patterns: &[P],
281
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
282
        Builder::new().build_many_sparse(patterns)
283
    }
284
}
285
286
/// Convenience routines for regex construction.
287
impl Regex<dense::DFA<&'static [u32]>> {
288
    /// Return a builder for configuring the construction of a `Regex`.
289
    ///
290
    /// This is a convenience routine to avoid needing to import the
291
    /// [`Builder`] type in common cases.
292
    ///
293
    /// # Example
294
    ///
295
    /// This example shows how to use the builder to disable UTF-8 mode
296
    /// everywhere.
297
    ///
298
    /// ```
299
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
300
    /// use regex_automata::{
301
    ///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
302
    /// };
303
    ///
304
    /// let re = Regex::builder()
305
    ///     .syntax(syntax::Config::new().utf8(false))
306
    ///     .thompson(thompson::Config::new().utf8(false))
307
    ///     .build(r"foo(?-u:[^b])ar.*")?;
308
    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
309
    /// let expected = Some(Match::must(0, 1..9));
310
    /// let got = re.find(haystack);
311
    /// assert_eq!(expected, got);
312
    ///
313
    /// # Ok::<(), Box<dyn std::error::Error>>(())
314
    /// ```
315
0
    pub fn builder() -> Builder {
316
0
        Builder::new()
317
0
    }
318
}
319
320
/// Standard search routines for finding and iterating over matches.
321
impl<A: Automaton> Regex<A> {
322
    /// Returns true if and only if this regex matches the given haystack.
323
    ///
324
    /// This routine may short circuit if it knows that scanning future input
325
    /// will never lead to a different result. In particular, if the underlying
326
    /// DFA enters a match state or a dead state, then this routine will return
327
    /// `true` or `false`, respectively, without inspecting any future input.
328
    ///
329
    /// # Panics
330
    ///
331
    /// This routine panics if the search could not complete. This can occur
332
    /// in a number of circumstances:
333
    ///
334
    /// * The configuration of the DFA may permit it to "quit" the search.
335
    /// For example, setting quit bytes or enabling heuristic support for
336
    /// Unicode word boundaries. The default configuration does not enable any
337
    /// option that could result in the DFA quitting.
338
    /// * When the provided `Input` configuration is not supported. For
339
    /// example, by providing an unsupported anchor mode.
340
    ///
341
    /// When a search panics, callers cannot know whether a match exists or
342
    /// not.
343
    ///
344
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
345
    ///
346
    /// # Example
347
    ///
348
    /// ```
349
    /// use regex_automata::dfa::regex::Regex;
350
    ///
351
    /// let re = Regex::new("foo[0-9]+bar")?;
352
    /// assert_eq!(true, re.is_match("foo12345bar"));
353
    /// assert_eq!(false, re.is_match("foobar"));
354
    /// # Ok::<(), Box<dyn std::error::Error>>(())
355
    /// ```
356
    #[inline]
357
    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
358
        // Not only can we do an "earliest" search, but we can avoid doing a
359
        // reverse scan too.
360
        let input = input.into().earliest(true);
361
        self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
362
    }
363
364
    /// Returns the start and end offset of the leftmost match. If no match
365
    /// exists, then `None` is returned.
366
    ///
367
    /// # Panics
368
    ///
369
    /// This routine panics if the search could not complete. This can occur
370
    /// in a number of circumstances:
371
    ///
372
    /// * The configuration of the DFA may permit it to "quit" the search.
373
    /// For example, setting quit bytes or enabling heuristic support for
374
    /// Unicode word boundaries. The default configuration does not enable any
375
    /// option that could result in the DFA quitting.
376
    /// * When the provided `Input` configuration is not supported. For
377
    /// example, by providing an unsupported anchor mode.
378
    ///
379
    /// When a search panics, callers cannot know whether a match exists or
380
    /// not.
381
    ///
382
    /// Use [`Regex::try_search`] if you want to handle these error conditions.
383
    ///
384
    /// # Example
385
    ///
386
    /// ```
387
    /// use regex_automata::{Match, dfa::regex::Regex};
388
    ///
389
    /// // Greediness is applied appropriately.
390
    /// let re = Regex::new("foo[0-9]+")?;
391
    /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
392
    ///
393
    /// // Even though a match is found after reading the first byte (`a`),
394
    /// // the default leftmost-first match semantics demand that we find the
395
    /// // earliest match that prefers earlier parts of the pattern over latter
396
    /// // parts.
397
    /// let re = Regex::new("abc|a")?;
398
    /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
399
    /// # Ok::<(), Box<dyn std::error::Error>>(())
400
    /// ```
401
    #[inline]
402
    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
403
        self.try_search(&input.into()).unwrap()
404
    }
405
406
    /// Returns an iterator over all non-overlapping leftmost matches in the
407
    /// given bytes. If no match exists, then the iterator yields no elements.
408
    ///
409
    /// This corresponds to the "standard" regex search iterator.
410
    ///
411
    /// # Panics
412
    ///
413
    /// If the search returns an error during iteration, then iteration
414
    /// panics. See [`Regex::find`] for the panic conditions.
415
    ///
416
    /// Use [`Regex::try_search`] with
417
    /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
418
    /// handle these error conditions.
419
    ///
420
    /// # Example
421
    ///
422
    /// ```
423
    /// use regex_automata::{Match, dfa::regex::Regex};
424
    ///
425
    /// let re = Regex::new("foo[0-9]+")?;
426
    /// let text = "foo1 foo12 foo123";
427
    /// let matches: Vec<Match> = re.find_iter(text).collect();
428
    /// assert_eq!(matches, vec![
429
    ///     Match::must(0, 0..4),
430
    ///     Match::must(0, 5..10),
431
    ///     Match::must(0, 11..17),
432
    /// ]);
433
    /// # Ok::<(), Box<dyn std::error::Error>>(())
434
    /// ```
435
    #[inline]
436
    pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
437
        &'r self,
438
        input: I,
439
    ) -> FindMatches<'r, 'h, A> {
440
        let it = iter::Searcher::new(input.into());
441
        FindMatches { re: self, it }
442
    }
443
}
444
445
/// Lower level fallible search routines that permit controlling where the
446
/// search starts and ends in a particular sequence.
447
impl<A: Automaton> Regex<A> {
448
    /// Returns the start and end offset of the leftmost match. If no match
449
    /// exists, then `None` is returned.
450
    ///
451
    /// This is like [`Regex::find`] but with two differences:
452
    ///
453
    /// 1. It is not generic over `Into<Input>` and instead accepts a
454
    /// `&Input`. This permits reusing the same `Input` for multiple searches
455
    /// without needing to create a new one. This _may_ help with latency.
456
    /// 2. It returns an error if the search could not complete where as
457
    /// [`Regex::find`] will panic.
458
    ///
459
    /// # Errors
460
    ///
461
    /// This routine errors if the search could not complete. This can occur
462
    /// in the following circumstances:
463
    ///
464
    /// * The configuration of the DFA may permit it to "quit" the search.
465
    /// For example, setting quit bytes or enabling heuristic support for
466
    /// Unicode word boundaries. The default configuration does not enable any
467
    /// option that could result in the DFA quitting.
468
    /// * When the provided `Input` configuration is not supported. For
469
    /// example, by providing an unsupported anchor mode.
470
    ///
471
    /// When a search returns an error, callers cannot know whether a match
472
    /// exists or not.
473
    #[inline]
474
35.8k
    pub fn try_search(
475
35.8k
        &self,
476
35.8k
        input: &Input<'_>,
477
35.8k
    ) -> Result<Option<Match>, MatchError> {
478
35.8k
        let (fwd, rev) = (self.forward(), self.reverse());
479
35.8k
        let end = match fwd.try_search_fwd(input)? {
480
12.5k
            None => return Ok(None),
481
8.81k
            Some(end) => end,
482
        };
483
        // This special cases an empty match at the beginning of the search. If
484
        // our end matches our start, then since a reverse DFA can't match past
485
        // the start, it must follow that our starting position is also our end
486
        // position. So short circuit and skip the reverse search.
487
8.81k
        if input.start() == end.offset() {
488
5.77k
            return Ok(Some(Match::new(
489
5.77k
                end.pattern(),
490
5.77k
                end.offset()..end.offset(),
491
5.77k
            )));
492
3.04k
        }
493
        // We can also skip the reverse search if we know our search was
494
        // anchored. This occurs either when the input config is anchored or
495
        // when we know the regex itself is anchored. In this case, we know the
496
        // start of the match, if one is found, must be the start of the
497
        // search.
498
3.04k
        if self.is_anchored(input) {
499
34
            return Ok(Some(Match::new(
500
34
                end.pattern(),
501
34
                input.start()..end.offset(),
502
34
            )));
503
3.01k
        }
504
        // N.B. I have tentatively convinced myself that it isn't necessary
505
        // to specify the specific pattern for the reverse search since the
506
        // reverse search will always find the same pattern to match as the
507
        // forward search. But I lack a rigorous proof. Why not just provide
508
        // the pattern anyway? Well, if it is needed, then leaving it out
509
        // gives us a chance to find a witness. (Also, if we don't need to
510
        // specify the pattern, then we don't need to build the reverse DFA
511
        // with 'starts_for_each_pattern' enabled.)
512
        //
513
        // We also need to be careful to disable 'earliest' for the reverse
514
        // search, since it could be enabled for the forward search. In the
515
        // reverse case, to satisfy "leftmost" criteria, we need to match
516
        // as much as we can. We also need to be careful to make the search
517
        // anchored. We don't want the reverse search to report any matches
518
        // other than the one beginning at the end of our forward search.
519
3.01k
        let revsearch = input
520
3.01k
            .clone()
521
3.01k
            .span(input.start()..end.offset())
522
3.01k
            .anchored(Anchored::Yes)
523
3.01k
            .earliest(false);
524
3.01k
        let start = rev
525
3.01k
            .try_search_rev(&revsearch)?
526
2.83k
            .expect("reverse search must match if forward search does");
527
2.83k
        assert_eq!(
528
2.83k
            start.pattern(),
529
2.83k
            end.pattern(),
530
0
            "forward and reverse search must match same pattern",
531
        );
532
2.83k
        assert!(start.offset() <= end.offset());
533
2.83k
        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
534
35.8k
    }
535
536
    /// Returns true if either the given input specifies an anchored search
537
    /// or if the underlying DFA is always anchored.
538
3.04k
    fn is_anchored(&self, input: &Input<'_>) -> bool {
539
3.04k
        match input.get_anchored() {
540
3.04k
            Anchored::No => self.forward().is_always_start_anchored(),
541
0
            Anchored::Yes | Anchored::Pattern(_) => true,
542
        }
543
3.04k
    }
544
}
545
546
/// Non-search APIs for querying information about the regex and setting a
547
/// prefilter.
548
impl<A: Automaton> Regex<A> {
549
    /// Return the underlying DFA responsible for forward matching.
550
    ///
551
    /// This is useful for accessing the underlying DFA and converting it to
552
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
553
    /// for an example of where this might be useful.
554
70.4k
    pub fn forward(&self) -> &A {
555
70.4k
        &self.forward
556
70.4k
    }
557
558
    /// Return the underlying DFA responsible for reverse matching.
559
    ///
560
    /// This is useful for accessing the underlying DFA and converting it to
561
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
562
    /// for an example of where this might be useful.
563
306k
    pub fn reverse(&self) -> &A {
564
306k
        &self.reverse
565
306k
    }
566
567
    /// Returns the total number of patterns matched by this regex.
568
    ///
569
    /// # Example
570
    ///
571
    /// ```
572
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
573
    /// use regex_automata::dfa::regex::Regex;
574
    ///
575
    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
576
    /// assert_eq!(3, re.pattern_len());
577
    /// # Ok::<(), Box<dyn std::error::Error>>(())
578
    /// ```
579
    pub fn pattern_len(&self) -> usize {
580
        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
581
        self.forward().pattern_len()
582
    }
583
}
584
585
/// An iterator over all non-overlapping matches for an infallible search.
586
///
587
/// The iterator yields a [`Match`] value until no more matches could be found.
588
/// If the underlying regex engine returns an error, then a panic occurs.
589
///
590
/// The type parameters are as follows:
591
///
592
/// * `A` represents the type of the underlying DFA that implements the
593
/// [`Automaton`] trait.
594
///
595
/// The lifetime parameters are as follows:
596
///
597
/// * `'h` represents the lifetime of the haystack being searched.
598
/// * `'r` represents the lifetime of the regex object itself.
599
///
600
/// This iterator can be created with the [`Regex::find_iter`] method.
601
#[derive(Debug)]
602
pub struct FindMatches<'r, 'h, A> {
603
    re: &'r Regex<A>,
604
    it: iter::Searcher<'h>,
605
}
606
607
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
608
    type Item = Match;
609
610
    #[inline]
611
    fn next(&mut self) -> Option<Match> {
612
        let FindMatches { re, ref mut it } = *self;
613
        it.advance(|input| re.try_search(input))
614
    }
615
}
616
617
/// A builder for a regex based on deterministic finite automatons.
618
///
619
/// This builder permits configuring options for the syntax of a pattern, the
620
/// NFA construction, the DFA construction and finally the regex searching
621
/// itself. This builder is different from a general purpose regex builder in
622
/// that it permits fine grain configuration of the construction process. The
623
/// trade off for this is complexity, and the possibility of setting a
624
/// configuration that might not make sense. For example, there are two
625
/// different UTF-8 modes:
626
///
627
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
628
/// whether the pattern itself can contain sub-expressions that match invalid
629
/// UTF-8.
630
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
631
/// how the regex iterators themselves advance the starting position of the
632
/// next search when a match with zero length is found.
633
///
634
/// Generally speaking, callers will want to either enable all of these or
635
/// disable all of these.
636
///
637
/// Internally, building a regex requires building two DFAs, where one is
638
/// responsible for finding the end of a match and the other is responsible
639
/// for finding the start of a match. If you only need to detect whether
640
/// something matched, or only the end of a match, then you should use a
641
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
642
/// building two DFAs.
643
///
644
/// # Build methods
645
///
646
/// This builder has a few "build" methods. In general, it's the result of
647
/// combining the following parameters:
648
///
649
/// * Building one or many regexes.
650
/// * Building a regex with dense or sparse DFAs.
651
///
652
/// The simplest "build" method is [`Builder::build`]. It accepts a single
653
/// pattern and builds a dense DFA using `usize` for the state identifier
654
/// representation.
655
///
656
/// The most general "build" method is [`Builder::build_many`], which permits
657
/// building a regex that searches for multiple patterns simultaneously while
658
/// using a specific state identifier representation.
659
///
660
/// The most flexible "build" method, but hardest to use, is
661
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
662
/// just a pair of DFAs, and this method allows you to specify those DFAs
663
/// exactly.
664
///
665
/// # Example
666
///
667
/// This example shows how to disable UTF-8 mode in the syntax and the regex
668
/// itself. This is generally what you want for matching on arbitrary bytes.
669
///
670
/// ```
671
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
672
/// use regex_automata::{
673
///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,
674
/// };
675
///
676
/// let re = Regex::builder()
677
///     .syntax(syntax::Config::new().utf8(false))
678
///     .thompson(thompson::Config::new().utf8(false))
679
///     .build(r"foo(?-u:[^b])ar.*")?;
680
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
681
/// let expected = Some(Match::must(0, 1..9));
682
/// let got = re.find(haystack);
683
/// assert_eq!(expected, got);
684
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
685
/// // but the subsequent `.*` does not! Disabling UTF-8
686
/// // on the syntax permits this.
687
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
688
///
689
/// # Ok::<(), Box<dyn std::error::Error>>(())
690
/// ```
691
#[derive(Clone, Debug)]
692
pub struct Builder {
693
    #[cfg(feature = "dfa-build")]
694
    dfa: dense::Builder,
695
}
696
697
impl Builder {
698
    /// Create a new regex builder with the default configuration.
699
38.5k
    pub fn new() -> Builder {
700
38.5k
        Builder {
701
38.5k
            #[cfg(feature = "dfa-build")]
702
38.5k
            dfa: dense::Builder::new(),
703
38.5k
        }
704
38.5k
    }
705
706
    /// Build a regex from the given pattern.
707
    ///
708
    /// If there was a problem parsing or compiling the pattern, then an error
709
    /// is returned.
710
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
711
0
    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
712
0
        self.build_many(&[pattern])
713
0
    }
714
715
    /// Build a regex from the given pattern using sparse DFAs.
716
    ///
717
    /// If there was a problem parsing or compiling the pattern, then an error
718
    /// is returned.
719
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
720
0
    pub fn build_sparse(
721
0
        &self,
722
0
        pattern: &str,
723
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
724
0
        self.build_many_sparse(&[pattern])
725
0
    }
726
727
    /// Build a regex from the given patterns.
728
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
729
0
    pub fn build_many<P: AsRef<str>>(
730
0
        &self,
731
0
        patterns: &[P],
732
0
    ) -> Result<Regex, BuildError> {
733
0
        let forward = self.dfa.build_many(patterns)?;
734
0
        let reverse = self
735
0
            .dfa
736
0
            .clone()
737
0
            .configure(
738
0
                dense::Config::new()
739
0
                    .prefilter(None)
740
0
                    .specialize_start_states(false)
741
0
                    .start_kind(StartKind::Anchored)
742
0
                    .match_kind(MatchKind::All),
743
0
            )
744
0
            .thompson(crate::nfa::thompson::Config::new().reverse(true))
745
0
            .build_many(patterns)?;
746
0
        Ok(self.build_from_dfas(forward, reverse))
747
0
    }
748
749
    /// Build a sparse regex from the given patterns.
750
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
751
0
    pub fn build_many_sparse<P: AsRef<str>>(
752
0
        &self,
753
0
        patterns: &[P],
754
0
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
755
0
        let re = self.build_many(patterns)?;
756
0
        let forward = re.forward().to_sparse()?;
757
0
        let reverse = re.reverse().to_sparse()?;
758
0
        Ok(self.build_from_dfas(forward, reverse))
759
0
    }
760
761
    /// Build a regex from its component forward and reverse DFAs.
762
    ///
763
    /// This is useful when deserializing a regex from some arbitrary
764
    /// memory region. This is also useful for building regexes from other
765
    /// types of DFAs.
766
    ///
767
    /// If you're building the DFAs from scratch instead of building new DFAs
768
    /// from other DFAs, then you'll need to make sure that the reverse DFA is
769
    /// configured correctly to match the intended semantics. Namely:
770
    ///
771
    /// * It should be anchored.
772
    /// * It should use [`MatchKind::All`] semantics.
773
    /// * It should match in reverse.
774
    /// * Otherwise, its configuration should match the forward DFA.
775
    ///
776
    /// If these conditions aren't satisfied, then the behavior of searches is
777
    /// unspecified.
778
    ///
779
    /// Note that when using this constructor, no configuration is applied.
780
    /// Since this routine provides the DFAs to the builder, there is no
781
    /// opportunity to apply other configuration options.
782
    ///
783
    /// # Example
784
    ///
785
    /// This example is a bit a contrived. The usual use of these methods
786
    /// would involve serializing `initial_re` somewhere and then deserializing
787
    /// it later to build a regex. But in this case, we do everything in
788
    /// memory.
789
    ///
790
    /// ```
791
    /// use regex_automata::dfa::regex::Regex;
792
    ///
793
    /// let initial_re = Regex::new("foo[0-9]+")?;
794
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
795
    ///
796
    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
797
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
798
    /// assert_eq!(true, re.is_match(b"foo123"));
799
    /// # Ok::<(), Box<dyn std::error::Error>>(())
800
    /// ```
801
    ///
802
    /// This example shows how to build a `Regex` that uses sparse DFAs instead
803
    /// of dense DFAs without using one of the convenience `build_sparse`
804
    /// routines:
805
    ///
806
    /// ```
807
    /// use regex_automata::dfa::regex::Regex;
808
    ///
809
    /// let initial_re = Regex::new("foo[0-9]+")?;
810
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
811
    ///
812
    /// let fwd = initial_re.forward().to_sparse()?;
813
    /// let rev = initial_re.reverse().to_sparse()?;
814
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
815
    /// assert_eq!(true, re.is_match(b"foo123"));
816
    /// # Ok::<(), Box<dyn std::error::Error>>(())
817
    /// ```
818
38.5k
    pub fn build_from_dfas<A: Automaton>(
819
38.5k
        &self,
820
38.5k
        forward: A,
821
38.5k
        reverse: A,
822
38.5k
    ) -> Regex<A> {
823
38.5k
        Regex { forward, reverse }
824
38.5k
    }
<regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::dense::DFA<alloc::vec::Vec<u32>>>
Line
Count
Source
818
38.5k
    pub fn build_from_dfas<A: Automaton>(
819
38.5k
        &self,
820
38.5k
        forward: A,
821
38.5k
        reverse: A,
822
38.5k
    ) -> Regex<A> {
823
38.5k
        Regex { forward, reverse }
824
38.5k
    }
Unexecuted instantiation: <regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::sparse::DFA<alloc::vec::Vec<u8>>>
825
826
    /// Set the syntax configuration for this builder using
827
    /// [`syntax::Config`](crate::util::syntax::Config).
828
    ///
829
    /// This permits setting things like case insensitivity, Unicode and multi
830
    /// line mode.
831
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
832
0
    pub fn syntax(
833
0
        &mut self,
834
0
        config: crate::util::syntax::Config,
835
0
    ) -> &mut Builder {
836
0
        self.dfa.syntax(config);
837
0
        self
838
0
    }
839
840
    /// Set the Thompson NFA configuration for this builder using
841
    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
842
    ///
843
    /// This permits setting things like whether additional time should be
844
    /// spent shrinking the size of the NFA.
845
    #[cfg(all(feature = "syntax", feature = "dfa-build"))]
846
0
    pub fn thompson(
847
0
        &mut self,
848
0
        config: crate::nfa::thompson::Config,
849
0
    ) -> &mut Builder {
850
0
        self.dfa.thompson(config);
851
0
        self
852
0
    }
853
854
    /// Set the dense DFA compilation configuration for this builder using
855
    /// [`dense::Config`].
856
    ///
857
    /// This permits setting things like whether the underlying DFAs should
858
    /// be minimized.
859
    #[cfg(feature = "dfa-build")]
860
0
    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
861
0
        self.dfa.configure(config);
862
0
        self
863
0
    }
864
}
865
866
impl Default for Builder {
867
0
    fn default() -> Builder {
868
0
        Builder::new()
869
0
    }
870
}