Coverage Report

Created: 2025-09-27 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/dfa/regex.rs
Line
Count
Source
1
/*!
2
A DFA-backed `Regex`.
3
4
This module provides [`Regex`], which is defined generically over the
5
[`Automaton`] trait. A `Regex` implements convenience routines you might have
6
come to expect, such as finding the start/end of a match and iterating over
7
all non-overlapping matches. This `Regex` type is limited in its capabilities
8
to what a DFA can provide. Therefore, APIs involving capturing groups, for
9
example, are not provided.
10
11
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12
finds the end offset of a match, where as the other is a "reverse" DFA that
13
find the start offset of a match.
14
15
See the [parent module](crate::dfa) for examples.
16
*/
17
18
#[cfg(feature = "alloc")]
19
use alloc::vec::Vec;
20
21
use crate::{
22
    dfa::automaton::{Automaton, OverlappingState},
23
    util::prefilter::{self, Prefilter},
24
    MatchError, MultiMatch,
25
};
26
#[cfg(feature = "alloc")]
27
use crate::{
28
    dfa::{dense, error::Error, sparse},
29
    nfa::thompson,
30
    util::matchtypes::MatchKind,
31
};
32
33
// When the alloc feature is enabled, the regex type sets its A type parameter
34
// to default to an owned dense DFA. But without alloc, we set no default. This
35
// makes things a lot more convenient in the common case, since writing out the
36
// DFA types is pretty annoying.
37
//
38
// Since we have two different definitions but only want to write one doc
39
// string, we use a macro to capture the doc and other attributes once and then
40
// repeat them for each definition.
41
macro_rules! define_regex_type {
42
    ($(#[$doc:meta])*) => {
43
        #[cfg(feature = "alloc")]
44
        $(#[$doc])*
45
        pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> {
46
            prefilter: Option<P>,
47
            forward: A,
48
            reverse: A,
49
            utf8: bool,
50
        }
51
52
        #[cfg(not(feature = "alloc"))]
53
        $(#[$doc])*
54
        pub struct Regex<A, P = prefilter::None> {
55
            prefilter: Option<P>,
56
            forward: A,
57
            reverse: A,
58
            utf8: bool,
59
        }
60
    };
61
}
62
63
define_regex_type!(
64
    /// A regular expression that uses deterministic finite automata for fast
65
    /// searching.
66
    ///
67
    /// A regular expression is comprised of two DFAs, a "forward" DFA and a
68
    /// "reverse" DFA. The forward DFA is responsible for detecting the end of
69
    /// a match while the reverse DFA is responsible for detecting the start
70
    /// of a match. Thus, in order to find the bounds of any given match, a
71
    /// forward search must first be run followed by a reverse search. A match
72
    /// found by the forward DFA guarantees that the reverse DFA will also find
73
    /// a match.
74
    ///
75
    /// The type of the DFA used by a `Regex` corresponds to the `A` type
76
    /// parameter, which must satisfy the [`Automaton`] trait. Typically,
77
    /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
78
    /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
79
    /// memory but search faster, while sparse DFAs use less memory but search
80
    /// more slowly.
81
    ///
82
    /// By default, a regex's automaton type parameter is set to
83
    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
84
    /// in-memory work loads, this is the most convenient type that gives the
85
    /// best search performance. When the `alloc` feature is disabled, no
86
    /// default type is used.
87
    ///
88
    /// A `Regex` also has a `P` type parameter, which is used to select the
89
    /// prefilter used during search. By default, no prefilter is enabled by
90
    /// setting the type to default to [`prefilter::None`]. A prefilter can be
91
    /// enabled by using the [`Regex::prefilter`] method.
92
    ///
93
    /// # When should I use this?
94
    ///
95
    /// Generally speaking, if you can afford the overhead of building a full
96
    /// DFA for your regex, and you don't need things like capturing groups,
97
    /// then this is a good choice if you're looking to optimize for matching
98
    /// speed. Note however that its speed may be worse than a general purpose
99
    /// regex engine if you don't select a good [prefilter].
100
    ///
101
    /// # Earliest vs Leftmost vs Overlapping
102
    ///
103
    /// The search routines exposed on a `Regex` reflect three different ways
104
    /// of searching:
105
    ///
106
    /// * "earliest" means to stop as soon as a match has been detected.
107
    /// * "leftmost" means to continue matching until the underlying
108
    ///   automaton cannot advance. This reflects "standard" searching you
109
    ///   might be used to in other regex engines. e.g., This permits
110
    ///   non-greedy and greedy searching to work as you would expect.
111
    /// * "overlapping" means to find all possible matches, even if they
112
    ///   overlap.
113
    ///
114
    /// Generally speaking, when doing an overlapping search, you'll want to
115
    /// build your regex DFAs with [`MatchKind::All`] semantics. Using
116
    /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
117
    /// likely to lead to odd behavior since `LeftmostFirst` specifically omits
118
    /// some matches that can never be reported due to its semantics.
119
    ///
120
    /// The following example shows the differences between how these different
121
    /// types of searches impact looking for matches of `[a-z]+` in the
122
    /// haystack `abc`.
123
    ///
124
    /// ```
125
    /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch};
126
    ///
127
    /// let pattern = r"[a-z]+";
128
    /// let haystack = "abc".as_bytes();
129
    ///
130
    /// // With leftmost-first semantics, we test "earliest" and "leftmost".
131
    /// let re = dfa::regex::Builder::new()
132
    ///     .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst))
133
    ///     .build(pattern)?;
134
    ///
135
    /// // "earliest" searching isn't impacted by greediness
136
    /// let mut it = re.find_earliest_iter(haystack);
137
    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
138
    /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
139
    /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
140
    /// assert_eq!(None, it.next());
141
    ///
142
    /// // "leftmost" searching supports greediness (and non-greediness)
143
    /// let mut it = re.find_leftmost_iter(haystack);
144
    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
145
    /// assert_eq!(None, it.next());
146
    ///
147
    /// // For overlapping, we want "all" match kind semantics.
148
    /// let re = dfa::regex::Builder::new()
149
    ///     .dense(dense::Config::new().match_kind(MatchKind::All))
150
    ///     .build(pattern)?;
151
    ///
152
    /// // In the overlapping search, we find all three possible matches
153
    /// // starting at the beginning of the haystack.
154
    /// let mut it = re.find_overlapping_iter(haystack);
155
    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
156
    /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
157
    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
158
    /// assert_eq!(None, it.next());
159
    ///
160
    /// # Ok::<(), Box<dyn std::error::Error>>(())
161
    /// ```
162
    ///
163
    /// # Sparse DFAs
164
    ///
165
    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
166
    /// used with any kind of DFA. While this crate constructs dense DFAs by
167
    /// default, it is easy enough to build corresponding sparse DFAs, and then
168
    /// build a regex from them:
169
    ///
170
    /// ```
171
    /// use regex_automata::dfa::regex::Regex;
172
    ///
173
    /// // First, build a regex that uses dense DFAs.
174
    /// let dense_re = Regex::new("foo[0-9]+")?;
175
    ///
176
    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
177
    /// let fwd = dense_re.forward().to_sparse()?;
178
    /// let rev = dense_re.reverse().to_sparse()?;
179
    ///
180
    /// // Third, build a new regex from the constituent sparse DFAs.
181
    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
182
    ///
183
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
184
    /// assert_eq!(true, sparse_re.is_match(b"foo123"));
185
    ///
186
    /// # Ok::<(), Box<dyn std::error::Error>>(())
187
    /// ```
188
    ///
189
    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
190
    /// more succinctly. (Note though that dense DFAs are still constructed
191
    /// first internally, and then converted to sparse DFAs, as in the example
192
    /// above.)
193
    ///
194
    /// ```
195
    /// use regex_automata::dfa::regex::Regex;
196
    ///
197
    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
198
    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
199
    /// assert!(sparse_re.is_match(b"foo123"));
200
    ///
201
    /// # Ok::<(), Box<dyn std::error::Error>>(())
202
    /// ```
203
    ///
204
    /// # Fallibility
205
    ///
206
    /// In non-default configurations, the DFAs generated in this module may
207
    /// return an error during a search. (Currently, the only way this happens
208
    /// is if quit bytes are added or Unicode word boundaries are heuristically
209
    /// enabled, both of which are turned off by default.) For convenience, the
210
    /// main search routines, like [`find_leftmost`](Regex::find_leftmost),
211
    /// will panic if an error occurs. However, if you need to use DFAs
212
    /// which may produce an error at search time, then there are fallible
213
    /// equivalents of all search routines. For example, for `find_leftmost`,
214
    /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost).
215
    /// The routines prefixed with `try_` return `Result<Option<MultiMatch>,
216
    /// MatchError>`, where as the infallible routines simply return
217
    /// `Option<MultiMatch>`.
218
    ///
219
    /// # Example
220
    ///
221
    /// This example shows how to cause a search to terminate if it sees a
222
    /// `\n` byte, and handle the error returned. This could be useful if, for
223
    /// example, you wanted to prevent a user supplied pattern from matching
224
    /// across a line boundary.
225
    ///
226
    /// ```
227
    /// use regex_automata::{dfa::{self, regex::Regex}, MatchError};
228
    ///
229
    /// let re = Regex::builder()
230
    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))
231
    ///     .build(r"foo\p{any}+bar")?;
232
    ///
233
    /// let haystack = "foo\nbar".as_bytes();
234
    /// // Normally this would produce a match, since \p{any} contains '\n'.
235
    /// // But since we instructed the automaton to enter a quit state if a
236
    /// // '\n' is observed, this produces a match error instead.
237
    /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
238
    /// let got = re.try_find_leftmost(haystack).unwrap_err();
239
    /// assert_eq!(expected, got);
240
    ///
241
    /// # Ok::<(), Box<dyn std::error::Error>>(())
242
    /// ```
243
    #[derive(Clone, Debug)]
244
);
245
246
#[cfg(feature = "alloc")]
247
impl Regex {
248
    /// Parse the given regular expression using the default configuration and
249
    /// return the corresponding regex.
250
    ///
251
    /// If you want a non-default configuration, then use the [`Builder`] to
252
    /// set your own configuration.
253
    ///
254
    /// # Example
255
    ///
256
    /// ```
257
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
258
    ///
259
    /// let re = Regex::new("foo[0-9]+bar")?;
260
    /// assert_eq!(
261
    ///     Some(MultiMatch::must(0, 3, 14)),
262
    ///     re.find_leftmost(b"zzzfoo12345barzzz"),
263
    /// );
264
    /// # Ok::<(), Box<dyn std::error::Error>>(())
265
    /// ```
266
    pub fn new(pattern: &str) -> Result<Regex, Error> {
267
        Builder::new().build(pattern)
268
    }
269
270
    /// Like `new`, but parses multiple patterns into a single "regex set."
271
    /// This similarly uses the default regex configuration.
272
    ///
273
    /// # Example
274
    ///
275
    /// ```
276
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
277
    ///
278
    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
279
    ///
280
    /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
281
    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
282
    /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
283
    /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
284
    /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
285
    /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
286
    /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
287
    /// assert_eq!(None, it.next());
288
    /// # Ok::<(), Box<dyn std::error::Error>>(())
289
    /// ```
290
    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> {
291
        Builder::new().build_many(patterns)
292
    }
293
}
294
295
#[cfg(feature = "alloc")]
296
impl Regex<sparse::DFA<Vec<u8>>> {
297
    /// Parse the given regular expression using the default configuration,
298
    /// except using sparse DFAs, and return the corresponding regex.
299
    ///
300
    /// If you want a non-default configuration, then use the [`Builder`] to
301
    /// set your own configuration.
302
    ///
303
    /// # Example
304
    ///
305
    /// ```
306
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
307
    ///
308
    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
309
    /// assert_eq!(
310
    ///     Some(MultiMatch::must(0, 3, 14)),
311
    ///     re.find_leftmost(b"zzzfoo12345barzzz"),
312
    /// );
313
    /// # Ok::<(), Box<dyn std::error::Error>>(())
314
    /// ```
315
    pub fn new_sparse(
316
        pattern: &str,
317
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
318
        Builder::new().build_sparse(pattern)
319
    }
320
321
    /// Like `new`, but parses multiple patterns into a single "regex set"
322
    /// using sparse DFAs. This otherwise similarly uses the default regex
323
    /// configuration.
324
    ///
325
    /// # Example
326
    ///
327
    /// ```
328
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
329
    ///
330
    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
331
    ///
332
    /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
333
    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
334
    /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
335
    /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
336
    /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
337
    /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
338
    /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
339
    /// assert_eq!(None, it.next());
340
    /// # Ok::<(), Box<dyn std::error::Error>>(())
341
    /// ```
342
    pub fn new_many_sparse<P: AsRef<str>>(
343
        patterns: &[P],
344
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
345
        Builder::new().build_many_sparse(patterns)
346
    }
347
}
348
349
/// Convenience routines for regex construction.
350
#[cfg(feature = "alloc")]
351
impl Regex {
352
    /// Return a default configuration for a `Regex`.
353
    ///
354
    /// This is a convenience routine to avoid needing to import the `Config`
355
    /// type when customizing the construction of a regex.
356
    ///
357
    /// # Example
358
    ///
359
    /// This example shows how to disable UTF-8 mode for `Regex` iteration.
360
    /// When UTF-8 mode is disabled, the position immediately following an
361
    /// empty match is where the next search begins, instead of the next
362
    /// position of a UTF-8 encoded codepoint.
363
    ///
364
    /// ```
365
    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
366
    ///
367
    /// let re = Regex::builder()
368
    ///     .configure(Regex::config().utf8(false))
369
    ///     .build(r"")?;
370
    /// let haystack = "a☃z".as_bytes();
371
    /// let mut it = re.find_leftmost_iter(haystack);
372
    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
373
    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
374
    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
375
    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
376
    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
377
    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
378
    /// assert_eq!(None, it.next());
379
    ///
380
    /// # Ok::<(), Box<dyn std::error::Error>>(())
381
    /// ```
382
    pub fn config() -> Config {
383
        Config::new()
384
    }
385
386
    /// Return a builder for configuring the construction of a `Regex`.
387
    ///
388
    /// This is a convenience routine to avoid needing to import the
389
    /// [`Builder`] type in common cases.
390
    ///
391
    /// # Example
392
    ///
393
    /// This example shows how to use the builder to disable UTF-8 mode
394
    /// everywhere.
395
    ///
396
    /// ```
397
    /// use regex_automata::{
398
    ///     dfa::regex::Regex,
399
    ///     nfa::thompson,
400
    ///     MultiMatch, SyntaxConfig,
401
    /// };
402
    ///
403
    /// let re = Regex::builder()
404
    ///     .configure(Regex::config().utf8(false))
405
    ///     .syntax(SyntaxConfig::new().utf8(false))
406
    ///     .thompson(thompson::Config::new().utf8(false))
407
    ///     .build(r"foo(?-u:[^b])ar.*")?;
408
    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
409
    /// let expected = Some(MultiMatch::must(0, 1, 9));
410
    /// let got = re.find_leftmost(haystack);
411
    /// assert_eq!(expected, got);
412
    ///
413
    /// # Ok::<(), Box<dyn std::error::Error>>(())
414
    /// ```
415
    pub fn builder() -> Builder {
416
        Builder::new()
417
    }
418
}
419
420
/// Standard search routines for finding and iterating over matches.
421
impl<A: Automaton, P: Prefilter> Regex<A, P> {
422
    /// Returns true if and only if this regex matches the given haystack.
423
    ///
424
    /// This routine may short circuit if it knows that scanning future input
425
    /// will never lead to a different result. In particular, if the underlying
426
    /// DFA enters a match state or a dead state, then this routine will return
427
    /// `true` or `false`, respectively, without inspecting any future input.
428
    ///
429
    /// # Panics
430
    ///
431
    /// If the underlying DFAs return an error, then this routine panics. This
432
    /// only occurs in non-default configurations where quit bytes are used or
433
    /// Unicode word boundaries are heuristically enabled.
434
    ///
435
    /// The fallible version of this routine is
436
    /// [`try_is_match`](Regex::try_is_match).
437
    ///
438
    /// # Example
439
    ///
440
    /// ```
441
    /// use regex_automata::dfa::regex::Regex;
442
    ///
443
    /// let re = Regex::new("foo[0-9]+bar")?;
444
    /// assert_eq!(true, re.is_match(b"foo12345bar"));
445
    /// assert_eq!(false, re.is_match(b"foobar"));
446
    /// # Ok::<(), Box<dyn std::error::Error>>(())
447
    /// ```
448
0
    pub fn is_match(&self, haystack: &[u8]) -> bool {
449
0
        self.is_match_at(haystack, 0, haystack.len())
450
0
    }
451
452
    /// Returns the first position at which a match is found.
453
    ///
454
    /// This routine stops scanning input in precisely the same circumstances
455
    /// as `is_match`. The key difference is that this routine returns the
456
    /// position at which it stopped scanning input if and only if a match
457
    /// was found. If no match is found, then `None` is returned.
458
    ///
459
    /// # Panics
460
    ///
461
    /// If the underlying DFAs return an error, then this routine panics. This
462
    /// only occurs in non-default configurations where quit bytes are used or
463
    /// Unicode word boundaries are heuristically enabled.
464
    ///
465
    /// The fallible version of this routine is
466
    /// [`try_find_earliest`](Regex::try_find_earliest).
467
    ///
468
    /// # Example
469
    ///
470
    /// ```
471
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
472
    ///
473
    /// // Normally, the leftmost first match would greedily consume as many
474
    /// // decimal digits as it could. But a match is detected as soon as one
475
    /// // digit is seen.
476
    /// let re = Regex::new("foo[0-9]+")?;
477
    /// assert_eq!(
478
    ///     Some(MultiMatch::must(0, 0, 4)),
479
    ///     re.find_earliest(b"foo12345"),
480
    /// );
481
    ///
482
    /// // Normally, the end of the leftmost first match here would be 3,
483
    /// // but the "earliest" match semantics detect a match earlier.
484
    /// let re = Regex::new("abc|a")?;
485
    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc"));
486
    /// # Ok::<(), Box<dyn std::error::Error>>(())
487
    /// ```
488
0
    pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> {
489
0
        self.find_earliest_at(haystack, 0, haystack.len())
490
0
    }
491
492
    /// Returns the start and end offset of the leftmost match. If no match
493
    /// exists, then `None` is returned.
494
    ///
495
    /// # Panics
496
    ///
497
    /// If the underlying DFAs return an error, then this routine panics. This
498
    /// only occurs in non-default configurations where quit bytes are used or
499
    /// Unicode word boundaries are heuristically enabled.
500
    ///
501
    /// The fallible version of this routine is
502
    /// [`try_find_leftmost`](Regex::try_find_leftmost).
503
    ///
504
    /// # Example
505
    ///
506
    /// ```
507
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
508
    ///
509
    /// // Greediness is applied appropriately when compared to find_earliest.
510
    /// let re = Regex::new("foo[0-9]+")?;
511
    /// assert_eq!(
512
    ///     Some(MultiMatch::must(0, 3, 11)),
513
    ///     re.find_leftmost(b"zzzfoo12345zzz"),
514
    /// );
515
    ///
516
    /// // Even though a match is found after reading the first byte (`a`),
517
    /// // the default leftmost-first match semantics demand that we find the
518
    /// // earliest match that prefers earlier parts of the pattern over latter
519
    /// // parts.
520
    /// let re = Regex::new("abc|a")?;
521
    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc"));
522
    /// # Ok::<(), Box<dyn std::error::Error>>(())
523
    /// ```
524
0
    pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> {
525
0
        self.find_leftmost_at(haystack, 0, haystack.len())
526
0
    }
527
528
    /// Search for the first overlapping match in `haystack`.
529
    ///
530
    /// This routine is principally useful when searching for multiple patterns
531
    /// on inputs where multiple patterns may match the same regions of text.
532
    /// In particular, callers must preserve the automaton's search state from
533
    /// prior calls so that the implementation knows where the last match
534
    /// occurred and which pattern was reported.
535
    ///
536
    /// # Panics
537
    ///
538
    /// If the underlying DFAs return an error, then this routine panics. This
539
    /// only occurs in non-default configurations where quit bytes are used or
540
    /// Unicode word boundaries are heuristically enabled.
541
    ///
542
    /// The fallible version of this routine is
543
    /// [`try_find_overlapping`](Regex::try_find_overlapping).
544
    ///
545
    /// # Example
546
    ///
547
    /// This example shows how to run an overlapping search with multiple
548
    /// regexes.
549
    ///
550
    /// ```
551
    /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
552
    ///
553
    /// let re = Regex::builder()
554
    ///     .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
555
    ///     .build_many(&[r"\w+$", r"\S+$"])?;
556
    /// let haystack = "@foo".as_bytes();
557
    /// let mut state = dfa::OverlappingState::start();
558
    ///
559
    /// let expected = Some(MultiMatch::must(1, 0, 4));
560
    /// let got = re.find_overlapping(haystack, &mut state);
561
    /// assert_eq!(expected, got);
562
    ///
563
    /// // The first pattern also matches at the same position, so re-running
564
    /// // the search will yield another match. Notice also that the first
565
    /// // pattern is returned after the second. This is because the second
566
    /// // pattern begins its match before the first, is therefore an earlier
567
    /// // match and is thus reported first.
568
    /// let expected = Some(MultiMatch::must(0, 1, 4));
569
    /// let got = re.find_overlapping(haystack, &mut state);
570
    /// assert_eq!(expected, got);
571
    ///
572
    /// # Ok::<(), Box<dyn std::error::Error>>(())
573
    /// ```
574
0
    pub fn find_overlapping(
575
0
        &self,
576
0
        haystack: &[u8],
577
0
        state: &mut OverlappingState,
578
0
    ) -> Option<MultiMatch> {
579
0
        self.find_overlapping_at(haystack, 0, haystack.len(), state)
580
0
    }
581
582
    /// Returns an iterator over all non-overlapping "earliest" matches.
583
    ///
584
    /// Match positions are reported as soon as a match is known to occur, even
585
    /// if the standard leftmost match would be longer.
586
    ///
587
    /// # Panics
588
    ///
589
    /// If the underlying DFAs return an error during iteration, then iteration
590
    /// panics. This only occurs in non-default configurations where quit bytes
591
    /// are used or Unicode word boundaries are heuristically enabled.
592
    ///
593
    /// The fallible version of this routine is
594
    /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
595
    ///
596
    /// # Example
597
    ///
598
    /// This example shows how to run an "earliest" iterator.
599
    ///
600
    /// ```
601
    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
602
    ///
603
    /// let re = Regex::new("[0-9]+")?;
604
    /// let haystack = "123".as_bytes();
605
    ///
606
    /// // Normally, a standard leftmost iterator would return a single
607
    /// // match, but since "earliest" detects matches earlier, we get
608
    /// // three matches.
609
    /// let mut it = re.find_earliest_iter(haystack);
610
    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
611
    /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
612
    /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
613
    /// assert_eq!(None, it.next());
614
    ///
615
    /// # Ok::<(), Box<dyn std::error::Error>>(())
616
    /// ```
617
0
    pub fn find_earliest_iter<'r, 't>(
618
0
        &'r self,
619
0
        haystack: &'t [u8],
620
0
    ) -> FindEarliestMatches<'r, 't, A, P> {
621
0
        FindEarliestMatches::new(self, haystack)
622
0
    }
623
624
    /// Returns an iterator over all non-overlapping leftmost matches in the
625
    /// given bytes. If no match exists, then the iterator yields no elements.
626
    ///
627
    /// This corresponds to the "standard" regex search iterator.
628
    ///
629
    /// # Panics
630
    ///
631
    /// If the underlying DFAs return an error during iteration, then iteration
632
    /// panics. This only occurs in non-default configurations where quit bytes
633
    /// are used or Unicode word boundaries are heuristically enabled.
634
    ///
635
    /// The fallible version of this routine is
636
    /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
637
    ///
638
    /// # Example
639
    ///
640
    /// ```
641
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
642
    ///
643
    /// let re = Regex::new("foo[0-9]+")?;
644
    /// let text = b"foo1 foo12 foo123";
645
    /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
646
    /// assert_eq!(matches, vec![
647
    ///     MultiMatch::must(0, 0, 4),
648
    ///     MultiMatch::must(0, 5, 10),
649
    ///     MultiMatch::must(0, 11, 17),
650
    /// ]);
651
    /// # Ok::<(), Box<dyn std::error::Error>>(())
652
    /// ```
653
0
    pub fn find_leftmost_iter<'r, 't>(
654
0
        &'r self,
655
0
        haystack: &'t [u8],
656
0
    ) -> FindLeftmostMatches<'r, 't, A, P> {
657
0
        FindLeftmostMatches::new(self, haystack)
658
0
    }
659
660
    /// Returns an iterator over all overlapping matches in the given haystack.
661
    ///
662
    /// This routine is principally useful when searching for multiple patterns
663
    /// on inputs where multiple patterns may match the same regions of text.
664
    /// The iterator takes care of handling the overlapping state that must be
665
    /// threaded through every search.
666
    ///
667
    /// # Panics
668
    ///
669
    /// If the underlying DFAs return an error during iteration, then iteration
670
    /// panics. This only occurs in non-default configurations where quit bytes
671
    /// are used or Unicode word boundaries are heuristically enabled.
672
    ///
673
    /// The fallible version of this routine is
674
    /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
675
    ///
676
    /// # Example
677
    ///
678
    /// This example shows how to run an overlapping search with multiple
679
    /// regexes.
680
    ///
681
    /// ```
682
    /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
683
    ///
684
    /// let re = Regex::builder()
685
    ///     .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
686
    ///     .build_many(&[r"\w+$", r"\S+$"])?;
687
    /// let haystack = "@foo".as_bytes();
688
    ///
689
    /// let mut it = re.find_overlapping_iter(haystack);
690
    /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
691
    /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
692
    /// assert_eq!(None, it.next());
693
    ///
694
    /// # Ok::<(), Box<dyn std::error::Error>>(())
695
    /// ```
696
0
    pub fn find_overlapping_iter<'r, 't>(
697
0
        &'r self,
698
0
        haystack: &'t [u8],
699
0
    ) -> FindOverlappingMatches<'r, 't, A, P> {
700
0
        FindOverlappingMatches::new(self, haystack)
701
0
    }
702
}
703
704
/// Lower level infallible search routines that permit controlling where
705
/// the search starts and ends in a particular sequence. This is useful for
706
/// executing searches that need to take surrounding context into account. This
707
/// is required for correctly implementing iteration because of look-around
708
/// operators (`^`, `$`, `\b`).
709
impl<A: Automaton, P: Prefilter> Regex<A, P> {
710
    /// Returns true if and only if this regex matches the given haystack.
711
    ///
712
    /// This routine may short circuit if it knows that scanning future input
713
    /// will never lead to a different result. In particular, if the underlying
714
    /// DFA enters a match state or a dead state, then this routine will return
715
    /// `true` or `false`, respectively, without inspecting any future input.
716
    ///
717
    /// # Searching a substring of the haystack
718
    ///
719
    /// Being an "at" search routine, this permits callers to search a
720
    /// substring of `haystack` by specifying a range in `haystack`.
721
    /// Why expose this as an API instead of just asking callers to use
722
    /// `&input[start..end]`? The reason is that regex matching often wants
723
    /// to take the surrounding context into account in order to handle
724
    /// look-around (`^`, `$` and `\b`).
725
    ///
726
    /// # Panics
727
    ///
728
    /// If the underlying DFAs return an error, then this routine panics. This
729
    /// only occurs in non-default configurations where quit bytes are used or
730
    /// Unicode word boundaries are heuristically enabled.
731
    ///
732
    /// The fallible version of this routine is
733
    /// [`try_is_match_at`](Regex::try_is_match_at).
734
0
    pub fn is_match_at(
735
0
        &self,
736
0
        haystack: &[u8],
737
0
        start: usize,
738
0
        end: usize,
739
0
    ) -> bool {
740
0
        self.try_is_match_at(haystack, start, end).unwrap()
741
0
    }
742
743
    /// Returns the first position at which a match is found.
744
    ///
745
    /// This routine stops scanning input in precisely the same circumstances
746
    /// as `is_match`. The key difference is that this routine returns the
747
    /// position at which it stopped scanning input if and only if a match
748
    /// was found. If no match is found, then `None` is returned.
749
    ///
750
    /// # Searching a substring of the haystack
751
    ///
752
    /// Being an "at" search routine, this permits callers to search a
753
    /// substring of `haystack` by specifying a range in `haystack`.
754
    /// Why expose this as an API instead of just asking callers to use
755
    /// `&input[start..end]`? The reason is that regex matching often wants
756
    /// to take the surrounding context into account in order to handle
757
    /// look-around (`^`, `$` and `\b`).
758
    ///
759
    /// This is useful when implementing an iterator over matches
760
    /// within the same haystack, which cannot be done correctly by simply
761
    /// providing a subslice of `haystack`.
762
    ///
763
    /// # Panics
764
    ///
765
    /// If the underlying DFAs return an error, then this routine panics. This
766
    /// only occurs in non-default configurations where quit bytes are used or
767
    /// Unicode word boundaries are heuristically enabled.
768
    ///
769
    /// The fallible version of this routine is
770
    /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
771
0
    pub fn find_earliest_at(
772
0
        &self,
773
0
        haystack: &[u8],
774
0
        start: usize,
775
0
        end: usize,
776
0
    ) -> Option<MultiMatch> {
777
0
        self.try_find_earliest_at(haystack, start, end).unwrap()
778
0
    }
779
780
    /// Returns the same as `find_leftmost`, but starts the search at the given
781
    /// offset.
782
    ///
783
    /// The significance of the starting point is that it takes the surrounding
784
    /// context into consideration. For example, if the DFA is anchored, then
785
    /// a match can only occur when `start == 0`.
786
    ///
787
    /// # Searching a substring of the haystack
788
    ///
789
    /// Being an "at" search routine, this permits callers to search a
790
    /// substring of `haystack` by specifying a range in `haystack`.
791
    /// Why expose this as an API instead of just asking callers to use
792
    /// `&input[start..end]`? The reason is that regex matching often wants
793
    /// to take the surrounding context into account in order to handle
794
    /// look-around (`^`, `$` and `\b`).
795
    ///
796
    /// This is useful when implementing an iterator over matches within the
797
    /// same haystack, which cannot be done correctly by simply providing a
798
    /// subslice of `haystack`.
799
    ///
800
    /// # Panics
801
    ///
802
    /// If the underlying DFAs return an error, then this routine panics. This
803
    /// only occurs in non-default configurations where quit bytes are used or
804
    /// Unicode word boundaries are heuristically enabled.
805
    ///
806
    /// The fallible version of this routine is
807
    /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
808
0
    pub fn find_leftmost_at(
809
0
        &self,
810
0
        haystack: &[u8],
811
0
        start: usize,
812
0
        end: usize,
813
0
    ) -> Option<MultiMatch> {
814
0
        self.try_find_leftmost_at(haystack, start, end).unwrap()
815
0
    }
816
817
    /// Search for the first overlapping match within a given range of
818
    /// `haystack`.
819
    ///
820
    /// This routine is principally useful when searching for multiple patterns
821
    /// on inputs where multiple patterns may match the same regions of text.
822
    /// In particular, callers must preserve the automaton's search state from
823
    /// prior calls so that the implementation knows where the last match
824
    /// occurred and which pattern was reported.
825
    ///
826
    /// # Searching a substring of the haystack
827
    ///
828
    /// Being an "at" search routine, this permits callers to search a
829
    /// substring of `haystack` by specifying a range in `haystack`.
830
    /// Why expose this as an API instead of just asking callers to use
831
    /// `&input[start..end]`? The reason is that regex matching often wants
832
    /// to take the surrounding context into account in order to handle
833
    /// look-around (`^`, `$` and `\b`).
834
    ///
835
    /// This is useful when implementing an iterator over matches
836
    /// within the same haystack, which cannot be done correctly by simply
837
    /// providing a subslice of `haystack`.
838
    ///
839
    /// # Panics
840
    ///
841
    /// If the underlying DFAs return an error, then this routine panics. This
842
    /// only occurs in non-default configurations where quit bytes are used or
843
    /// Unicode word boundaries are heuristically enabled.
844
    ///
845
    /// The fallible version of this routine is
846
    /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
847
0
    pub fn find_overlapping_at(
848
0
        &self,
849
0
        haystack: &[u8],
850
0
        start: usize,
851
0
        end: usize,
852
0
        state: &mut OverlappingState,
853
0
    ) -> Option<MultiMatch> {
854
0
        self.try_find_overlapping_at(haystack, start, end, state).unwrap()
855
0
    }
856
}
857
858
/// Fallible search routines. These may return an error when the underlying
859
/// DFAs have been configured in a way that permits them to fail during a
860
/// search.
861
///
862
/// Errors during search only occur when the DFA has been explicitly
863
/// configured to do so, usually by specifying one or more "quit" bytes or by
864
/// heuristically enabling Unicode word boundaries.
865
///
866
/// Errors will never be returned using the default configuration. So these
867
/// fallible routines are only needed for particular configurations.
868
impl<A: Automaton, P: Prefilter> Regex<A, P> {
869
    /// Returns true if and only if this regex matches the given haystack.
870
    ///
871
    /// This routine may short circuit if it knows that scanning future input
872
    /// will never lead to a different result. In particular, if the underlying
873
    /// DFA enters a match state or a dead state, then this routine will return
874
    /// `true` or `false`, respectively, without inspecting any future input.
875
    ///
876
    /// # Errors
877
    ///
878
    /// This routine only errors if the search could not complete. For
879
    /// DFA-based regexes, this only occurs in a non-default configuration
880
    /// where quit bytes are used or Unicode word boundaries are heuristically
881
    /// enabled.
882
    ///
883
    /// When a search cannot complete, callers cannot know whether a match
884
    /// exists or not.
885
    ///
886
    /// The infallible (panics on error) version of this routine is
887
    /// [`is_match`](Regex::is_match).
888
0
    pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> {
889
0
        self.try_is_match_at(haystack, 0, haystack.len())
890
0
    }
891
892
    /// Returns the first position at which a match is found.
893
    ///
894
    /// This routine stops scanning input in precisely the same circumstances
895
    /// as `is_match`. The key difference is that this routine returns the
896
    /// position at which it stopped scanning input if and only if a match
897
    /// was found. If no match is found, then `None` is returned.
898
    ///
899
    /// # Errors
900
    ///
901
    /// This routine only errors if the search could not complete. For
902
    /// DFA-based regexes, this only occurs in a non-default configuration
903
    /// where quit bytes are used or Unicode word boundaries are heuristically
904
    /// enabled.
905
    ///
906
    /// When a search cannot complete, callers cannot know whether a match
907
    /// exists or not.
908
    ///
909
    /// The infallible (panics on error) version of this routine is
910
    /// [`find_earliest`](Regex::find_earliest).
911
0
    pub fn try_find_earliest(
912
0
        &self,
913
0
        haystack: &[u8],
914
0
    ) -> Result<Option<MultiMatch>, MatchError> {
915
0
        self.try_find_earliest_at(haystack, 0, haystack.len())
916
0
    }
917
918
    /// Returns the start and end offset of the leftmost match. If no match
919
    /// exists, then `None` is returned.
920
    ///
921
    /// # Errors
922
    ///
923
    /// This routine only errors if the search could not complete. For
924
    /// DFA-based regexes, this only occurs in a non-default configuration
925
    /// where quit bytes are used or Unicode word boundaries are heuristically
926
    /// enabled.
927
    ///
928
    /// When a search cannot complete, callers cannot know whether a match
929
    /// exists or not.
930
    ///
931
    /// The infallible (panics on error) version of this routine is
932
    /// [`find_leftmost`](Regex::find_leftmost).
933
0
    pub fn try_find_leftmost(
934
0
        &self,
935
0
        haystack: &[u8],
936
0
    ) -> Result<Option<MultiMatch>, MatchError> {
937
0
        self.try_find_leftmost_at(haystack, 0, haystack.len())
938
0
    }
939
940
    /// Search for the first overlapping match in `haystack`.
941
    ///
942
    /// This routine is principally useful when searching for multiple patterns
943
    /// on inputs where multiple patterns may match the same regions of text.
944
    /// In particular, callers must preserve the automaton's search state from
945
    /// prior calls so that the implementation knows where the last match
946
    /// occurred and which pattern was reported.
947
    ///
948
    /// # Errors
949
    ///
950
    /// This routine only errors if the search could not complete. For
951
    /// DFA-based regexes, this only occurs in a non-default configuration
952
    /// where quit bytes are used or Unicode word boundaries are heuristically
953
    /// enabled.
954
    ///
955
    /// When a search cannot complete, callers cannot know whether a match
956
    /// exists or not.
957
    ///
958
    /// The infallible (panics on error) version of this routine is
959
    /// [`find_overlapping`](Regex::find_overlapping).
960
0
    pub fn try_find_overlapping(
961
0
        &self,
962
0
        haystack: &[u8],
963
0
        state: &mut OverlappingState,
964
0
    ) -> Result<Option<MultiMatch>, MatchError> {
965
0
        self.try_find_overlapping_at(haystack, 0, haystack.len(), state)
966
0
    }
967
968
    /// Returns an iterator over all non-overlapping "earliest" matches.
969
    ///
970
    /// Match positions are reported as soon as a match is known to occur, even
971
    /// if the standard leftmost match would be longer.
972
    ///
973
    /// # Errors
974
    ///
975
    /// This iterator only yields errors if the search could not complete. For
976
    /// DFA-based regexes, this only occurs in a non-default configuration
977
    /// where quit bytes are used or Unicode word boundaries are heuristically
978
    /// enabled.
979
    ///
980
    /// When a search cannot complete, callers cannot know whether a match
981
    /// exists or not.
982
    ///
983
    /// The infallible (panics on error) version of this routine is
984
    /// [`find_earliest_iter`](Regex::find_earliest_iter).
985
0
    pub fn try_find_earliest_iter<'r, 't>(
986
0
        &'r self,
987
0
        haystack: &'t [u8],
988
0
    ) -> TryFindEarliestMatches<'r, 't, A, P> {
989
0
        TryFindEarliestMatches::new(self, haystack)
990
0
    }
991
992
    /// Returns an iterator over all non-overlapping leftmost matches in the
993
    /// given bytes. If no match exists, then the iterator yields no elements.
994
    ///
995
    /// This corresponds to the "standard" regex search iterator.
996
    ///
997
    /// # Errors
998
    ///
999
    /// This iterator only yields errors if the search could not complete. For
1000
    /// DFA-based regexes, this only occurs in a non-default configuration
1001
    /// where quit bytes are used or Unicode word boundaries are heuristically
1002
    /// enabled.
1003
    ///
1004
    /// When a search cannot complete, callers cannot know whether a match
1005
    /// exists or not.
1006
    ///
1007
    /// The infallible (panics on error) version of this routine is
1008
    /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
1009
0
    pub fn try_find_leftmost_iter<'r, 't>(
1010
0
        &'r self,
1011
0
        haystack: &'t [u8],
1012
0
    ) -> TryFindLeftmostMatches<'r, 't, A, P> {
1013
0
        TryFindLeftmostMatches::new(self, haystack)
1014
0
    }
1015
1016
    /// Returns an iterator over all overlapping matches in the given haystack.
1017
    ///
1018
    /// This routine is principally useful when searching for multiple patterns
1019
    /// on inputs where multiple patterns may match the same regions of text.
1020
    /// The iterator takes care of handling the overlapping state that must be
1021
    /// threaded through every search.
1022
    ///
1023
    /// # Errors
1024
    ///
1025
    /// This iterator only yields errors if the search could not complete. For
1026
    /// DFA-based regexes, this only occurs in a non-default configuration
1027
    /// where quit bytes are used or Unicode word boundaries are heuristically
1028
    /// enabled.
1029
    ///
1030
    /// When a search cannot complete, callers cannot know whether a match
1031
    /// exists or not.
1032
    ///
1033
    /// The infallible (panics on error) version of this routine is
1034
    /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
1035
0
    pub fn try_find_overlapping_iter<'r, 't>(
1036
0
        &'r self,
1037
0
        haystack: &'t [u8],
1038
0
    ) -> TryFindOverlappingMatches<'r, 't, A, P> {
1039
0
        TryFindOverlappingMatches::new(self, haystack)
1040
0
    }
1041
}
1042
1043
/// Lower level fallible search routines that permit controlling where the
1044
/// search starts and ends in a particular sequence.
1045
impl<A: Automaton, P: Prefilter> Regex<A, P> {
1046
    /// Returns true if and only if this regex matches the given haystack.
1047
    ///
1048
    /// This routine may short circuit if it knows that scanning future input
1049
    /// will never lead to a different result. In particular, if the underlying
1050
    /// DFA enters a match state or a dead state, then this routine will return
1051
    /// `true` or `false`, respectively, without inspecting any future input.
1052
    ///
1053
    /// # Searching a substring of the haystack
1054
    ///
1055
    /// Being an "at" search routine, this permits callers to search a
1056
    /// substring of `haystack` by specifying a range in `haystack`.
1057
    /// Why expose this as an API instead of just asking callers to use
1058
    /// `&input[start..end]`? The reason is that regex matching often wants
1059
    /// to take the surrounding context into account in order to handle
1060
    /// look-around (`^`, `$` and `\b`).
1061
    ///
1062
    /// # Errors
1063
    ///
1064
    /// This routine only errors if the search could not complete. For
1065
    /// DFA-based regexes, this only occurs in a non-default configuration
1066
    /// where quit bytes are used, Unicode word boundaries are heuristically
1067
    /// enabled or limits are set on the number of times the lazy DFA's cache
1068
    /// may be cleared.
1069
    ///
1070
    /// When a search cannot complete, callers cannot know whether a match
1071
    /// exists or not.
1072
    ///
1073
    /// The infallible (panics on error) version of this routine is
1074
    /// [`is_match_at`](Regex::is_match_at).
1075
0
    pub fn try_is_match_at(
1076
0
        &self,
1077
0
        haystack: &[u8],
1078
0
        start: usize,
1079
0
        end: usize,
1080
0
    ) -> Result<bool, MatchError> {
1081
0
        self.forward()
1082
0
            .find_earliest_fwd_at(
1083
0
                self.scanner().as_mut(),
1084
0
                None,
1085
0
                haystack,
1086
0
                start,
1087
0
                end,
1088
            )
1089
0
            .map(|x| x.is_some())
1090
0
    }
1091
1092
    /// Returns the first position at which a match is found.
1093
    ///
1094
    /// This routine stops scanning input in precisely the same circumstances
1095
    /// as `is_match`. The key difference is that this routine returns the
1096
    /// position at which it stopped scanning input if and only if a match
1097
    /// was found. If no match is found, then `None` is returned.
1098
    ///
1099
    /// # Searching a substring of the haystack
1100
    ///
1101
    /// Being an "at" search routine, this permits callers to search a
1102
    /// substring of `haystack` by specifying a range in `haystack`.
1103
    /// Why expose this as an API instead of just asking callers to use
1104
    /// `&input[start..end]`? The reason is that regex matching often wants
1105
    /// to take the surrounding context into account in order to handle
1106
    /// look-around (`^`, `$` and `\b`).
1107
    ///
1108
    /// This is useful when implementing an iterator over matches
1109
    /// within the same haystack, which cannot be done correctly by simply
1110
    /// providing a subslice of `haystack`.
1111
    ///
1112
    /// # Errors
1113
    ///
1114
    /// This routine only errors if the search could not complete. For
1115
    /// DFA-based regexes, this only occurs in a non-default configuration
1116
    /// where quit bytes are used or Unicode word boundaries are heuristically
1117
    /// enabled.
1118
    ///
1119
    /// When a search cannot complete, callers cannot know whether a match
1120
    /// exists or not.
1121
    ///
1122
    /// The infallible (panics on error) version of this routine is
1123
    /// [`find_earliest_at`](Regex::find_earliest_at).
1124
0
    pub fn try_find_earliest_at(
1125
0
        &self,
1126
0
        haystack: &[u8],
1127
0
        start: usize,
1128
0
        end: usize,
1129
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1130
0
        self.try_find_earliest_at_imp(
1131
0
            self.scanner().as_mut(),
1132
0
            haystack,
1133
0
            start,
1134
0
            end,
1135
        )
1136
0
    }
1137
1138
    /// The implementation of "earliest" searching, where a prefilter scanner
1139
    /// may be given.
1140
0
    fn try_find_earliest_at_imp(
1141
0
        &self,
1142
0
        pre: Option<&mut prefilter::Scanner>,
1143
0
        haystack: &[u8],
1144
0
        start: usize,
1145
0
        end: usize,
1146
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1147
        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1148
        // that we always use the `impl Automaton for &A` for calling methods.
1149
        // Since this is the usual way that automata are used, this helps
1150
        // reduce the number of monomorphized copies of the search code.
1151
0
        let (fwd, rev) = (self.forward(), self.reverse());
1152
0
        let end = match (&fwd)
1153
0
            .find_earliest_fwd_at(pre, None, haystack, start, end)?
1154
        {
1155
0
            None => return Ok(None),
1156
0
            Some(end) => end,
1157
        };
1158
        // N.B. The only time we need to tell the reverse searcher the pattern
1159
        // to match is in the overlapping case, since it's ambiguous. In the
1160
        // leftmost case, I have tentatively convinced myself that it isn't
1161
        // necessary and the reverse search will always find the same pattern
1162
        // to match as the forward search. But I lack a rigorous proof.
1163
0
        let start = (&rev)
1164
0
            .find_earliest_rev_at(None, haystack, start, end.offset())?
1165
0
            .expect("reverse search must match if forward search does");
1166
0
        assert_eq!(
1167
0
            start.pattern(),
1168
0
            end.pattern(),
1169
0
            "forward and reverse search must match same pattern"
1170
        );
1171
0
        assert!(start.offset() <= end.offset());
1172
0
        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1173
0
    }
1174
1175
    /// Returns the start and end offset of the leftmost match. If no match
1176
    /// exists, then `None` is returned.
1177
    ///
1178
    /// # Searching a substring of the haystack
1179
    ///
1180
    /// Being an "at" search routine, this permits callers to search a
1181
    /// substring of `haystack` by specifying a range in `haystack`.
1182
    /// Why expose this as an API instead of just asking callers to use
1183
    /// `&input[start..end]`? The reason is that regex matching often wants
1184
    /// to take the surrounding context into account in order to handle
1185
    /// look-around (`^`, `$` and `\b`).
1186
    ///
1187
    /// This is useful when implementing an iterator over matches
1188
    /// within the same haystack, which cannot be done correctly by simply
1189
    /// providing a subslice of `haystack`.
1190
    ///
1191
    /// # Errors
1192
    ///
1193
    /// This routine only errors if the search could not complete. For
1194
    /// DFA-based regexes, this only occurs in a non-default configuration
1195
    /// where quit bytes are used or Unicode word boundaries are heuristically
1196
    /// enabled.
1197
    ///
1198
    /// When a search cannot complete, callers cannot know whether a match
1199
    /// exists or not.
1200
    ///
1201
    /// The infallible (panics on error) version of this routine is
1202
    /// [`find_leftmost_at`](Regex::find_leftmost_at).
1203
0
    pub fn try_find_leftmost_at(
1204
0
        &self,
1205
0
        haystack: &[u8],
1206
0
        start: usize,
1207
0
        end: usize,
1208
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1209
0
        self.try_find_leftmost_at_imp(
1210
0
            self.scanner().as_mut(),
1211
0
            haystack,
1212
0
            start,
1213
0
            end,
1214
        )
1215
0
    }
1216
1217
    /// The implementation of leftmost searching, where a prefilter scanner
1218
    /// may be given.
1219
0
    fn try_find_leftmost_at_imp(
1220
0
        &self,
1221
0
        scanner: Option<&mut prefilter::Scanner>,
1222
0
        haystack: &[u8],
1223
0
        start: usize,
1224
0
        end: usize,
1225
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1226
        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1227
        // that we always use the `impl Automaton for &A` for calling methods.
1228
        // Since this is the usual way that automata are used, this helps
1229
        // reduce the number of monomorphized copies of the search code.
1230
0
        let (fwd, rev) = (self.forward(), self.reverse());
1231
0
        let end = match (&fwd)
1232
0
            .find_leftmost_fwd_at(scanner, None, haystack, start, end)?
1233
        {
1234
0
            None => return Ok(None),
1235
0
            Some(end) => end,
1236
        };
1237
        // N.B. The only time we need to tell the reverse searcher the pattern
1238
        // to match is in the overlapping case, since it's ambiguous. In the
1239
        // leftmost case, I have tentatively convinced myself that it isn't
1240
        // necessary and the reverse search will always find the same pattern
1241
        // to match as the forward search. But I lack a rigorous proof. Why not
1242
        // just provide the pattern anyway? Well, if it is needed, then leaving
1243
        // it out gives us a chance to find a witness.
1244
0
        let start = (&rev)
1245
0
            .find_leftmost_rev_at(None, haystack, start, end.offset())?
1246
0
            .expect("reverse search must match if forward search does");
1247
0
        assert_eq!(
1248
0
            start.pattern(),
1249
0
            end.pattern(),
1250
0
            "forward and reverse search must match same pattern",
1251
        );
1252
0
        assert!(start.offset() <= end.offset());
1253
0
        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1254
0
    }
1255
1256
    /// Search for the first overlapping match within a given range of
1257
    /// `haystack`.
1258
    ///
1259
    /// This routine is principally useful when searching for multiple patterns
1260
    /// on inputs where multiple patterns may match the same regions of text.
1261
    /// In particular, callers must preserve the automaton's search state from
1262
    /// prior calls so that the implementation knows where the last match
1263
    /// occurred and which pattern was reported.
1264
    ///
1265
    /// # Searching a substring of the haystack
1266
    ///
1267
    /// Being an "at" search routine, this permits callers to search a
1268
    /// substring of `haystack` by specifying a range in `haystack`.
1269
    /// Why expose this as an API instead of just asking callers to use
1270
    /// `&input[start..end]`? The reason is that regex matching often wants
1271
    /// to take the surrounding context into account in order to handle
1272
    /// look-around (`^`, `$` and `\b`).
1273
    ///
1274
    /// This is useful when implementing an iterator over matches
1275
    /// within the same haystack, which cannot be done correctly by simply
1276
    /// providing a subslice of `haystack`.
1277
    ///
1278
    /// # Errors
1279
    ///
1280
    /// This routine only errors if the search could not complete. For
1281
    /// DFA-based regexes, this only occurs in a non-default configuration
1282
    /// where quit bytes are used or Unicode word boundaries are heuristically
1283
    /// enabled.
1284
    ///
1285
    /// When a search cannot complete, callers cannot know whether a match
1286
    /// exists or not.
1287
    ///
1288
    /// The infallible (panics on error) version of this routine is
1289
    /// [`find_overlapping_at`](Regex::find_overlapping_at).
1290
0
    pub fn try_find_overlapping_at(
1291
0
        &self,
1292
0
        haystack: &[u8],
1293
0
        start: usize,
1294
0
        end: usize,
1295
0
        state: &mut OverlappingState,
1296
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1297
0
        self.try_find_overlapping_at_imp(
1298
0
            self.scanner().as_mut(),
1299
0
            haystack,
1300
0
            start,
1301
0
            end,
1302
0
            state,
1303
        )
1304
0
    }
1305
1306
    /// The implementation of overlapping search at a given range in
1307
    /// `haystack`, where `scanner` is a prefilter (if active) and `state` is
1308
    /// the current state of the search.
1309
0
    fn try_find_overlapping_at_imp(
1310
0
        &self,
1311
0
        scanner: Option<&mut prefilter::Scanner>,
1312
0
        haystack: &[u8],
1313
0
        start: usize,
1314
0
        end: usize,
1315
0
        state: &mut OverlappingState,
1316
0
    ) -> Result<Option<MultiMatch>, MatchError> {
1317
        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1318
        // that we always use the `impl Automaton for &A` for calling methods.
1319
        // Since this is the usual way that automata are used, this helps
1320
        // reduce the number of monomorphized copies of the search code.
1321
0
        let (fwd, rev) = (self.forward(), self.reverse());
1322
        // TODO: Decide whether it's worth making this assert work. It doesn't
1323
        // work currently because 'has_starts_for_each_pattern' isn't on the
1324
        // Automaton trait. Without this assert, we still get a panic, but it's
1325
        // a bit more inscrutable.
1326
        // assert!(
1327
        // rev.has_starts_for_each_pattern(),
1328
        // "overlapping searches require that the reverse DFA is \
1329
        // compiled with the 'starts_for_each_pattern' option",
1330
        // );
1331
0
        let end = match (&fwd).find_overlapping_fwd_at(
1332
0
            scanner, None, haystack, start, end, state,
1333
0
        )? {
1334
0
            None => return Ok(None),
1335
0
            Some(end) => end,
1336
        };
1337
        // Unlike the leftmost cases, the reverse overlapping search may match
1338
        // a different pattern than the forward search. See test failures when
1339
        // using `None` instead of `Some(end.pattern())` below. Thus, we must
1340
        // run our reverse search using the pattern that matched in the forward
1341
        // direction.
1342
0
        let start = (&rev)
1343
0
            .find_leftmost_rev_at(
1344
0
                Some(end.pattern()),
1345
0
                haystack,
1346
                0,
1347
0
                end.offset(),
1348
0
            )?
1349
0
            .expect("reverse search must match if forward search does");
1350
0
        assert!(start.offset() <= end.offset());
1351
0
        assert_eq!(start.pattern(), end.pattern());
1352
0
        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1353
0
    }
1354
}
1355
1356
/// Non-search APIs for querying information about the regex and setting a
1357
/// prefilter.
1358
impl<A: Automaton, P: Prefilter> Regex<A, P> {
1359
    /// Attach the given prefilter to this regex.
1360
0
    pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> {
1361
0
        Regex {
1362
0
            prefilter: Some(prefilter),
1363
0
            forward: self.forward,
1364
0
            reverse: self.reverse,
1365
0
            utf8: self.utf8,
1366
0
        }
1367
0
    }
1368
1369
    /// Remove any prefilter from this regex.
1370
0
    pub fn without_prefilter(self) -> Regex<A> {
1371
0
        Regex {
1372
0
            prefilter: None,
1373
0
            forward: self.forward,
1374
0
            reverse: self.reverse,
1375
0
            utf8: self.utf8,
1376
0
        }
1377
0
    }
1378
1379
    /// Return the underlying DFA responsible for forward matching.
1380
    ///
1381
    /// This is useful for accessing the underlying DFA and converting it to
1382
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
1383
    /// for an example of where this might be useful.
1384
0
    pub fn forward(&self) -> &A {
1385
0
        &self.forward
1386
0
    }
1387
1388
    /// Return the underlying DFA responsible for reverse matching.
1389
    ///
1390
    /// This is useful for accessing the underlying DFA and converting it to
1391
    /// some other format or size. See the [`Builder::build_from_dfas`] docs
1392
    /// for an example of where this might be useful.
1393
0
    pub fn reverse(&self) -> &A {
1394
0
        &self.reverse
1395
0
    }
1396
1397
    /// Returns the total number of patterns matched by this regex.
1398
    ///
1399
    /// # Example
1400
    ///
1401
    /// ```
1402
    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
1403
    ///
1404
    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
1405
    /// assert_eq!(3, re.pattern_count());
1406
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1407
    /// ```
1408
0
    pub fn pattern_count(&self) -> usize {
1409
0
        assert_eq!(
1410
0
            self.forward().pattern_count(),
1411
0
            self.reverse().pattern_count()
1412
        );
1413
0
        self.forward().pattern_count()
1414
0
    }
1415
1416
    /// Convenience function for returning this regex's prefilter as a trait
1417
    /// object.
1418
    ///
1419
    /// If this regex doesn't have a prefilter, then `None` is returned.
1420
0
    pub fn prefilter(&self) -> Option<&dyn Prefilter> {
1421
0
        match self.prefilter {
1422
0
            None => None,
1423
0
            Some(ref x) => Some(&*x),
1424
        }
1425
0
    }
1426
1427
    /// Convenience function for returning a prefilter scanner.
1428
0
    fn scanner(&self) -> Option<prefilter::Scanner> {
1429
0
        self.prefilter().map(prefilter::Scanner::new)
1430
0
    }
1431
}
1432
1433
/// An iterator over all non-overlapping earliest matches for a particular
1434
/// infallible search.
1435
///
1436
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1437
/// found. If the underlying search returns an error, then this panics.
1438
///
1439
/// `A` is the type used to represent the underlying DFAs used by the regex,
1440
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1441
/// as follows:
1442
///
1443
/// * `'r` is the lifetime of the regular expression itself.
1444
/// * `'t` is the lifetime of the text being searched.
1445
#[derive(Clone, Debug)]
1446
pub struct FindEarliestMatches<'r, 't, A, P>(
1447
    TryFindEarliestMatches<'r, 't, A, P>,
1448
);
1449
1450
impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> {
1451
0
    fn new(
1452
0
        re: &'r Regex<A, P>,
1453
0
        text: &'t [u8],
1454
0
    ) -> FindEarliestMatches<'r, 't, A, P> {
1455
0
        FindEarliestMatches(TryFindEarliestMatches::new(re, text))
1456
0
    }
1457
}
1458
1459
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1460
    for FindEarliestMatches<'r, 't, A, P>
1461
{
1462
    type Item = MultiMatch;
1463
1464
0
    fn next(&mut self) -> Option<MultiMatch> {
1465
0
        next_unwrap(self.0.next())
1466
0
    }
1467
}
1468
1469
/// An iterator over all non-overlapping leftmost matches for a particular
1470
/// infallible search.
1471
///
1472
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1473
/// found. If the underlying search returns an error, then this panics.
1474
///
1475
/// `A` is the type used to represent the underlying DFAs used by the regex,
1476
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1477
/// as follows:
1478
///
1479
/// * `'r` is the lifetime of the regular expression itself.
1480
/// * `'t` is the lifetime of the text being searched.
1481
#[derive(Clone, Debug)]
1482
pub struct FindLeftmostMatches<'r, 't, A, P>(
1483
    TryFindLeftmostMatches<'r, 't, A, P>,
1484
);
1485
1486
impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> {
1487
0
    fn new(
1488
0
        re: &'r Regex<A, P>,
1489
0
        text: &'t [u8],
1490
0
    ) -> FindLeftmostMatches<'r, 't, A, P> {
1491
0
        FindLeftmostMatches(TryFindLeftmostMatches::new(re, text))
1492
0
    }
1493
}
1494
1495
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1496
    for FindLeftmostMatches<'r, 't, A, P>
1497
{
1498
    type Item = MultiMatch;
1499
1500
0
    fn next(&mut self) -> Option<MultiMatch> {
1501
0
        next_unwrap(self.0.next())
1502
0
    }
1503
}
1504
1505
/// An iterator over all overlapping matches for a particular infallible
1506
/// search.
1507
///
1508
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1509
/// found. If the underlying search returns an error, then this panics.
1510
///
1511
/// `A` is the type used to represent the underlying DFAs used by the regex,
1512
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1513
/// as follows:
1514
///
1515
/// * `'r` is the lifetime of the regular expression itself.
1516
/// * `'t` is the lifetime of the text being searched.
1517
#[derive(Clone, Debug)]
1518
pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>(
1519
    TryFindOverlappingMatches<'r, 't, A, P>,
1520
);
1521
1522
impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> {
1523
0
    fn new(
1524
0
        re: &'r Regex<A, P>,
1525
0
        text: &'t [u8],
1526
0
    ) -> FindOverlappingMatches<'r, 't, A, P> {
1527
0
        FindOverlappingMatches(TryFindOverlappingMatches::new(re, text))
1528
0
    }
1529
}
1530
1531
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1532
    for FindOverlappingMatches<'r, 't, A, P>
1533
{
1534
    type Item = MultiMatch;
1535
1536
0
    fn next(&mut self) -> Option<MultiMatch> {
1537
0
        next_unwrap(self.0.next())
1538
0
    }
1539
}
1540
1541
/// An iterator over all non-overlapping earliest matches for a particular
1542
/// fallible search.
1543
///
1544
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1545
/// found.
1546
///
1547
/// `A` is the type used to represent the underlying DFAs used by the regex,
1548
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1549
/// as follows:
1550
///
1551
/// * `'r` is the lifetime of the regular expression itself.
1552
/// * `'t` is the lifetime of the text being searched.
1553
#[derive(Clone, Debug)]
1554
pub struct TryFindEarliestMatches<'r, 't, A, P> {
1555
    re: &'r Regex<A, P>,
1556
    scanner: Option<prefilter::Scanner<'r>>,
1557
    text: &'t [u8],
1558
    last_end: usize,
1559
    last_match: Option<usize>,
1560
}
1561
1562
impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> {
1563
0
    fn new(
1564
0
        re: &'r Regex<A, P>,
1565
0
        text: &'t [u8],
1566
0
    ) -> TryFindEarliestMatches<'r, 't, A, P> {
1567
0
        let scanner = re.scanner();
1568
0
        TryFindEarliestMatches {
1569
0
            re,
1570
0
            scanner,
1571
0
            text,
1572
0
            last_end: 0,
1573
0
            last_match: None,
1574
0
        }
1575
0
    }
1576
}
1577
1578
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1579
    for TryFindEarliestMatches<'r, 't, A, P>
1580
{
1581
    type Item = Result<MultiMatch, MatchError>;
1582
1583
0
    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1584
0
        if self.last_end > self.text.len() {
1585
0
            return None;
1586
0
        }
1587
0
        let result = self.re.try_find_earliest_at_imp(
1588
0
            self.scanner.as_mut(),
1589
0
            self.text,
1590
0
            self.last_end,
1591
0
            self.text.len(),
1592
        );
1593
0
        let m = match result {
1594
0
            Err(err) => return Some(Err(err)),
1595
0
            Ok(None) => return None,
1596
0
            Ok(Some(m)) => m,
1597
        };
1598
0
        if m.is_empty() {
1599
            // This is an empty match. To ensure we make progress, start
1600
            // the next search at the smallest possible starting position
1601
            // of the next match following this one.
1602
0
            self.last_end = if self.re.utf8 {
1603
0
                crate::util::next_utf8(self.text, m.end())
1604
            } else {
1605
0
                m.end() + 1
1606
            };
1607
            // Don't accept empty matches immediately following a match.
1608
            // Just move on to the next match.
1609
0
            if Some(m.end()) == self.last_match {
1610
0
                return self.next();
1611
0
            }
1612
0
        } else {
1613
0
            self.last_end = m.end();
1614
0
        }
1615
0
        self.last_match = Some(m.end());
1616
0
        Some(Ok(m))
1617
0
    }
1618
}
1619
1620
/// An iterator over all non-overlapping leftmost matches for a particular
1621
/// fallible search.
1622
///
1623
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1624
/// found.
1625
///
1626
/// `A` is the type used to represent the underlying DFAs used by the regex,
1627
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1628
/// as follows:
1629
///
1630
/// * `'r` is the lifetime of the regular expression itself.
1631
/// * `'t` is the lifetime of the text being searched.
1632
#[derive(Clone, Debug)]
1633
pub struct TryFindLeftmostMatches<'r, 't, A, P> {
1634
    re: &'r Regex<A, P>,
1635
    scanner: Option<prefilter::Scanner<'r>>,
1636
    text: &'t [u8],
1637
    last_end: usize,
1638
    last_match: Option<usize>,
1639
}
1640
1641
impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> {
1642
0
    fn new(
1643
0
        re: &'r Regex<A, P>,
1644
0
        text: &'t [u8],
1645
0
    ) -> TryFindLeftmostMatches<'r, 't, A, P> {
1646
0
        let scanner = re.scanner();
1647
0
        TryFindLeftmostMatches {
1648
0
            re,
1649
0
            scanner,
1650
0
            text,
1651
0
            last_end: 0,
1652
0
            last_match: None,
1653
0
        }
1654
0
    }
1655
}
1656
1657
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1658
    for TryFindLeftmostMatches<'r, 't, A, P>
1659
{
1660
    type Item = Result<MultiMatch, MatchError>;
1661
1662
0
    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1663
0
        if self.last_end > self.text.len() {
1664
0
            return None;
1665
0
        }
1666
0
        let result = self.re.try_find_leftmost_at_imp(
1667
0
            self.scanner.as_mut(),
1668
0
            self.text,
1669
0
            self.last_end,
1670
0
            self.text.len(),
1671
        );
1672
0
        let m = match result {
1673
0
            Err(err) => return Some(Err(err)),
1674
0
            Ok(None) => return None,
1675
0
            Ok(Some(m)) => m,
1676
        };
1677
0
        if m.is_empty() {
1678
            // This is an empty match. To ensure we make progress, start
1679
            // the next search at the smallest possible starting position
1680
            // of the next match following this one.
1681
0
            self.last_end = if self.re.utf8 {
1682
0
                crate::util::next_utf8(self.text, m.end())
1683
            } else {
1684
0
                m.end() + 1
1685
            };
1686
            // Don't accept empty matches immediately following a match.
1687
            // Just move on to the next match.
1688
0
            if Some(m.end()) == self.last_match {
1689
0
                return self.next();
1690
0
            }
1691
0
        } else {
1692
0
            self.last_end = m.end();
1693
0
        }
1694
0
        self.last_match = Some(m.end());
1695
0
        Some(Ok(m))
1696
0
    }
1697
}
1698
1699
/// An iterator over all overlapping matches for a particular fallible search.
1700
///
1701
/// The iterator yields a [`MultiMatch`] value until no more matches could be
1702
/// found.
1703
///
1704
/// `A` is the type used to represent the underlying DFAs used by the regex,
1705
/// while `P` is the type of prefilter used, if any. The lifetime variables are
1706
/// as follows:
1707
///
1708
/// * `'r` is the lifetime of the regular expression itself.
1709
/// * `'t` is the lifetime of the text being searched.
1710
#[derive(Clone, Debug)]
1711
pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> {
1712
    re: &'r Regex<A, P>,
1713
    scanner: Option<prefilter::Scanner<'r>>,
1714
    text: &'t [u8],
1715
    last_end: usize,
1716
    state: OverlappingState,
1717
}
1718
1719
impl<'r, 't, A: Automaton, P: Prefilter>
1720
    TryFindOverlappingMatches<'r, 't, A, P>
1721
{
1722
0
    fn new(
1723
0
        re: &'r Regex<A, P>,
1724
0
        text: &'t [u8],
1725
0
    ) -> TryFindOverlappingMatches<'r, 't, A, P> {
1726
0
        let scanner = re.scanner();
1727
0
        TryFindOverlappingMatches {
1728
0
            re,
1729
0
            scanner,
1730
0
            text,
1731
0
            last_end: 0,
1732
0
            state: OverlappingState::start(),
1733
0
        }
1734
0
    }
1735
}
1736
1737
impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1738
    for TryFindOverlappingMatches<'r, 't, A, P>
1739
{
1740
    type Item = Result<MultiMatch, MatchError>;
1741
1742
0
    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1743
0
        if self.last_end > self.text.len() {
1744
0
            return None;
1745
0
        }
1746
0
        let result = self.re.try_find_overlapping_at_imp(
1747
0
            self.scanner.as_mut(),
1748
0
            self.text,
1749
0
            self.last_end,
1750
0
            self.text.len(),
1751
0
            &mut self.state,
1752
        );
1753
0
        let m = match result {
1754
0
            Err(err) => return Some(Err(err)),
1755
0
            Ok(None) => return None,
1756
0
            Ok(Some(m)) => m,
1757
        };
1758
        // Unlike the non-overlapping case, we're OK with empty matches at this
1759
        // level. In particular, the overlapping search algorithm is itself
1760
        // responsible for ensuring that progress is always made.
1761
0
        self.last_end = m.end();
1762
0
        Some(Ok(m))
1763
0
    }
1764
}
1765
1766
/// The configuration used for compiling a DFA-backed regex.
1767
///
1768
/// A regex configuration is a simple data object that is typically used with
1769
/// [`Builder::configure`].
1770
#[cfg(feature = "alloc")]
1771
#[derive(Clone, Copy, Debug, Default)]
1772
pub struct Config {
1773
    utf8: Option<bool>,
1774
}
1775
1776
#[cfg(feature = "alloc")]
1777
impl Config {
1778
    /// Return a new default regex compiler configuration.
1779
    pub fn new() -> Config {
1780
        Config::default()
1781
    }
1782
1783
    /// Whether to enable UTF-8 mode or not.
1784
    ///
1785
    /// When UTF-8 mode is enabled (the default) and an empty match is seen,
1786
    /// the iterators on [`Regex`] will always start the next search at the
1787
    /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
1788
    /// mode is disabled, such searches are begun at the next byte offset.
1789
    ///
1790
    /// If this mode is enabled and invalid UTF-8 is given to search, then
1791
    /// behavior is unspecified.
1792
    ///
1793
    /// Generally speaking, one should enable this when
1794
    /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
1795
    /// and
1796
    /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
1797
    /// are enabled, and disable it otherwise.
1798
    ///
1799
    /// # Example
1800
    ///
1801
    /// This example demonstrates the differences between when this option is
1802
    /// enabled and disabled. The differences only arise when the regex can
1803
    /// return matches of length zero.
1804
    ///
1805
    /// In this first snippet, we show the results when UTF-8 mode is disabled.
1806
    ///
1807
    /// ```
1808
    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
1809
    ///
1810
    /// let re = Regex::builder()
1811
    ///     .configure(Regex::config().utf8(false))
1812
    ///     .build(r"")?;
1813
    /// let haystack = "a☃z".as_bytes();
1814
    /// let mut it = re.find_leftmost_iter(haystack);
1815
    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
1816
    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
1817
    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
1818
    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
1819
    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
1820
    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
1821
    /// assert_eq!(None, it.next());
1822
    ///
1823
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1824
    /// ```
1825
    ///
1826
    /// And in this snippet, we execute the same search on the same haystack,
1827
    /// but with UTF-8 mode enabled. Notice that byte offsets that would
1828
    /// otherwise split the encoding of `☃` are not returned.
1829
    ///
1830
    /// ```
1831
    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
1832
    ///
1833
    /// let re = Regex::builder()
1834
    ///     .configure(Regex::config().utf8(true))
1835
    ///     .build(r"")?;
1836
    /// let haystack = "a☃z".as_bytes();
1837
    /// let mut it = re.find_leftmost_iter(haystack);
1838
    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
1839
    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
1840
    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
1841
    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
1842
    /// assert_eq!(None, it.next());
1843
    ///
1844
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1845
    /// ```
1846
    pub fn utf8(mut self, yes: bool) -> Config {
1847
        self.utf8 = Some(yes);
1848
        self
1849
    }
1850
1851
    /// Returns true if and only if this configuration has UTF-8 mode enabled.
1852
    ///
1853
    /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
1854
    /// [`Regex`] will always start the next search at the next UTF-8 encoded
1855
    /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
1856
    /// next byte offset.
1857
    pub fn get_utf8(&self) -> bool {
1858
        self.utf8.unwrap_or(true)
1859
    }
1860
1861
    /// Overwrite the default configuration such that the options in `o` are
1862
    /// always used. If an option in `o` is not set, then the corresponding
1863
    /// option in `self` is used. If it's not set in `self` either, then it
1864
    /// remains not set.
1865
    pub(crate) fn overwrite(self, o: Config) -> Config {
1866
        Config { utf8: o.utf8.or(self.utf8) }
1867
    }
1868
}
1869
1870
/// A builder for a regex based on deterministic finite automatons.
1871
///
1872
/// This builder permits configuring options for the syntax of a pattern, the
1873
/// NFA construction, the DFA construction and finally the regex searching
1874
/// itself. This builder is different from a general purpose regex builder in
1875
/// that it permits fine grain configuration of the construction process. The
1876
/// trade off for this is complexity, and the possibility of setting a
1877
/// configuration that might not make sense. For example, there are three
1878
/// different UTF-8 modes:
1879
///
1880
/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
1881
/// pattern itself can contain sub-expressions that match invalid UTF-8.
1882
/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
1883
/// controls whether the implicit unanchored prefix added to the NFA can
1884
/// match through invalid UTF-8 or not.
1885
/// * [`Config::utf8`] controls how the regex iterators themselves advance
1886
/// the starting position of the next search when a match with zero length is
1887
/// found.
1888
///
1889
/// Generally speaking, callers will want to either enable all of these or
1890
/// disable all of these.
1891
///
1892
/// Internally, building a regex requires building two DFAs, where one is
1893
/// responsible for finding the end of a match and the other is responsible
1894
/// for finding the start of a match. If you only need to detect whether
1895
/// something matched, or only the end of a match, then you should use a
1896
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
1897
/// building two DFAs.
1898
///
1899
/// # Build methods
1900
///
1901
/// This builder has a few "build" methods. In general, it's the result of
1902
/// combining the following parameters:
1903
///
1904
/// * Building one or many regexes.
1905
/// * Building a regex with dense or sparse DFAs.
1906
///
1907
/// The simplest "build" method is [`Builder::build`]. It accepts a single
1908
/// pattern and builds a dense DFA using `usize` for the state identifier
1909
/// representation.
1910
///
1911
/// The most general "build" method is [`Builder::build_many`], which permits
1912
/// building a regex that searches for multiple patterns simultaneously while
1913
/// using a specific state identifier representation.
1914
///
1915
/// The most flexible "build" method, but hardest to use, is
1916
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
1917
/// just a pair of DFAs, and this method allows you to specify those DFAs
1918
/// exactly.
1919
///
1920
/// # Example
1921
///
1922
/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
1923
/// the regex itself. This is generally what you want for matching on
1924
/// arbitrary bytes.
1925
///
1926
/// ```
1927
/// use regex_automata::{
1928
///     dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
1929
/// };
1930
///
1931
/// let re = Regex::builder()
1932
///     .configure(Regex::config().utf8(false))
1933
///     .syntax(SyntaxConfig::new().utf8(false))
1934
///     .thompson(thompson::Config::new().utf8(false))
1935
///     .build(r"foo(?-u:[^b])ar.*")?;
1936
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
1937
/// let expected = Some(MultiMatch::must(0, 1, 9));
1938
/// let got = re.find_leftmost(haystack);
1939
/// assert_eq!(expected, got);
1940
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
1941
/// // but the subsequent `.*` does not! Disabling UTF-8
1942
/// // on the syntax permits this. Notice also that the
1943
/// // search was unanchored and skipped over invalid UTF-8.
1944
/// // Disabling UTF-8 on the Thompson NFA permits this.
1945
/// //
1946
/// // N.B. This example does not show the impact of
1947
/// // disabling UTF-8 mode on Config, since that
1948
/// // only impacts regexes that can produce matches of
1949
/// // length 0.
1950
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
1951
///
1952
/// # Ok::<(), Box<dyn std::error::Error>>(())
1953
/// ```
1954
#[cfg(feature = "alloc")]
1955
#[derive(Clone, Debug)]
1956
pub struct Builder {
1957
    config: Config,
1958
    dfa: dense::Builder,
1959
}
1960
1961
#[cfg(feature = "alloc")]
1962
impl Builder {
1963
    /// Create a new regex builder with the default configuration.
1964
    pub fn new() -> Builder {
1965
        Builder { config: Config::default(), dfa: dense::Builder::new() }
1966
    }
1967
1968
    /// Build a regex from the given pattern.
1969
    ///
1970
    /// If there was a problem parsing or compiling the pattern, then an error
1971
    /// is returned.
1972
    pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
1973
        self.build_many(&[pattern])
1974
    }
1975
1976
    /// Build a regex from the given pattern using sparse DFAs.
1977
    ///
1978
    /// If there was a problem parsing or compiling the pattern, then an error
1979
    /// is returned.
1980
    pub fn build_sparse(
1981
        &self,
1982
        pattern: &str,
1983
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
1984
        self.build_many_sparse(&[pattern])
1985
    }
1986
1987
    /// Build a regex from the given patterns.
1988
    pub fn build_many<P: AsRef<str>>(
1989
        &self,
1990
        patterns: &[P],
1991
    ) -> Result<Regex, Error> {
1992
        let forward = self.dfa.build_many(patterns)?;
1993
        let reverse = self
1994
            .dfa
1995
            .clone()
1996
            .configure(
1997
                dense::Config::new()
1998
                    .anchored(true)
1999
                    .match_kind(MatchKind::All)
2000
                    .starts_for_each_pattern(true),
2001
            )
2002
            .thompson(thompson::Config::new().reverse(true))
2003
            .build_many(patterns)?;
2004
        Ok(self.build_from_dfas(forward, reverse))
2005
    }
2006
2007
    /// Build a sparse regex from the given patterns.
2008
    pub fn build_many_sparse<P: AsRef<str>>(
2009
        &self,
2010
        patterns: &[P],
2011
    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
2012
        let re = self.build_many(patterns)?;
2013
        let forward = re.forward().to_sparse()?;
2014
        let reverse = re.reverse().to_sparse()?;
2015
        Ok(self.build_from_dfas(forward, reverse))
2016
    }
2017
2018
    /// Build a regex from its component forward and reverse DFAs.
2019
    ///
2020
    /// This is useful when deserializing a regex from some arbitrary
2021
    /// memory region. This is also useful for building regexes from other
2022
    /// types of DFAs.
2023
    ///
2024
    /// If you're building the DFAs from scratch instead of building new DFAs
2025
    /// from other DFAs, then you'll need to make sure that the reverse DFA is
2026
    /// configured correctly to match the intended semantics. Namely:
2027
    ///
2028
    /// * It should be anchored.
2029
    /// * It should use [`MatchKind::All`] semantics.
2030
    /// * It should match in reverse.
2031
    /// * It should have anchored start states compiled for each pattern.
2032
    /// * Otherwise, its configuration should match the forward DFA.
2033
    ///
2034
    /// If these conditions are satisfied, then behavior of searches is
2035
    /// unspecified.
2036
    ///
2037
    /// Note that when using this constructor, only the configuration from
2038
    /// [`Config`] is applied. The only configuration settings on this builder
2039
    /// only apply when the builder owns the construction of the DFAs
2040
    /// themselves.
2041
    ///
2042
    /// # Example
2043
    ///
2044
    /// This example is a bit a contrived. The usual use of these methods
2045
    /// would involve serializing `initial_re` somewhere and then deserializing
2046
    /// it later to build a regex. But in this case, we do everything in
2047
    /// memory.
2048
    ///
2049
    /// ```
2050
    /// use regex_automata::dfa::regex::Regex;
2051
    ///
2052
    /// let initial_re = Regex::new("foo[0-9]+")?;
2053
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
2054
    ///
2055
    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
2056
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
2057
    /// assert_eq!(true, re.is_match(b"foo123"));
2058
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2059
    /// ```
2060
    ///
2061
    /// This example shows how to build a `Regex` that uses sparse DFAs instead
2062
    /// of dense DFAs without using one of the convenience `build_sparse`
2063
    /// routines:
2064
    ///
2065
    /// ```
2066
    /// use regex_automata::dfa::regex::Regex;
2067
    ///
2068
    /// let initial_re = Regex::new("foo[0-9]+")?;
2069
    /// assert_eq!(true, initial_re.is_match(b"foo123"));
2070
    ///
2071
    /// let fwd = initial_re.forward().to_sparse()?;
2072
    /// let rev = initial_re.reverse().to_sparse()?;
2073
    /// let re = Regex::builder().build_from_dfas(fwd, rev);
2074
    /// assert_eq!(true, re.is_match(b"foo123"));
2075
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2076
    /// ```
2077
    pub fn build_from_dfas<A: Automaton>(
2078
        &self,
2079
        forward: A,
2080
        reverse: A,
2081
    ) -> Regex<A> {
2082
        let utf8 = self.config.get_utf8();
2083
        Regex { prefilter: None, forward, reverse, utf8 }
2084
    }
2085
2086
    /// Apply the given regex configuration options to this builder.
2087
    pub fn configure(&mut self, config: Config) -> &mut Builder {
2088
        self.config = self.config.overwrite(config);
2089
        self
2090
    }
2091
2092
    /// Set the syntax configuration for this builder using
2093
    /// [`SyntaxConfig`](crate::SyntaxConfig).
2094
    ///
2095
    /// This permits setting things like case insensitivity, Unicode and multi
2096
    /// line mode.
2097
    pub fn syntax(
2098
        &mut self,
2099
        config: crate::util::syntax::SyntaxConfig,
2100
    ) -> &mut Builder {
2101
        self.dfa.syntax(config);
2102
        self
2103
    }
2104
2105
    /// Set the Thompson NFA configuration for this builder using
2106
    /// [`nfa::thompson::Config`](thompson::Config).
2107
    ///
2108
    /// This permits setting things like whether additional time should be
2109
    /// spent shrinking the size of the NFA.
2110
    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
2111
        self.dfa.thompson(config);
2112
        self
2113
    }
2114
2115
    /// Set the dense DFA compilation configuration for this builder using
2116
    /// [`dense::Config`](dense::Config).
2117
    ///
2118
    /// This permits setting things like whether the underlying DFAs should
2119
    /// be minimized.
2120
    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
2121
        self.dfa.configure(config);
2122
        self
2123
    }
2124
}
2125
2126
#[cfg(feature = "alloc")]
2127
impl Default for Builder {
2128
    fn default() -> Builder {
2129
        Builder::new()
2130
    }
2131
}
2132
2133
#[inline(always)]
2134
0
fn next_unwrap(
2135
0
    item: Option<Result<MultiMatch, MatchError>>,
2136
0
) -> Option<MultiMatch> {
2137
0
    match item {
2138
0
        None => None,
2139
0
        Some(Ok(m)) => Some(m),
2140
0
        Some(Err(err)) => panic!(
2141
0
            "unexpected regex search error: {}\n\
2142
0
             to handle search errors, use try_ methods",
2143
            err,
2144
        ),
2145
    }
2146
0
}