/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.2.0/src/dfa/regex.rs
Line | Count | Source |
1 | | /*! |
2 | | A DFA-backed `Regex`. |
3 | | |
4 | | This module provides [`Regex`], which is defined generically over the |
5 | | [`Automaton`] trait. A `Regex` implements convenience routines you might have |
6 | | come to expect, such as finding the start/end of a match and iterating over |
7 | | all non-overlapping matches. This `Regex` type is limited in its capabilities |
8 | | to what a DFA can provide. Therefore, APIs involving capturing groups, for |
9 | | example, are not provided. |
10 | | |
11 | | Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that |
12 | | finds the end offset of a match, where as the other is a "reverse" DFA that |
13 | | find the start offset of a match. |
14 | | |
15 | | See the [parent module](crate::dfa) for examples. |
16 | | */ |
17 | | |
18 | | #[cfg(feature = "alloc")] |
19 | | use alloc::vec::Vec; |
20 | | |
21 | | use crate::{ |
22 | | dfa::automaton::{Automaton, OverlappingState}, |
23 | | util::prefilter::{self, Prefilter}, |
24 | | MatchError, MultiMatch, |
25 | | }; |
26 | | #[cfg(feature = "alloc")] |
27 | | use crate::{ |
28 | | dfa::{dense, error::Error, sparse}, |
29 | | nfa::thompson, |
30 | | util::matchtypes::MatchKind, |
31 | | }; |
32 | | |
33 | | // When the alloc feature is enabled, the regex type sets its A type parameter |
34 | | // to default to an owned dense DFA. But without alloc, we set no default. This |
35 | | // makes things a lot more convenient in the common case, since writing out the |
36 | | // DFA types is pretty annoying. |
37 | | // |
38 | | // Since we have two different definitions but only want to write one doc |
39 | | // string, we use a macro to capture the doc and other attributes once and then |
40 | | // repeat them for each definition. |
41 | | macro_rules! define_regex_type { |
42 | | ($(#[$doc:meta])*) => { |
43 | | #[cfg(feature = "alloc")] |
44 | | $(#[$doc])* |
45 | | pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> { |
46 | | prefilter: Option<P>, |
47 | | forward: A, |
48 | | reverse: A, |
49 | | utf8: bool, |
50 | | } |
51 | | |
52 | | #[cfg(not(feature = "alloc"))] |
53 | | $(#[$doc])* |
54 | | pub struct Regex<A, P = prefilter::None> { |
55 | | prefilter: Option<P>, |
56 | | forward: A, |
57 | | reverse: A, |
58 | | utf8: bool, |
59 | | } |
60 | | }; |
61 | | } |
62 | | |
63 | | define_regex_type!( |
64 | | /// A regular expression that uses deterministic finite automata for fast |
65 | | /// searching. |
66 | | /// |
67 | | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
68 | | /// "reverse" DFA. The forward DFA is responsible for detecting the end of |
69 | | /// a match while the reverse DFA is responsible for detecting the start |
70 | | /// of a match. Thus, in order to find the bounds of any given match, a |
71 | | /// forward search must first be run followed by a reverse search. A match |
72 | | /// found by the forward DFA guarantees that the reverse DFA will also find |
73 | | /// a match. |
74 | | /// |
75 | | /// The type of the DFA used by a `Regex` corresponds to the `A` type |
76 | | /// parameter, which must satisfy the [`Automaton`] trait. Typically, |
77 | | /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a |
78 | | /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more |
79 | | /// memory but search faster, while sparse DFAs use less memory but search |
80 | | /// more slowly. |
81 | | /// |
82 | | /// By default, a regex's automaton type parameter is set to |
83 | | /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most |
84 | | /// in-memory work loads, this is the most convenient type that gives the |
85 | | /// best search performance. When the `alloc` feature is disabled, no |
86 | | /// default type is used. |
87 | | /// |
88 | | /// A `Regex` also has a `P` type parameter, which is used to select the |
89 | | /// prefilter used during search. By default, no prefilter is enabled by |
90 | | /// setting the type to default to [`prefilter::None`]. A prefilter can be |
91 | | /// enabled by using the [`Regex::prefilter`] method. |
92 | | /// |
93 | | /// # When should I use this? |
94 | | /// |
95 | | /// Generally speaking, if you can afford the overhead of building a full |
96 | | /// DFA for your regex, and you don't need things like capturing groups, |
97 | | /// then this is a good choice if you're looking to optimize for matching |
98 | | /// speed. Note however that its speed may be worse than a general purpose |
99 | | /// regex engine if you don't select a good [prefilter]. |
100 | | /// |
101 | | /// # Earliest vs Leftmost vs Overlapping |
102 | | /// |
103 | | /// The search routines exposed on a `Regex` reflect three different ways |
104 | | /// of searching: |
105 | | /// |
106 | | /// * "earliest" means to stop as soon as a match has been detected. |
107 | | /// * "leftmost" means to continue matching until the underlying |
108 | | /// automaton cannot advance. This reflects "standard" searching you |
109 | | /// might be used to in other regex engines. e.g., This permits |
110 | | /// non-greedy and greedy searching to work as you would expect. |
111 | | /// * "overlapping" means to find all possible matches, even if they |
112 | | /// overlap. |
113 | | /// |
114 | | /// Generally speaking, when doing an overlapping search, you'll want to |
115 | | /// build your regex DFAs with [`MatchKind::All`] semantics. Using |
116 | | /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is |
117 | | /// likely to lead to odd behavior since `LeftmostFirst` specifically omits |
118 | | /// some matches that can never be reported due to its semantics. |
119 | | /// |
120 | | /// The following example shows the differences between how these different |
121 | | /// types of searches impact looking for matches of `[a-z]+` in the |
122 | | /// haystack `abc`. |
123 | | /// |
124 | | /// ``` |
125 | | /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; |
126 | | /// |
127 | | /// let pattern = r"[a-z]+"; |
128 | | /// let haystack = "abc".as_bytes(); |
129 | | /// |
130 | | /// // With leftmost-first semantics, we test "earliest" and "leftmost". |
131 | | /// let re = dfa::regex::Builder::new() |
132 | | /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) |
133 | | /// .build(pattern)?; |
134 | | /// |
135 | | /// // "earliest" searching isn't impacted by greediness |
136 | | /// let mut it = re.find_earliest_iter(haystack); |
137 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
138 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
139 | | /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
140 | | /// assert_eq!(None, it.next()); |
141 | | /// |
142 | | /// // "leftmost" searching supports greediness (and non-greediness) |
143 | | /// let mut it = re.find_leftmost_iter(haystack); |
144 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
145 | | /// assert_eq!(None, it.next()); |
146 | | /// |
147 | | /// // For overlapping, we want "all" match kind semantics. |
148 | | /// let re = dfa::regex::Builder::new() |
149 | | /// .dense(dense::Config::new().match_kind(MatchKind::All)) |
150 | | /// .build(pattern)?; |
151 | | /// |
152 | | /// // In the overlapping search, we find all three possible matches |
153 | | /// // starting at the beginning of the haystack. |
154 | | /// let mut it = re.find_overlapping_iter(haystack); |
155 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
156 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); |
157 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
158 | | /// assert_eq!(None, it.next()); |
159 | | /// |
160 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
161 | | /// ``` |
162 | | /// |
163 | | /// # Sparse DFAs |
164 | | /// |
165 | | /// Since a `Regex` is generic over the [`Automaton`] trait, it can be |
166 | | /// used with any kind of DFA. While this crate constructs dense DFAs by |
167 | | /// default, it is easy enough to build corresponding sparse DFAs, and then |
168 | | /// build a regex from them: |
169 | | /// |
170 | | /// ``` |
171 | | /// use regex_automata::dfa::regex::Regex; |
172 | | /// |
173 | | /// // First, build a regex that uses dense DFAs. |
174 | | /// let dense_re = Regex::new("foo[0-9]+")?; |
175 | | /// |
176 | | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
177 | | /// let fwd = dense_re.forward().to_sparse()?; |
178 | | /// let rev = dense_re.reverse().to_sparse()?; |
179 | | /// |
180 | | /// // Third, build a new regex from the constituent sparse DFAs. |
181 | | /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); |
182 | | /// |
183 | | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
184 | | /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
185 | | /// |
186 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
187 | | /// ``` |
188 | | /// |
189 | | /// Alternatively, one can use a [`Builder`] to construct a sparse DFA |
190 | | /// more succinctly. (Note though that dense DFAs are still constructed |
191 | | /// first internally, and then converted to sparse DFAs, as in the example |
192 | | /// above.) |
193 | | /// |
194 | | /// ``` |
195 | | /// use regex_automata::dfa::regex::Regex; |
196 | | /// |
197 | | /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; |
198 | | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
199 | | /// assert!(sparse_re.is_match(b"foo123")); |
200 | | /// |
201 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
202 | | /// ``` |
203 | | /// |
204 | | /// # Fallibility |
205 | | /// |
206 | | /// In non-default configurations, the DFAs generated in this module may |
207 | | /// return an error during a search. (Currently, the only way this happens |
208 | | /// is if quit bytes are added or Unicode word boundaries are heuristically |
209 | | /// enabled, both of which are turned off by default.) For convenience, the |
210 | | /// main search routines, like [`find_leftmost`](Regex::find_leftmost), |
211 | | /// will panic if an error occurs. However, if you need to use DFAs |
212 | | /// which may produce an error at search time, then there are fallible |
213 | | /// equivalents of all search routines. For example, for `find_leftmost`, |
214 | | /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). |
215 | | /// The routines prefixed with `try_` return `Result<Option<MultiMatch>, |
216 | | /// MatchError>`, where as the infallible routines simply return |
217 | | /// `Option<MultiMatch>`. |
218 | | /// |
219 | | /// # Example |
220 | | /// |
221 | | /// This example shows how to cause a search to terminate if it sees a |
222 | | /// `\n` byte, and handle the error returned. This could be useful if, for |
223 | | /// example, you wanted to prevent a user supplied pattern from matching |
224 | | /// across a line boundary. |
225 | | /// |
226 | | /// ``` |
227 | | /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; |
228 | | /// |
229 | | /// let re = Regex::builder() |
230 | | /// .dense(dfa::dense::Config::new().quit(b'\n', true)) |
231 | | /// .build(r"foo\p{any}+bar")?; |
232 | | /// |
233 | | /// let haystack = "foo\nbar".as_bytes(); |
234 | | /// // Normally this would produce a match, since \p{any} contains '\n'. |
235 | | /// // But since we instructed the automaton to enter a quit state if a |
236 | | /// // '\n' is observed, this produces a match error instead. |
237 | | /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; |
238 | | /// let got = re.try_find_leftmost(haystack).unwrap_err(); |
239 | | /// assert_eq!(expected, got); |
240 | | /// |
241 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
242 | | /// ``` |
243 | | #[derive(Clone, Debug)] |
244 | | ); |
245 | | |
246 | | #[cfg(feature = "alloc")] |
247 | | impl Regex { |
248 | | /// Parse the given regular expression using the default configuration and |
249 | | /// return the corresponding regex. |
250 | | /// |
251 | | /// If you want a non-default configuration, then use the [`Builder`] to |
252 | | /// set your own configuration. |
253 | | /// |
254 | | /// # Example |
255 | | /// |
256 | | /// ``` |
257 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
258 | | /// |
259 | | /// let re = Regex::new("foo[0-9]+bar")?; |
260 | | /// assert_eq!( |
261 | | /// Some(MultiMatch::must(0, 3, 14)), |
262 | | /// re.find_leftmost(b"zzzfoo12345barzzz"), |
263 | | /// ); |
264 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
265 | | /// ``` |
266 | | pub fn new(pattern: &str) -> Result<Regex, Error> { |
267 | | Builder::new().build(pattern) |
268 | | } |
269 | | |
270 | | /// Like `new`, but parses multiple patterns into a single "regex set." |
271 | | /// This similarly uses the default regex configuration. |
272 | | /// |
273 | | /// # Example |
274 | | /// |
275 | | /// ``` |
276 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
277 | | /// |
278 | | /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; |
279 | | /// |
280 | | /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
281 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
282 | | /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
283 | | /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
284 | | /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
285 | | /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
286 | | /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
287 | | /// assert_eq!(None, it.next()); |
288 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
289 | | /// ``` |
290 | | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> { |
291 | | Builder::new().build_many(patterns) |
292 | | } |
293 | | } |
294 | | |
295 | | #[cfg(feature = "alloc")] |
296 | | impl Regex<sparse::DFA<Vec<u8>>> { |
297 | | /// Parse the given regular expression using the default configuration, |
298 | | /// except using sparse DFAs, and return the corresponding regex. |
299 | | /// |
300 | | /// If you want a non-default configuration, then use the [`Builder`] to |
301 | | /// set your own configuration. |
302 | | /// |
303 | | /// # Example |
304 | | /// |
305 | | /// ``` |
306 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
307 | | /// |
308 | | /// let re = Regex::new_sparse("foo[0-9]+bar")?; |
309 | | /// assert_eq!( |
310 | | /// Some(MultiMatch::must(0, 3, 14)), |
311 | | /// re.find_leftmost(b"zzzfoo12345barzzz"), |
312 | | /// ); |
313 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
314 | | /// ``` |
315 | | pub fn new_sparse( |
316 | | pattern: &str, |
317 | | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
318 | | Builder::new().build_sparse(pattern) |
319 | | } |
320 | | |
321 | | /// Like `new`, but parses multiple patterns into a single "regex set" |
322 | | /// using sparse DFAs. This otherwise similarly uses the default regex |
323 | | /// configuration. |
324 | | /// |
325 | | /// # Example |
326 | | /// |
327 | | /// ``` |
328 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
329 | | /// |
330 | | /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; |
331 | | /// |
332 | | /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
333 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
334 | | /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
335 | | /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
336 | | /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
337 | | /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
338 | | /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
339 | | /// assert_eq!(None, it.next()); |
340 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
341 | | /// ``` |
342 | | pub fn new_many_sparse<P: AsRef<str>>( |
343 | | patterns: &[P], |
344 | | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
345 | | Builder::new().build_many_sparse(patterns) |
346 | | } |
347 | | } |
348 | | |
349 | | /// Convenience routines for regex construction. |
350 | | #[cfg(feature = "alloc")] |
351 | | impl Regex { |
352 | | /// Return a default configuration for a `Regex`. |
353 | | /// |
354 | | /// This is a convenience routine to avoid needing to import the `Config` |
355 | | /// type when customizing the construction of a regex. |
356 | | /// |
357 | | /// # Example |
358 | | /// |
359 | | /// This example shows how to disable UTF-8 mode for `Regex` iteration. |
360 | | /// When UTF-8 mode is disabled, the position immediately following an |
361 | | /// empty match is where the next search begins, instead of the next |
362 | | /// position of a UTF-8 encoded codepoint. |
363 | | /// |
364 | | /// ``` |
365 | | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
366 | | /// |
367 | | /// let re = Regex::builder() |
368 | | /// .configure(Regex::config().utf8(false)) |
369 | | /// .build(r"")?; |
370 | | /// let haystack = "a☃z".as_bytes(); |
371 | | /// let mut it = re.find_leftmost_iter(haystack); |
372 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
373 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
374 | | /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
375 | | /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
376 | | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
377 | | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
378 | | /// assert_eq!(None, it.next()); |
379 | | /// |
380 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
381 | | /// ``` |
382 | | pub fn config() -> Config { |
383 | | Config::new() |
384 | | } |
385 | | |
386 | | /// Return a builder for configuring the construction of a `Regex`. |
387 | | /// |
388 | | /// This is a convenience routine to avoid needing to import the |
389 | | /// [`Builder`] type in common cases. |
390 | | /// |
391 | | /// # Example |
392 | | /// |
393 | | /// This example shows how to use the builder to disable UTF-8 mode |
394 | | /// everywhere. |
395 | | /// |
396 | | /// ``` |
397 | | /// use regex_automata::{ |
398 | | /// dfa::regex::Regex, |
399 | | /// nfa::thompson, |
400 | | /// MultiMatch, SyntaxConfig, |
401 | | /// }; |
402 | | /// |
403 | | /// let re = Regex::builder() |
404 | | /// .configure(Regex::config().utf8(false)) |
405 | | /// .syntax(SyntaxConfig::new().utf8(false)) |
406 | | /// .thompson(thompson::Config::new().utf8(false)) |
407 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
408 | | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
409 | | /// let expected = Some(MultiMatch::must(0, 1, 9)); |
410 | | /// let got = re.find_leftmost(haystack); |
411 | | /// assert_eq!(expected, got); |
412 | | /// |
413 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
414 | | /// ``` |
415 | | pub fn builder() -> Builder { |
416 | | Builder::new() |
417 | | } |
418 | | } |
419 | | |
420 | | /// Standard search routines for finding and iterating over matches. |
421 | | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
422 | | /// Returns true if and only if this regex matches the given haystack. |
423 | | /// |
424 | | /// This routine may short circuit if it knows that scanning future input |
425 | | /// will never lead to a different result. In particular, if the underlying |
426 | | /// DFA enters a match state or a dead state, then this routine will return |
427 | | /// `true` or `false`, respectively, without inspecting any future input. |
428 | | /// |
429 | | /// # Panics |
430 | | /// |
431 | | /// If the underlying DFAs return an error, then this routine panics. This |
432 | | /// only occurs in non-default configurations where quit bytes are used or |
433 | | /// Unicode word boundaries are heuristically enabled. |
434 | | /// |
435 | | /// The fallible version of this routine is |
436 | | /// [`try_is_match`](Regex::try_is_match). |
437 | | /// |
438 | | /// # Example |
439 | | /// |
440 | | /// ``` |
441 | | /// use regex_automata::dfa::regex::Regex; |
442 | | /// |
443 | | /// let re = Regex::new("foo[0-9]+bar")?; |
444 | | /// assert_eq!(true, re.is_match(b"foo12345bar")); |
445 | | /// assert_eq!(false, re.is_match(b"foobar")); |
446 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
447 | | /// ``` |
448 | 0 | pub fn is_match(&self, haystack: &[u8]) -> bool { |
449 | 0 | self.is_match_at(haystack, 0, haystack.len()) |
450 | 0 | } |
451 | | |
452 | | /// Returns the first position at which a match is found. |
453 | | /// |
454 | | /// This routine stops scanning input in precisely the same circumstances |
455 | | /// as `is_match`. The key difference is that this routine returns the |
456 | | /// position at which it stopped scanning input if and only if a match |
457 | | /// was found. If no match is found, then `None` is returned. |
458 | | /// |
459 | | /// # Panics |
460 | | /// |
461 | | /// If the underlying DFAs return an error, then this routine panics. This |
462 | | /// only occurs in non-default configurations where quit bytes are used or |
463 | | /// Unicode word boundaries are heuristically enabled. |
464 | | /// |
465 | | /// The fallible version of this routine is |
466 | | /// [`try_find_earliest`](Regex::try_find_earliest). |
467 | | /// |
468 | | /// # Example |
469 | | /// |
470 | | /// ``` |
471 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
472 | | /// |
473 | | /// // Normally, the leftmost first match would greedily consume as many |
474 | | /// // decimal digits as it could. But a match is detected as soon as one |
475 | | /// // digit is seen. |
476 | | /// let re = Regex::new("foo[0-9]+")?; |
477 | | /// assert_eq!( |
478 | | /// Some(MultiMatch::must(0, 0, 4)), |
479 | | /// re.find_earliest(b"foo12345"), |
480 | | /// ); |
481 | | /// |
482 | | /// // Normally, the end of the leftmost first match here would be 3, |
483 | | /// // but the "earliest" match semantics detect a match earlier. |
484 | | /// let re = Regex::new("abc|a")?; |
485 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); |
486 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
487 | | /// ``` |
488 | 0 | pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> { |
489 | 0 | self.find_earliest_at(haystack, 0, haystack.len()) |
490 | 0 | } |
491 | | |
492 | | /// Returns the start and end offset of the leftmost match. If no match |
493 | | /// exists, then `None` is returned. |
494 | | /// |
495 | | /// # Panics |
496 | | /// |
497 | | /// If the underlying DFAs return an error, then this routine panics. This |
498 | | /// only occurs in non-default configurations where quit bytes are used or |
499 | | /// Unicode word boundaries are heuristically enabled. |
500 | | /// |
501 | | /// The fallible version of this routine is |
502 | | /// [`try_find_leftmost`](Regex::try_find_leftmost). |
503 | | /// |
504 | | /// # Example |
505 | | /// |
506 | | /// ``` |
507 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
508 | | /// |
509 | | /// // Greediness is applied appropriately when compared to find_earliest. |
510 | | /// let re = Regex::new("foo[0-9]+")?; |
511 | | /// assert_eq!( |
512 | | /// Some(MultiMatch::must(0, 3, 11)), |
513 | | /// re.find_leftmost(b"zzzfoo12345zzz"), |
514 | | /// ); |
515 | | /// |
516 | | /// // Even though a match is found after reading the first byte (`a`), |
517 | | /// // the default leftmost-first match semantics demand that we find the |
518 | | /// // earliest match that prefers earlier parts of the pattern over latter |
519 | | /// // parts. |
520 | | /// let re = Regex::new("abc|a")?; |
521 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); |
522 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
523 | | /// ``` |
524 | 0 | pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> { |
525 | 0 | self.find_leftmost_at(haystack, 0, haystack.len()) |
526 | 0 | } |
527 | | |
528 | | /// Search for the first overlapping match in `haystack`. |
529 | | /// |
530 | | /// This routine is principally useful when searching for multiple patterns |
531 | | /// on inputs where multiple patterns may match the same regions of text. |
532 | | /// In particular, callers must preserve the automaton's search state from |
533 | | /// prior calls so that the implementation knows where the last match |
534 | | /// occurred and which pattern was reported. |
535 | | /// |
536 | | /// # Panics |
537 | | /// |
538 | | /// If the underlying DFAs return an error, then this routine panics. This |
539 | | /// only occurs in non-default configurations where quit bytes are used or |
540 | | /// Unicode word boundaries are heuristically enabled. |
541 | | /// |
542 | | /// The fallible version of this routine is |
543 | | /// [`try_find_overlapping`](Regex::try_find_overlapping). |
544 | | /// |
545 | | /// # Example |
546 | | /// |
547 | | /// This example shows how to run an overlapping search with multiple |
548 | | /// regexes. |
549 | | /// |
550 | | /// ``` |
551 | | /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
552 | | /// |
553 | | /// let re = Regex::builder() |
554 | | /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
555 | | /// .build_many(&[r"\w+$", r"\S+$"])?; |
556 | | /// let haystack = "@foo".as_bytes(); |
557 | | /// let mut state = dfa::OverlappingState::start(); |
558 | | /// |
559 | | /// let expected = Some(MultiMatch::must(1, 0, 4)); |
560 | | /// let got = re.find_overlapping(haystack, &mut state); |
561 | | /// assert_eq!(expected, got); |
562 | | /// |
563 | | /// // The first pattern also matches at the same position, so re-running |
564 | | /// // the search will yield another match. Notice also that the first |
565 | | /// // pattern is returned after the second. This is because the second |
566 | | /// // pattern begins its match before the first, is therefore an earlier |
567 | | /// // match and is thus reported first. |
568 | | /// let expected = Some(MultiMatch::must(0, 1, 4)); |
569 | | /// let got = re.find_overlapping(haystack, &mut state); |
570 | | /// assert_eq!(expected, got); |
571 | | /// |
572 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
573 | | /// ``` |
574 | 0 | pub fn find_overlapping( |
575 | 0 | &self, |
576 | 0 | haystack: &[u8], |
577 | 0 | state: &mut OverlappingState, |
578 | 0 | ) -> Option<MultiMatch> { |
579 | 0 | self.find_overlapping_at(haystack, 0, haystack.len(), state) |
580 | 0 | } |
581 | | |
582 | | /// Returns an iterator over all non-overlapping "earliest" matches. |
583 | | /// |
584 | | /// Match positions are reported as soon as a match is known to occur, even |
585 | | /// if the standard leftmost match would be longer. |
586 | | /// |
587 | | /// # Panics |
588 | | /// |
589 | | /// If the underlying DFAs return an error during iteration, then iteration |
590 | | /// panics. This only occurs in non-default configurations where quit bytes |
591 | | /// are used or Unicode word boundaries are heuristically enabled. |
592 | | /// |
593 | | /// The fallible version of this routine is |
594 | | /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). |
595 | | /// |
596 | | /// # Example |
597 | | /// |
598 | | /// This example shows how to run an "earliest" iterator. |
599 | | /// |
600 | | /// ``` |
601 | | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
602 | | /// |
603 | | /// let re = Regex::new("[0-9]+")?; |
604 | | /// let haystack = "123".as_bytes(); |
605 | | /// |
606 | | /// // Normally, a standard leftmost iterator would return a single |
607 | | /// // match, but since "earliest" detects matches earlier, we get |
608 | | /// // three matches. |
609 | | /// let mut it = re.find_earliest_iter(haystack); |
610 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
611 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
612 | | /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
613 | | /// assert_eq!(None, it.next()); |
614 | | /// |
615 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
616 | | /// ``` |
617 | 0 | pub fn find_earliest_iter<'r, 't>( |
618 | 0 | &'r self, |
619 | 0 | haystack: &'t [u8], |
620 | 0 | ) -> FindEarliestMatches<'r, 't, A, P> { |
621 | 0 | FindEarliestMatches::new(self, haystack) |
622 | 0 | } |
623 | | |
624 | | /// Returns an iterator over all non-overlapping leftmost matches in the |
625 | | /// given bytes. If no match exists, then the iterator yields no elements. |
626 | | /// |
627 | | /// This corresponds to the "standard" regex search iterator. |
628 | | /// |
629 | | /// # Panics |
630 | | /// |
631 | | /// If the underlying DFAs return an error during iteration, then iteration |
632 | | /// panics. This only occurs in non-default configurations where quit bytes |
633 | | /// are used or Unicode word boundaries are heuristically enabled. |
634 | | /// |
635 | | /// The fallible version of this routine is |
636 | | /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). |
637 | | /// |
638 | | /// # Example |
639 | | /// |
640 | | /// ``` |
641 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
642 | | /// |
643 | | /// let re = Regex::new("foo[0-9]+")?; |
644 | | /// let text = b"foo1 foo12 foo123"; |
645 | | /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); |
646 | | /// assert_eq!(matches, vec![ |
647 | | /// MultiMatch::must(0, 0, 4), |
648 | | /// MultiMatch::must(0, 5, 10), |
649 | | /// MultiMatch::must(0, 11, 17), |
650 | | /// ]); |
651 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
652 | | /// ``` |
653 | 0 | pub fn find_leftmost_iter<'r, 't>( |
654 | 0 | &'r self, |
655 | 0 | haystack: &'t [u8], |
656 | 0 | ) -> FindLeftmostMatches<'r, 't, A, P> { |
657 | 0 | FindLeftmostMatches::new(self, haystack) |
658 | 0 | } |
659 | | |
660 | | /// Returns an iterator over all overlapping matches in the given haystack. |
661 | | /// |
662 | | /// This routine is principally useful when searching for multiple patterns |
663 | | /// on inputs where multiple patterns may match the same regions of text. |
664 | | /// The iterator takes care of handling the overlapping state that must be |
665 | | /// threaded through every search. |
666 | | /// |
667 | | /// # Panics |
668 | | /// |
669 | | /// If the underlying DFAs return an error during iteration, then iteration |
670 | | /// panics. This only occurs in non-default configurations where quit bytes |
671 | | /// are used or Unicode word boundaries are heuristically enabled. |
672 | | /// |
673 | | /// The fallible version of this routine is |
674 | | /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). |
675 | | /// |
676 | | /// # Example |
677 | | /// |
678 | | /// This example shows how to run an overlapping search with multiple |
679 | | /// regexes. |
680 | | /// |
681 | | /// ``` |
682 | | /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
683 | | /// |
684 | | /// let re = Regex::builder() |
685 | | /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
686 | | /// .build_many(&[r"\w+$", r"\S+$"])?; |
687 | | /// let haystack = "@foo".as_bytes(); |
688 | | /// |
689 | | /// let mut it = re.find_overlapping_iter(haystack); |
690 | | /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); |
691 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); |
692 | | /// assert_eq!(None, it.next()); |
693 | | /// |
694 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
695 | | /// ``` |
696 | 0 | pub fn find_overlapping_iter<'r, 't>( |
697 | 0 | &'r self, |
698 | 0 | haystack: &'t [u8], |
699 | 0 | ) -> FindOverlappingMatches<'r, 't, A, P> { |
700 | 0 | FindOverlappingMatches::new(self, haystack) |
701 | 0 | } |
702 | | } |
703 | | |
704 | | /// Lower level infallible search routines that permit controlling where |
705 | | /// the search starts and ends in a particular sequence. This is useful for |
706 | | /// executing searches that need to take surrounding context into account. This |
707 | | /// is required for correctly implementing iteration because of look-around |
708 | | /// operators (`^`, `$`, `\b`). |
709 | | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
710 | | /// Returns true if and only if this regex matches the given haystack. |
711 | | /// |
712 | | /// This routine may short circuit if it knows that scanning future input |
713 | | /// will never lead to a different result. In particular, if the underlying |
714 | | /// DFA enters a match state or a dead state, then this routine will return |
715 | | /// `true` or `false`, respectively, without inspecting any future input. |
716 | | /// |
717 | | /// # Searching a substring of the haystack |
718 | | /// |
719 | | /// Being an "at" search routine, this permits callers to search a |
720 | | /// substring of `haystack` by specifying a range in `haystack`. |
721 | | /// Why expose this as an API instead of just asking callers to use |
722 | | /// `&input[start..end]`? The reason is that regex matching often wants |
723 | | /// to take the surrounding context into account in order to handle |
724 | | /// look-around (`^`, `$` and `\b`). |
725 | | /// |
726 | | /// # Panics |
727 | | /// |
728 | | /// If the underlying DFAs return an error, then this routine panics. This |
729 | | /// only occurs in non-default configurations where quit bytes are used or |
730 | | /// Unicode word boundaries are heuristically enabled. |
731 | | /// |
732 | | /// The fallible version of this routine is |
733 | | /// [`try_is_match_at`](Regex::try_is_match_at). |
734 | 0 | pub fn is_match_at( |
735 | 0 | &self, |
736 | 0 | haystack: &[u8], |
737 | 0 | start: usize, |
738 | 0 | end: usize, |
739 | 0 | ) -> bool { |
740 | 0 | self.try_is_match_at(haystack, start, end).unwrap() |
741 | 0 | } |
742 | | |
743 | | /// Returns the first position at which a match is found. |
744 | | /// |
745 | | /// This routine stops scanning input in precisely the same circumstances |
746 | | /// as `is_match`. The key difference is that this routine returns the |
747 | | /// position at which it stopped scanning input if and only if a match |
748 | | /// was found. If no match is found, then `None` is returned. |
749 | | /// |
750 | | /// # Searching a substring of the haystack |
751 | | /// |
752 | | /// Being an "at" search routine, this permits callers to search a |
753 | | /// substring of `haystack` by specifying a range in `haystack`. |
754 | | /// Why expose this as an API instead of just asking callers to use |
755 | | /// `&input[start..end]`? The reason is that regex matching often wants |
756 | | /// to take the surrounding context into account in order to handle |
757 | | /// look-around (`^`, `$` and `\b`). |
758 | | /// |
759 | | /// This is useful when implementing an iterator over matches |
760 | | /// within the same haystack, which cannot be done correctly by simply |
761 | | /// providing a subslice of `haystack`. |
762 | | /// |
763 | | /// # Panics |
764 | | /// |
765 | | /// If the underlying DFAs return an error, then this routine panics. This |
766 | | /// only occurs in non-default configurations where quit bytes are used or |
767 | | /// Unicode word boundaries are heuristically enabled. |
768 | | /// |
769 | | /// The fallible version of this routine is |
770 | | /// [`try_find_earliest_at`](Regex::try_find_earliest_at). |
771 | 0 | pub fn find_earliest_at( |
772 | 0 | &self, |
773 | 0 | haystack: &[u8], |
774 | 0 | start: usize, |
775 | 0 | end: usize, |
776 | 0 | ) -> Option<MultiMatch> { |
777 | 0 | self.try_find_earliest_at(haystack, start, end).unwrap() |
778 | 0 | } |
779 | | |
780 | | /// Returns the same as `find_leftmost`, but starts the search at the given |
781 | | /// offset. |
782 | | /// |
783 | | /// The significance of the starting point is that it takes the surrounding |
784 | | /// context into consideration. For example, if the DFA is anchored, then |
785 | | /// a match can only occur when `start == 0`. |
786 | | /// |
787 | | /// # Searching a substring of the haystack |
788 | | /// |
789 | | /// Being an "at" search routine, this permits callers to search a |
790 | | /// substring of `haystack` by specifying a range in `haystack`. |
791 | | /// Why expose this as an API instead of just asking callers to use |
792 | | /// `&input[start..end]`? The reason is that regex matching often wants |
793 | | /// to take the surrounding context into account in order to handle |
794 | | /// look-around (`^`, `$` and `\b`). |
795 | | /// |
796 | | /// This is useful when implementing an iterator over matches within the |
797 | | /// same haystack, which cannot be done correctly by simply providing a |
798 | | /// subslice of `haystack`. |
799 | | /// |
800 | | /// # Panics |
801 | | /// |
802 | | /// If the underlying DFAs return an error, then this routine panics. This |
803 | | /// only occurs in non-default configurations where quit bytes are used or |
804 | | /// Unicode word boundaries are heuristically enabled. |
805 | | /// |
806 | | /// The fallible version of this routine is |
807 | | /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). |
808 | 0 | pub fn find_leftmost_at( |
809 | 0 | &self, |
810 | 0 | haystack: &[u8], |
811 | 0 | start: usize, |
812 | 0 | end: usize, |
813 | 0 | ) -> Option<MultiMatch> { |
814 | 0 | self.try_find_leftmost_at(haystack, start, end).unwrap() |
815 | 0 | } |
816 | | |
817 | | /// Search for the first overlapping match within a given range of |
818 | | /// `haystack`. |
819 | | /// |
820 | | /// This routine is principally useful when searching for multiple patterns |
821 | | /// on inputs where multiple patterns may match the same regions of text. |
822 | | /// In particular, callers must preserve the automaton's search state from |
823 | | /// prior calls so that the implementation knows where the last match |
824 | | /// occurred and which pattern was reported. |
825 | | /// |
826 | | /// # Searching a substring of the haystack |
827 | | /// |
828 | | /// Being an "at" search routine, this permits callers to search a |
829 | | /// substring of `haystack` by specifying a range in `haystack`. |
830 | | /// Why expose this as an API instead of just asking callers to use |
831 | | /// `&input[start..end]`? The reason is that regex matching often wants |
832 | | /// to take the surrounding context into account in order to handle |
833 | | /// look-around (`^`, `$` and `\b`). |
834 | | /// |
835 | | /// This is useful when implementing an iterator over matches |
836 | | /// within the same haystack, which cannot be done correctly by simply |
837 | | /// providing a subslice of `haystack`. |
838 | | /// |
839 | | /// # Panics |
840 | | /// |
841 | | /// If the underlying DFAs return an error, then this routine panics. This |
842 | | /// only occurs in non-default configurations where quit bytes are used or |
843 | | /// Unicode word boundaries are heuristically enabled. |
844 | | /// |
845 | | /// The fallible version of this routine is |
846 | | /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). |
847 | 0 | pub fn find_overlapping_at( |
848 | 0 | &self, |
849 | 0 | haystack: &[u8], |
850 | 0 | start: usize, |
851 | 0 | end: usize, |
852 | 0 | state: &mut OverlappingState, |
853 | 0 | ) -> Option<MultiMatch> { |
854 | 0 | self.try_find_overlapping_at(haystack, start, end, state).unwrap() |
855 | 0 | } |
856 | | } |
857 | | |
858 | | /// Fallible search routines. These may return an error when the underlying |
859 | | /// DFAs have been configured in a way that permits them to fail during a |
860 | | /// search. |
861 | | /// |
862 | | /// Errors during search only occur when the DFA has been explicitly |
863 | | /// configured to do so, usually by specifying one or more "quit" bytes or by |
864 | | /// heuristically enabling Unicode word boundaries. |
865 | | /// |
866 | | /// Errors will never be returned using the default configuration. So these |
867 | | /// fallible routines are only needed for particular configurations. |
868 | | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
869 | | /// Returns true if and only if this regex matches the given haystack. |
870 | | /// |
871 | | /// This routine may short circuit if it knows that scanning future input |
872 | | /// will never lead to a different result. In particular, if the underlying |
873 | | /// DFA enters a match state or a dead state, then this routine will return |
874 | | /// `true` or `false`, respectively, without inspecting any future input. |
875 | | /// |
876 | | /// # Errors |
877 | | /// |
878 | | /// This routine only errors if the search could not complete. For |
879 | | /// DFA-based regexes, this only occurs in a non-default configuration |
880 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
881 | | /// enabled. |
882 | | /// |
883 | | /// When a search cannot complete, callers cannot know whether a match |
884 | | /// exists or not. |
885 | | /// |
886 | | /// The infallible (panics on error) version of this routine is |
887 | | /// [`is_match`](Regex::is_match). |
888 | 0 | pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> { |
889 | 0 | self.try_is_match_at(haystack, 0, haystack.len()) |
890 | 0 | } |
891 | | |
892 | | /// Returns the first position at which a match is found. |
893 | | /// |
894 | | /// This routine stops scanning input in precisely the same circumstances |
895 | | /// as `is_match`. The key difference is that this routine returns the |
896 | | /// position at which it stopped scanning input if and only if a match |
897 | | /// was found. If no match is found, then `None` is returned. |
898 | | /// |
899 | | /// # Errors |
900 | | /// |
901 | | /// This routine only errors if the search could not complete. For |
902 | | /// DFA-based regexes, this only occurs in a non-default configuration |
903 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
904 | | /// enabled. |
905 | | /// |
906 | | /// When a search cannot complete, callers cannot know whether a match |
907 | | /// exists or not. |
908 | | /// |
909 | | /// The infallible (panics on error) version of this routine is |
910 | | /// [`find_earliest`](Regex::find_earliest). |
911 | 0 | pub fn try_find_earliest( |
912 | 0 | &self, |
913 | 0 | haystack: &[u8], |
914 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
915 | 0 | self.try_find_earliest_at(haystack, 0, haystack.len()) |
916 | 0 | } |
917 | | |
918 | | /// Returns the start and end offset of the leftmost match. If no match |
919 | | /// exists, then `None` is returned. |
920 | | /// |
921 | | /// # Errors |
922 | | /// |
923 | | /// This routine only errors if the search could not complete. For |
924 | | /// DFA-based regexes, this only occurs in a non-default configuration |
925 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
926 | | /// enabled. |
927 | | /// |
928 | | /// When a search cannot complete, callers cannot know whether a match |
929 | | /// exists or not. |
930 | | /// |
931 | | /// The infallible (panics on error) version of this routine is |
932 | | /// [`find_leftmost`](Regex::find_leftmost). |
933 | 0 | pub fn try_find_leftmost( |
934 | 0 | &self, |
935 | 0 | haystack: &[u8], |
936 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
937 | 0 | self.try_find_leftmost_at(haystack, 0, haystack.len()) |
938 | 0 | } |
939 | | |
940 | | /// Search for the first overlapping match in `haystack`. |
941 | | /// |
942 | | /// This routine is principally useful when searching for multiple patterns |
943 | | /// on inputs where multiple patterns may match the same regions of text. |
944 | | /// In particular, callers must preserve the automaton's search state from |
945 | | /// prior calls so that the implementation knows where the last match |
946 | | /// occurred and which pattern was reported. |
947 | | /// |
948 | | /// # Errors |
949 | | /// |
950 | | /// This routine only errors if the search could not complete. For |
951 | | /// DFA-based regexes, this only occurs in a non-default configuration |
952 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
953 | | /// enabled. |
954 | | /// |
955 | | /// When a search cannot complete, callers cannot know whether a match |
956 | | /// exists or not. |
957 | | /// |
958 | | /// The infallible (panics on error) version of this routine is |
959 | | /// [`find_overlapping`](Regex::find_overlapping). |
960 | 0 | pub fn try_find_overlapping( |
961 | 0 | &self, |
962 | 0 | haystack: &[u8], |
963 | 0 | state: &mut OverlappingState, |
964 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
965 | 0 | self.try_find_overlapping_at(haystack, 0, haystack.len(), state) |
966 | 0 | } |
967 | | |
968 | | /// Returns an iterator over all non-overlapping "earliest" matches. |
969 | | /// |
970 | | /// Match positions are reported as soon as a match is known to occur, even |
971 | | /// if the standard leftmost match would be longer. |
972 | | /// |
973 | | /// # Errors |
974 | | /// |
975 | | /// This iterator only yields errors if the search could not complete. For |
976 | | /// DFA-based regexes, this only occurs in a non-default configuration |
977 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
978 | | /// enabled. |
979 | | /// |
980 | | /// When a search cannot complete, callers cannot know whether a match |
981 | | /// exists or not. |
982 | | /// |
983 | | /// The infallible (panics on error) version of this routine is |
984 | | /// [`find_earliest_iter`](Regex::find_earliest_iter). |
985 | 0 | pub fn try_find_earliest_iter<'r, 't>( |
986 | 0 | &'r self, |
987 | 0 | haystack: &'t [u8], |
988 | 0 | ) -> TryFindEarliestMatches<'r, 't, A, P> { |
989 | 0 | TryFindEarliestMatches::new(self, haystack) |
990 | 0 | } |
991 | | |
992 | | /// Returns an iterator over all non-overlapping leftmost matches in the |
993 | | /// given bytes. If no match exists, then the iterator yields no elements. |
994 | | /// |
995 | | /// This corresponds to the "standard" regex search iterator. |
996 | | /// |
997 | | /// # Errors |
998 | | /// |
999 | | /// This iterator only yields errors if the search could not complete. For |
1000 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1001 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
1002 | | /// enabled. |
1003 | | /// |
1004 | | /// When a search cannot complete, callers cannot know whether a match |
1005 | | /// exists or not. |
1006 | | /// |
1007 | | /// The infallible (panics on error) version of this routine is |
1008 | | /// [`find_leftmost_iter`](Regex::find_leftmost_iter). |
1009 | 0 | pub fn try_find_leftmost_iter<'r, 't>( |
1010 | 0 | &'r self, |
1011 | 0 | haystack: &'t [u8], |
1012 | 0 | ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
1013 | 0 | TryFindLeftmostMatches::new(self, haystack) |
1014 | 0 | } |
1015 | | |
1016 | | /// Returns an iterator over all overlapping matches in the given haystack. |
1017 | | /// |
1018 | | /// This routine is principally useful when searching for multiple patterns |
1019 | | /// on inputs where multiple patterns may match the same regions of text. |
1020 | | /// The iterator takes care of handling the overlapping state that must be |
1021 | | /// threaded through every search. |
1022 | | /// |
1023 | | /// # Errors |
1024 | | /// |
1025 | | /// This iterator only yields errors if the search could not complete. For |
1026 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1027 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
1028 | | /// enabled. |
1029 | | /// |
1030 | | /// When a search cannot complete, callers cannot know whether a match |
1031 | | /// exists or not. |
1032 | | /// |
1033 | | /// The infallible (panics on error) version of this routine is |
1034 | | /// [`find_overlapping_iter`](Regex::find_overlapping_iter). |
1035 | 0 | pub fn try_find_overlapping_iter<'r, 't>( |
1036 | 0 | &'r self, |
1037 | 0 | haystack: &'t [u8], |
1038 | 0 | ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
1039 | 0 | TryFindOverlappingMatches::new(self, haystack) |
1040 | 0 | } |
1041 | | } |
1042 | | |
1043 | | /// Lower level fallible search routines that permit controlling where the |
1044 | | /// search starts and ends in a particular sequence. |
1045 | | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
1046 | | /// Returns true if and only if this regex matches the given haystack. |
1047 | | /// |
1048 | | /// This routine may short circuit if it knows that scanning future input |
1049 | | /// will never lead to a different result. In particular, if the underlying |
1050 | | /// DFA enters a match state or a dead state, then this routine will return |
1051 | | /// `true` or `false`, respectively, without inspecting any future input. |
1052 | | /// |
1053 | | /// # Searching a substring of the haystack |
1054 | | /// |
1055 | | /// Being an "at" search routine, this permits callers to search a |
1056 | | /// substring of `haystack` by specifying a range in `haystack`. |
1057 | | /// Why expose this as an API instead of just asking callers to use |
1058 | | /// `&input[start..end]`? The reason is that regex matching often wants |
1059 | | /// to take the surrounding context into account in order to handle |
1060 | | /// look-around (`^`, `$` and `\b`). |
1061 | | /// |
1062 | | /// # Errors |
1063 | | /// |
1064 | | /// This routine only errors if the search could not complete. For |
1065 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1066 | | /// where quit bytes are used, Unicode word boundaries are heuristically |
1067 | | /// enabled or limits are set on the number of times the lazy DFA's cache |
1068 | | /// may be cleared. |
1069 | | /// |
1070 | | /// When a search cannot complete, callers cannot know whether a match |
1071 | | /// exists or not. |
1072 | | /// |
1073 | | /// The infallible (panics on error) version of this routine is |
1074 | | /// [`is_match_at`](Regex::is_match_at). |
1075 | 0 | pub fn try_is_match_at( |
1076 | 0 | &self, |
1077 | 0 | haystack: &[u8], |
1078 | 0 | start: usize, |
1079 | 0 | end: usize, |
1080 | 0 | ) -> Result<bool, MatchError> { |
1081 | 0 | self.forward() |
1082 | 0 | .find_earliest_fwd_at( |
1083 | 0 | self.scanner().as_mut(), |
1084 | 0 | None, |
1085 | 0 | haystack, |
1086 | 0 | start, |
1087 | 0 | end, |
1088 | | ) |
1089 | 0 | .map(|x| x.is_some()) |
1090 | 0 | } |
1091 | | |
1092 | | /// Returns the first position at which a match is found. |
1093 | | /// |
1094 | | /// This routine stops scanning input in precisely the same circumstances |
1095 | | /// as `is_match`. The key difference is that this routine returns the |
1096 | | /// position at which it stopped scanning input if and only if a match |
1097 | | /// was found. If no match is found, then `None` is returned. |
1098 | | /// |
1099 | | /// # Searching a substring of the haystack |
1100 | | /// |
1101 | | /// Being an "at" search routine, this permits callers to search a |
1102 | | /// substring of `haystack` by specifying a range in `haystack`. |
1103 | | /// Why expose this as an API instead of just asking callers to use |
1104 | | /// `&input[start..end]`? The reason is that regex matching often wants |
1105 | | /// to take the surrounding context into account in order to handle |
1106 | | /// look-around (`^`, `$` and `\b`). |
1107 | | /// |
1108 | | /// This is useful when implementing an iterator over matches |
1109 | | /// within the same haystack, which cannot be done correctly by simply |
1110 | | /// providing a subslice of `haystack`. |
1111 | | /// |
1112 | | /// # Errors |
1113 | | /// |
1114 | | /// This routine only errors if the search could not complete. For |
1115 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1116 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
1117 | | /// enabled. |
1118 | | /// |
1119 | | /// When a search cannot complete, callers cannot know whether a match |
1120 | | /// exists or not. |
1121 | | /// |
1122 | | /// The infallible (panics on error) version of this routine is |
1123 | | /// [`find_earliest_at`](Regex::find_earliest_at). |
1124 | 0 | pub fn try_find_earliest_at( |
1125 | 0 | &self, |
1126 | 0 | haystack: &[u8], |
1127 | 0 | start: usize, |
1128 | 0 | end: usize, |
1129 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1130 | 0 | self.try_find_earliest_at_imp( |
1131 | 0 | self.scanner().as_mut(), |
1132 | 0 | haystack, |
1133 | 0 | start, |
1134 | 0 | end, |
1135 | | ) |
1136 | 0 | } |
1137 | | |
1138 | | /// The implementation of "earliest" searching, where a prefilter scanner |
1139 | | /// may be given. |
1140 | 0 | fn try_find_earliest_at_imp( |
1141 | 0 | &self, |
1142 | 0 | pre: Option<&mut prefilter::Scanner>, |
1143 | 0 | haystack: &[u8], |
1144 | 0 | start: usize, |
1145 | 0 | end: usize, |
1146 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1147 | | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1148 | | // that we always use the `impl Automaton for &A` for calling methods. |
1149 | | // Since this is the usual way that automata are used, this helps |
1150 | | // reduce the number of monomorphized copies of the search code. |
1151 | 0 | let (fwd, rev) = (self.forward(), self.reverse()); |
1152 | 0 | let end = match (&fwd) |
1153 | 0 | .find_earliest_fwd_at(pre, None, haystack, start, end)? |
1154 | | { |
1155 | 0 | None => return Ok(None), |
1156 | 0 | Some(end) => end, |
1157 | | }; |
1158 | | // N.B. The only time we need to tell the reverse searcher the pattern |
1159 | | // to match is in the overlapping case, since it's ambiguous. In the |
1160 | | // leftmost case, I have tentatively convinced myself that it isn't |
1161 | | // necessary and the reverse search will always find the same pattern |
1162 | | // to match as the forward search. But I lack a rigorous proof. |
1163 | 0 | let start = (&rev) |
1164 | 0 | .find_earliest_rev_at(None, haystack, start, end.offset())? |
1165 | 0 | .expect("reverse search must match if forward search does"); |
1166 | 0 | assert_eq!( |
1167 | 0 | start.pattern(), |
1168 | 0 | end.pattern(), |
1169 | 0 | "forward and reverse search must match same pattern" |
1170 | | ); |
1171 | 0 | assert!(start.offset() <= end.offset()); |
1172 | 0 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1173 | 0 | } |
1174 | | |
1175 | | /// Returns the start and end offset of the leftmost match. If no match |
1176 | | /// exists, then `None` is returned. |
1177 | | /// |
1178 | | /// # Searching a substring of the haystack |
1179 | | /// |
1180 | | /// Being an "at" search routine, this permits callers to search a |
1181 | | /// substring of `haystack` by specifying a range in `haystack`. |
1182 | | /// Why expose this as an API instead of just asking callers to use |
1183 | | /// `&input[start..end]`? The reason is that regex matching often wants |
1184 | | /// to take the surrounding context into account in order to handle |
1185 | | /// look-around (`^`, `$` and `\b`). |
1186 | | /// |
1187 | | /// This is useful when implementing an iterator over matches |
1188 | | /// within the same haystack, which cannot be done correctly by simply |
1189 | | /// providing a subslice of `haystack`. |
1190 | | /// |
1191 | | /// # Errors |
1192 | | /// |
1193 | | /// This routine only errors if the search could not complete. For |
1194 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1195 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
1196 | | /// enabled. |
1197 | | /// |
1198 | | /// When a search cannot complete, callers cannot know whether a match |
1199 | | /// exists or not. |
1200 | | /// |
1201 | | /// The infallible (panics on error) version of this routine is |
1202 | | /// [`find_leftmost_at`](Regex::find_leftmost_at). |
1203 | 0 | pub fn try_find_leftmost_at( |
1204 | 0 | &self, |
1205 | 0 | haystack: &[u8], |
1206 | 0 | start: usize, |
1207 | 0 | end: usize, |
1208 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1209 | 0 | self.try_find_leftmost_at_imp( |
1210 | 0 | self.scanner().as_mut(), |
1211 | 0 | haystack, |
1212 | 0 | start, |
1213 | 0 | end, |
1214 | | ) |
1215 | 0 | } |
1216 | | |
1217 | | /// The implementation of leftmost searching, where a prefilter scanner |
1218 | | /// may be given. |
1219 | 0 | fn try_find_leftmost_at_imp( |
1220 | 0 | &self, |
1221 | 0 | scanner: Option<&mut prefilter::Scanner>, |
1222 | 0 | haystack: &[u8], |
1223 | 0 | start: usize, |
1224 | 0 | end: usize, |
1225 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1226 | | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1227 | | // that we always use the `impl Automaton for &A` for calling methods. |
1228 | | // Since this is the usual way that automata are used, this helps |
1229 | | // reduce the number of monomorphized copies of the search code. |
1230 | 0 | let (fwd, rev) = (self.forward(), self.reverse()); |
1231 | 0 | let end = match (&fwd) |
1232 | 0 | .find_leftmost_fwd_at(scanner, None, haystack, start, end)? |
1233 | | { |
1234 | 0 | None => return Ok(None), |
1235 | 0 | Some(end) => end, |
1236 | | }; |
1237 | | // N.B. The only time we need to tell the reverse searcher the pattern |
1238 | | // to match is in the overlapping case, since it's ambiguous. In the |
1239 | | // leftmost case, I have tentatively convinced myself that it isn't |
1240 | | // necessary and the reverse search will always find the same pattern |
1241 | | // to match as the forward search. But I lack a rigorous proof. Why not |
1242 | | // just provide the pattern anyway? Well, if it is needed, then leaving |
1243 | | // it out gives us a chance to find a witness. |
1244 | 0 | let start = (&rev) |
1245 | 0 | .find_leftmost_rev_at(None, haystack, start, end.offset())? |
1246 | 0 | .expect("reverse search must match if forward search does"); |
1247 | 0 | assert_eq!( |
1248 | 0 | start.pattern(), |
1249 | 0 | end.pattern(), |
1250 | 0 | "forward and reverse search must match same pattern", |
1251 | | ); |
1252 | 0 | assert!(start.offset() <= end.offset()); |
1253 | 0 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1254 | 0 | } |
1255 | | |
1256 | | /// Search for the first overlapping match within a given range of |
1257 | | /// `haystack`. |
1258 | | /// |
1259 | | /// This routine is principally useful when searching for multiple patterns |
1260 | | /// on inputs where multiple patterns may match the same regions of text. |
1261 | | /// In particular, callers must preserve the automaton's search state from |
1262 | | /// prior calls so that the implementation knows where the last match |
1263 | | /// occurred and which pattern was reported. |
1264 | | /// |
1265 | | /// # Searching a substring of the haystack |
1266 | | /// |
1267 | | /// Being an "at" search routine, this permits callers to search a |
1268 | | /// substring of `haystack` by specifying a range in `haystack`. |
1269 | | /// Why expose this as an API instead of just asking callers to use |
1270 | | /// `&input[start..end]`? The reason is that regex matching often wants |
1271 | | /// to take the surrounding context into account in order to handle |
1272 | | /// look-around (`^`, `$` and `\b`). |
1273 | | /// |
1274 | | /// This is useful when implementing an iterator over matches |
1275 | | /// within the same haystack, which cannot be done correctly by simply |
1276 | | /// providing a subslice of `haystack`. |
1277 | | /// |
1278 | | /// # Errors |
1279 | | /// |
1280 | | /// This routine only errors if the search could not complete. For |
1281 | | /// DFA-based regexes, this only occurs in a non-default configuration |
1282 | | /// where quit bytes are used or Unicode word boundaries are heuristically |
1283 | | /// enabled. |
1284 | | /// |
1285 | | /// When a search cannot complete, callers cannot know whether a match |
1286 | | /// exists or not. |
1287 | | /// |
1288 | | /// The infallible (panics on error) version of this routine is |
1289 | | /// [`find_overlapping_at`](Regex::find_overlapping_at). |
1290 | 0 | pub fn try_find_overlapping_at( |
1291 | 0 | &self, |
1292 | 0 | haystack: &[u8], |
1293 | 0 | start: usize, |
1294 | 0 | end: usize, |
1295 | 0 | state: &mut OverlappingState, |
1296 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1297 | 0 | self.try_find_overlapping_at_imp( |
1298 | 0 | self.scanner().as_mut(), |
1299 | 0 | haystack, |
1300 | 0 | start, |
1301 | 0 | end, |
1302 | 0 | state, |
1303 | | ) |
1304 | 0 | } |
1305 | | |
1306 | | /// The implementation of overlapping search at a given range in |
1307 | | /// `haystack`, where `scanner` is a prefilter (if active) and `state` is |
1308 | | /// the current state of the search. |
1309 | 0 | fn try_find_overlapping_at_imp( |
1310 | 0 | &self, |
1311 | 0 | scanner: Option<&mut prefilter::Scanner>, |
1312 | 0 | haystack: &[u8], |
1313 | 0 | start: usize, |
1314 | 0 | end: usize, |
1315 | 0 | state: &mut OverlappingState, |
1316 | 0 | ) -> Result<Option<MultiMatch>, MatchError> { |
1317 | | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1318 | | // that we always use the `impl Automaton for &A` for calling methods. |
1319 | | // Since this is the usual way that automata are used, this helps |
1320 | | // reduce the number of monomorphized copies of the search code. |
1321 | 0 | let (fwd, rev) = (self.forward(), self.reverse()); |
1322 | | // TODO: Decide whether it's worth making this assert work. It doesn't |
1323 | | // work currently because 'has_starts_for_each_pattern' isn't on the |
1324 | | // Automaton trait. Without this assert, we still get a panic, but it's |
1325 | | // a bit more inscrutable. |
1326 | | // assert!( |
1327 | | // rev.has_starts_for_each_pattern(), |
1328 | | // "overlapping searches require that the reverse DFA is \ |
1329 | | // compiled with the 'starts_for_each_pattern' option", |
1330 | | // ); |
1331 | 0 | let end = match (&fwd).find_overlapping_fwd_at( |
1332 | 0 | scanner, None, haystack, start, end, state, |
1333 | 0 | )? { |
1334 | 0 | None => return Ok(None), |
1335 | 0 | Some(end) => end, |
1336 | | }; |
1337 | | // Unlike the leftmost cases, the reverse overlapping search may match |
1338 | | // a different pattern than the forward search. See test failures when |
1339 | | // using `None` instead of `Some(end.pattern())` below. Thus, we must |
1340 | | // run our reverse search using the pattern that matched in the forward |
1341 | | // direction. |
1342 | 0 | let start = (&rev) |
1343 | 0 | .find_leftmost_rev_at( |
1344 | 0 | Some(end.pattern()), |
1345 | 0 | haystack, |
1346 | | 0, |
1347 | 0 | end.offset(), |
1348 | 0 | )? |
1349 | 0 | .expect("reverse search must match if forward search does"); |
1350 | 0 | assert!(start.offset() <= end.offset()); |
1351 | 0 | assert_eq!(start.pattern(), end.pattern()); |
1352 | 0 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1353 | 0 | } |
1354 | | } |
1355 | | |
1356 | | /// Non-search APIs for querying information about the regex and setting a |
1357 | | /// prefilter. |
1358 | | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
1359 | | /// Attach the given prefilter to this regex. |
1360 | 0 | pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> { |
1361 | 0 | Regex { |
1362 | 0 | prefilter: Some(prefilter), |
1363 | 0 | forward: self.forward, |
1364 | 0 | reverse: self.reverse, |
1365 | 0 | utf8: self.utf8, |
1366 | 0 | } |
1367 | 0 | } |
1368 | | |
1369 | | /// Remove any prefilter from this regex. |
1370 | 0 | pub fn without_prefilter(self) -> Regex<A> { |
1371 | 0 | Regex { |
1372 | 0 | prefilter: None, |
1373 | 0 | forward: self.forward, |
1374 | 0 | reverse: self.reverse, |
1375 | 0 | utf8: self.utf8, |
1376 | 0 | } |
1377 | 0 | } |
1378 | | |
1379 | | /// Return the underlying DFA responsible for forward matching. |
1380 | | /// |
1381 | | /// This is useful for accessing the underlying DFA and converting it to |
1382 | | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
1383 | | /// for an example of where this might be useful. |
1384 | 0 | pub fn forward(&self) -> &A { |
1385 | 0 | &self.forward |
1386 | 0 | } |
1387 | | |
1388 | | /// Return the underlying DFA responsible for reverse matching. |
1389 | | /// |
1390 | | /// This is useful for accessing the underlying DFA and converting it to |
1391 | | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
1392 | | /// for an example of where this might be useful. |
1393 | 0 | pub fn reverse(&self) -> &A { |
1394 | 0 | &self.reverse |
1395 | 0 | } |
1396 | | |
1397 | | /// Returns the total number of patterns matched by this regex. |
1398 | | /// |
1399 | | /// # Example |
1400 | | /// |
1401 | | /// ``` |
1402 | | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
1403 | | /// |
1404 | | /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; |
1405 | | /// assert_eq!(3, re.pattern_count()); |
1406 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1407 | | /// ``` |
1408 | 0 | pub fn pattern_count(&self) -> usize { |
1409 | 0 | assert_eq!( |
1410 | 0 | self.forward().pattern_count(), |
1411 | 0 | self.reverse().pattern_count() |
1412 | | ); |
1413 | 0 | self.forward().pattern_count() |
1414 | 0 | } |
1415 | | |
1416 | | /// Convenience function for returning this regex's prefilter as a trait |
1417 | | /// object. |
1418 | | /// |
1419 | | /// If this regex doesn't have a prefilter, then `None` is returned. |
1420 | 0 | pub fn prefilter(&self) -> Option<&dyn Prefilter> { |
1421 | 0 | match self.prefilter { |
1422 | 0 | None => None, |
1423 | 0 | Some(ref x) => Some(&*x), |
1424 | | } |
1425 | 0 | } |
1426 | | |
1427 | | /// Convenience function for returning a prefilter scanner. |
1428 | 0 | fn scanner(&self) -> Option<prefilter::Scanner> { |
1429 | 0 | self.prefilter().map(prefilter::Scanner::new) |
1430 | 0 | } |
1431 | | } |
1432 | | |
1433 | | /// An iterator over all non-overlapping earliest matches for a particular |
1434 | | /// infallible search. |
1435 | | /// |
1436 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1437 | | /// found. If the underlying search returns an error, then this panics. |
1438 | | /// |
1439 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1440 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1441 | | /// as follows: |
1442 | | /// |
1443 | | /// * `'r` is the lifetime of the regular expression itself. |
1444 | | /// * `'t` is the lifetime of the text being searched. |
1445 | | #[derive(Clone, Debug)] |
1446 | | pub struct FindEarliestMatches<'r, 't, A, P>( |
1447 | | TryFindEarliestMatches<'r, 't, A, P>, |
1448 | | ); |
1449 | | |
1450 | | impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { |
1451 | 0 | fn new( |
1452 | 0 | re: &'r Regex<A, P>, |
1453 | 0 | text: &'t [u8], |
1454 | 0 | ) -> FindEarliestMatches<'r, 't, A, P> { |
1455 | 0 | FindEarliestMatches(TryFindEarliestMatches::new(re, text)) |
1456 | 0 | } |
1457 | | } |
1458 | | |
1459 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1460 | | for FindEarliestMatches<'r, 't, A, P> |
1461 | | { |
1462 | | type Item = MultiMatch; |
1463 | | |
1464 | 0 | fn next(&mut self) -> Option<MultiMatch> { |
1465 | 0 | next_unwrap(self.0.next()) |
1466 | 0 | } |
1467 | | } |
1468 | | |
1469 | | /// An iterator over all non-overlapping leftmost matches for a particular |
1470 | | /// infallible search. |
1471 | | /// |
1472 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1473 | | /// found. If the underlying search returns an error, then this panics. |
1474 | | /// |
1475 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1476 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1477 | | /// as follows: |
1478 | | /// |
1479 | | /// * `'r` is the lifetime of the regular expression itself. |
1480 | | /// * `'t` is the lifetime of the text being searched. |
1481 | | #[derive(Clone, Debug)] |
1482 | | pub struct FindLeftmostMatches<'r, 't, A, P>( |
1483 | | TryFindLeftmostMatches<'r, 't, A, P>, |
1484 | | ); |
1485 | | |
1486 | | impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { |
1487 | 0 | fn new( |
1488 | 0 | re: &'r Regex<A, P>, |
1489 | 0 | text: &'t [u8], |
1490 | 0 | ) -> FindLeftmostMatches<'r, 't, A, P> { |
1491 | 0 | FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) |
1492 | 0 | } |
1493 | | } |
1494 | | |
1495 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1496 | | for FindLeftmostMatches<'r, 't, A, P> |
1497 | | { |
1498 | | type Item = MultiMatch; |
1499 | | |
1500 | 0 | fn next(&mut self) -> Option<MultiMatch> { |
1501 | 0 | next_unwrap(self.0.next()) |
1502 | 0 | } |
1503 | | } |
1504 | | |
1505 | | /// An iterator over all overlapping matches for a particular infallible |
1506 | | /// search. |
1507 | | /// |
1508 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1509 | | /// found. If the underlying search returns an error, then this panics. |
1510 | | /// |
1511 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1512 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1513 | | /// as follows: |
1514 | | /// |
1515 | | /// * `'r` is the lifetime of the regular expression itself. |
1516 | | /// * `'t` is the lifetime of the text being searched. |
1517 | | #[derive(Clone, Debug)] |
1518 | | pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( |
1519 | | TryFindOverlappingMatches<'r, 't, A, P>, |
1520 | | ); |
1521 | | |
1522 | | impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { |
1523 | 0 | fn new( |
1524 | 0 | re: &'r Regex<A, P>, |
1525 | 0 | text: &'t [u8], |
1526 | 0 | ) -> FindOverlappingMatches<'r, 't, A, P> { |
1527 | 0 | FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) |
1528 | 0 | } |
1529 | | } |
1530 | | |
1531 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1532 | | for FindOverlappingMatches<'r, 't, A, P> |
1533 | | { |
1534 | | type Item = MultiMatch; |
1535 | | |
1536 | 0 | fn next(&mut self) -> Option<MultiMatch> { |
1537 | 0 | next_unwrap(self.0.next()) |
1538 | 0 | } |
1539 | | } |
1540 | | |
1541 | | /// An iterator over all non-overlapping earliest matches for a particular |
1542 | | /// fallible search. |
1543 | | /// |
1544 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1545 | | /// found. |
1546 | | /// |
1547 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1548 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1549 | | /// as follows: |
1550 | | /// |
1551 | | /// * `'r` is the lifetime of the regular expression itself. |
1552 | | /// * `'t` is the lifetime of the text being searched. |
1553 | | #[derive(Clone, Debug)] |
1554 | | pub struct TryFindEarliestMatches<'r, 't, A, P> { |
1555 | | re: &'r Regex<A, P>, |
1556 | | scanner: Option<prefilter::Scanner<'r>>, |
1557 | | text: &'t [u8], |
1558 | | last_end: usize, |
1559 | | last_match: Option<usize>, |
1560 | | } |
1561 | | |
1562 | | impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { |
1563 | 0 | fn new( |
1564 | 0 | re: &'r Regex<A, P>, |
1565 | 0 | text: &'t [u8], |
1566 | 0 | ) -> TryFindEarliestMatches<'r, 't, A, P> { |
1567 | 0 | let scanner = re.scanner(); |
1568 | 0 | TryFindEarliestMatches { |
1569 | 0 | re, |
1570 | 0 | scanner, |
1571 | 0 | text, |
1572 | 0 | last_end: 0, |
1573 | 0 | last_match: None, |
1574 | 0 | } |
1575 | 0 | } |
1576 | | } |
1577 | | |
1578 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1579 | | for TryFindEarliestMatches<'r, 't, A, P> |
1580 | | { |
1581 | | type Item = Result<MultiMatch, MatchError>; |
1582 | | |
1583 | 0 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1584 | 0 | if self.last_end > self.text.len() { |
1585 | 0 | return None; |
1586 | 0 | } |
1587 | 0 | let result = self.re.try_find_earliest_at_imp( |
1588 | 0 | self.scanner.as_mut(), |
1589 | 0 | self.text, |
1590 | 0 | self.last_end, |
1591 | 0 | self.text.len(), |
1592 | | ); |
1593 | 0 | let m = match result { |
1594 | 0 | Err(err) => return Some(Err(err)), |
1595 | 0 | Ok(None) => return None, |
1596 | 0 | Ok(Some(m)) => m, |
1597 | | }; |
1598 | 0 | if m.is_empty() { |
1599 | | // This is an empty match. To ensure we make progress, start |
1600 | | // the next search at the smallest possible starting position |
1601 | | // of the next match following this one. |
1602 | 0 | self.last_end = if self.re.utf8 { |
1603 | 0 | crate::util::next_utf8(self.text, m.end()) |
1604 | | } else { |
1605 | 0 | m.end() + 1 |
1606 | | }; |
1607 | | // Don't accept empty matches immediately following a match. |
1608 | | // Just move on to the next match. |
1609 | 0 | if Some(m.end()) == self.last_match { |
1610 | 0 | return self.next(); |
1611 | 0 | } |
1612 | 0 | } else { |
1613 | 0 | self.last_end = m.end(); |
1614 | 0 | } |
1615 | 0 | self.last_match = Some(m.end()); |
1616 | 0 | Some(Ok(m)) |
1617 | 0 | } |
1618 | | } |
1619 | | |
1620 | | /// An iterator over all non-overlapping leftmost matches for a particular |
1621 | | /// fallible search. |
1622 | | /// |
1623 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1624 | | /// found. |
1625 | | /// |
1626 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1627 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1628 | | /// as follows: |
1629 | | /// |
1630 | | /// * `'r` is the lifetime of the regular expression itself. |
1631 | | /// * `'t` is the lifetime of the text being searched. |
1632 | | #[derive(Clone, Debug)] |
1633 | | pub struct TryFindLeftmostMatches<'r, 't, A, P> { |
1634 | | re: &'r Regex<A, P>, |
1635 | | scanner: Option<prefilter::Scanner<'r>>, |
1636 | | text: &'t [u8], |
1637 | | last_end: usize, |
1638 | | last_match: Option<usize>, |
1639 | | } |
1640 | | |
1641 | | impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { |
1642 | 0 | fn new( |
1643 | 0 | re: &'r Regex<A, P>, |
1644 | 0 | text: &'t [u8], |
1645 | 0 | ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
1646 | 0 | let scanner = re.scanner(); |
1647 | 0 | TryFindLeftmostMatches { |
1648 | 0 | re, |
1649 | 0 | scanner, |
1650 | 0 | text, |
1651 | 0 | last_end: 0, |
1652 | 0 | last_match: None, |
1653 | 0 | } |
1654 | 0 | } |
1655 | | } |
1656 | | |
1657 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1658 | | for TryFindLeftmostMatches<'r, 't, A, P> |
1659 | | { |
1660 | | type Item = Result<MultiMatch, MatchError>; |
1661 | | |
1662 | 0 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1663 | 0 | if self.last_end > self.text.len() { |
1664 | 0 | return None; |
1665 | 0 | } |
1666 | 0 | let result = self.re.try_find_leftmost_at_imp( |
1667 | 0 | self.scanner.as_mut(), |
1668 | 0 | self.text, |
1669 | 0 | self.last_end, |
1670 | 0 | self.text.len(), |
1671 | | ); |
1672 | 0 | let m = match result { |
1673 | 0 | Err(err) => return Some(Err(err)), |
1674 | 0 | Ok(None) => return None, |
1675 | 0 | Ok(Some(m)) => m, |
1676 | | }; |
1677 | 0 | if m.is_empty() { |
1678 | | // This is an empty match. To ensure we make progress, start |
1679 | | // the next search at the smallest possible starting position |
1680 | | // of the next match following this one. |
1681 | 0 | self.last_end = if self.re.utf8 { |
1682 | 0 | crate::util::next_utf8(self.text, m.end()) |
1683 | | } else { |
1684 | 0 | m.end() + 1 |
1685 | | }; |
1686 | | // Don't accept empty matches immediately following a match. |
1687 | | // Just move on to the next match. |
1688 | 0 | if Some(m.end()) == self.last_match { |
1689 | 0 | return self.next(); |
1690 | 0 | } |
1691 | 0 | } else { |
1692 | 0 | self.last_end = m.end(); |
1693 | 0 | } |
1694 | 0 | self.last_match = Some(m.end()); |
1695 | 0 | Some(Ok(m)) |
1696 | 0 | } |
1697 | | } |
1698 | | |
1699 | | /// An iterator over all overlapping matches for a particular fallible search. |
1700 | | /// |
1701 | | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1702 | | /// found. |
1703 | | /// |
1704 | | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1705 | | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1706 | | /// as follows: |
1707 | | /// |
1708 | | /// * `'r` is the lifetime of the regular expression itself. |
1709 | | /// * `'t` is the lifetime of the text being searched. |
1710 | | #[derive(Clone, Debug)] |
1711 | | pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { |
1712 | | re: &'r Regex<A, P>, |
1713 | | scanner: Option<prefilter::Scanner<'r>>, |
1714 | | text: &'t [u8], |
1715 | | last_end: usize, |
1716 | | state: OverlappingState, |
1717 | | } |
1718 | | |
1719 | | impl<'r, 't, A: Automaton, P: Prefilter> |
1720 | | TryFindOverlappingMatches<'r, 't, A, P> |
1721 | | { |
1722 | 0 | fn new( |
1723 | 0 | re: &'r Regex<A, P>, |
1724 | 0 | text: &'t [u8], |
1725 | 0 | ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
1726 | 0 | let scanner = re.scanner(); |
1727 | 0 | TryFindOverlappingMatches { |
1728 | 0 | re, |
1729 | 0 | scanner, |
1730 | 0 | text, |
1731 | 0 | last_end: 0, |
1732 | 0 | state: OverlappingState::start(), |
1733 | 0 | } |
1734 | 0 | } |
1735 | | } |
1736 | | |
1737 | | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1738 | | for TryFindOverlappingMatches<'r, 't, A, P> |
1739 | | { |
1740 | | type Item = Result<MultiMatch, MatchError>; |
1741 | | |
1742 | 0 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1743 | 0 | if self.last_end > self.text.len() { |
1744 | 0 | return None; |
1745 | 0 | } |
1746 | 0 | let result = self.re.try_find_overlapping_at_imp( |
1747 | 0 | self.scanner.as_mut(), |
1748 | 0 | self.text, |
1749 | 0 | self.last_end, |
1750 | 0 | self.text.len(), |
1751 | 0 | &mut self.state, |
1752 | | ); |
1753 | 0 | let m = match result { |
1754 | 0 | Err(err) => return Some(Err(err)), |
1755 | 0 | Ok(None) => return None, |
1756 | 0 | Ok(Some(m)) => m, |
1757 | | }; |
1758 | | // Unlike the non-overlapping case, we're OK with empty matches at this |
1759 | | // level. In particular, the overlapping search algorithm is itself |
1760 | | // responsible for ensuring that progress is always made. |
1761 | 0 | self.last_end = m.end(); |
1762 | 0 | Some(Ok(m)) |
1763 | 0 | } |
1764 | | } |
1765 | | |
1766 | | /// The configuration used for compiling a DFA-backed regex. |
1767 | | /// |
1768 | | /// A regex configuration is a simple data object that is typically used with |
1769 | | /// [`Builder::configure`]. |
1770 | | #[cfg(feature = "alloc")] |
1771 | | #[derive(Clone, Copy, Debug, Default)] |
1772 | | pub struct Config { |
1773 | | utf8: Option<bool>, |
1774 | | } |
1775 | | |
1776 | | #[cfg(feature = "alloc")] |
1777 | | impl Config { |
1778 | | /// Return a new default regex compiler configuration. |
1779 | | pub fn new() -> Config { |
1780 | | Config::default() |
1781 | | } |
1782 | | |
1783 | | /// Whether to enable UTF-8 mode or not. |
1784 | | /// |
1785 | | /// When UTF-8 mode is enabled (the default) and an empty match is seen, |
1786 | | /// the iterators on [`Regex`] will always start the next search at the |
1787 | | /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 |
1788 | | /// mode is disabled, such searches are begun at the next byte offset. |
1789 | | /// |
1790 | | /// If this mode is enabled and invalid UTF-8 is given to search, then |
1791 | | /// behavior is unspecified. |
1792 | | /// |
1793 | | /// Generally speaking, one should enable this when |
1794 | | /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) |
1795 | | /// and |
1796 | | /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
1797 | | /// are enabled, and disable it otherwise. |
1798 | | /// |
1799 | | /// # Example |
1800 | | /// |
1801 | | /// This example demonstrates the differences between when this option is |
1802 | | /// enabled and disabled. The differences only arise when the regex can |
1803 | | /// return matches of length zero. |
1804 | | /// |
1805 | | /// In this first snippet, we show the results when UTF-8 mode is disabled. |
1806 | | /// |
1807 | | /// ``` |
1808 | | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
1809 | | /// |
1810 | | /// let re = Regex::builder() |
1811 | | /// .configure(Regex::config().utf8(false)) |
1812 | | /// .build(r"")?; |
1813 | | /// let haystack = "a☃z".as_bytes(); |
1814 | | /// let mut it = re.find_leftmost_iter(haystack); |
1815 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
1816 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
1817 | | /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
1818 | | /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
1819 | | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
1820 | | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
1821 | | /// assert_eq!(None, it.next()); |
1822 | | /// |
1823 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1824 | | /// ``` |
1825 | | /// |
1826 | | /// And in this snippet, we execute the same search on the same haystack, |
1827 | | /// but with UTF-8 mode enabled. Notice that byte offsets that would |
1828 | | /// otherwise split the encoding of `☃` are not returned. |
1829 | | /// |
1830 | | /// ``` |
1831 | | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
1832 | | /// |
1833 | | /// let re = Regex::builder() |
1834 | | /// .configure(Regex::config().utf8(true)) |
1835 | | /// .build(r"")?; |
1836 | | /// let haystack = "a☃z".as_bytes(); |
1837 | | /// let mut it = re.find_leftmost_iter(haystack); |
1838 | | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
1839 | | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
1840 | | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
1841 | | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
1842 | | /// assert_eq!(None, it.next()); |
1843 | | /// |
1844 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1845 | | /// ``` |
1846 | | pub fn utf8(mut self, yes: bool) -> Config { |
1847 | | self.utf8 = Some(yes); |
1848 | | self |
1849 | | } |
1850 | | |
1851 | | /// Returns true if and only if this configuration has UTF-8 mode enabled. |
1852 | | /// |
1853 | | /// When UTF-8 mode is enabled and an empty match is seen, the iterators on |
1854 | | /// [`Regex`] will always start the next search at the next UTF-8 encoded |
1855 | | /// codepoint. When UTF-8 mode is disabled, such searches are begun at the |
1856 | | /// next byte offset. |
1857 | | pub fn get_utf8(&self) -> bool { |
1858 | | self.utf8.unwrap_or(true) |
1859 | | } |
1860 | | |
1861 | | /// Overwrite the default configuration such that the options in `o` are |
1862 | | /// always used. If an option in `o` is not set, then the corresponding |
1863 | | /// option in `self` is used. If it's not set in `self` either, then it |
1864 | | /// remains not set. |
1865 | | pub(crate) fn overwrite(self, o: Config) -> Config { |
1866 | | Config { utf8: o.utf8.or(self.utf8) } |
1867 | | } |
1868 | | } |
1869 | | |
1870 | | /// A builder for a regex based on deterministic finite automatons. |
1871 | | /// |
1872 | | /// This builder permits configuring options for the syntax of a pattern, the |
1873 | | /// NFA construction, the DFA construction and finally the regex searching |
1874 | | /// itself. This builder is different from a general purpose regex builder in |
1875 | | /// that it permits fine grain configuration of the construction process. The |
1876 | | /// trade off for this is complexity, and the possibility of setting a |
1877 | | /// configuration that might not make sense. For example, there are three |
1878 | | /// different UTF-8 modes: |
1879 | | /// |
1880 | | /// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the |
1881 | | /// pattern itself can contain sub-expressions that match invalid UTF-8. |
1882 | | /// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
1883 | | /// controls whether the implicit unanchored prefix added to the NFA can |
1884 | | /// match through invalid UTF-8 or not. |
1885 | | /// * [`Config::utf8`] controls how the regex iterators themselves advance |
1886 | | /// the starting position of the next search when a match with zero length is |
1887 | | /// found. |
1888 | | /// |
1889 | | /// Generally speaking, callers will want to either enable all of these or |
1890 | | /// disable all of these. |
1891 | | /// |
1892 | | /// Internally, building a regex requires building two DFAs, where one is |
1893 | | /// responsible for finding the end of a match and the other is responsible |
1894 | | /// for finding the start of a match. If you only need to detect whether |
1895 | | /// something matched, or only the end of a match, then you should use a |
1896 | | /// [`dense::Builder`] to construct a single DFA, which is cheaper than |
1897 | | /// building two DFAs. |
1898 | | /// |
1899 | | /// # Build methods |
1900 | | /// |
1901 | | /// This builder has a few "build" methods. In general, it's the result of |
1902 | | /// combining the following parameters: |
1903 | | /// |
1904 | | /// * Building one or many regexes. |
1905 | | /// * Building a regex with dense or sparse DFAs. |
1906 | | /// |
1907 | | /// The simplest "build" method is [`Builder::build`]. It accepts a single |
1908 | | /// pattern and builds a dense DFA using `usize` for the state identifier |
1909 | | /// representation. |
1910 | | /// |
1911 | | /// The most general "build" method is [`Builder::build_many`], which permits |
1912 | | /// building a regex that searches for multiple patterns simultaneously while |
1913 | | /// using a specific state identifier representation. |
1914 | | /// |
1915 | | /// The most flexible "build" method, but hardest to use, is |
1916 | | /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is |
1917 | | /// just a pair of DFAs, and this method allows you to specify those DFAs |
1918 | | /// exactly. |
1919 | | /// |
1920 | | /// # Example |
1921 | | /// |
1922 | | /// This example shows how to disable UTF-8 mode in the syntax, the NFA and |
1923 | | /// the regex itself. This is generally what you want for matching on |
1924 | | /// arbitrary bytes. |
1925 | | /// |
1926 | | /// ``` |
1927 | | /// use regex_automata::{ |
1928 | | /// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig |
1929 | | /// }; |
1930 | | /// |
1931 | | /// let re = Regex::builder() |
1932 | | /// .configure(Regex::config().utf8(false)) |
1933 | | /// .syntax(SyntaxConfig::new().utf8(false)) |
1934 | | /// .thompson(thompson::Config::new().utf8(false)) |
1935 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
1936 | | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
1937 | | /// let expected = Some(MultiMatch::must(0, 1, 9)); |
1938 | | /// let got = re.find_leftmost(haystack); |
1939 | | /// assert_eq!(expected, got); |
1940 | | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
1941 | | /// // but the subsequent `.*` does not! Disabling UTF-8 |
1942 | | /// // on the syntax permits this. Notice also that the |
1943 | | /// // search was unanchored and skipped over invalid UTF-8. |
1944 | | /// // Disabling UTF-8 on the Thompson NFA permits this. |
1945 | | /// // |
1946 | | /// // N.B. This example does not show the impact of |
1947 | | /// // disabling UTF-8 mode on Config, since that |
1948 | | /// // only impacts regexes that can produce matches of |
1949 | | /// // length 0. |
1950 | | /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); |
1951 | | /// |
1952 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1953 | | /// ``` |
1954 | | #[cfg(feature = "alloc")] |
1955 | | #[derive(Clone, Debug)] |
1956 | | pub struct Builder { |
1957 | | config: Config, |
1958 | | dfa: dense::Builder, |
1959 | | } |
1960 | | |
1961 | | #[cfg(feature = "alloc")] |
1962 | | impl Builder { |
1963 | | /// Create a new regex builder with the default configuration. |
1964 | | pub fn new() -> Builder { |
1965 | | Builder { config: Config::default(), dfa: dense::Builder::new() } |
1966 | | } |
1967 | | |
1968 | | /// Build a regex from the given pattern. |
1969 | | /// |
1970 | | /// If there was a problem parsing or compiling the pattern, then an error |
1971 | | /// is returned. |
1972 | | pub fn build(&self, pattern: &str) -> Result<Regex, Error> { |
1973 | | self.build_many(&[pattern]) |
1974 | | } |
1975 | | |
1976 | | /// Build a regex from the given pattern using sparse DFAs. |
1977 | | /// |
1978 | | /// If there was a problem parsing or compiling the pattern, then an error |
1979 | | /// is returned. |
1980 | | pub fn build_sparse( |
1981 | | &self, |
1982 | | pattern: &str, |
1983 | | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
1984 | | self.build_many_sparse(&[pattern]) |
1985 | | } |
1986 | | |
1987 | | /// Build a regex from the given patterns. |
1988 | | pub fn build_many<P: AsRef<str>>( |
1989 | | &self, |
1990 | | patterns: &[P], |
1991 | | ) -> Result<Regex, Error> { |
1992 | | let forward = self.dfa.build_many(patterns)?; |
1993 | | let reverse = self |
1994 | | .dfa |
1995 | | .clone() |
1996 | | .configure( |
1997 | | dense::Config::new() |
1998 | | .anchored(true) |
1999 | | .match_kind(MatchKind::All) |
2000 | | .starts_for_each_pattern(true), |
2001 | | ) |
2002 | | .thompson(thompson::Config::new().reverse(true)) |
2003 | | .build_many(patterns)?; |
2004 | | Ok(self.build_from_dfas(forward, reverse)) |
2005 | | } |
2006 | | |
2007 | | /// Build a sparse regex from the given patterns. |
2008 | | pub fn build_many_sparse<P: AsRef<str>>( |
2009 | | &self, |
2010 | | patterns: &[P], |
2011 | | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
2012 | | let re = self.build_many(patterns)?; |
2013 | | let forward = re.forward().to_sparse()?; |
2014 | | let reverse = re.reverse().to_sparse()?; |
2015 | | Ok(self.build_from_dfas(forward, reverse)) |
2016 | | } |
2017 | | |
2018 | | /// Build a regex from its component forward and reverse DFAs. |
2019 | | /// |
2020 | | /// This is useful when deserializing a regex from some arbitrary |
2021 | | /// memory region. This is also useful for building regexes from other |
2022 | | /// types of DFAs. |
2023 | | /// |
2024 | | /// If you're building the DFAs from scratch instead of building new DFAs |
2025 | | /// from other DFAs, then you'll need to make sure that the reverse DFA is |
2026 | | /// configured correctly to match the intended semantics. Namely: |
2027 | | /// |
2028 | | /// * It should be anchored. |
2029 | | /// * It should use [`MatchKind::All`] semantics. |
2030 | | /// * It should match in reverse. |
2031 | | /// * It should have anchored start states compiled for each pattern. |
2032 | | /// * Otherwise, its configuration should match the forward DFA. |
2033 | | /// |
2034 | | /// If these conditions are satisfied, then behavior of searches is |
2035 | | /// unspecified. |
2036 | | /// |
2037 | | /// Note that when using this constructor, only the configuration from |
2038 | | /// [`Config`] is applied. The only configuration settings on this builder |
2039 | | /// only apply when the builder owns the construction of the DFAs |
2040 | | /// themselves. |
2041 | | /// |
2042 | | /// # Example |
2043 | | /// |
2044 | | /// This example is a bit a contrived. The usual use of these methods |
2045 | | /// would involve serializing `initial_re` somewhere and then deserializing |
2046 | | /// it later to build a regex. But in this case, we do everything in |
2047 | | /// memory. |
2048 | | /// |
2049 | | /// ``` |
2050 | | /// use regex_automata::dfa::regex::Regex; |
2051 | | /// |
2052 | | /// let initial_re = Regex::new("foo[0-9]+")?; |
2053 | | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
2054 | | /// |
2055 | | /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
2056 | | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
2057 | | /// assert_eq!(true, re.is_match(b"foo123")); |
2058 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2059 | | /// ``` |
2060 | | /// |
2061 | | /// This example shows how to build a `Regex` that uses sparse DFAs instead |
2062 | | /// of dense DFAs without using one of the convenience `build_sparse` |
2063 | | /// routines: |
2064 | | /// |
2065 | | /// ``` |
2066 | | /// use regex_automata::dfa::regex::Regex; |
2067 | | /// |
2068 | | /// let initial_re = Regex::new("foo[0-9]+")?; |
2069 | | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
2070 | | /// |
2071 | | /// let fwd = initial_re.forward().to_sparse()?; |
2072 | | /// let rev = initial_re.reverse().to_sparse()?; |
2073 | | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
2074 | | /// assert_eq!(true, re.is_match(b"foo123")); |
2075 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2076 | | /// ``` |
2077 | | pub fn build_from_dfas<A: Automaton>( |
2078 | | &self, |
2079 | | forward: A, |
2080 | | reverse: A, |
2081 | | ) -> Regex<A> { |
2082 | | let utf8 = self.config.get_utf8(); |
2083 | | Regex { prefilter: None, forward, reverse, utf8 } |
2084 | | } |
2085 | | |
2086 | | /// Apply the given regex configuration options to this builder. |
2087 | | pub fn configure(&mut self, config: Config) -> &mut Builder { |
2088 | | self.config = self.config.overwrite(config); |
2089 | | self |
2090 | | } |
2091 | | |
2092 | | /// Set the syntax configuration for this builder using |
2093 | | /// [`SyntaxConfig`](crate::SyntaxConfig). |
2094 | | /// |
2095 | | /// This permits setting things like case insensitivity, Unicode and multi |
2096 | | /// line mode. |
2097 | | pub fn syntax( |
2098 | | &mut self, |
2099 | | config: crate::util::syntax::SyntaxConfig, |
2100 | | ) -> &mut Builder { |
2101 | | self.dfa.syntax(config); |
2102 | | self |
2103 | | } |
2104 | | |
2105 | | /// Set the Thompson NFA configuration for this builder using |
2106 | | /// [`nfa::thompson::Config`](thompson::Config). |
2107 | | /// |
2108 | | /// This permits setting things like whether additional time should be |
2109 | | /// spent shrinking the size of the NFA. |
2110 | | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
2111 | | self.dfa.thompson(config); |
2112 | | self |
2113 | | } |
2114 | | |
2115 | | /// Set the dense DFA compilation configuration for this builder using |
2116 | | /// [`dense::Config`](dense::Config). |
2117 | | /// |
2118 | | /// This permits setting things like whether the underlying DFAs should |
2119 | | /// be minimized. |
2120 | | pub fn dense(&mut self, config: dense::Config) -> &mut Builder { |
2121 | | self.dfa.configure(config); |
2122 | | self |
2123 | | } |
2124 | | } |
2125 | | |
2126 | | #[cfg(feature = "alloc")] |
2127 | | impl Default for Builder { |
2128 | | fn default() -> Builder { |
2129 | | Builder::new() |
2130 | | } |
2131 | | } |
2132 | | |
2133 | | #[inline(always)] |
2134 | 0 | fn next_unwrap( |
2135 | 0 | item: Option<Result<MultiMatch, MatchError>>, |
2136 | 0 | ) -> Option<MultiMatch> { |
2137 | 0 | match item { |
2138 | 0 | None => None, |
2139 | 0 | Some(Ok(m)) => Some(m), |
2140 | 0 | Some(Err(err)) => panic!( |
2141 | 0 | "unexpected regex search error: {}\n\ |
2142 | 0 | to handle search errors, use try_ methods", |
2143 | | err, |
2144 | | ), |
2145 | | } |
2146 | 0 | } |