/src/regex/regex-automata/src/dfa/regex.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | A DFA-backed `Regex`. |
3 | | |
4 | | This module provides [`Regex`], which is defined generically over the |
5 | | [`Automaton`] trait. A `Regex` implements convenience routines you might have |
6 | | come to expect, such as finding the start/end of a match and iterating over |
7 | | all non-overlapping matches. This `Regex` type is limited in its capabilities |
8 | | to what a DFA can provide. Therefore, APIs involving capturing groups, for |
9 | | example, are not provided. |
10 | | |
11 | | Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that |
12 | | finds the end offset of a match, where as the other is a "reverse" DFA that |
13 | | find the start offset of a match. |
14 | | |
15 | | See the [parent module](crate::dfa) for examples. |
16 | | */ |
17 | | |
18 | | #[cfg(feature = "alloc")] |
19 | | use alloc::vec::Vec; |
20 | | |
21 | | #[cfg(feature = "dfa-build")] |
22 | | use crate::dfa::dense::BuildError; |
23 | | use crate::{ |
24 | | dfa::{automaton::Automaton, dense}, |
25 | | util::{iter, search::Input}, |
26 | | Anchored, Match, MatchError, |
27 | | }; |
28 | | #[cfg(feature = "alloc")] |
29 | | use crate::{ |
30 | | dfa::{sparse, StartKind}, |
31 | | util::search::MatchKind, |
32 | | }; |
33 | | |
34 | | // When the alloc feature is enabled, the regex type sets its A type parameter |
35 | | // to default to an owned dense DFA. But without alloc, we set no default. This |
36 | | // makes things a lot more convenient in the common case, since writing out the |
37 | | // DFA types is pretty annoying. |
38 | | // |
39 | | // Since we have two different definitions but only want to write one doc |
40 | | // string, we use a macro to capture the doc and other attributes once and then |
41 | | // repeat them for each definition. |
42 | | macro_rules! define_regex_type { |
43 | | ($(#[$doc:meta])*) => { |
44 | | #[cfg(feature = "alloc")] |
45 | | $(#[$doc])* |
46 | | pub struct Regex<A = dense::OwnedDFA> { |
47 | | forward: A, |
48 | | reverse: A, |
49 | | } |
50 | | |
51 | | #[cfg(not(feature = "alloc"))] |
52 | | $(#[$doc])* |
53 | | pub struct Regex<A> { |
54 | | forward: A, |
55 | | reverse: A, |
56 | | } |
57 | | }; |
58 | | } |
59 | | |
60 | | define_regex_type!( |
61 | | /// A regular expression that uses deterministic finite automata for fast |
62 | | /// searching. |
63 | | /// |
64 | | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
65 | | /// "reverse" DFA. The forward DFA is responsible for detecting the end of |
66 | | /// a match while the reverse DFA is responsible for detecting the start |
67 | | /// of a match. Thus, in order to find the bounds of any given match, a |
68 | | /// forward search must first be run followed by a reverse search. A match |
69 | | /// found by the forward DFA guarantees that the reverse DFA will also find |
70 | | /// a match. |
71 | | /// |
72 | | /// The type of the DFA used by a `Regex` corresponds to the `A` type |
73 | | /// parameter, which must satisfy the [`Automaton`] trait. Typically, |
74 | | /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a |
75 | | /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more |
76 | | /// memory but search faster, while sparse DFAs use less memory but search |
77 | | /// more slowly. |
78 | | /// |
79 | | /// # Crate features |
80 | | /// |
81 | | /// Note that despite what the documentation auto-generates, the _only_ |
82 | | /// crate feature needed to use this type is `dfa-search`. You do _not_ |
83 | | /// need to enable the `alloc` feature. |
84 | | /// |
85 | | /// By default, a regex's automaton type parameter is set to |
86 | | /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most |
87 | | /// in-memory work loads, this is the most convenient type that gives the |
88 | | /// best search performance. When the `alloc` feature is disabled, no |
89 | | /// default type is used. |
90 | | /// |
91 | | /// # When should I use this? |
92 | | /// |
93 | | /// Generally speaking, if you can afford the overhead of building a full |
94 | | /// DFA for your regex, and you don't need things like capturing groups, |
95 | | /// then this is a good choice if you're looking to optimize for matching |
96 | | /// speed. Note however that its speed may be worse than a general purpose |
97 | | /// regex engine if you don't provide a [`dense::Config::prefilter`] to the |
98 | | /// underlying DFA. |
99 | | /// |
100 | | /// # Sparse DFAs |
101 | | /// |
102 | | /// Since a `Regex` is generic over the [`Automaton`] trait, it can be |
103 | | /// used with any kind of DFA. While this crate constructs dense DFAs by |
104 | | /// default, it is easy enough to build corresponding sparse DFAs, and then |
105 | | /// build a regex from them: |
106 | | /// |
107 | | /// ``` |
108 | | /// use regex_automata::dfa::regex::Regex; |
109 | | /// |
110 | | /// // First, build a regex that uses dense DFAs. |
111 | | /// let dense_re = Regex::new("foo[0-9]+")?; |
112 | | /// |
113 | | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
114 | | /// let fwd = dense_re.forward().to_sparse()?; |
115 | | /// let rev = dense_re.reverse().to_sparse()?; |
116 | | /// |
117 | | /// // Third, build a new regex from the constituent sparse DFAs. |
118 | | /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); |
119 | | /// |
120 | | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
121 | | /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
122 | | /// |
123 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
124 | | /// ``` |
125 | | /// |
126 | | /// Alternatively, one can use a [`Builder`] to construct a sparse DFA |
127 | | /// more succinctly. (Note though that dense DFAs are still constructed |
128 | | /// first internally, and then converted to sparse DFAs, as in the example |
129 | | /// above.) |
130 | | /// |
131 | | /// ``` |
132 | | /// use regex_automata::dfa::regex::Regex; |
133 | | /// |
134 | | /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; |
135 | | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
136 | | /// assert!(sparse_re.is_match(b"foo123")); |
137 | | /// |
138 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
139 | | /// ``` |
140 | | /// |
141 | | /// # Fallibility |
142 | | /// |
143 | | /// Most of the search routines defined on this type will _panic_ when the |
144 | | /// underlying search fails. This might be because the DFA gave up because |
145 | | /// it saw a quit byte, whether configured explicitly or via heuristic |
146 | | /// Unicode word boundary support, although neither are enabled by default. |
147 | | /// Or it might fail because an invalid `Input` configuration is given, |
148 | | /// for example, with an unsupported [`Anchored`] mode. |
149 | | /// |
150 | | /// If you need to handle these error cases instead of allowing them to |
151 | | /// trigger a panic, then the lower level [`Regex::try_search`] provides |
152 | | /// a fallible API that never panics. |
153 | | /// |
154 | | /// # Example |
155 | | /// |
156 | | /// This example shows how to cause a search to terminate if it sees a |
157 | | /// `\n` byte, and handle the error returned. This could be useful if, for |
158 | | /// example, you wanted to prevent a user supplied pattern from matching |
159 | | /// across a line boundary. |
160 | | /// |
161 | | /// ``` |
162 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
163 | | /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; |
164 | | /// |
165 | | /// let re = Regex::builder() |
166 | | /// .dense(dfa::dense::Config::new().quit(b'\n', true)) |
167 | | /// .build(r"foo\p{any}+bar")?; |
168 | | /// |
169 | | /// let input = Input::new("foo\nbar"); |
170 | | /// // Normally this would produce a match, since \p{any} contains '\n'. |
171 | | /// // But since we instructed the automaton to enter a quit state if a |
172 | | /// // '\n' is observed, this produces a match error instead. |
173 | | /// let expected = MatchError::quit(b'\n', 3); |
174 | | /// let got = re.try_search(&input).unwrap_err(); |
175 | | /// assert_eq!(expected, got); |
176 | | /// |
177 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
178 | | /// ``` |
179 | | #[derive(Clone, Debug)] |
180 | | ); |
181 | | |
182 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
183 | | impl Regex { |
184 | | /// Parse the given regular expression using the default configuration and |
185 | | /// return the corresponding regex. |
186 | | /// |
187 | | /// If you want a non-default configuration, then use the [`Builder`] to |
188 | | /// set your own configuration. |
189 | | /// |
190 | | /// # Example |
191 | | /// |
192 | | /// ``` |
193 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
194 | | /// |
195 | | /// let re = Regex::new("foo[0-9]+bar")?; |
196 | | /// assert_eq!( |
197 | | /// Some(Match::must(0, 3..14)), |
198 | | /// re.find(b"zzzfoo12345barzzz"), |
199 | | /// ); |
200 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
201 | | /// ``` |
202 | 0 | pub fn new(pattern: &str) -> Result<Regex, BuildError> { |
203 | 0 | Builder::new().build(pattern) |
204 | 0 | } |
205 | | |
206 | | /// Like `new`, but parses multiple patterns into a single "regex set." |
207 | | /// This similarly uses the default regex configuration. |
208 | | /// |
209 | | /// # Example |
210 | | /// |
211 | | /// ``` |
212 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
213 | | /// |
214 | | /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; |
215 | | /// |
216 | | /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); |
217 | | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
218 | | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
219 | | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
220 | | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
221 | | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
222 | | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
223 | | /// assert_eq!(None, it.next()); |
224 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
225 | | /// ``` |
226 | | pub fn new_many<P: AsRef<str>>( |
227 | | patterns: &[P], |
228 | | ) -> Result<Regex, BuildError> { |
229 | | Builder::new().build_many(patterns) |
230 | | } |
231 | | } |
232 | | |
233 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
234 | | impl Regex<sparse::DFA<Vec<u8>>> { |
235 | | /// Parse the given regular expression using the default configuration, |
236 | | /// except using sparse DFAs, and return the corresponding regex. |
237 | | /// |
238 | | /// If you want a non-default configuration, then use the [`Builder`] to |
239 | | /// set your own configuration. |
240 | | /// |
241 | | /// # Example |
242 | | /// |
243 | | /// ``` |
244 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
245 | | /// |
246 | | /// let re = Regex::new_sparse("foo[0-9]+bar")?; |
247 | | /// assert_eq!( |
248 | | /// Some(Match::must(0, 3..14)), |
249 | | /// re.find(b"zzzfoo12345barzzz"), |
250 | | /// ); |
251 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
252 | | /// ``` |
253 | 0 | pub fn new_sparse( |
254 | 0 | pattern: &str, |
255 | 0 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
256 | 0 | Builder::new().build_sparse(pattern) |
257 | 0 | } |
258 | | |
259 | | /// Like `new`, but parses multiple patterns into a single "regex set" |
260 | | /// using sparse DFAs. This otherwise similarly uses the default regex |
261 | | /// configuration. |
262 | | /// |
263 | | /// # Example |
264 | | /// |
265 | | /// ``` |
266 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
267 | | /// |
268 | | /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; |
269 | | /// |
270 | | /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); |
271 | | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
272 | | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
273 | | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
274 | | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
275 | | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
276 | | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
277 | | /// assert_eq!(None, it.next()); |
278 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
279 | | /// ``` |
280 | | pub fn new_many_sparse<P: AsRef<str>>( |
281 | | patterns: &[P], |
282 | | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
283 | | Builder::new().build_many_sparse(patterns) |
284 | | } |
285 | | } |
286 | | |
287 | | /// Convenience routines for regex construction. |
288 | | impl Regex<dense::DFA<&'static [u32]>> { |
289 | | /// Return a builder for configuring the construction of a `Regex`. |
290 | | /// |
291 | | /// This is a convenience routine to avoid needing to import the |
292 | | /// [`Builder`] type in common cases. |
293 | | /// |
294 | | /// # Example |
295 | | /// |
296 | | /// This example shows how to use the builder to disable UTF-8 mode |
297 | | /// everywhere. |
298 | | /// |
299 | | /// ``` |
300 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
301 | | /// use regex_automata::{ |
302 | | /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, |
303 | | /// }; |
304 | | /// |
305 | | /// let re = Regex::builder() |
306 | | /// .syntax(syntax::Config::new().utf8(false)) |
307 | | /// .thompson(thompson::Config::new().utf8(false)) |
308 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
309 | | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
310 | | /// let expected = Some(Match::must(0, 1..9)); |
311 | | /// let got = re.find(haystack); |
312 | | /// assert_eq!(expected, got); |
313 | | /// |
314 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
315 | | /// ``` |
316 | 0 | pub fn builder() -> Builder { |
317 | 0 | Builder::new() |
318 | 0 | } |
319 | | } |
320 | | |
321 | | /// Standard search routines for finding and iterating over matches. |
322 | | impl<A: Automaton> Regex<A> { |
323 | | /// Returns true if and only if this regex matches the given haystack. |
324 | | /// |
325 | | /// This routine may short circuit if it knows that scanning future input |
326 | | /// will never lead to a different result. In particular, if the underlying |
327 | | /// DFA enters a match state or a dead state, then this routine will return |
328 | | /// `true` or `false`, respectively, without inspecting any future input. |
329 | | /// |
330 | | /// # Panics |
331 | | /// |
332 | | /// This routine panics if the search could not complete. This can occur |
333 | | /// in a number of circumstances: |
334 | | /// |
335 | | /// * The configuration of the DFA may permit it to "quit" the search. |
336 | | /// For example, setting quit bytes or enabling heuristic support for |
337 | | /// Unicode word boundaries. The default configuration does not enable any |
338 | | /// option that could result in the DFA quitting. |
339 | | /// * When the provided `Input` configuration is not supported. For |
340 | | /// example, by providing an unsupported anchor mode. |
341 | | /// |
342 | | /// When a search panics, callers cannot know whether a match exists or |
343 | | /// not. |
344 | | /// |
345 | | /// Use [`Regex::try_search`] if you want to handle these error conditions. |
346 | | /// |
347 | | /// # Example |
348 | | /// |
349 | | /// ``` |
350 | | /// use regex_automata::dfa::regex::Regex; |
351 | | /// |
352 | | /// let re = Regex::new("foo[0-9]+bar")?; |
353 | | /// assert_eq!(true, re.is_match("foo12345bar")); |
354 | | /// assert_eq!(false, re.is_match("foobar")); |
355 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
356 | | /// ``` |
357 | | #[inline] |
358 | | pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { |
359 | | // Not only can we do an "earliest" search, but we can avoid doing a |
360 | | // reverse scan too. |
361 | | let input = input.into().earliest(true); |
362 | | self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() |
363 | | } |
364 | | |
365 | | /// Returns the start and end offset of the leftmost match. If no match |
366 | | /// exists, then `None` is returned. |
367 | | /// |
368 | | /// # Panics |
369 | | /// |
370 | | /// This routine panics if the search could not complete. This can occur |
371 | | /// in a number of circumstances: |
372 | | /// |
373 | | /// * The configuration of the DFA may permit it to "quit" the search. |
374 | | /// For example, setting quit bytes or enabling heuristic support for |
375 | | /// Unicode word boundaries. The default configuration does not enable any |
376 | | /// option that could result in the DFA quitting. |
377 | | /// * When the provided `Input` configuration is not supported. For |
378 | | /// example, by providing an unsupported anchor mode. |
379 | | /// |
380 | | /// When a search panics, callers cannot know whether a match exists or |
381 | | /// not. |
382 | | /// |
383 | | /// Use [`Regex::try_search`] if you want to handle these error conditions. |
384 | | /// |
385 | | /// # Example |
386 | | /// |
387 | | /// ``` |
388 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
389 | | /// |
390 | | /// // Greediness is applied appropriately. |
391 | | /// let re = Regex::new("foo[0-9]+")?; |
392 | | /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz")); |
393 | | /// |
394 | | /// // Even though a match is found after reading the first byte (`a`), |
395 | | /// // the default leftmost-first match semantics demand that we find the |
396 | | /// // earliest match that prefers earlier parts of the pattern over latter |
397 | | /// // parts. |
398 | | /// let re = Regex::new("abc|a")?; |
399 | | /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc")); |
400 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
401 | | /// ``` |
402 | | #[inline] |
403 | | pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { |
404 | | self.try_search(&input.into()).unwrap() |
405 | | } |
406 | | |
407 | | /// Returns an iterator over all non-overlapping leftmost matches in the |
408 | | /// given bytes. If no match exists, then the iterator yields no elements. |
409 | | /// |
410 | | /// This corresponds to the "standard" regex search iterator. |
411 | | /// |
412 | | /// # Panics |
413 | | /// |
414 | | /// If the search returns an error during iteration, then iteration |
415 | | /// panics. See [`Regex::find`] for the panic conditions. |
416 | | /// |
417 | | /// Use [`Regex::try_search`] with |
418 | | /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to |
419 | | /// handle these error conditions. |
420 | | /// |
421 | | /// # Example |
422 | | /// |
423 | | /// ``` |
424 | | /// use regex_automata::{Match, dfa::regex::Regex}; |
425 | | /// |
426 | | /// let re = Regex::new("foo[0-9]+")?; |
427 | | /// let text = "foo1 foo12 foo123"; |
428 | | /// let matches: Vec<Match> = re.find_iter(text).collect(); |
429 | | /// assert_eq!(matches, vec![ |
430 | | /// Match::must(0, 0..4), |
431 | | /// Match::must(0, 5..10), |
432 | | /// Match::must(0, 11..17), |
433 | | /// ]); |
434 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
435 | | /// ``` |
436 | | #[inline] |
437 | | pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( |
438 | | &'r self, |
439 | | input: I, |
440 | | ) -> FindMatches<'r, 'h, A> { |
441 | | let it = iter::Searcher::new(input.into()); |
442 | | FindMatches { re: self, it } |
443 | | } |
444 | | } |
445 | | |
446 | | /// Lower level fallible search routines that permit controlling where the |
447 | | /// search starts and ends in a particular sequence. |
448 | | impl<A: Automaton> Regex<A> { |
449 | | /// Returns the start and end offset of the leftmost match. If no match |
450 | | /// exists, then `None` is returned. |
451 | | /// |
452 | | /// This is like [`Regex::find`] but with two differences: |
453 | | /// |
454 | | /// 1. It is not generic over `Into<Input>` and instead accepts a |
455 | | /// `&Input`. This permits reusing the same `Input` for multiple searches |
456 | | /// without needing to create a new one. This _may_ help with latency. |
457 | | /// 2. It returns an error if the search could not complete where as |
458 | | /// [`Regex::find`] will panic. |
459 | | /// |
460 | | /// # Errors |
461 | | /// |
462 | | /// This routine errors if the search could not complete. This can occur |
463 | | /// in the following circumstances: |
464 | | /// |
465 | | /// * The configuration of the DFA may permit it to "quit" the search. |
466 | | /// For example, setting quit bytes or enabling heuristic support for |
467 | | /// Unicode word boundaries. The default configuration does not enable any |
468 | | /// option that could result in the DFA quitting. |
469 | | /// * When the provided `Input` configuration is not supported. For |
470 | | /// example, by providing an unsupported anchor mode. |
471 | | /// |
472 | | /// When a search returns an error, callers cannot know whether a match |
473 | | /// exists or not. |
474 | | #[inline] |
475 | 34.2k | pub fn try_search( |
476 | 34.2k | &self, |
477 | 34.2k | input: &Input<'_>, |
478 | 34.2k | ) -> Result<Option<Match>, MatchError> { |
479 | 34.2k | let (fwd, rev) = (self.forward(), self.reverse()); |
480 | 34.2k | let end = match fwd.try_search_fwd(input)? { |
481 | 11.9k | None => return Ok(None), |
482 | 8.94k | Some(end) => end, |
483 | 8.94k | }; |
484 | 8.94k | // This special cases an empty match at the beginning of the search. If |
485 | 8.94k | // our end matches our start, then since a reverse DFA can't match past |
486 | 8.94k | // the start, it must follow that our starting position is also our end |
487 | 8.94k | // position. So short circuit and skip the reverse search. |
488 | 8.94k | if input.start() == end.offset() { |
489 | 5.65k | return Ok(Some(Match::new( |
490 | 5.65k | end.pattern(), |
491 | 5.65k | end.offset()..end.offset(), |
492 | 5.65k | ))); |
493 | 3.29k | } |
494 | 3.29k | // We can also skip the reverse search if we know our search was |
495 | 3.29k | // anchored. This occurs either when the input config is anchored or |
496 | 3.29k | // when we know the regex itself is anchored. In this case, we know the |
497 | 3.29k | // start of the match, if one is found, must be the start of the |
498 | 3.29k | // search. |
499 | 3.29k | if self.is_anchored(input) { |
500 | 42 | return Ok(Some(Match::new( |
501 | 42 | end.pattern(), |
502 | 42 | input.start()..end.offset(), |
503 | 42 | ))); |
504 | 3.25k | } |
505 | 3.25k | // N.B. I have tentatively convinced myself that it isn't necessary |
506 | 3.25k | // to specify the specific pattern for the reverse search since the |
507 | 3.25k | // reverse search will always find the same pattern to match as the |
508 | 3.25k | // forward search. But I lack a rigorous proof. Why not just provide |
509 | 3.25k | // the pattern anyway? Well, if it is needed, then leaving it out |
510 | 3.25k | // gives us a chance to find a witness. (Also, if we don't need to |
511 | 3.25k | // specify the pattern, then we don't need to build the reverse DFA |
512 | 3.25k | // with 'starts_for_each_pattern' enabled.) |
513 | 3.25k | // |
514 | 3.25k | // We also need to be careful to disable 'earliest' for the reverse |
515 | 3.25k | // search, since it could be enabled for the forward search. In the |
516 | 3.25k | // reverse case, to satisfy "leftmost" criteria, we need to match |
517 | 3.25k | // as much as we can. We also need to be careful to make the search |
518 | 3.25k | // anchored. We don't want the reverse search to report any matches |
519 | 3.25k | // other than the one beginning at the end of our forward search. |
520 | 3.25k | let revsearch = input |
521 | 3.25k | .clone() |
522 | 3.25k | .span(input.start()..end.offset()) |
523 | 3.25k | .anchored(Anchored::Yes) |
524 | 3.25k | .earliest(false); |
525 | 3.25k | let start = rev |
526 | 3.25k | .try_search_rev(&revsearch)? |
527 | 3.09k | .expect("reverse search must match if forward search does"); |
528 | 3.09k | assert_eq!( |
529 | 3.09k | start.pattern(), |
530 | 3.09k | end.pattern(), |
531 | 0 | "forward and reverse search must match same pattern", |
532 | | ); |
533 | 3.09k | assert!(start.offset() <= end.offset()); |
534 | 3.09k | Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) |
535 | 34.2k | } |
536 | | |
537 | | /// Returns true if either the given input specifies an anchored search |
538 | | /// or if the underlying DFA is always anchored. |
539 | 3.29k | fn is_anchored(&self, input: &Input<'_>) -> bool { |
540 | 3.29k | match input.get_anchored() { |
541 | 3.29k | Anchored::No => self.forward().is_always_start_anchored(), |
542 | 0 | Anchored::Yes | Anchored::Pattern(_) => true, |
543 | | } |
544 | 3.29k | } |
545 | | } |
546 | | |
547 | | /// Non-search APIs for querying information about the regex and setting a |
548 | | /// prefilter. |
549 | | impl<A: Automaton> Regex<A> { |
550 | | /// Return the underlying DFA responsible for forward matching. |
551 | | /// |
552 | | /// This is useful for accessing the underlying DFA and converting it to |
553 | | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
554 | | /// for an example of where this might be useful. |
555 | 66.5k | pub fn forward(&self) -> &A { |
556 | 66.5k | &self.forward |
557 | 66.5k | } |
558 | | |
559 | | /// Return the underlying DFA responsible for reverse matching. |
560 | | /// |
561 | | /// This is useful for accessing the underlying DFA and converting it to |
562 | | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
563 | | /// for an example of where this might be useful. |
564 | 275k | pub fn reverse(&self) -> &A { |
565 | 275k | &self.reverse |
566 | 275k | } |
567 | | |
568 | | /// Returns the total number of patterns matched by this regex. |
569 | | /// |
570 | | /// # Example |
571 | | /// |
572 | | /// ``` |
573 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
574 | | /// use regex_automata::dfa::regex::Regex; |
575 | | /// |
576 | | /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; |
577 | | /// assert_eq!(3, re.pattern_len()); |
578 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
579 | | /// ``` |
580 | | pub fn pattern_len(&self) -> usize { |
581 | | assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); |
582 | | self.forward().pattern_len() |
583 | | } |
584 | | } |
585 | | |
586 | | /// An iterator over all non-overlapping matches for an infallible search. |
587 | | /// |
588 | | /// The iterator yields a [`Match`] value until no more matches could be found. |
589 | | /// If the underlying regex engine returns an error, then a panic occurs. |
590 | | /// |
591 | | /// The type parameters are as follows: |
592 | | /// |
593 | | /// * `A` represents the type of the underlying DFA that implements the |
594 | | /// [`Automaton`] trait. |
595 | | /// |
596 | | /// The lifetime parameters are as follows: |
597 | | /// |
598 | | /// * `'h` represents the lifetime of the haystack being searched. |
599 | | /// * `'r` represents the lifetime of the regex object itself. |
600 | | /// |
601 | | /// This iterator can be created with the [`Regex::find_iter`] method. |
602 | | #[derive(Debug)] |
603 | | pub struct FindMatches<'r, 'h, A> { |
604 | | re: &'r Regex<A>, |
605 | | it: iter::Searcher<'h>, |
606 | | } |
607 | | |
608 | | impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { |
609 | | type Item = Match; |
610 | | |
611 | | #[inline] |
612 | | fn next(&mut self) -> Option<Match> { |
613 | | let FindMatches { re, ref mut it } = *self; |
614 | | it.advance(|input| re.try_search(input)) |
615 | | } |
616 | | } |
617 | | |
618 | | /// A builder for a regex based on deterministic finite automatons. |
619 | | /// |
620 | | /// This builder permits configuring options for the syntax of a pattern, the |
621 | | /// NFA construction, the DFA construction and finally the regex searching |
622 | | /// itself. This builder is different from a general purpose regex builder in |
623 | | /// that it permits fine grain configuration of the construction process. The |
624 | | /// trade off for this is complexity, and the possibility of setting a |
625 | | /// configuration that might not make sense. For example, there are two |
626 | | /// different UTF-8 modes: |
627 | | /// |
628 | | /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls |
629 | | /// whether the pattern itself can contain sub-expressions that match invalid |
630 | | /// UTF-8. |
631 | | /// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls |
632 | | /// how the regex iterators themselves advance the starting position of the |
633 | | /// next search when a match with zero length is found. |
634 | | /// |
635 | | /// Generally speaking, callers will want to either enable all of these or |
636 | | /// disable all of these. |
637 | | /// |
638 | | /// Internally, building a regex requires building two DFAs, where one is |
639 | | /// responsible for finding the end of a match and the other is responsible |
640 | | /// for finding the start of a match. If you only need to detect whether |
641 | | /// something matched, or only the end of a match, then you should use a |
642 | | /// [`dense::Builder`] to construct a single DFA, which is cheaper than |
643 | | /// building two DFAs. |
644 | | /// |
645 | | /// # Build methods |
646 | | /// |
647 | | /// This builder has a few "build" methods. In general, it's the result of |
648 | | /// combining the following parameters: |
649 | | /// |
650 | | /// * Building one or many regexes. |
651 | | /// * Building a regex with dense or sparse DFAs. |
652 | | /// |
653 | | /// The simplest "build" method is [`Builder::build`]. It accepts a single |
654 | | /// pattern and builds a dense DFA using `usize` for the state identifier |
655 | | /// representation. |
656 | | /// |
657 | | /// The most general "build" method is [`Builder::build_many`], which permits |
658 | | /// building a regex that searches for multiple patterns simultaneously while |
659 | | /// using a specific state identifier representation. |
660 | | /// |
661 | | /// The most flexible "build" method, but hardest to use, is |
662 | | /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is |
663 | | /// just a pair of DFAs, and this method allows you to specify those DFAs |
664 | | /// exactly. |
665 | | /// |
666 | | /// # Example |
667 | | /// |
668 | | /// This example shows how to disable UTF-8 mode in the syntax and the regex |
669 | | /// itself. This is generally what you want for matching on arbitrary bytes. |
670 | | /// |
671 | | /// ``` |
672 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
673 | | /// use regex_automata::{ |
674 | | /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, |
675 | | /// }; |
676 | | /// |
677 | | /// let re = Regex::builder() |
678 | | /// .syntax(syntax::Config::new().utf8(false)) |
679 | | /// .thompson(thompson::Config::new().utf8(false)) |
680 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
681 | | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
682 | | /// let expected = Some(Match::must(0, 1..9)); |
683 | | /// let got = re.find(haystack); |
684 | | /// assert_eq!(expected, got); |
685 | | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
686 | | /// // but the subsequent `.*` does not! Disabling UTF-8 |
687 | | /// // on the syntax permits this. |
688 | | /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); |
689 | | /// |
690 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
691 | | /// ``` |
692 | | #[derive(Clone, Debug)] |
693 | | pub struct Builder { |
694 | | #[cfg(feature = "dfa-build")] |
695 | | dfa: dense::Builder, |
696 | | } |
697 | | |
698 | | impl Builder { |
699 | | /// Create a new regex builder with the default configuration. |
700 | 38.5k | pub fn new() -> Builder { |
701 | 38.5k | Builder { |
702 | 38.5k | #[cfg(feature = "dfa-build")] |
703 | 38.5k | dfa: dense::Builder::new(), |
704 | 38.5k | } |
705 | 38.5k | } |
706 | | |
707 | | /// Build a regex from the given pattern. |
708 | | /// |
709 | | /// If there was a problem parsing or compiling the pattern, then an error |
710 | | /// is returned. |
711 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
712 | 0 | pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { |
713 | 0 | self.build_many(&[pattern]) |
714 | 0 | } |
715 | | |
716 | | /// Build a regex from the given pattern using sparse DFAs. |
717 | | /// |
718 | | /// If there was a problem parsing or compiling the pattern, then an error |
719 | | /// is returned. |
720 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
721 | 0 | pub fn build_sparse( |
722 | 0 | &self, |
723 | 0 | pattern: &str, |
724 | 0 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
725 | 0 | self.build_many_sparse(&[pattern]) |
726 | 0 | } |
727 | | |
728 | | /// Build a regex from the given patterns. |
729 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
730 | 0 | pub fn build_many<P: AsRef<str>>( |
731 | 0 | &self, |
732 | 0 | patterns: &[P], |
733 | 0 | ) -> Result<Regex, BuildError> { |
734 | 0 | let forward = self.dfa.build_many(patterns)?; |
735 | 0 | let reverse = self |
736 | 0 | .dfa |
737 | 0 | .clone() |
738 | 0 | .configure( |
739 | 0 | dense::Config::new() |
740 | 0 | .prefilter(None) |
741 | 0 | .specialize_start_states(false) |
742 | 0 | .start_kind(StartKind::Anchored) |
743 | 0 | .match_kind(MatchKind::All), |
744 | 0 | ) |
745 | 0 | .thompson(crate::nfa::thompson::Config::new().reverse(true)) |
746 | 0 | .build_many(patterns)?; |
747 | 0 | Ok(self.build_from_dfas(forward, reverse)) |
748 | 0 | } |
749 | | |
750 | | /// Build a sparse regex from the given patterns. |
751 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
752 | 0 | pub fn build_many_sparse<P: AsRef<str>>( |
753 | 0 | &self, |
754 | 0 | patterns: &[P], |
755 | 0 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
756 | 0 | let re = self.build_many(patterns)?; |
757 | 0 | let forward = re.forward().to_sparse()?; |
758 | 0 | let reverse = re.reverse().to_sparse()?; |
759 | 0 | Ok(self.build_from_dfas(forward, reverse)) |
760 | 0 | } |
761 | | |
762 | | /// Build a regex from its component forward and reverse DFAs. |
763 | | /// |
764 | | /// This is useful when deserializing a regex from some arbitrary |
765 | | /// memory region. This is also useful for building regexes from other |
766 | | /// types of DFAs. |
767 | | /// |
768 | | /// If you're building the DFAs from scratch instead of building new DFAs |
769 | | /// from other DFAs, then you'll need to make sure that the reverse DFA is |
770 | | /// configured correctly to match the intended semantics. Namely: |
771 | | /// |
772 | | /// * It should be anchored. |
773 | | /// * It should use [`MatchKind::All`] semantics. |
774 | | /// * It should match in reverse. |
775 | | /// * Otherwise, its configuration should match the forward DFA. |
776 | | /// |
777 | | /// If these conditions aren't satisfied, then the behavior of searches is |
778 | | /// unspecified. |
779 | | /// |
780 | | /// Note that when using this constructor, no configuration is applied. |
781 | | /// Since this routine provides the DFAs to the builder, there is no |
782 | | /// opportunity to apply other configuration options. |
783 | | /// |
784 | | /// # Example |
785 | | /// |
786 | | /// This example is a bit a contrived. The usual use of these methods |
787 | | /// would involve serializing `initial_re` somewhere and then deserializing |
788 | | /// it later to build a regex. But in this case, we do everything in |
789 | | /// memory. |
790 | | /// |
791 | | /// ``` |
792 | | /// use regex_automata::dfa::regex::Regex; |
793 | | /// |
794 | | /// let initial_re = Regex::new("foo[0-9]+")?; |
795 | | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
796 | | /// |
797 | | /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
798 | | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
799 | | /// assert_eq!(true, re.is_match(b"foo123")); |
800 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
801 | | /// ``` |
802 | | /// |
803 | | /// This example shows how to build a `Regex` that uses sparse DFAs instead |
804 | | /// of dense DFAs without using one of the convenience `build_sparse` |
805 | | /// routines: |
806 | | /// |
807 | | /// ``` |
808 | | /// use regex_automata::dfa::regex::Regex; |
809 | | /// |
810 | | /// let initial_re = Regex::new("foo[0-9]+")?; |
811 | | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
812 | | /// |
813 | | /// let fwd = initial_re.forward().to_sparse()?; |
814 | | /// let rev = initial_re.reverse().to_sparse()?; |
815 | | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
816 | | /// assert_eq!(true, re.is_match(b"foo123")); |
817 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
818 | | /// ``` |
819 | 38.5k | pub fn build_from_dfas<A: Automaton>( |
820 | 38.5k | &self, |
821 | 38.5k | forward: A, |
822 | 38.5k | reverse: A, |
823 | 38.5k | ) -> Regex<A> { |
824 | 38.5k | Regex { forward, reverse } |
825 | 38.5k | } <regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::dense::DFA<alloc::vec::Vec<u32>>> Line | Count | Source | 819 | 38.5k | pub fn build_from_dfas<A: Automaton>( | 820 | 38.5k | &self, | 821 | 38.5k | forward: A, | 822 | 38.5k | reverse: A, | 823 | 38.5k | ) -> Regex<A> { | 824 | 38.5k | Regex { forward, reverse } | 825 | 38.5k | } |
Unexecuted instantiation: <regex_automata::dfa::regex::Builder>::build_from_dfas::<regex_automata::dfa::sparse::DFA<alloc::vec::Vec<u8>>> |
826 | | |
827 | | /// Set the syntax configuration for this builder using |
828 | | /// [`syntax::Config`](crate::util::syntax::Config). |
829 | | /// |
830 | | /// This permits setting things like case insensitivity, Unicode and multi |
831 | | /// line mode. |
832 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
833 | 0 | pub fn syntax( |
834 | 0 | &mut self, |
835 | 0 | config: crate::util::syntax::Config, |
836 | 0 | ) -> &mut Builder { |
837 | 0 | self.dfa.syntax(config); |
838 | 0 | self |
839 | 0 | } |
840 | | |
841 | | /// Set the Thompson NFA configuration for this builder using |
842 | | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
843 | | /// |
844 | | /// This permits setting things like whether additional time should be |
845 | | /// spent shrinking the size of the NFA. |
846 | | #[cfg(all(feature = "syntax", feature = "dfa-build"))] |
847 | 0 | pub fn thompson( |
848 | 0 | &mut self, |
849 | 0 | config: crate::nfa::thompson::Config, |
850 | 0 | ) -> &mut Builder { |
851 | 0 | self.dfa.thompson(config); |
852 | 0 | self |
853 | 0 | } |
854 | | |
855 | | /// Set the dense DFA compilation configuration for this builder using |
856 | | /// [`dense::Config`]. |
857 | | /// |
858 | | /// This permits setting things like whether the underlying DFAs should |
859 | | /// be minimized. |
860 | | #[cfg(feature = "dfa-build")] |
861 | 0 | pub fn dense(&mut self, config: dense::Config) -> &mut Builder { |
862 | 0 | self.dfa.configure(config); |
863 | 0 | self |
864 | 0 | } |
865 | | } |
866 | | |
867 | | impl Default for Builder { |
868 | 0 | fn default() -> Builder { |
869 | 0 | Builder::new() |
870 | 0 | } |
871 | | } |