/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/search.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Types and routines that support the search APIs of most regex engines. |
3 | | |
4 | | This sub-module isn't exposed directly, but rather, its contents are exported |
5 | | at the crate root due to the universality of most of the types and routines in |
6 | | this module. |
7 | | */ |
8 | | |
9 | | use core::ops::{Range, RangeBounds}; |
10 | | |
11 | | use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; |
12 | | |
13 | | /// The parameters for a regex search including the haystack to search. |
14 | | /// |
15 | | /// It turns out that regex searches have a few parameters, and in most cases, |
16 | | /// those parameters have defaults that work in the vast majority of cases. |
17 | | /// This `Input` type exists to make that common case seamnless while also |
18 | | /// providing an avenue for changing the parameters of a search. In particular, |
19 | | /// this type enables doing so without a combinatorial explosion of different |
20 | | /// methods and/or superfluous parameters in the common cases. |
21 | | /// |
22 | | /// An `Input` permits configuring the following things: |
23 | | /// |
24 | | /// * Search only a substring of a haystack, while taking the broader context |
25 | | /// into account for resolving look-around assertions. |
26 | | /// * Indicating whether to search for all patterns in a regex, or to |
27 | | /// only search for one pattern in particular. |
28 | | /// * Whether to perform an anchored on unanchored search. |
29 | | /// * Whether to report a match as early as possible. |
30 | | /// |
31 | | /// All of these parameters, except for the haystack, have sensible default |
32 | | /// values. This means that the minimal search configuration is simply a call |
33 | | /// to [`Input::new`] with your haystack. Setting any other parameter is |
34 | | /// optional. |
35 | | /// |
36 | | /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a |
37 | | /// `From<H> for Input` implementation. This is useful because many of the |
38 | | /// search APIs in this crate accept an `Into<Input>`. This means you can |
39 | | /// provide string or byte strings to these routines directly, and they'll |
40 | | /// automatically get converted into an `Input` for you. |
41 | | /// |
42 | | /// The lifetime parameter `'h` refers to the lifetime of the haystack. |
43 | | /// |
44 | | /// # Organization |
45 | | /// |
46 | | /// The API of `Input` is split into a few different parts: |
47 | | /// |
48 | | /// * A builder-like API that transforms a `Input` by value. Examples: |
49 | | /// [`Input::span`] and [`Input::anchored`]. |
50 | | /// * A setter API that permits mutating parameters in place. Examples: |
51 | | /// [`Input::set_span`] and [`Input::set_anchored`]. |
52 | | /// * A getter API that permits retrieving any of the search parameters. |
53 | | /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. |
54 | | /// * A few convenience getter routines that don't conform to the above naming |
55 | | /// pattern due to how common they are. Examples: [`Input::haystack`], |
56 | | /// [`Input::start`] and [`Input::end`]. |
57 | | /// * Miscellaneous predicates and other helper routines that are useful |
58 | | /// in some contexts. Examples: [`Input::is_char_boundary`]. |
59 | | /// |
60 | | /// A `Input` exposes so much because it is meant to be used by both callers of |
61 | | /// regex engines _and_ implementors of regex engines. A constraining factor is |
62 | | /// that regex engines should accept a `&Input` as its lowest level API, which |
63 | | /// means that implementors should only use the "getter" APIs of a `Input`. |
64 | | /// |
65 | | /// # Valid bounds and search termination |
66 | | /// |
67 | | /// An `Input` permits setting the bounds of a search via either |
68 | | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |
69 | | /// else a panic will occur. Bounds are valid if and only if: |
70 | | /// |
71 | | /// * The bounds represent a valid range into the input's haystack. |
72 | | /// * **or** the end bound is a valid ending bound for the haystack *and* |
73 | | /// the start bound is exactly one greater than the start bound. |
74 | | /// |
75 | | /// In the latter case, [`Input::is_done`] will return true and indicates any |
76 | | /// search receiving such an input should immediately return with no match. |
77 | | /// |
78 | | /// Note that while `Input` is used for reverse searches in this crate, the |
79 | | /// `Input::is_done` predicate assumes a forward search. Because unsigned |
80 | | /// offsets are used internally, there is no way to tell from only the offsets |
81 | | /// whether a reverse search is done or not. |
82 | | /// |
83 | | /// # Regex engine support |
84 | | /// |
85 | | /// Any regex engine accepting an `Input` must support at least the following |
86 | | /// things: |
87 | | /// |
88 | | /// * Searching a `&[u8]` for matches. |
89 | | /// * Searching a substring of `&[u8]` for a match, such that any match |
90 | | /// reported must appear entirely within that substring. |
91 | | /// * For a forwards search, a match should never be reported when |
92 | | /// [`Input::is_done`] returns true. (For reverse searches, termination should |
93 | | /// be handled outside of `Input`.) |
94 | | /// |
95 | | /// Supporting other aspects of an `Input` are optional, but regex engines |
96 | | /// should handle aspects they don't support gracefully. How this is done is |
97 | | /// generally up to the regex engine. This crate generally treats unsupported |
98 | | /// anchored modes as an error to report for example, but for simplicity, in |
99 | | /// the meta regex engine, trying to search with an invalid pattern ID just |
100 | | /// results in no match being reported. |
101 | | #[derive(Clone)] |
102 | | pub struct Input<'h> { |
103 | | haystack: &'h [u8], |
104 | | span: Span, |
105 | | anchored: Anchored, |
106 | | earliest: bool, |
107 | | } |
108 | | |
109 | | impl<'h> Input<'h> { |
110 | | /// Create a new search configuration for the given haystack. |
111 | | #[inline] |
112 | 0 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |
113 | 0 | // Perform only one call to `haystack.as_ref()` to protect from incorrect |
114 | 0 | // implementations that return different values from multiple calls. |
115 | 0 | // This is important because there's code that relies on `span` not being |
116 | 0 | // out of bounds with respect to the stored `haystack`. |
117 | 0 | let haystack = haystack.as_ref(); |
118 | 0 | Input { |
119 | 0 | haystack, |
120 | 0 | span: Span { start: 0, end: haystack.len() }, |
121 | 0 | anchored: Anchored::No, |
122 | 0 | earliest: false, |
123 | 0 | } |
124 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::new::<str> Unexecuted instantiation: <regex_automata::util::search::Input>::new::<_> |
125 | | |
126 | | /// Set the span for this search. |
127 | | /// |
128 | | /// This routine does not panic if the span given is not a valid range for |
129 | | /// this search's haystack. If this search is run with an invalid range, |
130 | | /// then the most likely outcome is that the actual search execution will |
131 | | /// panic. |
132 | | /// |
133 | | /// This routine is generic over how a span is provided. While |
134 | | /// a [`Span`] may be given directly, one may also provide a |
135 | | /// `std::ops::Range<usize>`. To provide anything supported by range |
136 | | /// syntax, use the [`Input::range`] method. |
137 | | /// |
138 | | /// The default span is the entire haystack. |
139 | | /// |
140 | | /// Note that [`Input::range`] overrides this method and vice versa. |
141 | | /// |
142 | | /// # Panics |
143 | | /// |
144 | | /// This panics if the given span does not correspond to valid bounds in |
145 | | /// the haystack or the termination of a search. |
146 | | /// |
147 | | /// # Example |
148 | | /// |
149 | | /// This example shows how the span of the search can impact whether a |
150 | | /// match is reported or not. This is particularly relevant for look-around |
151 | | /// operators, which might take things outside of the span into account |
152 | | /// when determining whether they match. |
153 | | /// |
154 | | /// ``` |
155 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
156 | | /// use regex_automata::{ |
157 | | /// nfa::thompson::pikevm::PikeVM, |
158 | | /// Match, Input, |
159 | | /// }; |
160 | | /// |
161 | | /// // Look for 'at', but as a distinct word. |
162 | | /// let re = PikeVM::new(r"\bat\b")?; |
163 | | /// let mut cache = re.create_cache(); |
164 | | /// let mut caps = re.create_captures(); |
165 | | /// |
166 | | /// // Our haystack contains 'at', but not as a distinct word. |
167 | | /// let haystack = "batter"; |
168 | | /// |
169 | | /// // A standard search finds nothing, as expected. |
170 | | /// let input = Input::new(haystack); |
171 | | /// re.search(&mut cache, &input, &mut caps); |
172 | | /// assert_eq!(None, caps.get_match()); |
173 | | /// |
174 | | /// // But if we wanted to search starting at position '1', we might |
175 | | /// // slice the haystack. If we do this, it's impossible for the \b |
176 | | /// // anchors to take the surrounding context into account! And thus, |
177 | | /// // a match is produced. |
178 | | /// let input = Input::new(&haystack[1..3]); |
179 | | /// re.search(&mut cache, &input, &mut caps); |
180 | | /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); |
181 | | /// |
182 | | /// // But if we specify the span of the search instead of slicing the |
183 | | /// // haystack, then the regex engine can "see" outside of the span |
184 | | /// // and resolve the anchors correctly. |
185 | | /// let input = Input::new(haystack).span(1..3); |
186 | | /// re.search(&mut cache, &input, &mut caps); |
187 | | /// assert_eq!(None, caps.get_match()); |
188 | | /// |
189 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
190 | | /// ``` |
191 | | /// |
192 | | /// This may seem a little ham-fisted, but this scenario tends to come up |
193 | | /// if some other regex engine found the match span and now you need to |
194 | | /// re-process that span to look for capturing groups. (e.g., Run a faster |
195 | | /// DFA first, find a match, then run the PikeVM on just the match span to |
196 | | /// resolve capturing groups.) In order to implement that sort of logic |
197 | | /// correctly, you need to set the span on the search instead of slicing |
198 | | /// the haystack directly. |
199 | | /// |
200 | | /// The other advantage of using this routine to specify the bounds of the |
201 | | /// search is that the match offsets are still reported in terms of the |
202 | | /// original haystack. For example, the second search in the example above |
203 | | /// reported a match at position `0`, even though `at` starts at offset |
204 | | /// `1` because we sliced the haystack. |
205 | | #[inline] |
206 | 0 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |
207 | 0 | self.set_span(span); |
208 | 0 | self |
209 | 0 | } |
210 | | |
211 | | /// Like `Input::span`, but accepts any range instead. |
212 | | /// |
213 | | /// This routine does not panic if the range given is not a valid range for |
214 | | /// this search's haystack. If this search is run with an invalid range, |
215 | | /// then the most likely outcome is that the actual search execution will |
216 | | /// panic. |
217 | | /// |
218 | | /// The default range is the entire haystack. |
219 | | /// |
220 | | /// Note that [`Input::span`] overrides this method and vice versa. |
221 | | /// |
222 | | /// # Panics |
223 | | /// |
224 | | /// This routine will panic if the given range could not be converted |
225 | | /// to a valid [`Range`]. For example, this would panic when given |
226 | | /// `0..=usize::MAX` since it cannot be represented using a half-open |
227 | | /// interval in terms of `usize`. |
228 | | /// |
229 | | /// This also panics if the given range does not correspond to valid bounds |
230 | | /// in the haystack or the termination of a search. |
231 | | /// |
232 | | /// # Example |
233 | | /// |
234 | | /// ``` |
235 | | /// use regex_automata::Input; |
236 | | /// |
237 | | /// let input = Input::new("foobar"); |
238 | | /// assert_eq!(0..6, input.get_range()); |
239 | | /// |
240 | | /// let input = Input::new("foobar").range(2..=4); |
241 | | /// assert_eq!(2..5, input.get_range()); |
242 | | /// ``` |
243 | | #[inline] |
244 | 0 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |
245 | 0 | self.set_range(range); |
246 | 0 | self |
247 | 0 | } |
248 | | |
249 | | /// Sets the anchor mode of a search. |
250 | | /// |
251 | | /// When a search is anchored (so that's [`Anchored::Yes`] or |
252 | | /// [`Anchored::Pattern`]), a match must begin at the start of a search. |
253 | | /// When a search is not anchored (that's [`Anchored::No`]), regex engines |
254 | | /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix |
255 | | /// permits a match to appear anywhere. |
256 | | /// |
257 | | /// By default, the anchored mode is [`Anchored::No`]. |
258 | | /// |
259 | | /// **WARNING:** this is subtly different than using a `^` at the start of |
260 | | /// your regex. A `^` forces a regex to match exclusively at the start of |
261 | | /// a haystack, regardless of where you begin your search. In contrast, |
262 | | /// anchoring a search will allow your regex to match anywhere in your |
263 | | /// haystack, but the match must start at the beginning of a search. |
264 | | /// |
265 | | /// For example, consider the haystack `aba` and the following searches: |
266 | | /// |
267 | | /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` |
268 | | /// starting at position `2`. Since `^` requires the match to start at |
269 | | /// the beginning of the haystack and `2 > 0`, no match is found. |
270 | | /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
271 | | /// starting at position `2`. This reports a match at `[2, 3]` since |
272 | | /// the match starts where the search started. Since there is no `^`, |
273 | | /// there is no requirement for the match to start at the beginning of |
274 | | /// the haystack. |
275 | | /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
276 | | /// starting at position `1`. Since `b` corresponds to position `1` and |
277 | | /// since the search is anchored, it finds no match. While the regex |
278 | | /// matches at other positions, configuring the search to be anchored |
279 | | /// requires that it only report a match that begins at the same offset |
280 | | /// as the beginning of the search. |
281 | | /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` |
282 | | /// starting at position `1`. Since the search is not anchored and |
283 | | /// the regex does not start with `^`, the search executes as if there |
284 | | /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it |
285 | | /// reports a match at `[2, 3]`. |
286 | | /// |
287 | | /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, |
288 | | /// except it only reports matches for a particular pattern. |
289 | | /// |
290 | | /// # Example |
291 | | /// |
292 | | /// This demonstrates the differences between an anchored search and |
293 | | /// a pattern that begins with `^` (as described in the above warning |
294 | | /// message). |
295 | | /// |
296 | | /// ``` |
297 | | /// use regex_automata::{ |
298 | | /// nfa::thompson::pikevm::PikeVM, |
299 | | /// Anchored, Match, Input, |
300 | | /// }; |
301 | | /// |
302 | | /// let haystack = "aba"; |
303 | | /// |
304 | | /// let re = PikeVM::new(r"^a")?; |
305 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
306 | | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); |
307 | | /// re.search(&mut cache, &input, &mut caps); |
308 | | /// // No match is found because 2 is not the beginning of the haystack, |
309 | | /// // which is what ^ requires. |
310 | | /// assert_eq!(None, caps.get_match()); |
311 | | /// |
312 | | /// let re = PikeVM::new(r"a")?; |
313 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
314 | | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); |
315 | | /// re.search(&mut cache, &input, &mut caps); |
316 | | /// // An anchored search can still match anywhere in the haystack, it just |
317 | | /// // must begin at the start of the search which is '2' in this case. |
318 | | /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); |
319 | | /// |
320 | | /// let re = PikeVM::new(r"a")?; |
321 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
322 | | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); |
323 | | /// re.search(&mut cache, &input, &mut caps); |
324 | | /// // No match is found since we start searching at offset 1 which |
325 | | /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match |
326 | | /// // is found. |
327 | | /// assert_eq!(None, caps.get_match()); |
328 | | /// |
329 | | /// let re = PikeVM::new(r"a")?; |
330 | | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
331 | | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); |
332 | | /// re.search(&mut cache, &input, &mut caps); |
333 | | /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the |
334 | | /// // pattern. Even though the search starts at 'b', the 'match anything' |
335 | | /// // prefix allows the search to match 'a'. |
336 | | /// let expected = Some(Match::must(0, 2..3)); |
337 | | /// assert_eq!(expected, caps.get_match()); |
338 | | /// |
339 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
340 | | /// ``` |
341 | | #[inline] |
342 | 0 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |
343 | 0 | self.set_anchored(mode); |
344 | 0 | self |
345 | 0 | } |
346 | | |
347 | | /// Whether to execute an "earliest" search or not. |
348 | | /// |
349 | | /// When running a non-overlapping search, an "earliest" search will return |
350 | | /// the match location as early as possible. For example, given a pattern |
351 | | /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search |
352 | | /// will return `foo12345` as a match. But an "earliest" search for regex |
353 | | /// engines that support "earliest" semantics will return `foo1` as a |
354 | | /// match, since as soon as the first digit following `foo` is seen, it is |
355 | | /// known to have found a match. |
356 | | /// |
357 | | /// Note that "earliest" semantics generally depend on the regex engine. |
358 | | /// Different regex engines may determine there is a match at different |
359 | | /// points. So there is no guarantee that "earliest" matches will always |
360 | | /// return the same offsets for all regex engines. The "earliest" notion |
361 | | /// is really about when the particular regex engine determines there is |
362 | | /// a match rather than a consistent semantic unto itself. This is often |
363 | | /// useful for implementing "did a match occur or not" predicates, but |
364 | | /// sometimes the offset is useful as well. |
365 | | /// |
366 | | /// This is disabled by default. |
367 | | /// |
368 | | /// # Example |
369 | | /// |
370 | | /// This example shows the difference between "earliest" searching and |
371 | | /// normal searching. |
372 | | /// |
373 | | /// ``` |
374 | | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
375 | | /// |
376 | | /// let re = PikeVM::new(r"foo[0-9]+")?; |
377 | | /// let mut cache = re.create_cache(); |
378 | | /// let mut caps = re.create_captures(); |
379 | | /// |
380 | | /// // A normal search implements greediness like you expect. |
381 | | /// let input = Input::new("foo12345"); |
382 | | /// re.search(&mut cache, &input, &mut caps); |
383 | | /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); |
384 | | /// |
385 | | /// // When 'earliest' is enabled and the regex engine supports |
386 | | /// // it, the search will bail once it knows a match has been |
387 | | /// // found. |
388 | | /// let input = Input::new("foo12345").earliest(true); |
389 | | /// re.search(&mut cache, &input, &mut caps); |
390 | | /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); |
391 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
392 | | /// ``` |
393 | | #[inline] |
394 | 0 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |
395 | 0 | self.set_earliest(yes); |
396 | 0 | self |
397 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::earliest Unexecuted instantiation: <regex_automata::util::search::Input>::earliest |
398 | | |
399 | | /// Set the span for this search configuration. |
400 | | /// |
401 | | /// This is like the [`Input::span`] method, except this mutates the |
402 | | /// span in place. |
403 | | /// |
404 | | /// This routine is generic over how a span is provided. While |
405 | | /// a [`Span`] may be given directly, one may also provide a |
406 | | /// `std::ops::Range<usize>`. |
407 | | /// |
408 | | /// # Panics |
409 | | /// |
410 | | /// This panics if the given span does not correspond to valid bounds in |
411 | | /// the haystack or the termination of a search. |
412 | | /// |
413 | | /// # Example |
414 | | /// |
415 | | /// ``` |
416 | | /// use regex_automata::Input; |
417 | | /// |
418 | | /// let mut input = Input::new("foobar"); |
419 | | /// assert_eq!(0..6, input.get_range()); |
420 | | /// input.set_span(2..4); |
421 | | /// assert_eq!(2..4, input.get_range()); |
422 | | /// ``` |
423 | | #[inline] |
424 | 0 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |
425 | 0 | let span = span.into(); |
426 | 0 | assert!( |
427 | 0 | span.end <= self.haystack.len() |
428 | 0 | && span.start <= span.end.wrapping_add(1), |
429 | 0 | "invalid span {:?} for haystack of length {}", |
430 | 0 | span, |
431 | 0 | self.haystack.len(), |
432 | | ); |
433 | 0 | self.span = span; |
434 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::set_span::<core::ops::range::Range<usize>> Unexecuted instantiation: <regex_automata::util::search::Input>::set_span::<regex_automata::util::search::Span> |
435 | | |
436 | | /// Set the span for this search configuration given any range. |
437 | | /// |
438 | | /// This is like the [`Input::range`] method, except this mutates the |
439 | | /// span in place. |
440 | | /// |
441 | | /// This routine does not panic if the range given is not a valid range for |
442 | | /// this search's haystack. If this search is run with an invalid range, |
443 | | /// then the most likely outcome is that the actual search execution will |
444 | | /// panic. |
445 | | /// |
446 | | /// # Panics |
447 | | /// |
448 | | /// This routine will panic if the given range could not be converted |
449 | | /// to a valid [`Range`]. For example, this would panic when given |
450 | | /// `0..=usize::MAX` since it cannot be represented using a half-open |
451 | | /// interval in terms of `usize`. |
452 | | /// |
453 | | /// This also panics if the given span does not correspond to valid bounds |
454 | | /// in the haystack or the termination of a search. |
455 | | /// |
456 | | /// # Example |
457 | | /// |
458 | | /// ``` |
459 | | /// use regex_automata::Input; |
460 | | /// |
461 | | /// let mut input = Input::new("foobar"); |
462 | | /// assert_eq!(0..6, input.get_range()); |
463 | | /// input.set_range(2..=4); |
464 | | /// assert_eq!(2..5, input.get_range()); |
465 | | /// ``` |
466 | | #[inline] |
467 | 0 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |
468 | | use core::ops::Bound; |
469 | | |
470 | | // It's a little weird to convert ranges into spans, and then spans |
471 | | // back into ranges when we actually slice the haystack. Because |
472 | | // of that process, we always represent everything as a half-open |
473 | | // internal. Therefore, handling things like m..=n is a little awkward. |
474 | 0 | let start = match range.start_bound() { |
475 | 0 | Bound::Included(&i) => i, |
476 | | // Can this case ever happen? Range syntax doesn't support it... |
477 | 0 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |
478 | 0 | Bound::Unbounded => 0, |
479 | | }; |
480 | 0 | let end = match range.end_bound() { |
481 | 0 | Bound::Included(&i) => i.checked_add(1).unwrap(), |
482 | 0 | Bound::Excluded(&i) => i, |
483 | 0 | Bound::Unbounded => self.haystack().len(), |
484 | | }; |
485 | 0 | self.set_span(Span { start, end }); |
486 | 0 | } |
487 | | |
488 | | /// Set the starting offset for the span for this search configuration. |
489 | | /// |
490 | | /// This is a convenience routine for only mutating the start of a span |
491 | | /// without having to set the entire span. |
492 | | /// |
493 | | /// # Panics |
494 | | /// |
495 | | /// This panics if the span resulting from the new start position does not |
496 | | /// correspond to valid bounds in the haystack or the termination of a |
497 | | /// search. |
498 | | /// |
499 | | /// # Example |
500 | | /// |
501 | | /// ``` |
502 | | /// use regex_automata::Input; |
503 | | /// |
504 | | /// let mut input = Input::new("foobar"); |
505 | | /// assert_eq!(0..6, input.get_range()); |
506 | | /// input.set_start(5); |
507 | | /// assert_eq!(5..6, input.get_range()); |
508 | | /// ``` |
509 | | #[inline] |
510 | 0 | pub fn set_start(&mut self, start: usize) { |
511 | 0 | self.set_span(Span { start, ..self.get_span() }); |
512 | 0 | } |
513 | | |
514 | | /// Set the ending offset for the span for this search configuration. |
515 | | /// |
516 | | /// This is a convenience routine for only mutating the end of a span |
517 | | /// without having to set the entire span. |
518 | | /// |
519 | | /// # Panics |
520 | | /// |
521 | | /// This panics if the span resulting from the new end position does not |
522 | | /// correspond to valid bounds in the haystack or the termination of a |
523 | | /// search. |
524 | | /// |
525 | | /// # Example |
526 | | /// |
527 | | /// ``` |
528 | | /// use regex_automata::Input; |
529 | | /// |
530 | | /// let mut input = Input::new("foobar"); |
531 | | /// assert_eq!(0..6, input.get_range()); |
532 | | /// input.set_end(5); |
533 | | /// assert_eq!(0..5, input.get_range()); |
534 | | /// ``` |
535 | | #[inline] |
536 | 0 | pub fn set_end(&mut self, end: usize) { |
537 | 0 | self.set_span(Span { end, ..self.get_span() }); |
538 | 0 | } |
539 | | |
540 | | /// Set the anchor mode of a search. |
541 | | /// |
542 | | /// This is like [`Input::anchored`], except it mutates the search |
543 | | /// configuration in place. |
544 | | /// |
545 | | /// # Example |
546 | | /// |
547 | | /// ``` |
548 | | /// use regex_automata::{Anchored, Input, PatternID}; |
549 | | /// |
550 | | /// let mut input = Input::new("foobar"); |
551 | | /// assert_eq!(Anchored::No, input.get_anchored()); |
552 | | /// |
553 | | /// let pid = PatternID::must(5); |
554 | | /// input.set_anchored(Anchored::Pattern(pid)); |
555 | | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
556 | | /// ``` |
557 | | #[inline] |
558 | 0 | pub fn set_anchored(&mut self, mode: Anchored) { |
559 | 0 | self.anchored = mode; |
560 | 0 | } |
561 | | |
562 | | /// Set whether the search should execute in "earliest" mode or not. |
563 | | /// |
564 | | /// This is like [`Input::earliest`], except it mutates the search |
565 | | /// configuration in place. |
566 | | /// |
567 | | /// # Example |
568 | | /// |
569 | | /// ``` |
570 | | /// use regex_automata::Input; |
571 | | /// |
572 | | /// let mut input = Input::new("foobar"); |
573 | | /// assert!(!input.get_earliest()); |
574 | | /// input.set_earliest(true); |
575 | | /// assert!(input.get_earliest()); |
576 | | /// ``` |
577 | | #[inline] |
578 | 0 | pub fn set_earliest(&mut self, yes: bool) { |
579 | 0 | self.earliest = yes; |
580 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::set_earliest Unexecuted instantiation: <regex_automata::util::search::Input>::set_earliest |
581 | | |
582 | | /// Return a borrow of the underlying haystack as a slice of bytes. |
583 | | /// |
584 | | /// # Example |
585 | | /// |
586 | | /// ``` |
587 | | /// use regex_automata::Input; |
588 | | /// |
589 | | /// let input = Input::new("foobar"); |
590 | | /// assert_eq!(b"foobar", input.haystack()); |
591 | | /// ``` |
592 | | #[inline] |
593 | 0 | pub fn haystack(&self) -> &[u8] { |
594 | 0 | self.haystack |
595 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::haystack Unexecuted instantiation: <regex_automata::util::search::Input>::haystack |
596 | | |
597 | | /// Return the start position of this search. |
598 | | /// |
599 | | /// This is a convenience routine for `search.get_span().start()`. |
600 | | /// |
601 | | /// When [`Input::is_done`] is `false`, this is guaranteed to return |
602 | | /// an offset that is less than or equal to [`Input::end`]. Otherwise, |
603 | | /// the offset is one greater than [`Input::end`]. |
604 | | /// |
605 | | /// # Example |
606 | | /// |
607 | | /// ``` |
608 | | /// use regex_automata::Input; |
609 | | /// |
610 | | /// let input = Input::new("foobar"); |
611 | | /// assert_eq!(0, input.start()); |
612 | | /// |
613 | | /// let input = Input::new("foobar").span(2..4); |
614 | | /// assert_eq!(2, input.start()); |
615 | | /// ``` |
616 | | #[inline] |
617 | 0 | pub fn start(&self) -> usize { |
618 | 0 | self.get_span().start |
619 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::start Unexecuted instantiation: <regex_automata::util::search::Input>::start |
620 | | |
621 | | /// Return the end position of this search. |
622 | | /// |
623 | | /// This is a convenience routine for `search.get_span().end()`. |
624 | | /// |
625 | | /// This is guaranteed to return an offset that is a valid exclusive end |
626 | | /// bound for this input's haystack. |
627 | | /// |
628 | | /// # Example |
629 | | /// |
630 | | /// ``` |
631 | | /// use regex_automata::Input; |
632 | | /// |
633 | | /// let input = Input::new("foobar"); |
634 | | /// assert_eq!(6, input.end()); |
635 | | /// |
636 | | /// let input = Input::new("foobar").span(2..4); |
637 | | /// assert_eq!(4, input.end()); |
638 | | /// ``` |
639 | | #[inline] |
640 | 0 | pub fn end(&self) -> usize { |
641 | 0 | self.get_span().end |
642 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::end Unexecuted instantiation: <regex_automata::util::search::Input>::end |
643 | | |
644 | | /// Return the span for this search configuration. |
645 | | /// |
646 | | /// If one was not explicitly set, then the span corresponds to the entire |
647 | | /// range of the haystack. |
648 | | /// |
649 | | /// When [`Input::is_done`] is `false`, the span returned is guaranteed |
650 | | /// to correspond to valid bounds for this input's haystack. |
651 | | /// |
652 | | /// # Example |
653 | | /// |
654 | | /// ``` |
655 | | /// use regex_automata::{Input, Span}; |
656 | | /// |
657 | | /// let input = Input::new("foobar"); |
658 | | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |
659 | | /// ``` |
660 | | #[inline] |
661 | 0 | pub fn get_span(&self) -> Span { |
662 | 0 | self.span |
663 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::get_span Unexecuted instantiation: <regex_automata::util::search::Input>::get_span |
664 | | |
665 | | /// Return the span as a range for this search configuration. |
666 | | /// |
667 | | /// If one was not explicitly set, then the span corresponds to the entire |
668 | | /// range of the haystack. |
669 | | /// |
670 | | /// When [`Input::is_done`] is `false`, the range returned is guaranteed |
671 | | /// to correspond to valid bounds for this input's haystack. |
672 | | /// |
673 | | /// # Example |
674 | | /// |
675 | | /// ``` |
676 | | /// use regex_automata::Input; |
677 | | /// |
678 | | /// let input = Input::new("foobar"); |
679 | | /// assert_eq!(0..6, input.get_range()); |
680 | | /// ``` |
681 | | #[inline] |
682 | 0 | pub fn get_range(&self) -> Range<usize> { |
683 | 0 | self.get_span().range() |
684 | 0 | } |
685 | | |
686 | | /// Return the anchored mode for this search configuration. |
687 | | /// |
688 | | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |
689 | | /// |
690 | | /// # Example |
691 | | /// |
692 | | /// ``` |
693 | | /// use regex_automata::{Anchored, Input, PatternID}; |
694 | | /// |
695 | | /// let mut input = Input::new("foobar"); |
696 | | /// assert_eq!(Anchored::No, input.get_anchored()); |
697 | | /// |
698 | | /// let pid = PatternID::must(5); |
699 | | /// input.set_anchored(Anchored::Pattern(pid)); |
700 | | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
701 | | /// ``` |
702 | | #[inline] |
703 | 0 | pub fn get_anchored(&self) -> Anchored { |
704 | 0 | self.anchored |
705 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Input>::get_anchored Unexecuted instantiation: <regex_automata::util::search::Input>::get_anchored |
706 | | |
707 | | /// Return whether this search should execute in "earliest" mode. |
708 | | /// |
709 | | /// # Example |
710 | | /// |
711 | | /// ``` |
712 | | /// use regex_automata::Input; |
713 | | /// |
714 | | /// let input = Input::new("foobar"); |
715 | | /// assert!(!input.get_earliest()); |
716 | | /// ``` |
717 | | #[inline] |
718 | 0 | pub fn get_earliest(&self) -> bool { |
719 | 0 | self.earliest |
720 | 0 | } |
721 | | |
722 | | /// Return true if and only if this search can never return any other |
723 | | /// matches. |
724 | | /// |
725 | | /// This occurs when the start position of this search is greater than the |
726 | | /// end position of the search. |
727 | | /// |
728 | | /// # Example |
729 | | /// |
730 | | /// ``` |
731 | | /// use regex_automata::Input; |
732 | | /// |
733 | | /// let mut input = Input::new("foobar"); |
734 | | /// assert!(!input.is_done()); |
735 | | /// input.set_start(6); |
736 | | /// assert!(!input.is_done()); |
737 | | /// input.set_start(7); |
738 | | /// assert!(input.is_done()); |
739 | | /// ``` |
740 | | #[inline] |
741 | 0 | pub fn is_done(&self) -> bool { |
742 | 0 | self.get_span().start > self.get_span().end |
743 | 0 | } |
744 | | |
745 | | /// Returns true if and only if the given offset in this search's haystack |
746 | | /// falls on a valid UTF-8 encoded codepoint boundary. |
747 | | /// |
748 | | /// If the haystack is not valid UTF-8, then the behavior of this routine |
749 | | /// is unspecified. |
750 | | /// |
751 | | /// # Example |
752 | | /// |
753 | | /// This shows where codepoint boundaries do and don't exist in valid |
754 | | /// UTF-8. |
755 | | /// |
756 | | /// ``` |
757 | | /// use regex_automata::Input; |
758 | | /// |
759 | | /// let input = Input::new("☃"); |
760 | | /// assert!(input.is_char_boundary(0)); |
761 | | /// assert!(!input.is_char_boundary(1)); |
762 | | /// assert!(!input.is_char_boundary(2)); |
763 | | /// assert!(input.is_char_boundary(3)); |
764 | | /// assert!(!input.is_char_boundary(4)); |
765 | | /// ``` |
766 | | #[inline] |
767 | 0 | pub fn is_char_boundary(&self, offset: usize) -> bool { |
768 | 0 | utf8::is_boundary(self.haystack(), offset) |
769 | 0 | } |
770 | | } |
771 | | |
772 | | impl<'h> core::fmt::Debug for Input<'h> { |
773 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
774 | | use crate::util::escape::DebugHaystack; |
775 | | |
776 | 0 | f.debug_struct("Input") |
777 | 0 | .field("haystack", &DebugHaystack(self.haystack())) |
778 | 0 | .field("span", &self.span) |
779 | 0 | .field("anchored", &self.anchored) |
780 | 0 | .field("earliest", &self.earliest) |
781 | 0 | .finish() |
782 | 0 | } |
783 | | } |
784 | | |
785 | | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |
786 | 0 | fn from(haystack: &'h H) -> Input<'h> { |
787 | 0 | Input::new(haystack) |
788 | 0 | } |
789 | | } |
790 | | |
791 | | /// A representation of a span reported by a regex engine. |
792 | | /// |
793 | | /// A span corresponds to the starting and ending _byte offsets_ of a |
794 | | /// contiguous region of bytes. The starting offset is inclusive while the |
795 | | /// ending offset is exclusive. That is, a span is a half-open interval. |
796 | | /// |
797 | | /// A span is used to report the offsets of a match, but it is also used to |
798 | | /// convey which region of a haystack should be searched via routines like |
799 | | /// [`Input::span`]. |
800 | | /// |
801 | | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |
802 | | /// type implements `Copy` which makes it more ergonomic to use in the context |
803 | | /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, |
804 | | /// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, |
805 | | /// which means things like `Span::from(5..10)` work. |
806 | | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
807 | | pub struct Span { |
808 | | /// The start offset of the span, inclusive. |
809 | | pub start: usize, |
810 | | /// The end offset of the span, exclusive. |
811 | | pub end: usize, |
812 | | } |
813 | | |
814 | | impl Span { |
815 | | /// Returns this span as a range. |
816 | | #[inline] |
817 | 0 | pub fn range(&self) -> Range<usize> { |
818 | 0 | Range::from(*self) |
819 | 0 | } |
820 | | |
821 | | /// Returns true when this span is empty. That is, when `start >= end`. |
822 | | #[inline] |
823 | 0 | pub fn is_empty(&self) -> bool { |
824 | 0 | self.start >= self.end |
825 | 0 | } |
826 | | |
827 | | /// Returns the length of this span. |
828 | | /// |
829 | | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
830 | | #[inline] |
831 | 0 | pub fn len(&self) -> usize { |
832 | 0 | self.end.saturating_sub(self.start) |
833 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Span>::len Unexecuted instantiation: <regex_automata::util::search::Span>::len |
834 | | |
835 | | /// Returns true when the given offset is contained within this span. |
836 | | /// |
837 | | /// Note that an empty span contains no offsets and will always return |
838 | | /// false. |
839 | | #[inline] |
840 | 0 | pub fn contains(&self, offset: usize) -> bool { |
841 | 0 | !self.is_empty() && self.start <= offset && offset <= self.end |
842 | 0 | } |
843 | | |
844 | | /// Returns a new span with `offset` added to this span's `start` and `end` |
845 | | /// values. |
846 | | #[inline] |
847 | 0 | pub fn offset(&self, offset: usize) -> Span { |
848 | 0 | Span { start: self.start + offset, end: self.end + offset } |
849 | 0 | } |
850 | | } |
851 | | |
852 | | impl core::fmt::Debug for Span { |
853 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
854 | 0 | write!(f, "{}..{}", self.start, self.end) |
855 | 0 | } |
856 | | } |
857 | | |
858 | | impl core::ops::Index<Span> for [u8] { |
859 | | type Output = [u8]; |
860 | | |
861 | | #[inline] |
862 | 0 | fn index(&self, index: Span) -> &[u8] { |
863 | 0 | &self[index.range()] |
864 | 0 | } |
865 | | } |
866 | | |
867 | | impl core::ops::IndexMut<Span> for [u8] { |
868 | | #[inline] |
869 | 0 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |
870 | 0 | &mut self[index.range()] |
871 | 0 | } |
872 | | } |
873 | | |
874 | | impl core::ops::Index<Span> for str { |
875 | | type Output = str; |
876 | | |
877 | | #[inline] |
878 | 0 | fn index(&self, index: Span) -> &str { |
879 | 0 | &self[index.range()] |
880 | 0 | } |
881 | | } |
882 | | |
883 | | impl From<Range<usize>> for Span { |
884 | | #[inline] |
885 | 0 | fn from(range: Range<usize>) -> Span { |
886 | 0 | Span { start: range.start, end: range.end } |
887 | 0 | } |
888 | | } |
889 | | |
890 | | impl From<Span> for Range<usize> { |
891 | | #[inline] |
892 | 0 | fn from(span: Span) -> Range<usize> { |
893 | 0 | Range { start: span.start, end: span.end } |
894 | 0 | } |
895 | | } |
896 | | |
897 | | impl PartialEq<Range<usize>> for Span { |
898 | | #[inline] |
899 | 0 | fn eq(&self, range: &Range<usize>) -> bool { |
900 | 0 | self.start == range.start && self.end == range.end |
901 | 0 | } |
902 | | } |
903 | | |
904 | | impl PartialEq<Span> for Range<usize> { |
905 | | #[inline] |
906 | 0 | fn eq(&self, span: &Span) -> bool { |
907 | 0 | self.start == span.start && self.end == span.end |
908 | 0 | } |
909 | | } |
910 | | |
911 | | /// A representation of "half" of a match reported by a DFA. |
912 | | /// |
913 | | /// This is called a "half" match because it only includes the end location (or |
914 | | /// start location for a reverse search) of a match. This corresponds to the |
915 | | /// information that a single DFA scan can report. Getting the other half of |
916 | | /// the match requires a second scan with a reversed DFA. |
917 | | /// |
918 | | /// A half match also includes the pattern that matched. The pattern is |
919 | | /// identified by an ID, which corresponds to its position (starting from `0`) |
920 | | /// relative to other patterns used to construct the corresponding DFA. If only |
921 | | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
922 | | /// have a pattern ID of `0`. |
923 | | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
924 | | pub struct HalfMatch { |
925 | | /// The pattern ID. |
926 | | pattern: PatternID, |
927 | | /// The offset of the match. |
928 | | /// |
929 | | /// For forward searches, the offset is exclusive. For reverse searches, |
930 | | /// the offset is inclusive. |
931 | | offset: usize, |
932 | | } |
933 | | |
934 | | impl HalfMatch { |
935 | | /// Create a new half match from a pattern ID and a byte offset. |
936 | | #[inline] |
937 | 0 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
938 | 0 | HalfMatch { pattern, offset } |
939 | 0 | } |
940 | | |
941 | | /// Create a new half match from a pattern ID and a byte offset. |
942 | | /// |
943 | | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
944 | | /// [`PatternID`]. This panics if the given `usize` is not representable |
945 | | /// as a `PatternID`. |
946 | | #[inline] |
947 | 0 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
948 | 0 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
949 | 0 | } |
950 | | |
951 | | /// Returns the ID of the pattern that matched. |
952 | | /// |
953 | | /// The ID of a pattern is derived from the position in which it was |
954 | | /// originally inserted into the corresponding DFA. The first pattern has |
955 | | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
956 | | #[inline] |
957 | 0 | pub fn pattern(&self) -> PatternID { |
958 | 0 | self.pattern |
959 | 0 | } |
960 | | |
961 | | /// The position of the match. |
962 | | /// |
963 | | /// If this match was produced by a forward search, then the offset is |
964 | | /// exclusive. If this match was produced by a reverse search, then the |
965 | | /// offset is inclusive. |
966 | | #[inline] |
967 | 0 | pub fn offset(&self) -> usize { |
968 | 0 | self.offset |
969 | 0 | } |
970 | | } |
971 | | |
972 | | /// A representation of a match reported by a regex engine. |
973 | | /// |
974 | | /// A match has two essential pieces of information: the [`PatternID`] that |
975 | | /// matches, and the [`Span`] of the match in a haystack. |
976 | | /// |
977 | | /// The pattern is identified by an ID, which corresponds to its position |
978 | | /// (starting from `0`) relative to other patterns used to construct the |
979 | | /// corresponding regex engine. If only a single pattern is provided, then all |
980 | | /// matches are guaranteed to have a pattern ID of `0`. |
981 | | /// |
982 | | /// Every match reported by a regex engine guarantees that its span has its |
983 | | /// start offset as less than or equal to its end offset. |
984 | | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
985 | | pub struct Match { |
986 | | /// The pattern ID. |
987 | | pattern: PatternID, |
988 | | /// The underlying match span. |
989 | | span: Span, |
990 | | } |
991 | | |
992 | | impl Match { |
993 | | /// Create a new match from a pattern ID and a span. |
994 | | /// |
995 | | /// This constructor is generic over how a span is provided. While |
996 | | /// a [`Span`] may be given directly, one may also provide a |
997 | | /// `std::ops::Range<usize>`. |
998 | | /// |
999 | | /// # Panics |
1000 | | /// |
1001 | | /// This panics if `end < start`. |
1002 | | /// |
1003 | | /// # Example |
1004 | | /// |
1005 | | /// This shows how to create a match for the first pattern in a regex |
1006 | | /// object using convenient range syntax. |
1007 | | /// |
1008 | | /// ``` |
1009 | | /// use regex_automata::{Match, PatternID}; |
1010 | | /// |
1011 | | /// let m = Match::new(PatternID::ZERO, 5..10); |
1012 | | /// assert_eq!(0, m.pattern().as_usize()); |
1013 | | /// assert_eq!(5, m.start()); |
1014 | | /// assert_eq!(10, m.end()); |
1015 | | /// ``` |
1016 | | #[inline] |
1017 | 0 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |
1018 | 0 | let span: Span = span.into(); |
1019 | 0 | assert!(span.start <= span.end, "invalid match span"); |
1020 | 0 | Match { pattern, span } |
1021 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Match>::new::<core::ops::range::Range<usize>> Unexecuted instantiation: <regex_automata::util::search::Match>::new::<regex_automata::util::search::Span> |
1022 | | |
1023 | | /// Create a new match from a pattern ID and a byte offset span. |
1024 | | /// |
1025 | | /// This constructor is generic over how a span is provided. While |
1026 | | /// a [`Span`] may be given directly, one may also provide a |
1027 | | /// `std::ops::Range<usize>`. |
1028 | | /// |
1029 | | /// This is like [`Match::new`], but accepts a `usize` instead of a |
1030 | | /// [`PatternID`]. This panics if the given `usize` is not representable |
1031 | | /// as a `PatternID`. |
1032 | | /// |
1033 | | /// # Panics |
1034 | | /// |
1035 | | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
1036 | | /// |
1037 | | /// # Example |
1038 | | /// |
1039 | | /// This shows how to create a match for the third pattern in a regex |
1040 | | /// object using convenient range syntax. |
1041 | | /// |
1042 | | /// ``` |
1043 | | /// use regex_automata::Match; |
1044 | | /// |
1045 | | /// let m = Match::must(3, 5..10); |
1046 | | /// assert_eq!(3, m.pattern().as_usize()); |
1047 | | /// assert_eq!(5, m.start()); |
1048 | | /// assert_eq!(10, m.end()); |
1049 | | /// ``` |
1050 | | #[inline] |
1051 | 0 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |
1052 | 0 | Match::new(PatternID::must(pattern), span) |
1053 | 0 | } |
1054 | | |
1055 | | /// Returns the ID of the pattern that matched. |
1056 | | /// |
1057 | | /// The ID of a pattern is derived from the position in which it was |
1058 | | /// originally inserted into the corresponding regex engine. The first |
1059 | | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
1060 | | /// so on. |
1061 | | #[inline] |
1062 | 0 | pub fn pattern(&self) -> PatternID { |
1063 | 0 | self.pattern |
1064 | 0 | } |
1065 | | |
1066 | | /// The starting position of the match. |
1067 | | /// |
1068 | | /// This is a convenience routine for `Match::span().start`. |
1069 | | #[inline] |
1070 | 0 | pub fn start(&self) -> usize { |
1071 | 0 | self.span().start |
1072 | 0 | } |
1073 | | |
1074 | | /// The ending position of the match. |
1075 | | /// |
1076 | | /// This is a convenience routine for `Match::span().end`. |
1077 | | #[inline] |
1078 | 0 | pub fn end(&self) -> usize { |
1079 | 0 | self.span().end |
1080 | 0 | } |
1081 | | |
1082 | | /// Returns the match span as a range. |
1083 | | /// |
1084 | | /// This is a convenience routine for `Match::span().range()`. |
1085 | | #[inline] |
1086 | 0 | pub fn range(&self) -> core::ops::Range<usize> { |
1087 | 0 | self.span().range() |
1088 | 0 | } |
1089 | | |
1090 | | /// Returns the span for this match. |
1091 | | #[inline] |
1092 | 0 | pub fn span(&self) -> Span { |
1093 | 0 | self.span |
1094 | 0 | } |
1095 | | |
1096 | | /// Returns true when the span in this match is empty. |
1097 | | /// |
1098 | | /// An empty match can only be returned when the regex itself can match |
1099 | | /// the empty string. |
1100 | | #[inline] |
1101 | 0 | pub fn is_empty(&self) -> bool { |
1102 | 0 | self.span().is_empty() |
1103 | 0 | } |
1104 | | |
1105 | | /// Returns the length of this match. |
1106 | | /// |
1107 | | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
1108 | | #[inline] |
1109 | 0 | pub fn len(&self) -> usize { |
1110 | 0 | self.span().len() |
1111 | 0 | } |
1112 | | } |
1113 | | |
1114 | | /// A set of `PatternID`s. |
1115 | | /// |
1116 | | /// A set of pattern identifiers is useful for recording which patterns have |
1117 | | /// matched a particular haystack. A pattern set _only_ includes pattern |
1118 | | /// identifiers. It does not include offset information. |
1119 | | /// |
1120 | | /// # Example |
1121 | | /// |
1122 | | /// This shows basic usage of a set. |
1123 | | /// |
1124 | | /// ``` |
1125 | | /// use regex_automata::{PatternID, PatternSet}; |
1126 | | /// |
1127 | | /// let pid1 = PatternID::must(5); |
1128 | | /// let pid2 = PatternID::must(8); |
1129 | | /// // Create a new empty set. |
1130 | | /// let mut set = PatternSet::new(10); |
1131 | | /// // Insert pattern IDs. |
1132 | | /// set.insert(pid1); |
1133 | | /// set.insert(pid2); |
1134 | | /// // Test membership. |
1135 | | /// assert!(set.contains(pid1)); |
1136 | | /// assert!(set.contains(pid2)); |
1137 | | /// // Get all members. |
1138 | | /// assert_eq!( |
1139 | | /// vec![5, 8], |
1140 | | /// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), |
1141 | | /// ); |
1142 | | /// // Clear the set. |
1143 | | /// set.clear(); |
1144 | | /// // Test that it is indeed empty. |
1145 | | /// assert!(set.is_empty()); |
1146 | | /// ``` |
1147 | | #[cfg(feature = "alloc")] |
1148 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1149 | | pub struct PatternSet { |
1150 | | /// The number of patterns set to 'true' in this set. |
1151 | | len: usize, |
1152 | | /// A map from PatternID to boolean of whether a pattern matches or not. |
1153 | | /// |
1154 | | /// This should probably be a bitset, but it's probably unlikely to matter |
1155 | | /// much in practice. |
1156 | | /// |
1157 | | /// The main downside of this representation (and similarly for a bitset) |
1158 | | /// is that iteration scales with the capacity of the set instead of |
1159 | | /// the length of the set. This doesn't seem likely to be a problem in |
1160 | | /// practice. |
1161 | | /// |
1162 | | /// Another alternative is to just use a 'SparseSet' for this. It does use |
1163 | | /// more memory (quite a bit more), but that seems fine I think compared |
1164 | | /// to the memory being used by the regex engine. The real hiccup with |
1165 | | /// it is that it yields pattern IDs in the order they were inserted. |
1166 | | /// Which is actually kind of nice, but at the time of writing, pattern |
1167 | | /// IDs are yielded in ascending order in the regex crate RegexSet API. |
1168 | | /// If we did change to 'SparseSet', we could provide an additional |
1169 | | /// 'iter_match_order' iterator, but keep the ascending order one for |
1170 | | /// compatibility. |
1171 | | which: alloc::boxed::Box<[bool]>, |
1172 | | } |
1173 | | |
1174 | | #[cfg(feature = "alloc")] |
1175 | | impl PatternSet { |
1176 | | /// Create a new set of pattern identifiers with the given capacity. |
1177 | | /// |
1178 | | /// The given capacity typically corresponds to (at least) the number of |
1179 | | /// patterns in a compiled regex object. |
1180 | | /// |
1181 | | /// # Panics |
1182 | | /// |
1183 | | /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is |
1184 | | /// impossible if you use the `pattern_len()` method as defined on any of |
1185 | | /// the regex engines in this crate. Namely, a regex will fail to build by |
1186 | | /// returning an error if the number of patterns given to it exceeds the |
1187 | | /// limit. Therefore, the number of patterns in a valid regex is always |
1188 | | /// a correct capacity to provide here. |
1189 | 0 | pub fn new(capacity: usize) -> PatternSet { |
1190 | 0 | assert!( |
1191 | 0 | capacity <= PatternID::LIMIT, |
1192 | 0 | "pattern set capacity exceeds limit of {}", |
1193 | | PatternID::LIMIT, |
1194 | | ); |
1195 | 0 | PatternSet { |
1196 | 0 | len: 0, |
1197 | 0 | which: alloc::vec![false; capacity].into_boxed_slice(), |
1198 | 0 | } |
1199 | 0 | } |
1200 | | |
1201 | | /// Clear this set such that it contains no pattern IDs. |
1202 | 0 | pub fn clear(&mut self) { |
1203 | 0 | self.len = 0; |
1204 | 0 | for matched in self.which.iter_mut() { |
1205 | 0 | *matched = false; |
1206 | 0 | } |
1207 | 0 | } |
1208 | | |
1209 | | /// Return true if and only if the given pattern identifier is in this set. |
1210 | 0 | pub fn contains(&self, pid: PatternID) -> bool { |
1211 | 0 | pid.as_usize() < self.capacity() && self.which[pid] |
1212 | 0 | } |
1213 | | |
1214 | | /// Insert the given pattern identifier into this set and return `true` if |
1215 | | /// the given pattern ID was not previously in this set. |
1216 | | /// |
1217 | | /// If the pattern identifier is already in this set, then this is a no-op. |
1218 | | /// |
1219 | | /// Use [`PatternSet::try_insert`] for a fallible version of this routine. |
1220 | | /// |
1221 | | /// # Panics |
1222 | | /// |
1223 | | /// This panics if this pattern set has insufficient capacity to |
1224 | | /// store the given pattern ID. |
1225 | 0 | pub fn insert(&mut self, pid: PatternID) -> bool { |
1226 | 0 | self.try_insert(pid) |
1227 | 0 | .expect("PatternSet should have sufficient capacity") |
1228 | 0 | } |
1229 | | |
1230 | | /// Insert the given pattern identifier into this set and return `true` if |
1231 | | /// the given pattern ID was not previously in this set. |
1232 | | /// |
1233 | | /// If the pattern identifier is already in this set, then this is a no-op. |
1234 | | /// |
1235 | | /// # Errors |
1236 | | /// |
1237 | | /// This returns an error if this pattern set has insufficient capacity to |
1238 | | /// store the given pattern ID. |
1239 | 0 | pub fn try_insert( |
1240 | 0 | &mut self, |
1241 | 0 | pid: PatternID, |
1242 | 0 | ) -> Result<bool, PatternSetInsertError> { |
1243 | 0 | if pid.as_usize() >= self.capacity() { |
1244 | 0 | return Err(PatternSetInsertError { |
1245 | 0 | attempted: pid, |
1246 | 0 | capacity: self.capacity(), |
1247 | 0 | }); |
1248 | 0 | } |
1249 | 0 | if self.which[pid] { |
1250 | 0 | return Ok(false); |
1251 | 0 | } |
1252 | 0 | self.len += 1; |
1253 | 0 | self.which[pid] = true; |
1254 | 0 | Ok(true) |
1255 | 0 | } |
1256 | | |
1257 | | /* |
1258 | | // This is currently commented out because it is unused and it is unclear |
1259 | | // whether it's useful or not. What's the harm in having it? When, if |
1260 | | // we ever wanted to change our representation to a 'SparseSet', then |
1261 | | // supporting this method would be a bit tricky. So in order to keep some |
1262 | | // API evolution flexibility, we leave it out for now. |
1263 | | |
1264 | | /// Remove the given pattern identifier from this set. |
1265 | | /// |
1266 | | /// If the pattern identifier was not previously in this set, then this |
1267 | | /// does not change the set and returns `false`. |
1268 | | /// |
1269 | | /// # Panics |
1270 | | /// |
1271 | | /// This panics if `pid` exceeds the capacity of this set. |
1272 | | pub fn remove(&mut self, pid: PatternID) -> bool { |
1273 | | if !self.which[pid] { |
1274 | | return false; |
1275 | | } |
1276 | | self.len -= 1; |
1277 | | self.which[pid] = false; |
1278 | | true |
1279 | | } |
1280 | | */ |
1281 | | |
1282 | | /// Return true if and only if this set has no pattern identifiers in it. |
1283 | 0 | pub fn is_empty(&self) -> bool { |
1284 | 0 | self.len() == 0 |
1285 | 0 | } |
1286 | | |
1287 | | /// Return true if and only if this set has the maximum number of pattern |
1288 | | /// identifiers in the set. This occurs precisely when `PatternSet::len() |
1289 | | /// == PatternSet::capacity()`. |
1290 | | /// |
1291 | | /// This particular property is useful to test because it may allow one to |
1292 | | /// stop a search earlier than you might otherwise. Namely, if a search is |
1293 | | /// only reporting which patterns match a haystack and if you know all of |
1294 | | /// the patterns match at a given point, then there's no new information |
1295 | | /// that can be learned by continuing the search. (Because a pattern set |
1296 | | /// does not keep track of offset information.) |
1297 | 0 | pub fn is_full(&self) -> bool { |
1298 | 0 | self.len() == self.capacity() |
1299 | 0 | } |
1300 | | |
1301 | | /// Returns the total number of pattern identifiers in this set. |
1302 | 0 | pub fn len(&self) -> usize { |
1303 | 0 | self.len |
1304 | 0 | } |
1305 | | |
1306 | | /// Returns the total number of pattern identifiers that may be stored |
1307 | | /// in this set. |
1308 | | /// |
1309 | | /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. |
1310 | | /// |
1311 | | /// Typically, the capacity of a pattern set matches the number of patterns |
1312 | | /// in a regex object with which you are searching. |
1313 | 0 | pub fn capacity(&self) -> usize { |
1314 | 0 | self.which.len() |
1315 | 0 | } |
1316 | | |
1317 | | /// Returns an iterator over all pattern identifiers in this set. |
1318 | | /// |
1319 | | /// The iterator yields pattern identifiers in ascending order, starting |
1320 | | /// at zero. |
1321 | 0 | pub fn iter(&self) -> PatternSetIter<'_> { |
1322 | 0 | PatternSetIter { it: self.which.iter().enumerate() } |
1323 | 0 | } |
1324 | | } |
1325 | | |
1326 | | /// An error that occurs when a `PatternID` failed to insert into a |
1327 | | /// `PatternSet`. |
1328 | | /// |
1329 | | /// An insert fails when the given `PatternID` exceeds the configured capacity |
1330 | | /// of the `PatternSet`. |
1331 | | /// |
1332 | | /// This error is created by the [`PatternSet::try_insert`] routine. |
1333 | | #[cfg(feature = "alloc")] |
1334 | | #[derive(Clone, Debug)] |
1335 | | pub struct PatternSetInsertError { |
1336 | | attempted: PatternID, |
1337 | | capacity: usize, |
1338 | | } |
1339 | | |
1340 | | #[cfg(feature = "std")] |
1341 | | impl std::error::Error for PatternSetInsertError {} |
1342 | | |
1343 | | #[cfg(feature = "alloc")] |
1344 | | impl core::fmt::Display for PatternSetInsertError { |
1345 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1346 | 0 | write!( |
1347 | 0 | f, |
1348 | 0 | "failed to insert pattern ID {} into pattern set \ |
1349 | 0 | with insufficiet capacity of {}", |
1350 | 0 | self.attempted.as_usize(), |
1351 | 0 | self.capacity, |
1352 | 0 | ) |
1353 | 0 | } |
1354 | | } |
1355 | | |
1356 | | /// An iterator over all pattern identifiers in a [`PatternSet`]. |
1357 | | /// |
1358 | | /// The lifetime parameter `'a` refers to the lifetime of the pattern set being |
1359 | | /// iterated over. |
1360 | | /// |
1361 | | /// This iterator is created by the [`PatternSet::iter`] method. |
1362 | | #[cfg(feature = "alloc")] |
1363 | | #[derive(Clone, Debug)] |
1364 | | pub struct PatternSetIter<'a> { |
1365 | | it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, |
1366 | | } |
1367 | | |
1368 | | #[cfg(feature = "alloc")] |
1369 | | impl<'a> Iterator for PatternSetIter<'a> { |
1370 | | type Item = PatternID; |
1371 | | |
1372 | 0 | fn next(&mut self) -> Option<PatternID> { |
1373 | 0 | while let Some((index, &yes)) = self.it.next() { |
1374 | 0 | if yes { |
1375 | | // Only valid 'PatternID' values can be inserted into the set |
1376 | | // and construction of the set panics if the capacity would |
1377 | | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1378 | | // precisely when 'index' corresponds to a valid 'PatternID'. |
1379 | 0 | return Some(PatternID::new_unchecked(index)); |
1380 | 0 | } |
1381 | | } |
1382 | 0 | None |
1383 | 0 | } |
1384 | | |
1385 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
1386 | 0 | self.it.size_hint() |
1387 | 0 | } |
1388 | | } |
1389 | | |
1390 | | #[cfg(feature = "alloc")] |
1391 | | impl<'a> DoubleEndedIterator for PatternSetIter<'a> { |
1392 | 0 | fn next_back(&mut self) -> Option<PatternID> { |
1393 | 0 | while let Some((index, &yes)) = self.it.next_back() { |
1394 | 0 | if yes { |
1395 | | // Only valid 'PatternID' values can be inserted into the set |
1396 | | // and construction of the set panics if the capacity would |
1397 | | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1398 | | // precisely when 'index' corresponds to a valid 'PatternID'. |
1399 | 0 | return Some(PatternID::new_unchecked(index)); |
1400 | 0 | } |
1401 | | } |
1402 | 0 | None |
1403 | 0 | } |
1404 | | } |
1405 | | |
1406 | | /// The type of anchored search to perform. |
1407 | | /// |
1408 | | /// This is *almost* a boolean option. That is, you can either do an unanchored |
1409 | | /// search for any pattern in a regex, or you can do an anchored search for any |
1410 | | /// pattern in a regex. |
1411 | | /// |
1412 | | /// A third option exists that, assuming the regex engine supports it, permits |
1413 | | /// you to do an anchored search for a specific pattern. |
1414 | | /// |
1415 | | /// Note that there is no way to run an unanchored search for a specific |
1416 | | /// pattern. If you need that, you'll need to build separate regexes for each |
1417 | | /// pattern. |
1418 | | /// |
1419 | | /// # Errors |
1420 | | /// |
1421 | | /// If a regex engine does not support the anchored mode selected, then the |
1422 | | /// regex engine will return an error. While any non-trivial regex engine |
1423 | | /// should support at least one of the available anchored modes, there is no |
1424 | | /// singular mode that is guaranteed to be universally supported. Some regex |
1425 | | /// engines might only support unanchored searches (DFAs compiled without |
1426 | | /// anchored starting states) and some regex engines might only support |
1427 | | /// anchored searches (like the one-pass DFA). |
1428 | | /// |
1429 | | /// The specific error returned is a [`MatchError`] with a |
1430 | | /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the |
1431 | | /// `Anchored` value given that is unsupported. |
1432 | | /// |
1433 | | /// Note that regex engines should report "no match" if, for example, an |
1434 | | /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where |
1435 | | /// anchored searches for a specific pattern are supported. This is smooths out |
1436 | | /// behavior such that it's possible to guarantee that an error never occurs |
1437 | | /// based on how the regex engine is configured. All regex engines in this |
1438 | | /// crate report "no match" when searching for an invalid pattern ID, but where |
1439 | | /// searching for a valid pattern ID is otherwise supported. |
1440 | | /// |
1441 | | /// # Example |
1442 | | /// |
1443 | | /// This example shows how to use the various `Anchored` modes to run a |
1444 | | /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) |
1445 | | /// because it supports all modes unconditionally. Some regex engines, like |
1446 | | /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored |
1447 | | /// searches. |
1448 | | /// |
1449 | | /// ``` |
1450 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1451 | | /// use regex_automata::{ |
1452 | | /// nfa::thompson::pikevm::PikeVM, |
1453 | | /// Anchored, Input, Match, PatternID, |
1454 | | /// }; |
1455 | | /// |
1456 | | /// let re = PikeVM::new_many(&[ |
1457 | | /// r"Mrs. \w+", |
1458 | | /// r"Miss \w+", |
1459 | | /// r"Mr. \w+", |
1460 | | /// r"Ms. \w+", |
1461 | | /// ])?; |
1462 | | /// let mut cache = re.create_cache(); |
1463 | | /// let hay = "Hello Mr. Springsteen!"; |
1464 | | /// |
1465 | | /// // The default is to do an unanchored search. |
1466 | | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1467 | | /// // Explicitly ask for an unanchored search. Same as above. |
1468 | | /// let input = Input::new(hay).anchored(Anchored::No); |
1469 | | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1470 | | /// |
1471 | | /// // Now try an anchored search. Since the match doesn't start at the |
1472 | | /// // beginning of the haystack, no match is found! |
1473 | | /// let input = Input::new(hay).anchored(Anchored::Yes); |
1474 | | /// assert_eq!(None, re.find(&mut cache, input)); |
1475 | | /// |
1476 | | /// // We can try an anchored search again, but move the location of where |
1477 | | /// // we start the search. Note that the offsets reported are still in |
1478 | | /// // terms of the overall haystack and not relative to where we started |
1479 | | /// // the search. |
1480 | | /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); |
1481 | | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1482 | | /// |
1483 | | /// // Now try an anchored search for a specific pattern. We specifically |
1484 | | /// // choose a pattern that we know doesn't match to prove that the search |
1485 | | /// // only looks for the pattern we provide. |
1486 | | /// let input = Input::new(hay) |
1487 | | /// .anchored(Anchored::Pattern(PatternID::must(1))) |
1488 | | /// .range(6..); |
1489 | | /// assert_eq!(None, re.find(&mut cache, input)); |
1490 | | /// |
1491 | | /// // But if we switch it to the pattern that we know matches, then we find |
1492 | | /// // the match. |
1493 | | /// let input = Input::new(hay) |
1494 | | /// .anchored(Anchored::Pattern(PatternID::must(2))) |
1495 | | /// .range(6..); |
1496 | | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1497 | | /// |
1498 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1499 | | /// ``` |
1500 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1501 | | pub enum Anchored { |
1502 | | /// Run an unanchored search. This means a match may occur anywhere at or |
1503 | | /// after the start position of the search. |
1504 | | /// |
1505 | | /// This search can return a match for any pattern in the regex. |
1506 | | No, |
1507 | | /// Run an anchored search. This means that a match must begin at the |
1508 | | /// start position of the search. |
1509 | | /// |
1510 | | /// This search can return a match for any pattern in the regex. |
1511 | | Yes, |
1512 | | /// Run an anchored search for a specific pattern. This means that a match |
1513 | | /// must be for the given pattern and must begin at the start position of |
1514 | | /// the search. |
1515 | | Pattern(PatternID), |
1516 | | } |
1517 | | |
1518 | | impl Anchored { |
1519 | | /// Returns true if and only if this anchor mode corresponds to any kind of |
1520 | | /// anchored search. |
1521 | | /// |
1522 | | /// # Example |
1523 | | /// |
1524 | | /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` |
1525 | | /// are considered anchored searches. |
1526 | | /// |
1527 | | /// ``` |
1528 | | /// use regex_automata::{Anchored, PatternID}; |
1529 | | /// |
1530 | | /// assert!(!Anchored::No.is_anchored()); |
1531 | | /// assert!(Anchored::Yes.is_anchored()); |
1532 | | /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); |
1533 | | /// ``` |
1534 | | #[inline] |
1535 | 0 | pub fn is_anchored(&self) -> bool { |
1536 | 0 | matches!(*self, Anchored::Yes | Anchored::Pattern(_)) |
1537 | 0 | } Unexecuted instantiation: <regex_automata::util::search::Anchored>::is_anchored Unexecuted instantiation: <regex_automata::util::search::Anchored>::is_anchored |
1538 | | |
1539 | | /// Returns the pattern ID associated with this configuration if it is an |
1540 | | /// anchored search for a specific pattern. Otherwise `None` is returned. |
1541 | | /// |
1542 | | /// # Example |
1543 | | /// |
1544 | | /// ``` |
1545 | | /// use regex_automata::{Anchored, PatternID}; |
1546 | | /// |
1547 | | /// assert_eq!(None, Anchored::No.pattern()); |
1548 | | /// assert_eq!(None, Anchored::Yes.pattern()); |
1549 | | /// |
1550 | | /// let pid = PatternID::must(5); |
1551 | | /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); |
1552 | | /// ``` |
1553 | | #[inline] |
1554 | 0 | pub fn pattern(&self) -> Option<PatternID> { |
1555 | 0 | match *self { |
1556 | 0 | Anchored::Pattern(pid) => Some(pid), |
1557 | 0 | _ => None, |
1558 | | } |
1559 | 0 | } |
1560 | | } |
1561 | | |
1562 | | /// The kind of match semantics to use for a regex pattern. |
1563 | | /// |
1564 | | /// The default match kind is `LeftmostFirst`, and this corresponds to the |
1565 | | /// match semantics used by most backtracking engines, such as Perl. |
1566 | | /// |
1567 | | /// # Leftmost first or "preference order" match semantics |
1568 | | /// |
1569 | | /// Leftmost-first semantics determine which match to report when there are |
1570 | | /// multiple paths through a regex that match at the same position. The tie is |
1571 | | /// essentially broken by how a backtracker would behave. For example, consider |
1572 | | /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this |
1573 | | /// case, both the `foofoo` and `foo` branches match at position `0`. So should |
1574 | | /// the end of the match be `3` or `6`? |
1575 | | /// |
1576 | | /// A backtracker will conceptually work by trying `foofoofoo` and failing. |
1577 | | /// Then it will try `foofoo`, find the match and stop there. Thus, the |
1578 | | /// leftmost-first match position is `6`. This is called "leftmost-first" or |
1579 | | /// "preference order" because the order of the branches as written in the |
1580 | | /// regex pattern is what determines how to break the tie. |
1581 | | /// |
1582 | | /// (Note that leftmost-longest match semantics, which break ties by always |
1583 | | /// taking the longest matching string, are not currently supported by this |
1584 | | /// crate. These match semantics tend to be found in POSIX regex engines.) |
1585 | | /// |
1586 | | /// This example shows how leftmost-first semantics work, and how it even |
1587 | | /// applies to multi-pattern regexes: |
1588 | | /// |
1589 | | /// ``` |
1590 | | /// use regex_automata::{ |
1591 | | /// nfa::thompson::pikevm::PikeVM, |
1592 | | /// Match, |
1593 | | /// }; |
1594 | | /// |
1595 | | /// let re = PikeVM::new_many(&[ |
1596 | | /// r"foofoofoo", |
1597 | | /// r"foofoo", |
1598 | | /// r"foo", |
1599 | | /// ])?; |
1600 | | /// let mut cache = re.create_cache(); |
1601 | | /// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect(); |
1602 | | /// let expected = vec![Match::must(1, 0..6)]; |
1603 | | /// assert_eq!(expected, got); |
1604 | | /// |
1605 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1606 | | /// ``` |
1607 | | /// |
1608 | | /// # All matches |
1609 | | /// |
1610 | | /// The `All` match semantics report any and all matches, and generally will |
1611 | | /// attempt to match as much as possible. It doesn't respect any sort of match |
1612 | | /// priority at all, so things like non-greedy matching don't work in this |
1613 | | /// mode. |
1614 | | /// |
1615 | | /// The fact that non-greedy matching doesn't work generally makes most forms |
1616 | | /// of unanchored non-overlapping searches have unintuitive behavior. Namely, |
1617 | | /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the |
1618 | | /// beginning of the pattern, which is specifically non-greedy. Since it will |
1619 | | /// be treated as greedy in `All` match semantics, this generally means that |
1620 | | /// it will first attempt to consume all of the haystack and is likely to wind |
1621 | | /// up skipping matches. |
1622 | | /// |
1623 | | /// Generally speaking, `All` should only be used in two circumstances: |
1624 | | /// |
1625 | | /// * When running an anchored search and there is a desire to match as much as |
1626 | | /// possible. For example, when building a reverse regex matcher to find the |
1627 | | /// start of a match after finding the end. In this case, the reverse search |
1628 | | /// is anchored to the end of the match found by the forward search. |
1629 | | /// * When running overlapping searches. Since `All` encodes all possible |
1630 | | /// matches, this is generally what you want for an overlapping search. If you |
1631 | | /// try to use leftmost-first in an overlapping search, it is likely to produce |
1632 | | /// counter-intuitive results since leftmost-first specifically excludes some |
1633 | | /// matches from its underlying finite state machine. |
1634 | | /// |
1635 | | /// This example demonstrates the counter-intuitive behavior of `All` semantics |
1636 | | /// when using a standard leftmost unanchored search: |
1637 | | /// |
1638 | | /// ``` |
1639 | | /// use regex_automata::{ |
1640 | | /// nfa::thompson::pikevm::PikeVM, |
1641 | | /// Match, MatchKind, |
1642 | | /// }; |
1643 | | /// |
1644 | | /// let re = PikeVM::builder() |
1645 | | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
1646 | | /// .build("foo")?; |
1647 | | /// let hay = "first foo second foo wat"; |
1648 | | /// let mut cache = re.create_cache(); |
1649 | | /// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); |
1650 | | /// // Notice that it completely skips the first 'foo'! |
1651 | | /// let expected = vec![Match::must(0, 17..20)]; |
1652 | | /// assert_eq!(expected, got); |
1653 | | /// |
1654 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1655 | | /// ``` |
1656 | | /// |
1657 | | /// This second example shows how `All` semantics are useful for an overlapping |
1658 | | /// search. Note that we use lower level lazy DFA APIs here since the NFA |
1659 | | /// engines only currently support a very limited form of overlapping search. |
1660 | | /// |
1661 | | /// ``` |
1662 | | /// use regex_automata::{ |
1663 | | /// hybrid::dfa::{DFA, OverlappingState}, |
1664 | | /// HalfMatch, Input, MatchKind, |
1665 | | /// }; |
1666 | | /// |
1667 | | /// let re = DFA::builder() |
1668 | | /// // If we didn't set 'All' semantics here, then the regex would only |
1669 | | /// // match 'foo' at offset 3 and nothing else. Why? Because the state |
1670 | | /// // machine implements preference order and knows that the 'foofoo' and |
1671 | | /// // 'foofoofoo' branches can never match since 'foo' will always match |
1672 | | /// // when they match and take priority. |
1673 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1674 | | /// .build(r"foo|foofoo|foofoofoo")?; |
1675 | | /// let mut cache = re.create_cache(); |
1676 | | /// let mut state = OverlappingState::start(); |
1677 | | /// let input = Input::new("foofoofoo"); |
1678 | | /// let mut got = vec![]; |
1679 | | /// loop { |
1680 | | /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; |
1681 | | /// let m = match state.get_match() { |
1682 | | /// None => break, |
1683 | | /// Some(m) => m, |
1684 | | /// }; |
1685 | | /// got.push(m); |
1686 | | /// } |
1687 | | /// let expected = vec![ |
1688 | | /// HalfMatch::must(0, 3), |
1689 | | /// HalfMatch::must(0, 6), |
1690 | | /// HalfMatch::must(0, 9), |
1691 | | /// ]; |
1692 | | /// assert_eq!(expected, got); |
1693 | | /// |
1694 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1695 | | /// ``` |
1696 | | #[non_exhaustive] |
1697 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1698 | | pub enum MatchKind { |
1699 | | /// Report all possible matches. |
1700 | | All, |
1701 | | /// Report only the leftmost matches. When multiple leftmost matches exist, |
1702 | | /// report the match corresponding to the part of the regex that appears |
1703 | | /// first in the syntax. |
1704 | | LeftmostFirst, |
1705 | | // There is prior art in RE2 that shows that we should be able to add |
1706 | | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
1707 | | // repetitions. Instead of treating all NFA states as having equivalent |
1708 | | // priority (as in 'All') or treating all NFA states as having distinct |
1709 | | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
1710 | | // states into sets, and treat members of each set as having equivalent |
1711 | | // priority, but having greater priority than all following members |
1712 | | // of different sets. |
1713 | | // |
1714 | | // However, it's not clear whether it's really worth adding this. After |
1715 | | // all, leftmost-longest can be emulated when using literals by using |
1716 | | // leftmost-first and sorting the literals by length in descending order. |
1717 | | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
1718 | | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
1719 | | // would match `ab`. |
1720 | | } |
1721 | | |
1722 | | impl MatchKind { |
1723 | | #[cfg(feature = "alloc")] |
1724 | 0 | pub(crate) fn continue_past_first_match(&self) -> bool { |
1725 | 0 | *self == MatchKind::All |
1726 | 0 | } |
1727 | | } |
1728 | | |
1729 | | impl Default for MatchKind { |
1730 | 0 | fn default() -> MatchKind { |
1731 | 0 | MatchKind::LeftmostFirst |
1732 | 0 | } |
1733 | | } |
1734 | | |
1735 | | /// An error indicating that a search stopped before reporting whether a |
1736 | | /// match exists or not. |
1737 | | /// |
1738 | | /// To be very clear, this error type implies that one cannot assume that no |
1739 | | /// matches occur, since the search stopped before completing. That is, if |
1740 | | /// you're looking for information about where a search determined that no |
1741 | | /// match can occur, then this error type does *not* give you that. (Indeed, at |
1742 | | /// the time of writing, if you need such a thing, you have to write your own |
1743 | | /// search routine.) |
1744 | | /// |
1745 | | /// Normally, when one searches for something, the response is either an |
1746 | | /// affirmative "it was found at this location" or a negative "not found at |
1747 | | /// all." However, in some cases, a regex engine can be configured to stop its |
1748 | | /// search before concluding whether a match exists or not. When this happens, |
1749 | | /// it may be important for the caller to know why the regex engine gave up and |
1750 | | /// where in the input it gave up at. This error type exposes the 'why' and the |
1751 | | /// 'where.' |
1752 | | /// |
1753 | | /// For example, the DFAs provided by this library generally cannot correctly |
1754 | | /// implement Unicode word boundaries. Instead, they provide an option to |
1755 | | /// eagerly support them on ASCII text (since Unicode word boundaries are |
1756 | | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
1757 | | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
1758 | | /// required to either report the failure to the caller (unergonomic) or |
1759 | | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
1760 | | /// costly). |
1761 | | /// |
1762 | | /// More generally, some regex engines offer the ability for callers to specify |
1763 | | /// certain bytes that will trigger the regex engine to automatically quit if |
1764 | | /// they are seen. |
1765 | | /// |
1766 | | /// Still yet, there may be other reasons for a failed match. For example, |
1767 | | /// the hybrid DFA provided by this crate can be configured to give up if it |
1768 | | /// believes that it is not efficient. This in turn permits callers to choose a |
1769 | | /// different regex engine. |
1770 | | /// |
1771 | | /// (Note that DFAs are configured by default to never quit or give up in this |
1772 | | /// fashion. For example, by default, a DFA will fail to build if the regex |
1773 | | /// pattern contains a Unicode word boundary. One needs to opt into the "quit" |
1774 | | /// behavior via options, like |
1775 | | /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) |
1776 | | /// |
1777 | | /// There are a couple other ways a search |
1778 | | /// can fail. For example, when using the |
1779 | | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) |
1780 | | /// with a haystack that is too long, or trying to run an unanchored search |
1781 | | /// with a [one-pass DFA](crate::dfa::onepass). |
1782 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1783 | | pub struct MatchError( |
1784 | | #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>, |
1785 | | #[cfg(not(feature = "alloc"))] MatchErrorKind, |
1786 | | ); |
1787 | | |
1788 | | impl MatchError { |
1789 | | /// Create a new error value with the given kind. |
1790 | | /// |
1791 | | /// This is a more verbose version of the kind-specific constructors, |
1792 | | /// e.g., `MatchError::quit`. |
1793 | 0 | pub fn new(kind: MatchErrorKind) -> MatchError { |
1794 | 0 | #[cfg(feature = "alloc")] |
1795 | 0 | { |
1796 | 0 | MatchError(alloc::boxed::Box::new(kind)) |
1797 | 0 | } |
1798 | 0 | #[cfg(not(feature = "alloc"))] |
1799 | 0 | { |
1800 | 0 | MatchError(kind) |
1801 | 0 | } |
1802 | 0 | } |
1803 | | |
1804 | | /// Returns a reference to the underlying error kind. |
1805 | 0 | pub fn kind(&self) -> &MatchErrorKind { |
1806 | 0 | &self.0 |
1807 | 0 | } |
1808 | | |
1809 | | /// Create a new "quit" error. The given `byte` corresponds to the value |
1810 | | /// that tripped a search's quit condition, and `offset` corresponds to the |
1811 | | /// location in the haystack at which the search quit. |
1812 | | /// |
1813 | | /// This is the same as calling `MatchError::new` with a |
1814 | | /// [`MatchErrorKind::Quit`] kind. |
1815 | 0 | pub fn quit(byte: u8, offset: usize) -> MatchError { |
1816 | 0 | MatchError::new(MatchErrorKind::Quit { byte, offset }) |
1817 | 0 | } |
1818 | | |
1819 | | /// Create a new "gave up" error. The given `offset` corresponds to the |
1820 | | /// location in the haystack at which the search gave up. |
1821 | | /// |
1822 | | /// This is the same as calling `MatchError::new` with a |
1823 | | /// [`MatchErrorKind::GaveUp`] kind. |
1824 | 0 | pub fn gave_up(offset: usize) -> MatchError { |
1825 | 0 | MatchError::new(MatchErrorKind::GaveUp { offset }) |
1826 | 0 | } |
1827 | | |
1828 | | /// Create a new "haystack too long" error. The given `len` corresponds to |
1829 | | /// the length of the haystack that was problematic. |
1830 | | /// |
1831 | | /// This is the same as calling `MatchError::new` with a |
1832 | | /// [`MatchErrorKind::HaystackTooLong`] kind. |
1833 | 0 | pub fn haystack_too_long(len: usize) -> MatchError { |
1834 | 0 | MatchError::new(MatchErrorKind::HaystackTooLong { len }) |
1835 | 0 | } |
1836 | | |
1837 | | /// Create a new "unsupported anchored" error. This occurs when the caller |
1838 | | /// requests a search with an anchor mode that is not supported by the |
1839 | | /// regex engine. |
1840 | | /// |
1841 | | /// This is the same as calling `MatchError::new` with a |
1842 | | /// [`MatchErrorKind::UnsupportedAnchored`] kind. |
1843 | 0 | pub fn unsupported_anchored(mode: Anchored) -> MatchError { |
1844 | 0 | MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) |
1845 | 0 | } |
1846 | | } |
1847 | | |
1848 | | /// The underlying kind of a [`MatchError`]. |
1849 | | /// |
1850 | | /// This is a **non-exhaustive** enum. That means new variants may be added in |
1851 | | /// a semver-compatible release. |
1852 | | #[non_exhaustive] |
1853 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1854 | | pub enum MatchErrorKind { |
1855 | | /// The search saw a "quit" byte at which it was instructed to stop |
1856 | | /// searching. |
1857 | | Quit { |
1858 | | /// The "quit" byte that was observed that caused the search to stop. |
1859 | | byte: u8, |
1860 | | /// The offset at which the quit byte was observed. |
1861 | | offset: usize, |
1862 | | }, |
1863 | | /// The search, based on heuristics, determined that it would be better |
1864 | | /// to stop, typically to provide the caller an opportunity to use an |
1865 | | /// alternative regex engine. |
1866 | | /// |
1867 | | /// Currently, the only way for this to occur is via the lazy DFA and |
1868 | | /// only when it is configured to do so (it will not return this error by |
1869 | | /// default). |
1870 | | GaveUp { |
1871 | | /// The offset at which the search stopped. This corresponds to the |
1872 | | /// position immediately following the last byte scanned. |
1873 | | offset: usize, |
1874 | | }, |
1875 | | /// This error occurs if the haystack given to the regex engine was too |
1876 | | /// long to be searched. This occurs, for example, with regex engines |
1877 | | /// like the bounded backtracker that have a configurable fixed amount of |
1878 | | /// capacity that is tied to the length of the haystack. Anything beyond |
1879 | | /// that configured limit will result in an error at search time. |
1880 | | HaystackTooLong { |
1881 | | /// The length of the haystack that exceeded the limit. |
1882 | | len: usize, |
1883 | | }, |
1884 | | /// An error indicating that a particular type of anchored search was |
1885 | | /// requested, but that the regex engine does not support it. |
1886 | | /// |
1887 | | /// Note that this error should not be returned by a regex engine simply |
1888 | | /// because the pattern ID is invalid (i.e., equal to or exceeds the number |
1889 | | /// of patterns in the regex). In that case, the regex engine should report |
1890 | | /// a non-match. |
1891 | | UnsupportedAnchored { |
1892 | | /// The anchored mode given that is unsupported. |
1893 | | mode: Anchored, |
1894 | | }, |
1895 | | } |
1896 | | |
1897 | | #[cfg(feature = "std")] |
1898 | | impl std::error::Error for MatchError {} |
1899 | | |
1900 | | impl core::fmt::Display for MatchError { |
1901 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1902 | 0 | match *self.kind() { |
1903 | 0 | MatchErrorKind::Quit { byte, offset } => write!( |
1904 | 0 | f, |
1905 | 0 | "quit search after observing byte {:?} at offset {}", |
1906 | 0 | DebugByte(byte), |
1907 | 0 | offset, |
1908 | 0 | ), |
1909 | 0 | MatchErrorKind::GaveUp { offset } => { |
1910 | 0 | write!(f, "gave up searching at offset {}", offset) |
1911 | | } |
1912 | 0 | MatchErrorKind::HaystackTooLong { len } => { |
1913 | 0 | write!(f, "haystack of length {} is too long", len) |
1914 | | } |
1915 | | MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { |
1916 | 0 | write!(f, "anchored searches are not supported or enabled") |
1917 | | } |
1918 | | MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { |
1919 | 0 | write!(f, "unanchored searches are not supported or enabled") |
1920 | | } |
1921 | | MatchErrorKind::UnsupportedAnchored { |
1922 | 0 | mode: Anchored::Pattern(pid), |
1923 | 0 | } => { |
1924 | 0 | write!( |
1925 | 0 | f, |
1926 | 0 | "anchored searches for a specific pattern ({}) are \ |
1927 | 0 | not supported or enabled", |
1928 | 0 | pid.as_usize(), |
1929 | 0 | ) |
1930 | | } |
1931 | | } |
1932 | 0 | } |
1933 | | } |
1934 | | |
1935 | | #[cfg(test)] |
1936 | | mod tests { |
1937 | | use super::*; |
1938 | | |
1939 | | // We test that our 'MatchError' type is the size we expect. This isn't an |
1940 | | // API guarantee, but if the size increases, we really want to make sure we |
1941 | | // decide to do that intentionally. So this should be a speed bump. And in |
1942 | | // general, we should not increase the size without a very good reason. |
1943 | | // |
1944 | | // Why? Because low level search APIs return Result<.., MatchError>. When |
1945 | | // MatchError gets bigger, so to does the Result type. |
1946 | | // |
1947 | | // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes |
1948 | | // the importance of keeping a small error type. But without 'alloc', we |
1949 | | // still want things to be small. |
1950 | | #[test] |
1951 | | fn match_error_size() { |
1952 | | let expected_size = if cfg!(feature = "alloc") { |
1953 | | core::mem::size_of::<usize>() |
1954 | | } else { |
1955 | | 2 * core::mem::size_of::<usize>() |
1956 | | }; |
1957 | | assert_eq!(expected_size, core::mem::size_of::<MatchError>()); |
1958 | | } |
1959 | | |
1960 | | // Same as above, but for the underlying match error kind. |
1961 | | #[cfg(target_pointer_width = "64")] |
1962 | | #[test] |
1963 | | fn match_error_kind_size() { |
1964 | | let expected_size = 2 * core::mem::size_of::<usize>(); |
1965 | | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1966 | | } |
1967 | | |
1968 | | #[cfg(target_pointer_width = "32")] |
1969 | | #[test] |
1970 | | fn match_error_kind_size() { |
1971 | | let expected_size = 3 * core::mem::size_of::<usize>(); |
1972 | | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1973 | | } |
1974 | | |
1975 | | #[test] |
1976 | | fn incorrect_asref_guard() { |
1977 | | struct Bad(std::cell::Cell<bool>); |
1978 | | |
1979 | | impl AsRef<[u8]> for Bad { |
1980 | | fn as_ref(&self) -> &[u8] { |
1981 | | if self.0.replace(false) { |
1982 | | &[] |
1983 | | } else { |
1984 | | &[0; 1000] |
1985 | | } |
1986 | | } |
1987 | | } |
1988 | | |
1989 | | let bad = Bad(std::cell::Cell::new(true)); |
1990 | | let input = Input::new(&bad); |
1991 | | assert!(input.end() <= input.haystack().len()); |
1992 | | } |
1993 | | } |