/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/hir/literal.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Provides literal extraction from `Hir` expressions. |
3 | | |
4 | | An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a |
5 | | [`Seq`] of [`Literal`]s. |
6 | | |
7 | | The purpose of literal extraction is generally to provide avenues for |
8 | | optimizing regex searches. The main idea is that substring searches can be an |
9 | | order of magnitude faster than a regex search. Therefore, if one can execute |
10 | | a substring search to find candidate match locations and only run the regex |
11 | | search at those locations, then it is possible for huge improvements in |
12 | | performance to be realized. |
13 | | |
14 | | With that said, literal optimizations are generally a black art because even |
15 | | though substring search is generally faster, if the number of candidates |
16 | | produced is high, then it can create a lot of overhead by ping-ponging between |
17 | | the substring search and the regex search. |
18 | | |
19 | | Here are some heuristics that might be used to help increase the chances of |
20 | | effective literal optimizations: |
21 | | |
22 | | * Stick to small [`Seq`]s. If you search for too many literals, it's likely |
23 | | to lead to substring search that is only a little faster than a regex search, |
24 | | and thus the overhead of using literal optimizations in the first place might |
25 | | make things slower overall. |
26 | | * The literals in your [`Seq`] shouldn't be too short. In general, longer is |
27 | | better. A sequence corresponding to single bytes that occur frequently in the |
28 | | haystack, for example, is probably a bad literal optimization because it's |
29 | | likely to produce many false positive candidates. Longer literals are less |
30 | | likely to match, and thus probably produce fewer false positives. |
31 | | * If it's possible to estimate the approximate frequency of each byte according |
32 | | to some pre-computed background distribution, it is possible to compute a score |
33 | | of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider |
34 | | skipping the literal optimization and just use the regex engine. |
35 | | |
36 | | (It should be noted that there are always pathological cases that can make |
37 | | any kind of literal optimization be a net slower result. This is why it |
38 | | might be a good idea to be conservative, or to even provide a means for |
39 | | literal optimizations to be dynamically disabled if they are determined to be |
40 | | ineffective according to some measure.) |
41 | | |
42 | | You're encouraged to explore the methods on [`Seq`], which permit shrinking |
43 | | the size of sequences in a preference-order preserving fashion. |
44 | | |
45 | | Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, |
46 | | an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, |
47 | | so it is possible to implement your own extractor. For example, for n-grams |
48 | | or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` |
49 | | is mostly responsible for the case analysis over `Hir` expressions. Much of |
50 | | the "trickier" parts are how to combine literal sequences, and that is all |
51 | | implemented on [`Seq`]. |
52 | | */ |
53 | | |
54 | | use core::{cmp, mem, num::NonZeroUsize}; |
55 | | |
56 | | use alloc::{vec, vec::Vec}; |
57 | | |
58 | | use crate::hir::{self, Hir}; |
59 | | |
60 | | /// Extracts prefix or suffix literal sequences from [`Hir`] expressions. |
61 | | /// |
62 | | /// Literal extraction is based on the following observations: |
63 | | /// |
64 | | /// * Many regexes start with one or a small number of literals. |
65 | | /// * Substring search for literals is often much faster (sometimes by an order |
66 | | /// of magnitude) than a regex search. |
67 | | /// |
68 | | /// Thus, in many cases, one can search for literals to find candidate starting |
69 | | /// locations of a match, and then only run the full regex engine at each such |
70 | | /// location instead of over the full haystack. |
71 | | /// |
72 | | /// The main downside of literal extraction is that it can wind up causing a |
73 | | /// search to be slower overall. For example, if there are many matches or if |
74 | | /// there are many candidates that don't ultimately lead to a match, then a |
75 | | /// lot of overhead will be spent in shuffing back-and-forth between substring |
76 | | /// search and the regex engine. This is the fundamental reason why literal |
77 | | /// optimizations for regex patterns is sometimes considered a "black art." |
78 | | /// |
79 | | /// # Look-around assertions |
80 | | /// |
81 | | /// Literal extraction treats all look-around assertions as-if they match every |
82 | | /// empty string. So for example, the regex `\bquux\b` will yield a sequence |
83 | | /// containing a single exact literal `quux`. However, not all occurrences |
84 | | /// of `quux` correspond to a match a of the regex. For example, `\bquux\b` |
85 | | /// does not match `ZquuxZ` anywhere because `quux` does not fall on a word |
86 | | /// boundary. |
87 | | /// |
88 | | /// In effect, if your regex contains look-around assertions, then a match of |
89 | | /// an exact literal does not necessarily mean the regex overall matches. So |
90 | | /// you may still need to run the regex engine in such cases to confirm the |
91 | | /// match. |
92 | | /// |
93 | | /// The precise guarantee you get from a literal sequence is: if every literal |
94 | | /// in the sequence is exact and the original regex contains zero look-around |
95 | | /// assertions, then a preference-order multi-substring search of those |
96 | | /// literals will precisely match a preference-order search of the original |
97 | | /// regex. |
98 | | /// |
99 | | /// # Example |
100 | | /// |
101 | | /// This shows how to extract prefixes: |
102 | | /// |
103 | | /// ``` |
104 | | /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; |
105 | | /// |
106 | | /// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; |
107 | | /// let got = Extractor::new().extract(&hir); |
108 | | /// // All literals returned are "inexact" because none of them reach the |
109 | | /// // match state. |
110 | | /// let expected = Seq::from_iter([ |
111 | | /// Literal::inexact("ax"), |
112 | | /// Literal::inexact("ay"), |
113 | | /// Literal::inexact("az"), |
114 | | /// Literal::inexact("bx"), |
115 | | /// Literal::inexact("by"), |
116 | | /// Literal::inexact("bz"), |
117 | | /// Literal::inexact("cx"), |
118 | | /// Literal::inexact("cy"), |
119 | | /// Literal::inexact("cz"), |
120 | | /// ]); |
121 | | /// assert_eq!(expected, got); |
122 | | /// |
123 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
124 | | /// ``` |
125 | | /// |
126 | | /// This shows how to extract suffixes: |
127 | | /// |
128 | | /// ``` |
129 | | /// use regex_syntax::{ |
130 | | /// hir::literal::{Extractor, ExtractKind, Literal, Seq}, |
131 | | /// parse, |
132 | | /// }; |
133 | | /// |
134 | | /// let hir = parse(r"foo|[A-Z]+bar")?; |
135 | | /// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); |
136 | | /// // Since 'foo' gets to a match state, it is considered exact. But 'bar' |
137 | | /// // does not because of the '[A-Z]+', and thus is marked inexact. |
138 | | /// let expected = Seq::from_iter([ |
139 | | /// Literal::exact("foo"), |
140 | | /// Literal::inexact("bar"), |
141 | | /// ]); |
142 | | /// assert_eq!(expected, got); |
143 | | /// |
144 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
145 | | /// ``` |
146 | | #[derive(Clone, Debug)] |
147 | | pub struct Extractor { |
148 | | kind: ExtractKind, |
149 | | limit_class: usize, |
150 | | limit_repeat: usize, |
151 | | limit_literal_len: usize, |
152 | | limit_total: usize, |
153 | | } |
154 | | |
155 | | impl Extractor { |
156 | | /// Create a new extractor with a default configuration. |
157 | | /// |
158 | | /// The extractor can be optionally configured before calling |
159 | | /// [`Extractor::extract`] to get a literal sequence. |
160 | 0 | pub fn new() -> Extractor { |
161 | 0 | Extractor { |
162 | 0 | kind: ExtractKind::Prefix, |
163 | 0 | limit_class: 10, |
164 | 0 | limit_repeat: 10, |
165 | 0 | limit_literal_len: 100, |
166 | 0 | limit_total: 250, |
167 | 0 | } |
168 | 0 | } |
169 | | |
170 | | /// Execute the extractor and return a sequence of literals. |
171 | 0 | pub fn extract(&self, hir: &Hir) -> Seq { |
172 | | use crate::hir::HirKind::*; |
173 | | |
174 | 0 | match *hir.kind() { |
175 | 0 | Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), |
176 | 0 | Literal(hir::Literal(ref bytes)) => { |
177 | 0 | let mut seq = |
178 | 0 | Seq::singleton(self::Literal::exact(bytes.to_vec())); |
179 | 0 | self.enforce_literal_len(&mut seq); |
180 | 0 | seq |
181 | | } |
182 | 0 | Class(hir::Class::Unicode(ref cls)) => { |
183 | 0 | self.extract_class_unicode(cls) |
184 | | } |
185 | 0 | Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), |
186 | 0 | Repetition(ref rep) => self.extract_repetition(rep), |
187 | 0 | Capture(hir::Capture { ref sub, .. }) => self.extract(sub), |
188 | 0 | Concat(ref hirs) => match self.kind { |
189 | 0 | ExtractKind::Prefix => self.extract_concat(hirs.iter()), |
190 | 0 | ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), |
191 | | }, |
192 | 0 | Alternation(ref hirs) => { |
193 | 0 | // Unlike concat, we always union starting from the beginning, |
194 | 0 | // since the beginning corresponds to the highest preference, |
195 | 0 | // which doesn't change based on forwards vs reverse. |
196 | 0 | self.extract_alternation(hirs.iter()) |
197 | | } |
198 | | } |
199 | 0 | } |
200 | | |
201 | | /// Set the kind of literal sequence to extract from an [`Hir`] expression. |
202 | | /// |
203 | | /// The default is to extract prefixes, but suffixes can be selected |
204 | | /// instead. The contract for prefixes is that every match of the |
205 | | /// corresponding `Hir` must start with one of the literals in the sequence |
206 | | /// returned. Moreover, the _order_ of the sequence returned corresponds to |
207 | | /// the preference order. |
208 | | /// |
209 | | /// Suffixes satisfy a similar contract in that every match of the |
210 | | /// corresponding `Hir` must end with one of the literals in the sequence |
211 | | /// returned. However, there is no guarantee that the literals are in |
212 | | /// preference order. |
213 | | /// |
214 | | /// Remember that a sequence can be infinite. For example, unless the |
215 | | /// limits are configured to be impractically large, attempting to extract |
216 | | /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite |
217 | | /// sequence. Generally speaking, if the sequence returned is infinite, |
218 | | /// then it is presumed to be unwise to do prefix (or suffix) optimizations |
219 | | /// for the pattern. |
220 | 0 | pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { |
221 | 0 | self.kind = kind; |
222 | 0 | self |
223 | 0 | } |
224 | | |
225 | | /// Configure a limit on the length of the sequence that is permitted for |
226 | | /// a character class. If a character class exceeds this limit, then the |
227 | | /// sequence returned for it is infinite. |
228 | | /// |
229 | | /// This prevents classes like `[A-Z]` or `\pL` from getting turned into |
230 | | /// huge and likely unproductive sequences of literals. |
231 | | /// |
232 | | /// # Example |
233 | | /// |
234 | | /// This example shows how this limit can be lowered to decrease the tolerance |
235 | | /// for character classes being turned into literal sequences. |
236 | | /// |
237 | | /// ``` |
238 | | /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; |
239 | | /// |
240 | | /// let hir = parse(r"[0-9]")?; |
241 | | /// |
242 | | /// let got = Extractor::new().extract(&hir); |
243 | | /// let expected = Seq::new([ |
244 | | /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", |
245 | | /// ]); |
246 | | /// assert_eq!(expected, got); |
247 | | /// |
248 | | /// // Now let's shrink the limit and see how that changes things. |
249 | | /// let got = Extractor::new().limit_class(4).extract(&hir); |
250 | | /// let expected = Seq::infinite(); |
251 | | /// assert_eq!(expected, got); |
252 | | /// |
253 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
254 | | /// ``` |
255 | 0 | pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { |
256 | 0 | self.limit_class = limit; |
257 | 0 | self |
258 | 0 | } |
259 | | |
260 | | /// Configure a limit on the total number of repetitions that is permitted |
261 | | /// before literal extraction is stopped. |
262 | | /// |
263 | | /// This is useful for limiting things like `(abcde){50}`, or more |
264 | | /// insidiously, `(?:){1000000000}`. This limit prevents any one single |
265 | | /// repetition from adding too much to a literal sequence. |
266 | | /// |
267 | | /// With this limit set, repetitions that exceed it will be stopped and any |
268 | | /// literals extracted up to that point will be made inexact. |
269 | | /// |
270 | | /// # Example |
271 | | /// |
272 | | /// This shows how to decrease the limit and compares it with the default. |
273 | | /// |
274 | | /// ``` |
275 | | /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; |
276 | | /// |
277 | | /// let hir = parse(r"(abc){8}")?; |
278 | | /// |
279 | | /// let got = Extractor::new().extract(&hir); |
280 | | /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); |
281 | | /// assert_eq!(expected, got); |
282 | | /// |
283 | | /// // Now let's shrink the limit and see how that changes things. |
284 | | /// let got = Extractor::new().limit_repeat(4).extract(&hir); |
285 | | /// let expected = Seq::from_iter([ |
286 | | /// Literal::inexact("abcabcabcabc"), |
287 | | /// ]); |
288 | | /// assert_eq!(expected, got); |
289 | | /// |
290 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
291 | | /// ``` |
292 | 0 | pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { |
293 | 0 | self.limit_repeat = limit; |
294 | 0 | self |
295 | 0 | } |
296 | | |
297 | | /// Configure a limit on the maximum length of any literal in a sequence. |
298 | | /// |
299 | | /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While |
300 | | /// each repetition or literal in that regex is small, when all the |
301 | | /// repetitions are applied, one ends up with a literal of length `5^4 = |
302 | | /// 625`. |
303 | | /// |
304 | | /// With this limit set, literals that exceed it will be made inexact and |
305 | | /// thus prevented from growing. |
306 | | /// |
307 | | /// # Example |
308 | | /// |
309 | | /// This shows how to decrease the limit and compares it with the default. |
310 | | /// |
311 | | /// ``` |
312 | | /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; |
313 | | /// |
314 | | /// let hir = parse(r"(abc){2}{2}{2}")?; |
315 | | /// |
316 | | /// let got = Extractor::new().extract(&hir); |
317 | | /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); |
318 | | /// assert_eq!(expected, got); |
319 | | /// |
320 | | /// // Now let's shrink the limit and see how that changes things. |
321 | | /// let got = Extractor::new().limit_literal_len(14).extract(&hir); |
322 | | /// let expected = Seq::from_iter([ |
323 | | /// Literal::inexact("abcabcabcabcab"), |
324 | | /// ]); |
325 | | /// assert_eq!(expected, got); |
326 | | /// |
327 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
328 | | /// ``` |
329 | 0 | pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { |
330 | 0 | self.limit_literal_len = limit; |
331 | 0 | self |
332 | 0 | } |
333 | | |
334 | | /// Configure a limit on the total number of literals that will be |
335 | | /// returned. |
336 | | /// |
337 | | /// This is useful as a practical measure for avoiding the creation of |
338 | | /// large sequences of literals. While the extractor will automatically |
339 | | /// handle local creations of large sequences (for example, `[A-Z]` yields |
340 | | /// an infinite sequence by default), large sequences can be created |
341 | | /// through non-local means as well. |
342 | | /// |
343 | | /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` |
344 | | /// despite each of the repetitions being small on their own. This limit |
345 | | /// thus represents a "catch all" for avoiding locally small sequences from |
346 | | /// combining into large sequences. |
347 | | /// |
348 | | /// # Example |
349 | | /// |
350 | | /// This example shows how reducing the limit will change the literal |
351 | | /// sequence returned. |
352 | | /// |
353 | | /// ``` |
354 | | /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; |
355 | | /// |
356 | | /// let hir = parse(r"[ab]{2}{2}")?; |
357 | | /// |
358 | | /// let got = Extractor::new().extract(&hir); |
359 | | /// let expected = Seq::new([ |
360 | | /// "aaaa", "aaab", "aaba", "aabb", |
361 | | /// "abaa", "abab", "abba", "abbb", |
362 | | /// "baaa", "baab", "baba", "babb", |
363 | | /// "bbaa", "bbab", "bbba", "bbbb", |
364 | | /// ]); |
365 | | /// assert_eq!(expected, got); |
366 | | /// |
367 | | /// // The default limit is not too big, but big enough to extract all |
368 | | /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, |
369 | | /// // then we'll get a truncated set. Notice that it returns a sequence of |
370 | | /// // length 4 even though our limit was 10. This is because the sequence |
371 | | /// // is difficult to increase without blowing the limit. Notice also |
372 | | /// // that every literal in the sequence is now inexact because they were |
373 | | /// // stripped of some suffix. |
374 | | /// let got = Extractor::new().limit_total(10).extract(&hir); |
375 | | /// let expected = Seq::from_iter([ |
376 | | /// Literal::inexact("aa"), |
377 | | /// Literal::inexact("ab"), |
378 | | /// Literal::inexact("ba"), |
379 | | /// Literal::inexact("bb"), |
380 | | /// ]); |
381 | | /// assert_eq!(expected, got); |
382 | | /// |
383 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
384 | | /// ``` |
385 | 0 | pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { |
386 | 0 | self.limit_total = limit; |
387 | 0 | self |
388 | 0 | } |
389 | | |
390 | | /// Extract a sequence from the given concatenation. Sequences from each of |
391 | | /// the child HIR expressions are combined via cross product. |
392 | | /// |
393 | | /// This short circuits once the cross product turns into a sequence |
394 | | /// containing only inexact literals. |
395 | 0 | fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> Seq { |
396 | 0 | let mut seq = Seq::singleton(self::Literal::exact(vec![])); |
397 | 0 | for hir in it { |
398 | | // If every element in the sequence is inexact, then a cross |
399 | | // product will always be a no-op. Thus, there is nothing else we |
400 | | // can add to it and can quit early. Note that this also includes |
401 | | // infinite sequences. |
402 | 0 | if seq.is_inexact() { |
403 | 0 | break; |
404 | 0 | } |
405 | 0 | // Note that 'cross' also dispatches based on whether we're |
406 | 0 | // extracting prefixes or suffixes. |
407 | 0 | seq = self.cross(seq, &mut self.extract(hir)); |
408 | | } |
409 | 0 | seq |
410 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Extractor>::extract_concat::<core::slice::iter::Iter<regex_syntax::hir::Hir>> Unexecuted instantiation: <regex_syntax::hir::literal::Extractor>::extract_concat::<core::iter::adapters::rev::Rev<core::slice::iter::Iter<regex_syntax::hir::Hir>>> |
411 | | |
412 | | /// Extract a sequence from the given alternation. |
413 | | /// |
414 | | /// This short circuits once the union turns into an infinite sequence. |
415 | 0 | fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>( |
416 | 0 | &self, |
417 | 0 | it: I, |
418 | 0 | ) -> Seq { |
419 | 0 | let mut seq = Seq::empty(); |
420 | 0 | for hir in it { |
421 | | // Once our 'seq' is infinite, every subsequent union |
422 | | // operation on it will itself always result in an |
423 | | // infinite sequence. Thus, it can never change and we can |
424 | | // short-circuit. |
425 | 0 | if !seq.is_finite() { |
426 | 0 | break; |
427 | 0 | } |
428 | 0 | seq = self.union(seq, &mut self.extract(hir)); |
429 | | } |
430 | 0 | seq |
431 | 0 | } |
432 | | |
433 | | /// Extract a sequence of literals from the given repetition. We do our |
434 | | /// best, Some examples: |
435 | | /// |
436 | | /// 'a*' => [inexact(a), exact("")] |
437 | | /// 'a*?' => [exact(""), inexact(a)] |
438 | | /// 'a+' => [inexact(a)] |
439 | | /// 'a{3}' => [exact(aaa)] |
440 | | /// 'a{3,5} => [inexact(aaa)] |
441 | | /// |
442 | | /// The key here really is making sure we get the 'inexact' vs 'exact' |
443 | | /// attributes correct on each of the literals we add. For example, the |
444 | | /// fact that 'a*' gives us an inexact 'a' and an exact empty string means |
445 | | /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] |
446 | | /// literals being extracted, which might actually be a better prefilter |
447 | | /// than just 'a'. |
448 | 0 | fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { |
449 | 0 | let mut subseq = self.extract(&rep.sub); |
450 | 0 | match *rep { |
451 | 0 | hir::Repetition { min: 0, max, greedy, .. } => { |
452 | 0 | // When 'max=1', we can retain exactness, since 'a?' is |
453 | 0 | // equivalent to 'a|'. Similarly below, 'a??' is equivalent to |
454 | 0 | // '|a'. |
455 | 0 | if max != Some(1) { |
456 | 0 | subseq.make_inexact(); |
457 | 0 | } |
458 | 0 | let mut empty = Seq::singleton(Literal::exact(vec![])); |
459 | 0 | if !greedy { |
460 | 0 | mem::swap(&mut subseq, &mut empty); |
461 | 0 | } |
462 | 0 | self.union(subseq, &mut empty) |
463 | | } |
464 | 0 | hir::Repetition { min, max: Some(max), .. } if min == max => { |
465 | 0 | assert!(min > 0); // handled above |
466 | 0 | let limit = |
467 | 0 | u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); |
468 | 0 | let mut seq = Seq::singleton(Literal::exact(vec![])); |
469 | 0 | for _ in 0..cmp::min(min, limit) { |
470 | 0 | if seq.is_inexact() { |
471 | 0 | break; |
472 | 0 | } |
473 | 0 | seq = self.cross(seq, &mut subseq.clone()); |
474 | | } |
475 | 0 | if usize::try_from(min).is_err() || min > limit { |
476 | 0 | seq.make_inexact(); |
477 | 0 | } |
478 | 0 | seq |
479 | | } |
480 | 0 | hir::Repetition { min, .. } => { |
481 | 0 | assert!(min > 0); // handled above |
482 | 0 | let limit = |
483 | 0 | u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); |
484 | 0 | let mut seq = Seq::singleton(Literal::exact(vec![])); |
485 | 0 | for _ in 0..cmp::min(min, limit) { |
486 | 0 | if seq.is_inexact() { |
487 | 0 | break; |
488 | 0 | } |
489 | 0 | seq = self.cross(seq, &mut subseq.clone()); |
490 | | } |
491 | 0 | seq.make_inexact(); |
492 | 0 | seq |
493 | | } |
494 | | } |
495 | 0 | } |
496 | | |
497 | | /// Convert the given Unicode class into a sequence of literals if the |
498 | | /// class is small enough. If the class is too big, return an infinite |
499 | | /// sequence. |
500 | 0 | fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { |
501 | 0 | if self.class_over_limit_unicode(cls) { |
502 | 0 | return Seq::infinite(); |
503 | 0 | } |
504 | 0 | let mut seq = Seq::empty(); |
505 | 0 | for r in cls.iter() { |
506 | 0 | for ch in r.start()..=r.end() { |
507 | 0 | seq.push(Literal::from(ch)); |
508 | 0 | } |
509 | | } |
510 | 0 | self.enforce_literal_len(&mut seq); |
511 | 0 | seq |
512 | 0 | } |
513 | | |
514 | | /// Convert the given byte class into a sequence of literals if the class |
515 | | /// is small enough. If the class is too big, return an infinite sequence. |
516 | 0 | fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { |
517 | 0 | if self.class_over_limit_bytes(cls) { |
518 | 0 | return Seq::infinite(); |
519 | 0 | } |
520 | 0 | let mut seq = Seq::empty(); |
521 | 0 | for r in cls.iter() { |
522 | 0 | for b in r.start()..=r.end() { |
523 | 0 | seq.push(Literal::from(b)); |
524 | 0 | } |
525 | | } |
526 | 0 | self.enforce_literal_len(&mut seq); |
527 | 0 | seq |
528 | 0 | } |
529 | | |
530 | | /// Returns true if the given Unicode class exceeds the configured limits |
531 | | /// on this extractor. |
532 | 0 | fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { |
533 | 0 | let mut count = 0; |
534 | 0 | for r in cls.iter() { |
535 | 0 | if count > self.limit_class { |
536 | 0 | return true; |
537 | 0 | } |
538 | 0 | count += r.len(); |
539 | | } |
540 | 0 | count > self.limit_class |
541 | 0 | } |
542 | | |
543 | | /// Returns true if the given byte class exceeds the configured limits on |
544 | | /// this extractor. |
545 | 0 | fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { |
546 | 0 | let mut count = 0; |
547 | 0 | for r in cls.iter() { |
548 | 0 | if count > self.limit_class { |
549 | 0 | return true; |
550 | 0 | } |
551 | 0 | count += r.len(); |
552 | | } |
553 | 0 | count > self.limit_class |
554 | 0 | } |
555 | | |
556 | | /// Compute the cross product of the two sequences if the result would be |
557 | | /// within configured limits. Otherwise, make `seq2` infinite and cross the |
558 | | /// infinite sequence with `seq1`. |
559 | 0 | fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { |
560 | 0 | if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) |
561 | 0 | { |
562 | 0 | seq2.make_infinite(); |
563 | 0 | } |
564 | 0 | if let ExtractKind::Suffix = self.kind { |
565 | 0 | seq1.cross_reverse(seq2); |
566 | 0 | } else { |
567 | 0 | seq1.cross_forward(seq2); |
568 | 0 | } |
569 | 0 | assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); |
570 | 0 | self.enforce_literal_len(&mut seq1); |
571 | 0 | seq1 |
572 | 0 | } |
573 | | |
574 | | /// Union the two sequences if the result would be within configured |
575 | | /// limits. Otherwise, make `seq2` infinite and union the infinite sequence |
576 | | /// with `seq1`. |
577 | 0 | fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { |
578 | 0 | if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) |
579 | | { |
580 | | // We try to trim our literal sequences to see if we can make |
581 | | // room for more literals. The idea is that we'd rather trim down |
582 | | // literals already in our sequence if it means we can add a few |
583 | | // more and retain a finite sequence. Otherwise, we'll union with |
584 | | // an infinite sequence and that infects everything and effectively |
585 | | // stops literal extraction in its tracks. |
586 | | // |
587 | | // We do we keep 4 bytes here? Well, it's a bit of an abstraction |
588 | | // leakage. Downstream, the literals may wind up getting fed to |
589 | | // the Teddy algorithm, which supports searching literals up to |
590 | | // length 4. So that's why we pick that number here. Arguably this |
591 | | // should be a tuneable parameter, but it seems a little tricky to |
592 | | // describe. And I'm still unsure if this is the right way to go |
593 | | // about culling literal sequences. |
594 | 0 | match self.kind { |
595 | 0 | ExtractKind::Prefix => { |
596 | 0 | seq1.keep_first_bytes(4); |
597 | 0 | seq2.keep_first_bytes(4); |
598 | 0 | } |
599 | 0 | ExtractKind::Suffix => { |
600 | 0 | seq1.keep_last_bytes(4); |
601 | 0 | seq2.keep_last_bytes(4); |
602 | 0 | } |
603 | | } |
604 | 0 | seq1.dedup(); |
605 | 0 | seq2.dedup(); |
606 | 0 | if seq1 |
607 | 0 | .max_union_len(seq2) |
608 | 0 | .map_or(false, |len| len > self.limit_total) |
609 | 0 | { |
610 | 0 | seq2.make_infinite(); |
611 | 0 | } |
612 | 0 | } |
613 | 0 | seq1.union(seq2); |
614 | 0 | assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); |
615 | 0 | seq1 |
616 | 0 | } |
617 | | |
618 | | /// Applies the literal length limit to the given sequence. If none of the |
619 | | /// literals in the sequence exceed the limit, then this is a no-op. |
620 | 0 | fn enforce_literal_len(&self, seq: &mut Seq) { |
621 | 0 | let len = self.limit_literal_len; |
622 | 0 | match self.kind { |
623 | 0 | ExtractKind::Prefix => seq.keep_first_bytes(len), |
624 | 0 | ExtractKind::Suffix => seq.keep_last_bytes(len), |
625 | | } |
626 | 0 | } |
627 | | } |
628 | | |
629 | | impl Default for Extractor { |
630 | 0 | fn default() -> Extractor { |
631 | 0 | Extractor::new() |
632 | 0 | } |
633 | | } |
634 | | |
635 | | /// The kind of literals to extract from an [`Hir`] expression. |
636 | | /// |
637 | | /// The default extraction kind is `Prefix`. |
638 | | #[non_exhaustive] |
639 | | #[derive(Clone, Debug)] |
640 | | pub enum ExtractKind { |
641 | | /// Extracts only prefix literals from a regex. |
642 | | Prefix, |
643 | | /// Extracts only suffix literals from a regex. |
644 | | /// |
645 | | /// Note that the sequence returned by suffix literals currently may |
646 | | /// not correctly represent leftmost-first or "preference" order match |
647 | | /// semantics. |
648 | | Suffix, |
649 | | } |
650 | | |
651 | | impl ExtractKind { |
652 | | /// Returns true if this kind is the `Prefix` variant. |
653 | 0 | pub fn is_prefix(&self) -> bool { |
654 | 0 | matches!(*self, ExtractKind::Prefix) |
655 | 0 | } |
656 | | |
657 | | /// Returns true if this kind is the `Suffix` variant. |
658 | 0 | pub fn is_suffix(&self) -> bool { |
659 | 0 | matches!(*self, ExtractKind::Suffix) |
660 | 0 | } |
661 | | } |
662 | | |
663 | | impl Default for ExtractKind { |
664 | 0 | fn default() -> ExtractKind { |
665 | 0 | ExtractKind::Prefix |
666 | 0 | } |
667 | | } |
668 | | |
669 | | /// A sequence of literals. |
670 | | /// |
671 | | /// A `Seq` is very much like a set in that it represents a union of its |
672 | | /// members. That is, it corresponds to a set of literals where at least one |
673 | | /// must match in order for a particular [`Hir`] expression to match. (Whether |
674 | | /// this corresponds to the entire `Hir` expression, a prefix of it or a suffix |
675 | | /// of it depends on how the `Seq` was extracted from the `Hir`.) |
676 | | /// |
677 | | /// It is also unlike a set in that multiple identical literals may appear, |
678 | | /// and that the order of the literals in the `Seq` matters. For example, if |
679 | | /// the sequence is `[sam, samwise]` and leftmost-first matching is used, then |
680 | | /// `samwise` can never match and the sequence is equivalent to `[sam]`. |
681 | | /// |
682 | | /// # States of a sequence |
683 | | /// |
684 | | /// A `Seq` has a few different logical states to consider: |
685 | | /// |
686 | | /// * The sequence can represent "any" literal. When this happens, the set does |
687 | | /// not have a finite size. The purpose of this state is to inhibit callers |
688 | | /// from making assumptions about what literals are required in order to match |
689 | | /// a particular [`Hir`] expression. Generally speaking, when a set is in this |
690 | | /// state, literal optimizations are inhibited. A good example of a regex that |
691 | | /// will cause this sort of set to appear is `[A-Za-z]`. The character class |
692 | | /// is just too big (and also too narrow) to be usefully expanded into 52 |
693 | | /// different literals. (Note that the decision for when a seq should become |
694 | | /// infinite is determined by the caller. A seq itself has no hard-coded |
695 | | /// limits.) |
696 | | /// * The sequence can be empty, in which case, it is an affirmative statement |
697 | | /// that there are no literals that can match the corresponding `Hir`. |
698 | | /// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. |
699 | | /// * The sequence can be non-empty, in which case, at least one of the |
700 | | /// literals must match in order for the corresponding `Hir` to match. |
701 | | /// |
702 | | /// # Example |
703 | | /// |
704 | | /// This example shows how literal sequences can be simplified by stripping |
705 | | /// suffixes and minimizing while maintaining preference order. |
706 | | /// |
707 | | /// ``` |
708 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
709 | | /// |
710 | | /// let mut seq = Seq::new(&[ |
711 | | /// "farm", |
712 | | /// "appliance", |
713 | | /// "faraway", |
714 | | /// "apple", |
715 | | /// "fare", |
716 | | /// "gap", |
717 | | /// "applicant", |
718 | | /// "applaud", |
719 | | /// ]); |
720 | | /// seq.keep_first_bytes(3); |
721 | | /// seq.minimize_by_preference(); |
722 | | /// // Notice that 'far' comes before 'app', which matches the order in the |
723 | | /// // original sequence. This guarantees that leftmost-first semantics are |
724 | | /// // not altered by simplifying the set. |
725 | | /// let expected = Seq::from_iter([ |
726 | | /// Literal::inexact("far"), |
727 | | /// Literal::inexact("app"), |
728 | | /// Literal::exact("gap"), |
729 | | /// ]); |
730 | | /// assert_eq!(expected, seq); |
731 | | /// ``` |
732 | | #[derive(Clone, Eq, PartialEq)] |
733 | | pub struct Seq { |
734 | | /// The members of this seq. |
735 | | /// |
736 | | /// When `None`, the seq represents all possible literals. That is, it |
737 | | /// prevents one from making assumptions about specific literals in the |
738 | | /// seq, and forces one to treat it as if any literal might be in the seq. |
739 | | /// |
740 | | /// Note that `Some(vec![])` is valid and corresponds to the empty seq of |
741 | | /// literals, i.e., a regex that can never match. For example, `[a&&b]`. |
742 | | /// It is distinct from `Some(vec![""])`, which corresponds to the seq |
743 | | /// containing an empty string, which matches at every position. |
744 | | literals: Option<Vec<Literal>>, |
745 | | } |
746 | | |
747 | | impl Seq { |
748 | | /// Returns an empty sequence. |
749 | | /// |
750 | | /// An empty sequence matches zero literals, and thus corresponds to a |
751 | | /// regex that itself can never match. |
752 | | #[inline] |
753 | 0 | pub fn empty() -> Seq { |
754 | 0 | Seq { literals: Some(vec![]) } |
755 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::empty Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::empty |
756 | | |
757 | | /// Returns a sequence of literals without a finite size and may contain |
758 | | /// any literal. |
759 | | /// |
760 | | /// A sequence without finite size does not reveal anything about the |
761 | | /// characteristics of the literals in its set. There are no fixed prefixes |
762 | | /// or suffixes, nor are lower or upper bounds on the length of the literals |
763 | | /// in the set known. |
764 | | /// |
765 | | /// This is useful to represent constructs in a regex that are "too big" |
766 | | /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. |
767 | | /// When sequences get too big, they lose their discriminating nature and |
768 | | /// are more likely to produce false positives, which in turn makes them |
769 | | /// less likely to speed up searches. |
770 | | /// |
771 | | /// More pragmatically, for many regexes, enumerating all possible literals |
772 | | /// is itself not possible or might otherwise use too many resources. So |
773 | | /// constraining the size of sets during extraction is a practical trade |
774 | | /// off to make. |
775 | | #[inline] |
776 | 0 | pub fn infinite() -> Seq { |
777 | 0 | Seq { literals: None } |
778 | 0 | } |
779 | | |
780 | | /// Returns a sequence containing a single literal. |
781 | | #[inline] |
782 | 0 | pub fn singleton(lit: Literal) -> Seq { |
783 | 0 | Seq { literals: Some(vec![lit]) } |
784 | 0 | } |
785 | | |
786 | | /// Returns a sequence of exact literals from the given byte strings. |
787 | | #[inline] |
788 | 0 | pub fn new<I, B>(it: I) -> Seq |
789 | 0 | where |
790 | 0 | I: IntoIterator<Item = B>, |
791 | 0 | B: AsRef<[u8]>, |
792 | 0 | { |
793 | 0 | it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() |
794 | 0 | } |
795 | | |
796 | | /// If this is a finite sequence, return its members as a slice of |
797 | | /// literals. |
798 | | /// |
799 | | /// The slice returned may be empty, in which case, there are no literals |
800 | | /// that can match this sequence. |
801 | | #[inline] |
802 | 0 | pub fn literals(&self) -> Option<&[Literal]> { |
803 | 0 | self.literals.as_deref() |
804 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::literals Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::literals |
805 | | |
806 | | /// Push a literal to the end of this sequence. |
807 | | /// |
808 | | /// If this sequence is not finite, then this is a no-op. |
809 | | /// |
810 | | /// Similarly, if the most recently added item of this sequence is |
811 | | /// equivalent to the literal given, then it is not added. This reflects |
812 | | /// a `Seq`'s "set like" behavior, and represents a practical trade off. |
813 | | /// Namely, there is never any need to have two adjacent and equivalent |
814 | | /// literals in the same sequence, _and_ it is easy to detect in some |
815 | | /// cases. |
816 | | #[inline] |
817 | 0 | pub fn push(&mut self, lit: Literal) { |
818 | 0 | let lits = match self.literals { |
819 | 0 | None => return, |
820 | 0 | Some(ref mut lits) => lits, |
821 | 0 | }; |
822 | 0 | if lits.last().map_or(false, |m| m == &lit) { |
823 | 0 | return; |
824 | 0 | } |
825 | 0 | lits.push(lit); |
826 | 0 | } |
827 | | |
828 | | /// Make all of the literals in this sequence inexact. |
829 | | /// |
830 | | /// This is a no-op if this sequence is not finite. |
831 | | #[inline] |
832 | 0 | pub fn make_inexact(&mut self) { |
833 | 0 | let lits = match self.literals { |
834 | 0 | None => return, |
835 | 0 | Some(ref mut lits) => lits, |
836 | | }; |
837 | 0 | for lit in lits.iter_mut() { |
838 | 0 | lit.make_inexact(); |
839 | 0 | } |
840 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::make_inexact Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::make_inexact |
841 | | |
842 | | /// Converts this sequence to an infinite sequence. |
843 | | /// |
844 | | /// This is a no-op if the sequence is already infinite. |
845 | | #[inline] |
846 | 0 | pub fn make_infinite(&mut self) { |
847 | 0 | self.literals = None; |
848 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::make_infinite Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::make_infinite |
849 | | |
850 | | /// Modify this sequence to contain the cross product between it and the |
851 | | /// sequence given. |
852 | | /// |
853 | | /// The cross product only considers literals in this sequence that are |
854 | | /// exact. That is, inexact literals are not extended. |
855 | | /// |
856 | | /// The literals are always drained from `other`, even if none are used. |
857 | | /// This permits callers to reuse the sequence allocation elsewhere. |
858 | | /// |
859 | | /// If this sequence is infinite, then this is a no-op, regardless of what |
860 | | /// `other` contains (and in this case, the literals are still drained from |
861 | | /// `other`). If `other` is infinite and this sequence is finite, then this |
862 | | /// is a no-op, unless this sequence contains a zero-length literal. In |
863 | | /// which case, the infiniteness of `other` infects this sequence, and this |
864 | | /// sequence is itself made infinite. |
865 | | /// |
866 | | /// Like [`Seq::union`], this may attempt to deduplicate literals. See |
867 | | /// [`Seq::dedup`] for how deduplication deals with exact and inexact |
868 | | /// literals. |
869 | | /// |
870 | | /// # Example |
871 | | /// |
872 | | /// This example shows basic usage and how exact and inexact literals |
873 | | /// interact. |
874 | | /// |
875 | | /// ``` |
876 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
877 | | /// |
878 | | /// let mut seq1 = Seq::from_iter([ |
879 | | /// Literal::exact("foo"), |
880 | | /// Literal::inexact("bar"), |
881 | | /// ]); |
882 | | /// let mut seq2 = Seq::from_iter([ |
883 | | /// Literal::inexact("quux"), |
884 | | /// Literal::exact("baz"), |
885 | | /// ]); |
886 | | /// seq1.cross_forward(&mut seq2); |
887 | | /// |
888 | | /// // The literals are pulled out of seq2. |
889 | | /// assert_eq!(Some(0), seq2.len()); |
890 | | /// |
891 | | /// let expected = Seq::from_iter([ |
892 | | /// Literal::inexact("fooquux"), |
893 | | /// Literal::exact("foobaz"), |
894 | | /// Literal::inexact("bar"), |
895 | | /// ]); |
896 | | /// assert_eq!(expected, seq1); |
897 | | /// ``` |
898 | | /// |
899 | | /// This example shows the behavior of when `other` is an infinite |
900 | | /// sequence. |
901 | | /// |
902 | | /// ``` |
903 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
904 | | /// |
905 | | /// let mut seq1 = Seq::from_iter([ |
906 | | /// Literal::exact("foo"), |
907 | | /// Literal::inexact("bar"), |
908 | | /// ]); |
909 | | /// let mut seq2 = Seq::infinite(); |
910 | | /// seq1.cross_forward(&mut seq2); |
911 | | /// |
912 | | /// // When seq2 is infinite, cross product doesn't add anything, but |
913 | | /// // ensures all members of seq1 are inexact. |
914 | | /// let expected = Seq::from_iter([ |
915 | | /// Literal::inexact("foo"), |
916 | | /// Literal::inexact("bar"), |
917 | | /// ]); |
918 | | /// assert_eq!(expected, seq1); |
919 | | /// ``` |
920 | | /// |
921 | | /// This example is like the one above, but shows what happens when this |
922 | | /// sequence contains an empty string. In this case, an infinite `other` |
923 | | /// sequence infects this sequence (because the empty string means that |
924 | | /// there are no finite prefixes): |
925 | | /// |
926 | | /// ``` |
927 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
928 | | /// |
929 | | /// let mut seq1 = Seq::from_iter([ |
930 | | /// Literal::exact("foo"), |
931 | | /// Literal::exact(""), // inexact provokes same behavior |
932 | | /// Literal::inexact("bar"), |
933 | | /// ]); |
934 | | /// let mut seq2 = Seq::infinite(); |
935 | | /// seq1.cross_forward(&mut seq2); |
936 | | /// |
937 | | /// // seq1 is now infinite! |
938 | | /// assert!(!seq1.is_finite()); |
939 | | /// ``` |
940 | | /// |
941 | | /// This example shows the behavior of this sequence is infinite. |
942 | | /// |
943 | | /// ``` |
944 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
945 | | /// |
946 | | /// let mut seq1 = Seq::infinite(); |
947 | | /// let mut seq2 = Seq::from_iter([ |
948 | | /// Literal::exact("foo"), |
949 | | /// Literal::inexact("bar"), |
950 | | /// ]); |
951 | | /// seq1.cross_forward(&mut seq2); |
952 | | /// |
953 | | /// // seq1 remains unchanged. |
954 | | /// assert!(!seq1.is_finite()); |
955 | | /// // Even though the literals in seq2 weren't used, it was still drained. |
956 | | /// assert_eq!(Some(0), seq2.len()); |
957 | | /// ``` |
958 | | #[inline] |
959 | 0 | pub fn cross_forward(&mut self, other: &mut Seq) { |
960 | 0 | let (lits1, lits2) = match self.cross_preamble(other) { |
961 | 0 | None => return, |
962 | 0 | Some((lits1, lits2)) => (lits1, lits2), |
963 | 0 | }; |
964 | 0 | let newcap = lits1.len().saturating_mul(lits2.len()); |
965 | 0 | for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { |
966 | 0 | if !selflit.is_exact() { |
967 | 0 | lits1.push(selflit); |
968 | 0 | continue; |
969 | 0 | } |
970 | 0 | for otherlit in lits2.iter() { |
971 | 0 | let mut newlit = Literal::exact(Vec::with_capacity( |
972 | 0 | selflit.len() + otherlit.len(), |
973 | 0 | )); |
974 | 0 | newlit.extend(&selflit); |
975 | 0 | newlit.extend(&otherlit); |
976 | 0 | if !otherlit.is_exact() { |
977 | 0 | newlit.make_inexact(); |
978 | 0 | } |
979 | 0 | lits1.push(newlit); |
980 | | } |
981 | | } |
982 | 0 | lits2.drain(..); |
983 | 0 | self.dedup(); |
984 | 0 | } |
985 | | |
986 | | /// Modify this sequence to contain the cross product between it and |
987 | | /// the sequence given, where the sequences are treated as suffixes |
988 | | /// instead of prefixes. Namely, the sequence `other` is *prepended* |
989 | | /// to `self` (as opposed to `other` being *appended* to `self` in |
990 | | /// [`Seq::cross_forward`]). |
991 | | /// |
992 | | /// The cross product only considers literals in this sequence that are |
993 | | /// exact. That is, inexact literals are not extended. |
994 | | /// |
995 | | /// The literals are always drained from `other`, even if none are used. |
996 | | /// This permits callers to reuse the sequence allocation elsewhere. |
997 | | /// |
998 | | /// If this sequence is infinite, then this is a no-op, regardless of what |
999 | | /// `other` contains (and in this case, the literals are still drained from |
1000 | | /// `other`). If `other` is infinite and this sequence is finite, then this |
1001 | | /// is a no-op, unless this sequence contains a zero-length literal. In |
1002 | | /// which case, the infiniteness of `other` infects this sequence, and this |
1003 | | /// sequence is itself made infinite. |
1004 | | /// |
1005 | | /// Like [`Seq::union`], this may attempt to deduplicate literals. See |
1006 | | /// [`Seq::dedup`] for how deduplication deals with exact and inexact |
1007 | | /// literals. |
1008 | | /// |
1009 | | /// # Example |
1010 | | /// |
1011 | | /// This example shows basic usage and how exact and inexact literals |
1012 | | /// interact. |
1013 | | /// |
1014 | | /// ``` |
1015 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1016 | | /// |
1017 | | /// let mut seq1 = Seq::from_iter([ |
1018 | | /// Literal::exact("foo"), |
1019 | | /// Literal::inexact("bar"), |
1020 | | /// ]); |
1021 | | /// let mut seq2 = Seq::from_iter([ |
1022 | | /// Literal::inexact("quux"), |
1023 | | /// Literal::exact("baz"), |
1024 | | /// ]); |
1025 | | /// seq1.cross_reverse(&mut seq2); |
1026 | | /// |
1027 | | /// // The literals are pulled out of seq2. |
1028 | | /// assert_eq!(Some(0), seq2.len()); |
1029 | | /// |
1030 | | /// let expected = Seq::from_iter([ |
1031 | | /// Literal::inexact("quuxfoo"), |
1032 | | /// Literal::inexact("bar"), |
1033 | | /// Literal::exact("bazfoo"), |
1034 | | /// ]); |
1035 | | /// assert_eq!(expected, seq1); |
1036 | | /// ``` |
1037 | | /// |
1038 | | /// This example shows the behavior of when `other` is an infinite |
1039 | | /// sequence. |
1040 | | /// |
1041 | | /// ``` |
1042 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1043 | | /// |
1044 | | /// let mut seq1 = Seq::from_iter([ |
1045 | | /// Literal::exact("foo"), |
1046 | | /// Literal::inexact("bar"), |
1047 | | /// ]); |
1048 | | /// let mut seq2 = Seq::infinite(); |
1049 | | /// seq1.cross_reverse(&mut seq2); |
1050 | | /// |
1051 | | /// // When seq2 is infinite, cross product doesn't add anything, but |
1052 | | /// // ensures all members of seq1 are inexact. |
1053 | | /// let expected = Seq::from_iter([ |
1054 | | /// Literal::inexact("foo"), |
1055 | | /// Literal::inexact("bar"), |
1056 | | /// ]); |
1057 | | /// assert_eq!(expected, seq1); |
1058 | | /// ``` |
1059 | | /// |
1060 | | /// This example is like the one above, but shows what happens when this |
1061 | | /// sequence contains an empty string. In this case, an infinite `other` |
1062 | | /// sequence infects this sequence (because the empty string means that |
1063 | | /// there are no finite suffixes): |
1064 | | /// |
1065 | | /// ``` |
1066 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1067 | | /// |
1068 | | /// let mut seq1 = Seq::from_iter([ |
1069 | | /// Literal::exact("foo"), |
1070 | | /// Literal::exact(""), // inexact provokes same behavior |
1071 | | /// Literal::inexact("bar"), |
1072 | | /// ]); |
1073 | | /// let mut seq2 = Seq::infinite(); |
1074 | | /// seq1.cross_reverse(&mut seq2); |
1075 | | /// |
1076 | | /// // seq1 is now infinite! |
1077 | | /// assert!(!seq1.is_finite()); |
1078 | | /// ``` |
1079 | | /// |
1080 | | /// This example shows the behavior when this sequence is infinite. |
1081 | | /// |
1082 | | /// ``` |
1083 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1084 | | /// |
1085 | | /// let mut seq1 = Seq::infinite(); |
1086 | | /// let mut seq2 = Seq::from_iter([ |
1087 | | /// Literal::exact("foo"), |
1088 | | /// Literal::inexact("bar"), |
1089 | | /// ]); |
1090 | | /// seq1.cross_reverse(&mut seq2); |
1091 | | /// |
1092 | | /// // seq1 remains unchanged. |
1093 | | /// assert!(!seq1.is_finite()); |
1094 | | /// // Even though the literals in seq2 weren't used, it was still drained. |
1095 | | /// assert_eq!(Some(0), seq2.len()); |
1096 | | /// ``` |
1097 | | #[inline] |
1098 | 0 | pub fn cross_reverse(&mut self, other: &mut Seq) { |
1099 | 0 | let (lits1, lits2) = match self.cross_preamble(other) { |
1100 | 0 | None => return, |
1101 | 0 | Some((lits1, lits2)) => (lits1, lits2), |
1102 | 0 | }; |
1103 | 0 | // We basically proceed as we do in 'cross_forward' at this point, |
1104 | 0 | // except that the outer loop is now 'other' and the inner loop is now |
1105 | 0 | // 'self'. That's because 'self' corresponds to suffixes and 'other' |
1106 | 0 | // corresponds to the sequence we want to *prepend* to the suffixes. |
1107 | 0 | let newcap = lits1.len().saturating_mul(lits2.len()); |
1108 | 0 | let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); |
1109 | 0 | for (i, otherlit) in lits2.drain(..).enumerate() { |
1110 | 0 | for selflit in selflits.iter() { |
1111 | 0 | if !selflit.is_exact() { |
1112 | | // If the suffix isn't exact, then we can't prepend |
1113 | | // anything to it. However, we still want to keep it. But |
1114 | | // we only want to keep one of them, to avoid duplication. |
1115 | | // (The duplication is okay from a correctness perspective, |
1116 | | // but wasteful.) |
1117 | 0 | if i == 0 { |
1118 | 0 | lits1.push(selflit.clone()); |
1119 | 0 | } |
1120 | 0 | continue; |
1121 | 0 | } |
1122 | 0 | let mut newlit = Literal::exact(Vec::with_capacity( |
1123 | 0 | otherlit.len() + selflit.len(), |
1124 | 0 | )); |
1125 | 0 | newlit.extend(&otherlit); |
1126 | 0 | newlit.extend(&selflit); |
1127 | 0 | if !otherlit.is_exact() { |
1128 | 0 | newlit.make_inexact(); |
1129 | 0 | } |
1130 | 0 | lits1.push(newlit); |
1131 | | } |
1132 | | } |
1133 | 0 | self.dedup(); |
1134 | 0 | } |
1135 | | |
1136 | | /// A helper function the corresponds to the subtle preamble for both |
1137 | | /// `cross_forward` and `cross_reverse`. In effect, it handles the cases |
1138 | | /// of infinite sequences for both `self` and `other`, as well as ensuring |
1139 | | /// that literals from `other` are drained even if they aren't used. |
1140 | 0 | fn cross_preamble<'a>( |
1141 | 0 | &'a mut self, |
1142 | 0 | other: &'a mut Seq, |
1143 | 0 | ) -> Option<(&'a mut Vec<Literal>, &'a mut Vec<Literal>)> { |
1144 | 0 | let lits2 = match other.literals { |
1145 | | None => { |
1146 | | // If our current seq contains the empty string and the seq |
1147 | | // we're adding matches any literal, then it follows that the |
1148 | | // current seq must now also match any literal. |
1149 | | // |
1150 | | // Otherwise, we just have to make sure everything in this |
1151 | | // sequence is inexact. |
1152 | 0 | if self.min_literal_len() == Some(0) { |
1153 | 0 | *self = Seq::infinite(); |
1154 | 0 | } else { |
1155 | 0 | self.make_inexact(); |
1156 | 0 | } |
1157 | 0 | return None; |
1158 | | } |
1159 | 0 | Some(ref mut lits) => lits, |
1160 | | }; |
1161 | 0 | let lits1 = match self.literals { |
1162 | | None => { |
1163 | | // If we aren't going to make it to the end of this routine |
1164 | | // where lits2 is drained, then we need to do it now. |
1165 | 0 | lits2.drain(..); |
1166 | 0 | return None; |
1167 | | } |
1168 | 0 | Some(ref mut lits) => lits, |
1169 | 0 | }; |
1170 | 0 | Some((lits1, lits2)) |
1171 | 0 | } |
1172 | | |
1173 | | /// Unions the `other` sequence into this one. |
1174 | | /// |
1175 | | /// The literals are always drained out of the given `other` sequence, |
1176 | | /// even if they are being unioned into an infinite sequence. This permits |
1177 | | /// the caller to reuse the `other` sequence in another context. |
1178 | | /// |
1179 | | /// Some literal deduping may be performed. If any deduping happens, |
1180 | | /// any leftmost-first or "preference" order match semantics will be |
1181 | | /// preserved. |
1182 | | /// |
1183 | | /// # Example |
1184 | | /// |
1185 | | /// This example shows basic usage. |
1186 | | /// |
1187 | | /// ``` |
1188 | | /// use regex_syntax::hir::literal::Seq; |
1189 | | /// |
1190 | | /// let mut seq1 = Seq::new(&["foo", "bar"]); |
1191 | | /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); |
1192 | | /// seq1.union(&mut seq2); |
1193 | | /// |
1194 | | /// // The literals are pulled out of seq2. |
1195 | | /// assert_eq!(Some(0), seq2.len()); |
1196 | | /// |
1197 | | /// // Adjacent literals are deduped, but non-adjacent literals may not be. |
1198 | | /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); |
1199 | | /// ``` |
1200 | | /// |
1201 | | /// This example shows that literals are drained from `other` even when |
1202 | | /// they aren't necessarily used. |
1203 | | /// |
1204 | | /// ``` |
1205 | | /// use regex_syntax::hir::literal::Seq; |
1206 | | /// |
1207 | | /// let mut seq1 = Seq::infinite(); |
1208 | | /// // Infinite sequences have no finite length. |
1209 | | /// assert_eq!(None, seq1.len()); |
1210 | | /// |
1211 | | /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); |
1212 | | /// seq1.union(&mut seq2); |
1213 | | /// |
1214 | | /// // seq1 is still infinite and seq2 has been drained. |
1215 | | /// assert_eq!(None, seq1.len()); |
1216 | | /// assert_eq!(Some(0), seq2.len()); |
1217 | | /// ``` |
1218 | | #[inline] |
1219 | 0 | pub fn union(&mut self, other: &mut Seq) { |
1220 | 0 | let lits2 = match other.literals { |
1221 | | None => { |
1222 | | // Unioning with an infinite sequence always results in an |
1223 | | // infinite sequence. |
1224 | 0 | self.make_infinite(); |
1225 | 0 | return; |
1226 | | } |
1227 | 0 | Some(ref mut lits) => lits.drain(..), |
1228 | | }; |
1229 | 0 | let lits1 = match self.literals { |
1230 | 0 | None => return, |
1231 | 0 | Some(ref mut lits) => lits, |
1232 | 0 | }; |
1233 | 0 | lits1.extend(lits2); |
1234 | 0 | self.dedup(); |
1235 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::union Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::union |
1236 | | |
1237 | | /// Unions the `other` sequence into this one by splice the `other` |
1238 | | /// sequence at the position of the first zero-length literal. |
1239 | | /// |
1240 | | /// This is useful for preserving preference order semantics when combining |
1241 | | /// two literal sequences. For example, in the regex `(a||f)+foo`, the |
1242 | | /// correct preference order prefix sequence is `[a, foo, f]`. |
1243 | | /// |
1244 | | /// The literals are always drained out of the given `other` sequence, |
1245 | | /// even if they are being unioned into an infinite sequence. This permits |
1246 | | /// the caller to reuse the `other` sequence in another context. Note that |
1247 | | /// the literals are drained even if no union is performed as well, i.e., |
1248 | | /// when this sequence does not contain a zero-length literal. |
1249 | | /// |
1250 | | /// Some literal deduping may be performed. If any deduping happens, |
1251 | | /// any leftmost-first or "preference" order match semantics will be |
1252 | | /// preserved. |
1253 | | /// |
1254 | | /// # Example |
1255 | | /// |
1256 | | /// This example shows basic usage. |
1257 | | /// |
1258 | | /// ``` |
1259 | | /// use regex_syntax::hir::literal::Seq; |
1260 | | /// |
1261 | | /// let mut seq1 = Seq::new(&["a", "", "f", ""]); |
1262 | | /// let mut seq2 = Seq::new(&["foo"]); |
1263 | | /// seq1.union_into_empty(&mut seq2); |
1264 | | /// |
1265 | | /// // The literals are pulled out of seq2. |
1266 | | /// assert_eq!(Some(0), seq2.len()); |
1267 | | /// // 'foo' gets spliced into seq1 where the first empty string occurs. |
1268 | | /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); |
1269 | | /// ``` |
1270 | | /// |
1271 | | /// This example shows that literals are drained from `other` even when |
1272 | | /// they aren't necessarily used. |
1273 | | /// |
1274 | | /// ``` |
1275 | | /// use regex_syntax::hir::literal::Seq; |
1276 | | /// |
1277 | | /// let mut seq1 = Seq::new(&["foo", "bar"]); |
1278 | | /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); |
1279 | | /// seq1.union_into_empty(&mut seq2); |
1280 | | /// |
1281 | | /// // seq1 has no zero length literals, so no splicing happens. |
1282 | | /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); |
1283 | | /// // Even though no splicing happens, seq2 is still drained. |
1284 | | /// assert_eq!(Some(0), seq2.len()); |
1285 | | /// ``` |
1286 | | #[inline] |
1287 | 0 | pub fn union_into_empty(&mut self, other: &mut Seq) { |
1288 | 0 | let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); |
1289 | 0 | let lits1 = match self.literals { |
1290 | 0 | None => return, |
1291 | 0 | Some(ref mut lits) => lits, |
1292 | | }; |
1293 | 0 | let first_empty = match lits1.iter().position(|m| m.is_empty()) { |
1294 | 0 | None => return, |
1295 | 0 | Some(i) => i, |
1296 | | }; |
1297 | 0 | let lits2 = match lits2 { |
1298 | | None => { |
1299 | | // Note that we are only here if we've found an empty literal, |
1300 | | // which implies that an infinite sequence infects this seq and |
1301 | | // also turns it into an infinite sequence. |
1302 | 0 | self.literals = None; |
1303 | 0 | return; |
1304 | | } |
1305 | 0 | Some(lits) => lits, |
1306 | 0 | }; |
1307 | 0 | // Clearing out the empties needs to come before the splice because |
1308 | 0 | // the splice might add more empties that we don't want to get rid |
1309 | 0 | // of. Since we're splicing into the position of the first empty, the |
1310 | 0 | // 'first_empty' position computed above is still correct. |
1311 | 0 | lits1.retain(|m| !m.is_empty()); |
1312 | 0 | lits1.splice(first_empty..first_empty, lits2); |
1313 | 0 | self.dedup(); |
1314 | 0 | } |
1315 | | |
1316 | | /// Deduplicate adjacent equivalent literals in this sequence. |
1317 | | /// |
1318 | | /// If adjacent literals are equivalent strings but one is exact and the |
1319 | | /// other inexact, the inexact literal is kept and the exact one is |
1320 | | /// removed. |
1321 | | /// |
1322 | | /// Deduping an infinite sequence is a no-op. |
1323 | | /// |
1324 | | /// # Example |
1325 | | /// |
1326 | | /// This example shows how literals that are duplicate byte strings but |
1327 | | /// are not equivalent with respect to exactness are resolved. |
1328 | | /// |
1329 | | /// ``` |
1330 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1331 | | /// |
1332 | | /// let mut seq = Seq::from_iter([ |
1333 | | /// Literal::exact("foo"), |
1334 | | /// Literal::inexact("foo"), |
1335 | | /// ]); |
1336 | | /// seq.dedup(); |
1337 | | /// |
1338 | | /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); |
1339 | | /// ``` |
1340 | | #[inline] |
1341 | 0 | pub fn dedup(&mut self) { |
1342 | 0 | if let Some(ref mut lits) = self.literals { |
1343 | 0 | lits.dedup_by(|lit1, lit2| { |
1344 | 0 | if lit1.as_bytes() != lit2.as_bytes() { |
1345 | 0 | return false; |
1346 | 0 | } |
1347 | 0 | if lit1.is_exact() != lit2.is_exact() { |
1348 | 0 | lit1.make_inexact(); |
1349 | 0 | lit2.make_inexact(); |
1350 | 0 | } |
1351 | 0 | true |
1352 | 0 | }); |
1353 | 0 | } |
1354 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::dedup Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::dedup |
1355 | | |
1356 | | /// Sorts this sequence of literals lexicographically. |
1357 | | /// |
1358 | | /// Note that if, before sorting, if a literal that is a prefix of another |
1359 | | /// literal appears after it, then after sorting, the sequence will not |
1360 | | /// represent the same preference order match semantics. For example, |
1361 | | /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, |
1362 | | /// samwise]`. Under preference order semantics, the latter sequence will |
1363 | | /// never match `samwise` where as the first sequence can. |
1364 | | /// |
1365 | | /// # Example |
1366 | | /// |
1367 | | /// This example shows basic usage. |
1368 | | /// |
1369 | | /// ``` |
1370 | | /// use regex_syntax::hir::literal::Seq; |
1371 | | /// |
1372 | | /// let mut seq = Seq::new(&["foo", "quux", "bar"]); |
1373 | | /// seq.sort(); |
1374 | | /// |
1375 | | /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); |
1376 | | /// ``` |
1377 | | #[inline] |
1378 | 0 | pub fn sort(&mut self) { |
1379 | 0 | if let Some(ref mut lits) = self.literals { |
1380 | 0 | lits.sort(); |
1381 | 0 | } |
1382 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::sort Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::sort |
1383 | | |
1384 | | /// Reverses all of the literals in this sequence. |
1385 | | /// |
1386 | | /// The order of the sequence itself is preserved. |
1387 | | /// |
1388 | | /// # Example |
1389 | | /// |
1390 | | /// This example shows basic usage. |
1391 | | /// |
1392 | | /// ``` |
1393 | | /// use regex_syntax::hir::literal::Seq; |
1394 | | /// |
1395 | | /// let mut seq = Seq::new(&["oof", "rab"]); |
1396 | | /// seq.reverse_literals(); |
1397 | | /// assert_eq!(Seq::new(&["foo", "bar"]), seq); |
1398 | | /// ``` |
1399 | | #[inline] |
1400 | 0 | pub fn reverse_literals(&mut self) { |
1401 | 0 | if let Some(ref mut lits) = self.literals { |
1402 | 0 | for lit in lits.iter_mut() { |
1403 | 0 | lit.reverse(); |
1404 | 0 | } |
1405 | 0 | } |
1406 | 0 | } |
1407 | | |
1408 | | /// Shrinks this seq to its minimal size while respecting the preference |
1409 | | /// order of its literals. |
1410 | | /// |
1411 | | /// While this routine will remove duplicate literals from this seq, it |
1412 | | /// will also remove literals that can never match in a leftmost-first or |
1413 | | /// "preference order" search. Similar to [`Seq::dedup`], if a literal is |
1414 | | /// deduped, then the one that remains is made inexact. |
1415 | | /// |
1416 | | /// This is a no-op on seqs that are empty or not finite. |
1417 | | /// |
1418 | | /// # Example |
1419 | | /// |
1420 | | /// This example shows the difference between `{sam, samwise}` and |
1421 | | /// `{samwise, sam}`. |
1422 | | /// |
1423 | | /// ``` |
1424 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1425 | | /// |
1426 | | /// // If 'sam' comes before 'samwise' and a preference order search is |
1427 | | /// // executed, then 'samwise' can never match. |
1428 | | /// let mut seq = Seq::new(&["sam", "samwise"]); |
1429 | | /// seq.minimize_by_preference(); |
1430 | | /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); |
1431 | | /// |
1432 | | /// // But if they are reversed, then it's possible for 'samwise' to match |
1433 | | /// // since it is given higher preference. |
1434 | | /// let mut seq = Seq::new(&["samwise", "sam"]); |
1435 | | /// seq.minimize_by_preference(); |
1436 | | /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); |
1437 | | /// ``` |
1438 | | /// |
1439 | | /// This example shows that if an empty string is in this seq, then |
1440 | | /// anything that comes after it can never match. |
1441 | | /// |
1442 | | /// ``` |
1443 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1444 | | /// |
1445 | | /// // An empty string is a prefix of all strings, so it automatically |
1446 | | /// // inhibits any subsequent strings from matching. |
1447 | | /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); |
1448 | | /// seq.minimize_by_preference(); |
1449 | | /// let expected = Seq::from_iter([ |
1450 | | /// Literal::exact("foo"), |
1451 | | /// Literal::exact("bar"), |
1452 | | /// Literal::inexact(""), |
1453 | | /// ]); |
1454 | | /// assert_eq!(expected, seq); |
1455 | | /// |
1456 | | /// // And of course, if it's at the beginning, then it makes it impossible |
1457 | | /// // for anything else to match. |
1458 | | /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); |
1459 | | /// seq.minimize_by_preference(); |
1460 | | /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); |
1461 | | /// ``` |
1462 | | #[inline] |
1463 | 0 | pub fn minimize_by_preference(&mut self) { |
1464 | 0 | if let Some(ref mut lits) = self.literals { |
1465 | 0 | PreferenceTrie::minimize(lits, false); |
1466 | 0 | } |
1467 | 0 | } |
1468 | | |
1469 | | /// Trims all literals in this seq such that only the first `len` bytes |
1470 | | /// remain. If a literal has less than or equal to `len` bytes, then it |
1471 | | /// remains unchanged. Otherwise, it is trimmed and made inexact. |
1472 | | /// |
1473 | | /// # Example |
1474 | | /// |
1475 | | /// ``` |
1476 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1477 | | /// |
1478 | | /// let mut seq = Seq::new(&["a", "foo", "quux"]); |
1479 | | /// seq.keep_first_bytes(2); |
1480 | | /// |
1481 | | /// let expected = Seq::from_iter([ |
1482 | | /// Literal::exact("a"), |
1483 | | /// Literal::inexact("fo"), |
1484 | | /// Literal::inexact("qu"), |
1485 | | /// ]); |
1486 | | /// assert_eq!(expected, seq); |
1487 | | /// ``` |
1488 | | #[inline] |
1489 | 0 | pub fn keep_first_bytes(&mut self, len: usize) { |
1490 | 0 | if let Some(ref mut lits) = self.literals { |
1491 | 0 | for m in lits.iter_mut() { |
1492 | 0 | m.keep_first_bytes(len); |
1493 | 0 | } |
1494 | 0 | } |
1495 | 0 | } |
1496 | | |
1497 | | /// Trims all literals in this seq such that only the last `len` bytes |
1498 | | /// remain. If a literal has less than or equal to `len` bytes, then it |
1499 | | /// remains unchanged. Otherwise, it is trimmed and made inexact. |
1500 | | /// |
1501 | | /// # Example |
1502 | | /// |
1503 | | /// ``` |
1504 | | /// use regex_syntax::hir::literal::{Literal, Seq}; |
1505 | | /// |
1506 | | /// let mut seq = Seq::new(&["a", "foo", "quux"]); |
1507 | | /// seq.keep_last_bytes(2); |
1508 | | /// |
1509 | | /// let expected = Seq::from_iter([ |
1510 | | /// Literal::exact("a"), |
1511 | | /// Literal::inexact("oo"), |
1512 | | /// Literal::inexact("ux"), |
1513 | | /// ]); |
1514 | | /// assert_eq!(expected, seq); |
1515 | | /// ``` |
1516 | | #[inline] |
1517 | 0 | pub fn keep_last_bytes(&mut self, len: usize) { |
1518 | 0 | if let Some(ref mut lits) = self.literals { |
1519 | 0 | for m in lits.iter_mut() { |
1520 | 0 | m.keep_last_bytes(len); |
1521 | 0 | } |
1522 | 0 | } |
1523 | 0 | } |
1524 | | |
1525 | | /// Returns true if this sequence is finite. |
1526 | | /// |
1527 | | /// When false, this sequence is infinite and must be treated as if it |
1528 | | /// contains every possible literal. |
1529 | | #[inline] |
1530 | 0 | pub fn is_finite(&self) -> bool { |
1531 | 0 | self.literals.is_some() |
1532 | 0 | } |
1533 | | |
1534 | | /// Returns true if and only if this sequence is finite and empty. |
1535 | | /// |
1536 | | /// An empty sequence never matches anything. It can only be produced by |
1537 | | /// literal extraction when the corresponding regex itself cannot match. |
1538 | | #[inline] |
1539 | 0 | pub fn is_empty(&self) -> bool { |
1540 | 0 | self.len() == Some(0) |
1541 | 0 | } |
1542 | | |
1543 | | /// Returns the number of literals in this sequence if the sequence is |
1544 | | /// finite. If the sequence is infinite, then `None` is returned. |
1545 | | #[inline] |
1546 | 0 | pub fn len(&self) -> Option<usize> { |
1547 | 0 | self.literals.as_ref().map(|lits| lits.len()) |
1548 | 0 | } |
1549 | | |
1550 | | /// Returns true if and only if all literals in this sequence are exact. |
1551 | | /// |
1552 | | /// This returns false if the sequence is infinite. |
1553 | | #[inline] |
1554 | 0 | pub fn is_exact(&self) -> bool { |
1555 | 0 | self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) |
1556 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::is_exact Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::is_exact |
1557 | | |
1558 | | /// Returns true if and only if all literals in this sequence are inexact. |
1559 | | /// |
1560 | | /// This returns true if the sequence is infinite. |
1561 | | #[inline] |
1562 | 0 | pub fn is_inexact(&self) -> bool { |
1563 | 0 | self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) |
1564 | 0 | } |
1565 | | |
1566 | | /// Return the maximum length of the sequence that would result from |
1567 | | /// unioning `self` with `other`. If either set is infinite, then this |
1568 | | /// returns `None`. |
1569 | | #[inline] |
1570 | 0 | pub fn max_union_len(&self, other: &Seq) -> Option<usize> { |
1571 | 0 | let len1 = self.len()?; |
1572 | 0 | let len2 = other.len()?; |
1573 | 0 | Some(len1.saturating_add(len2)) |
1574 | 0 | } |
1575 | | |
1576 | | /// Return the maximum length of the sequence that would result from the |
1577 | | /// cross product of `self` with `other`. If either set is infinite, then |
1578 | | /// this returns `None`. |
1579 | | #[inline] |
1580 | 0 | pub fn max_cross_len(&self, other: &Seq) -> Option<usize> { |
1581 | 0 | let len1 = self.len()?; |
1582 | 0 | let len2 = other.len()?; |
1583 | 0 | Some(len1.saturating_mul(len2)) |
1584 | 0 | } |
1585 | | |
1586 | | /// Returns the length of the shortest literal in this sequence. |
1587 | | /// |
1588 | | /// If the sequence is infinite or empty, then this returns `None`. |
1589 | | #[inline] |
1590 | 0 | pub fn min_literal_len(&self) -> Option<usize> { |
1591 | 0 | self.literals.as_ref()?.iter().map(|x| x.len()).min() |
1592 | 0 | } |
1593 | | |
1594 | | /// Returns the length of the longest literal in this sequence. |
1595 | | /// |
1596 | | /// If the sequence is infinite or empty, then this returns `None`. |
1597 | | #[inline] |
1598 | 0 | pub fn max_literal_len(&self) -> Option<usize> { |
1599 | 0 | self.literals.as_ref()?.iter().map(|x| x.len()).max() |
1600 | 0 | } |
1601 | | |
1602 | | /// Returns the longest common prefix from this seq. |
1603 | | /// |
1604 | | /// If the seq matches any literal or other contains no literals, then |
1605 | | /// there is no meaningful prefix and this returns `None`. |
1606 | | /// |
1607 | | /// # Example |
1608 | | /// |
1609 | | /// This shows some example seqs and their longest common prefix. |
1610 | | /// |
1611 | | /// ``` |
1612 | | /// use regex_syntax::hir::literal::Seq; |
1613 | | /// |
1614 | | /// let seq = Seq::new(&["foo", "foobar", "fo"]); |
1615 | | /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); |
1616 | | /// let seq = Seq::new(&["foo", "foo"]); |
1617 | | /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); |
1618 | | /// let seq = Seq::new(&["foo", "bar"]); |
1619 | | /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); |
1620 | | /// let seq = Seq::new(&[""]); |
1621 | | /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); |
1622 | | /// |
1623 | | /// let seq = Seq::infinite(); |
1624 | | /// assert_eq!(None, seq.longest_common_prefix()); |
1625 | | /// let seq = Seq::empty(); |
1626 | | /// assert_eq!(None, seq.longest_common_prefix()); |
1627 | | /// ``` |
1628 | | #[inline] |
1629 | 0 | pub fn longest_common_prefix(&self) -> Option<&[u8]> { |
1630 | | // If we match everything or match nothing, then there's no meaningful |
1631 | | // longest common prefix. |
1632 | 0 | let lits = match self.literals { |
1633 | 0 | None => return None, |
1634 | 0 | Some(ref lits) => lits, |
1635 | 0 | }; |
1636 | 0 | if lits.len() == 0 { |
1637 | 0 | return None; |
1638 | 0 | } |
1639 | 0 | let base = lits[0].as_bytes(); |
1640 | 0 | let mut len = base.len(); |
1641 | 0 | for m in lits.iter().skip(1) { |
1642 | 0 | len = m |
1643 | 0 | .as_bytes() |
1644 | 0 | .iter() |
1645 | 0 | .zip(base[..len].iter()) |
1646 | 0 | .take_while(|&(a, b)| a == b) |
1647 | 0 | .count(); |
1648 | 0 | if len == 0 { |
1649 | 0 | return Some(&[]); |
1650 | 0 | } |
1651 | | } |
1652 | 0 | Some(&base[..len]) |
1653 | 0 | } |
1654 | | |
1655 | | /// Returns the longest common suffix from this seq. |
1656 | | /// |
1657 | | /// If the seq matches any literal or other contains no literals, then |
1658 | | /// there is no meaningful suffix and this returns `None`. |
1659 | | /// |
1660 | | /// # Example |
1661 | | /// |
1662 | | /// This shows some example seqs and their longest common suffix. |
1663 | | /// |
1664 | | /// ``` |
1665 | | /// use regex_syntax::hir::literal::Seq; |
1666 | | /// |
1667 | | /// let seq = Seq::new(&["oof", "raboof", "of"]); |
1668 | | /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); |
1669 | | /// let seq = Seq::new(&["foo", "foo"]); |
1670 | | /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); |
1671 | | /// let seq = Seq::new(&["foo", "bar"]); |
1672 | | /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); |
1673 | | /// let seq = Seq::new(&[""]); |
1674 | | /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); |
1675 | | /// |
1676 | | /// let seq = Seq::infinite(); |
1677 | | /// assert_eq!(None, seq.longest_common_suffix()); |
1678 | | /// let seq = Seq::empty(); |
1679 | | /// assert_eq!(None, seq.longest_common_suffix()); |
1680 | | /// ``` |
1681 | | #[inline] |
1682 | 0 | pub fn longest_common_suffix(&self) -> Option<&[u8]> { |
1683 | | // If we match everything or match nothing, then there's no meaningful |
1684 | | // longest common suffix. |
1685 | 0 | let lits = match self.literals { |
1686 | 0 | None => return None, |
1687 | 0 | Some(ref lits) => lits, |
1688 | 0 | }; |
1689 | 0 | if lits.len() == 0 { |
1690 | 0 | return None; |
1691 | 0 | } |
1692 | 0 | let base = lits[0].as_bytes(); |
1693 | 0 | let mut len = base.len(); |
1694 | 0 | for m in lits.iter().skip(1) { |
1695 | 0 | len = m |
1696 | 0 | .as_bytes() |
1697 | 0 | .iter() |
1698 | 0 | .rev() |
1699 | 0 | .zip(base[base.len() - len..].iter().rev()) |
1700 | 0 | .take_while(|&(a, b)| a == b) |
1701 | 0 | .count(); |
1702 | 0 | if len == 0 { |
1703 | 0 | return Some(&[]); |
1704 | 0 | } |
1705 | | } |
1706 | 0 | Some(&base[base.len() - len..]) |
1707 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::longest_common_suffix Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::longest_common_suffix |
1708 | | |
1709 | | /// Optimizes this seq while treating its literals as prefixes and |
1710 | | /// respecting the preference order of its literals. |
1711 | | /// |
1712 | | /// The specific way "optimization" works is meant to be an implementation |
1713 | | /// detail, as it essentially represents a set of heuristics. The goal |
1714 | | /// that optimization tries to accomplish is to make the literals in this |
1715 | | /// set reflect inputs that will result in a more effective prefilter. |
1716 | | /// Principally by reducing the false positive rate of candidates found by |
1717 | | /// the literals in this sequence. That is, when a match of a literal is |
1718 | | /// found, we would like it to be a strong predictor of the overall match |
1719 | | /// of the regex. If it isn't, then much time will be spent starting and |
1720 | | /// stopping the prefilter search and attempting to confirm the match only |
1721 | | /// to have it fail. |
1722 | | /// |
1723 | | /// Some of those heuristics might be: |
1724 | | /// |
1725 | | /// * Identifying a common prefix from a larger sequence of literals, and |
1726 | | /// shrinking the sequence down to that single common prefix. |
1727 | | /// * Rejecting the sequence entirely if it is believed to result in very |
1728 | | /// high false positive rate. When this happens, the sequence is made |
1729 | | /// infinite. |
1730 | | /// * Shrinking the sequence to a smaller number of literals representing |
1731 | | /// prefixes, but not shrinking it so much as to make literals too short. |
1732 | | /// (A sequence with very short literals, of 1 or 2 bytes, will typically |
1733 | | /// result in a higher false positive rate.) |
1734 | | /// |
1735 | | /// Optimization should only be run once extraction is complete. Namely, |
1736 | | /// optimization may make assumptions that do not compose with other |
1737 | | /// operations in the middle of extraction. For example, optimization will |
1738 | | /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation |
1739 | | /// is only valid if no other extraction will occur. If other extraction |
1740 | | /// may occur, then the correct transformation would be to `[I(sam)]`. |
1741 | | /// |
1742 | | /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but |
1743 | | /// for suffixes. |
1744 | | /// |
1745 | | /// # Example |
1746 | | /// |
1747 | | /// This shows how optimization might transform a sequence. Note that |
1748 | | /// the specific behavior is not a documented guarantee. The heuristics |
1749 | | /// used are an implementation detail and may change over time in semver |
1750 | | /// compatible releases. |
1751 | | /// |
1752 | | /// ``` |
1753 | | /// use regex_syntax::hir::literal::{Seq, Literal}; |
1754 | | /// |
1755 | | /// let mut seq = Seq::new(&[ |
1756 | | /// "samantha", |
1757 | | /// "sam", |
1758 | | /// "samwise", |
1759 | | /// "frodo", |
1760 | | /// ]); |
1761 | | /// seq.optimize_for_prefix_by_preference(); |
1762 | | /// assert_eq!(Seq::from_iter([ |
1763 | | /// Literal::exact("samantha"), |
1764 | | /// // Kept exact even though 'samwise' got pruned |
1765 | | /// // because optimization assumes literal extraction |
1766 | | /// // has finished. |
1767 | | /// Literal::exact("sam"), |
1768 | | /// Literal::exact("frodo"), |
1769 | | /// ]), seq); |
1770 | | /// ``` |
1771 | | /// |
1772 | | /// # Example: optimization may make the sequence infinite |
1773 | | /// |
1774 | | /// If the heuristics deem that the sequence could cause a very high false |
1775 | | /// positive rate, then it may make the sequence infinite, effectively |
1776 | | /// disabling its use as a prefilter. |
1777 | | /// |
1778 | | /// ``` |
1779 | | /// use regex_syntax::hir::literal::{Seq, Literal}; |
1780 | | /// |
1781 | | /// let mut seq = Seq::new(&[ |
1782 | | /// "samantha", |
1783 | | /// // An empty string matches at every position, |
1784 | | /// // thus rendering the prefilter completely |
1785 | | /// // ineffective. |
1786 | | /// "", |
1787 | | /// "sam", |
1788 | | /// "samwise", |
1789 | | /// "frodo", |
1790 | | /// ]); |
1791 | | /// seq.optimize_for_prefix_by_preference(); |
1792 | | /// assert!(!seq.is_finite()); |
1793 | | /// ``` |
1794 | | /// |
1795 | | /// Do note that just because there is a `" "` in the sequence, that |
1796 | | /// doesn't mean the sequence will always be made infinite after it is |
1797 | | /// optimized. Namely, if the sequence is considered exact (any match |
1798 | | /// corresponds to an overall match of the original regex), then any match |
1799 | | /// is an overall match, and so the false positive rate is always `0`. |
1800 | | /// |
1801 | | /// To demonstrate this, we remove `samwise` from our sequence. This |
1802 | | /// results in no optimization happening and all literals remain exact. |
1803 | | /// Thus the entire sequence is exact, and it is kept as-is, even though |
1804 | | /// one is an ASCII space: |
1805 | | /// |
1806 | | /// ``` |
1807 | | /// use regex_syntax::hir::literal::{Seq, Literal}; |
1808 | | /// |
1809 | | /// let mut seq = Seq::new(&[ |
1810 | | /// "samantha", |
1811 | | /// " ", |
1812 | | /// "sam", |
1813 | | /// "frodo", |
1814 | | /// ]); |
1815 | | /// seq.optimize_for_prefix_by_preference(); |
1816 | | /// assert!(seq.is_finite()); |
1817 | | /// ``` |
1818 | | #[inline] |
1819 | 0 | pub fn optimize_for_prefix_by_preference(&mut self) { |
1820 | 0 | self.optimize_by_preference(true); |
1821 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::optimize_for_prefix_by_preference Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::optimize_for_prefix_by_preference |
1822 | | |
1823 | | /// Optimizes this seq while treating its literals as suffixes and |
1824 | | /// respecting the preference order of its literals. |
1825 | | /// |
1826 | | /// Optimization should only be run once extraction is complete. |
1827 | | /// |
1828 | | /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but |
1829 | | /// for prefixes. See its documentation for more explanation. |
1830 | | #[inline] |
1831 | 0 | pub fn optimize_for_suffix_by_preference(&mut self) { |
1832 | 0 | self.optimize_by_preference(false); |
1833 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::optimize_for_suffix_by_preference Unexecuted instantiation: <regex_syntax::hir::literal::Seq>::optimize_for_suffix_by_preference |
1834 | | |
1835 | 0 | fn optimize_by_preference(&mut self, prefix: bool) { |
1836 | 0 | let origlen = match self.len() { |
1837 | 0 | None => return, |
1838 | 0 | Some(len) => len, |
1839 | 0 | }; |
1840 | 0 | // Just give up now if our sequence contains an empty string. |
1841 | 0 | if self.min_literal_len().map_or(false, |len| len == 0) { |
1842 | | // We squash the sequence so that nobody else gets any bright |
1843 | | // ideas to try and use it. An empty string implies a match at |
1844 | | // every position. A prefilter cannot help you here. |
1845 | 0 | self.make_infinite(); |
1846 | 0 | return; |
1847 | 0 | } |
1848 | 0 | // Make sure we start with the smallest sequence possible. We use a |
1849 | 0 | // special version of preference minimization that retains exactness. |
1850 | 0 | // This is legal because optimization is only expected to occur once |
1851 | 0 | // extraction is complete. |
1852 | 0 | if prefix { |
1853 | 0 | if let Some(ref mut lits) = self.literals { |
1854 | 0 | PreferenceTrie::minimize(lits, true); |
1855 | 0 | } |
1856 | 0 | } |
1857 | | |
1858 | | // Look for a common prefix (or suffix). If we found one of those and |
1859 | | // it's long enough, then it's a good bet that it will be our fastest |
1860 | | // possible prefilter since single-substring search is so fast. |
1861 | 0 | let fix = if prefix { |
1862 | 0 | self.longest_common_prefix() |
1863 | | } else { |
1864 | 0 | self.longest_common_suffix() |
1865 | | }; |
1866 | 0 | if let Some(fix) = fix { |
1867 | | // As a special case, if we have a common prefix and the leading |
1868 | | // byte of that prefix is one that we think probably occurs rarely, |
1869 | | // then strip everything down to just that single byte. This should |
1870 | | // promote the use of memchr. |
1871 | | // |
1872 | | // ... we only do this though if our sequence has more than one |
1873 | | // literal. Otherwise, we'd rather just stick with a single literal |
1874 | | // scan. That is, using memchr is probably better than looking |
1875 | | // for 2 or more literals, but probably not as good as a straight |
1876 | | // memmem search. |
1877 | | // |
1878 | | // ... and also only do this when the prefix is short and probably |
1879 | | // not too discriminatory anyway. If it's longer, then it's |
1880 | | // probably quite discriminatory and thus is likely to have a low |
1881 | | // false positive rate. |
1882 | 0 | if prefix |
1883 | 0 | && origlen > 1 |
1884 | 0 | && fix.len() >= 1 |
1885 | 0 | && fix.len() <= 3 |
1886 | 0 | && rank(fix[0]) < 200 |
1887 | | { |
1888 | 0 | self.keep_first_bytes(1); |
1889 | 0 | self.dedup(); |
1890 | 0 | return; |
1891 | 0 | } |
1892 | | // We only strip down to the common prefix/suffix if we think |
1893 | | // the existing set of literals isn't great, or if the common |
1894 | | // prefix/suffix is expected to be particularly discriminatory. |
1895 | 0 | let isfast = |
1896 | 0 | self.is_exact() && self.len().map_or(false, |len| len <= 16); |
1897 | 0 | let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); |
1898 | 0 | if usefix { |
1899 | | // If we keep exactly the number of bytes equal to the length |
1900 | | // of the prefix (or suffix), then by the definition of a |
1901 | | // prefix, every literal in the sequence will be equivalent. |
1902 | | // Thus, 'dedup' will leave us with one literal. |
1903 | | // |
1904 | | // We do it this way to avoid an alloc, but also to make sure |
1905 | | // the exactness of literals is kept (or not). |
1906 | 0 | if prefix { |
1907 | 0 | self.keep_first_bytes(fix.len()); |
1908 | 0 | } else { |
1909 | 0 | self.keep_last_bytes(fix.len()); |
1910 | 0 | } |
1911 | 0 | self.dedup(); |
1912 | 0 | assert_eq!(Some(1), self.len()); |
1913 | | // We still fall through here. In particular, we want our |
1914 | | // longest common prefix to be subject to the poison check. |
1915 | 0 | } |
1916 | 0 | } |
1917 | | // If we have an exact sequence, we *probably* just want to keep it |
1918 | | // as-is. But there are some cases where we don't. So we save a copy of |
1919 | | // the exact sequence now, and then try to do some more optimizations |
1920 | | // below. If those don't work out, we go back to this exact sequence. |
1921 | | // |
1922 | | // The specific motivation for this is that we sometimes wind up with |
1923 | | // an exact sequence with a hefty number of literals. Say, 100. If we |
1924 | | // stuck with that, it would be too big for Teddy and would result in |
1925 | | // using Aho-Corasick. Which is fine... but the lazy DFA is plenty |
1926 | | // suitable in such cases. The real issue is that we will wind up not |
1927 | | // using a fast prefilter at all. So in cases like this, even though |
1928 | | // we have an exact sequence, it would be better to try and shrink the |
1929 | | // sequence (which we do below) and use it as a prefilter that can |
1930 | | // produce false positive matches. |
1931 | | // |
1932 | | // But if the shrinking below results in a sequence that "sucks," then |
1933 | | // we don't want to use that because we already have an exact sequence |
1934 | | // in hand. |
1935 | 0 | let exact: Option<Seq> = |
1936 | 0 | if self.is_exact() { Some(self.clone()) } else { None }; |
1937 | | // Now we attempt to shorten the sequence. The idea here is that we |
1938 | | // don't want to look for too many literals, but we want to shorten |
1939 | | // our sequence enough to improve our odds of using better algorithms |
1940 | | // downstream (such as Teddy). |
1941 | | // |
1942 | | // The pair of numbers in this list corresponds to the maximal prefix |
1943 | | // (in bytes) to keep for all literals and the length of the sequence |
1944 | | // at which to do it. |
1945 | | // |
1946 | | // So for example, the pair (3, 500) would mean, "if we have more than |
1947 | | // 500 literals in our sequence, then truncate all of our literals |
1948 | | // such that they are at most 3 bytes in length and the minimize the |
1949 | | // sequence." |
1950 | | const ATTEMPTS: [(usize, usize); 5] = |
1951 | | [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)]; |
1952 | 0 | for (keep, limit) in ATTEMPTS { |
1953 | 0 | let len = match self.len() { |
1954 | 0 | None => break, |
1955 | 0 | Some(len) => len, |
1956 | 0 | }; |
1957 | 0 | if len <= limit { |
1958 | 0 | break; |
1959 | 0 | } |
1960 | 0 | if prefix { |
1961 | 0 | self.keep_first_bytes(keep); |
1962 | 0 | } else { |
1963 | 0 | self.keep_last_bytes(keep); |
1964 | 0 | } |
1965 | 0 | if prefix { |
1966 | 0 | if let Some(ref mut lits) = self.literals { |
1967 | 0 | PreferenceTrie::minimize(lits, true); |
1968 | 0 | } |
1969 | 0 | } |
1970 | | } |
1971 | | // Check for a poison literal. A poison literal is one that is short |
1972 | | // and is believed to have a very high match count. These poisons |
1973 | | // generally lead to a prefilter with a very high false positive rate, |
1974 | | // and thus overall worse performance. |
1975 | | // |
1976 | | // We do this last because we could have gone from a non-poisonous |
1977 | | // sequence to a poisonous one. Perhaps we should add some code to |
1978 | | // prevent such transitions in the first place, but then again, we |
1979 | | // likely only made the transition in the first place if the sequence |
1980 | | // was itself huge. And huge sequences are themselves poisonous. So... |
1981 | 0 | if let Some(lits) = self.literals() { |
1982 | 0 | if lits.iter().any(|lit| lit.is_poisonous()) { |
1983 | 0 | self.make_infinite(); |
1984 | 0 | } |
1985 | 0 | } |
1986 | | // OK, if we had an exact sequence before attempting more optimizations |
1987 | | // above and our post-optimized sequence sucks for some reason or |
1988 | | // another, then we go back to the exact sequence. |
1989 | 0 | if let Some(exact) = exact { |
1990 | | // If optimizing resulted in dropping our literals, then certainly |
1991 | | // backup and use the exact sequence that we had. |
1992 | 0 | if !self.is_finite() { |
1993 | 0 | *self = exact; |
1994 | 0 | return; |
1995 | 0 | } |
1996 | 0 | // If our optimized sequence contains a short literal, then it's |
1997 | 0 | // *probably* not so great. So throw it away and revert to the |
1998 | 0 | // exact sequence. |
1999 | 0 | if self.min_literal_len().map_or(true, |len| len <= 2) { |
2000 | 0 | *self = exact; |
2001 | 0 | return; |
2002 | 0 | } |
2003 | 0 | // Finally, if our optimized sequence is "big" (i.e., can't use |
2004 | 0 | // Teddy), then also don't use it and rely on the exact sequence. |
2005 | 0 | if self.len().map_or(true, |len| len > 64) { |
2006 | 0 | *self = exact; |
2007 | 0 | return; |
2008 | 0 | } |
2009 | 0 | } |
2010 | 0 | } |
2011 | | } |
2012 | | |
2013 | | impl core::fmt::Debug for Seq { |
2014 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2015 | 0 | write!(f, "Seq")?; |
2016 | 0 | if let Some(lits) = self.literals() { |
2017 | 0 | f.debug_list().entries(lits.iter()).finish() |
2018 | | } else { |
2019 | 0 | write!(f, "[∞]") |
2020 | | } |
2021 | 0 | } |
2022 | | } |
2023 | | |
2024 | | impl FromIterator<Literal> for Seq { |
2025 | 0 | fn from_iter<T: IntoIterator<Item = Literal>>(it: T) -> Seq { |
2026 | 0 | let mut seq = Seq::empty(); |
2027 | 0 | for literal in it { |
2028 | 0 | seq.push(literal); |
2029 | 0 | } |
2030 | 0 | seq |
2031 | 0 | } |
2032 | | } |
2033 | | |
2034 | | /// A single literal extracted from an [`Hir`] expression. |
2035 | | /// |
2036 | | /// A literal is composed of two things: |
2037 | | /// |
2038 | | /// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. |
2039 | | /// In particular, even if the regex a literal is extracted from is UTF-8, the |
2040 | | /// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] |
2041 | | /// limit resulted in trimming a literal in a way that splits a codepoint.) |
2042 | | /// * Whether the literal is "exact" or not. An "exact" literal means that it |
2043 | | /// has not been trimmed, and may continue to be extended. If a literal is |
2044 | | /// "exact" after visiting the entire `Hir` expression, then this implies that |
2045 | | /// the literal leads to a match state. (Although it doesn't necessarily imply |
2046 | | /// all occurrences of the literal correspond to a match of the regex, since |
2047 | | /// literal extraction ignores look-around assertions.) |
2048 | | #[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] |
2049 | | pub struct Literal { |
2050 | | bytes: Vec<u8>, |
2051 | | exact: bool, |
2052 | | } |
2053 | | |
2054 | | impl Literal { |
2055 | | /// Returns a new exact literal containing the bytes given. |
2056 | | #[inline] |
2057 | 0 | pub fn exact<B: Into<Vec<u8>>>(bytes: B) -> Literal { |
2058 | 0 | Literal { bytes: bytes.into(), exact: true } |
2059 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::exact::<alloc::vec::Vec<u8>> Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::exact::<alloc::string::String> |
2060 | | |
2061 | | /// Returns a new inexact literal containing the bytes given. |
2062 | | #[inline] |
2063 | 0 | pub fn inexact<B: Into<Vec<u8>>>(bytes: B) -> Literal { |
2064 | 0 | Literal { bytes: bytes.into(), exact: false } |
2065 | 0 | } |
2066 | | |
2067 | | /// Returns the bytes in this literal. |
2068 | | #[inline] |
2069 | 0 | pub fn as_bytes(&self) -> &[u8] { |
2070 | 0 | &self.bytes |
2071 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::as_bytes Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::as_bytes |
2072 | | |
2073 | | /// Yields ownership of the bytes inside this literal. |
2074 | | /// |
2075 | | /// Note that this throws away whether the literal is "exact" or not. |
2076 | | #[inline] |
2077 | 0 | pub fn into_bytes(self) -> Vec<u8> { |
2078 | 0 | self.bytes |
2079 | 0 | } |
2080 | | |
2081 | | /// Returns the length of this literal in bytes. |
2082 | | #[inline] |
2083 | 0 | pub fn len(&self) -> usize { |
2084 | 0 | self.as_bytes().len() |
2085 | 0 | } |
2086 | | |
2087 | | /// Returns true if and only if this literal has zero bytes. |
2088 | | #[inline] |
2089 | 0 | pub fn is_empty(&self) -> bool { |
2090 | 0 | self.len() == 0 |
2091 | 0 | } |
2092 | | |
2093 | | /// Returns true if and only if this literal is exact. |
2094 | | #[inline] |
2095 | 0 | pub fn is_exact(&self) -> bool { |
2096 | 0 | self.exact |
2097 | 0 | } |
2098 | | |
2099 | | /// Marks this literal as inexact. |
2100 | | /// |
2101 | | /// Inexact literals can never be extended. For example, |
2102 | | /// [`Seq::cross_forward`] will not extend inexact literals. |
2103 | | #[inline] |
2104 | 0 | pub fn make_inexact(&mut self) { |
2105 | 0 | self.exact = false; |
2106 | 0 | } Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::make_inexact Unexecuted instantiation: <regex_syntax::hir::literal::Literal>::make_inexact |
2107 | | |
2108 | | /// Reverse the bytes in this literal. |
2109 | | #[inline] |
2110 | 0 | pub fn reverse(&mut self) { |
2111 | 0 | self.bytes.reverse(); |
2112 | 0 | } |
2113 | | |
2114 | | /// Extend this literal with the literal given. |
2115 | | /// |
2116 | | /// If this literal is inexact, then this is a no-op. |
2117 | | #[inline] |
2118 | 0 | pub fn extend(&mut self, lit: &Literal) { |
2119 | 0 | if !self.is_exact() { |
2120 | 0 | return; |
2121 | 0 | } |
2122 | 0 | self.bytes.extend_from_slice(&lit.bytes); |
2123 | 0 | } |
2124 | | |
2125 | | /// Trims this literal such that only the first `len` bytes remain. If |
2126 | | /// this literal has fewer than `len` bytes, then it remains unchanged. |
2127 | | /// Otherwise, the literal is marked as inexact. |
2128 | | #[inline] |
2129 | 0 | pub fn keep_first_bytes(&mut self, len: usize) { |
2130 | 0 | if len >= self.len() { |
2131 | 0 | return; |
2132 | 0 | } |
2133 | 0 | self.make_inexact(); |
2134 | 0 | self.bytes.truncate(len); |
2135 | 0 | } |
2136 | | |
2137 | | /// Trims this literal such that only the last `len` bytes remain. If this |
2138 | | /// literal has fewer than `len` bytes, then it remains unchanged. |
2139 | | /// Otherwise, the literal is marked as inexact. |
2140 | | #[inline] |
2141 | 0 | pub fn keep_last_bytes(&mut self, len: usize) { |
2142 | 0 | if len >= self.len() { |
2143 | 0 | return; |
2144 | 0 | } |
2145 | 0 | self.make_inexact(); |
2146 | 0 | self.bytes.drain(..self.len() - len); |
2147 | 0 | } |
2148 | | |
2149 | | /// Returns true if it is believe that this literal is likely to match very |
2150 | | /// frequently, and is thus not a good candidate for a prefilter. |
2151 | 0 | fn is_poisonous(&self) -> bool { |
2152 | 0 | self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) |
2153 | 0 | } |
2154 | | } |
2155 | | |
2156 | | impl From<u8> for Literal { |
2157 | 0 | fn from(byte: u8) -> Literal { |
2158 | 0 | Literal::exact(vec![byte]) |
2159 | 0 | } |
2160 | | } |
2161 | | |
2162 | | impl From<char> for Literal { |
2163 | 0 | fn from(ch: char) -> Literal { |
2164 | | use alloc::string::ToString; |
2165 | 0 | Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) |
2166 | 0 | } |
2167 | | } |
2168 | | |
2169 | | impl AsRef<[u8]> for Literal { |
2170 | 0 | fn as_ref(&self) -> &[u8] { |
2171 | 0 | self.as_bytes() |
2172 | 0 | } |
2173 | | } |
2174 | | |
2175 | | impl core::fmt::Debug for Literal { |
2176 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2177 | 0 | let tag = if self.exact { "E" } else { "I" }; |
2178 | 0 | f.debug_tuple(tag) |
2179 | 0 | .field(&crate::debug::Bytes(self.as_bytes())) |
2180 | 0 | .finish() |
2181 | 0 | } |
2182 | | } |
2183 | | |
2184 | | /// A "preference" trie that rejects literals that will never match when |
2185 | | /// executing a leftmost first or "preference" search. |
2186 | | /// |
2187 | | /// For example, if 'sam' is inserted, then trying to insert 'samwise' will be |
2188 | | /// rejected because 'samwise' can never match since 'sam' will always take |
2189 | | /// priority. However, if 'samwise' is inserted first, then inserting 'sam' |
2190 | | /// after it is accepted. In this case, either 'samwise' or 'sam' can match in |
2191 | | /// a "preference" search. |
2192 | | /// |
2193 | | /// Note that we only use this trie as a "set." That is, given a sequence of |
2194 | | /// literals, we insert each one in order. An `insert` will reject a literal |
2195 | | /// if a prefix of that literal already exists in the trie. Thus, to rebuild |
2196 | | /// the "minimal" sequence, we simply only keep literals that were successfully |
2197 | | /// inserted. (Since we don't need traversal, one wonders whether we can make |
2198 | | /// some simplifications here, but I haven't given it a ton of thought and I've |
2199 | | /// never seen this show up on a profile. Because of the heuristic limits |
2200 | | /// imposed on literal extractions, the size of the inputs here is usually |
2201 | | /// very small.) |
2202 | | #[derive(Debug)] |
2203 | | struct PreferenceTrie { |
2204 | | /// The states in this trie. The index of a state in this vector is its ID. |
2205 | | states: Vec<State>, |
2206 | | /// This vec indicates which states are match states. It always has |
2207 | | /// the same length as `states` and is indexed by the same state ID. |
2208 | | /// A state with identifier `sid` is a match state if and only if |
2209 | | /// `matches[sid].is_some()`. The option contains the index of the literal |
2210 | | /// corresponding to the match. The index is offset by 1 so that it fits in |
2211 | | /// a NonZeroUsize. |
2212 | | matches: Vec<Option<NonZeroUsize>>, |
2213 | | /// The index to allocate to the next literal added to this trie. Starts at |
2214 | | /// 1 and increments by 1 for every literal successfully added to the trie. |
2215 | | next_literal_index: usize, |
2216 | | } |
2217 | | |
2218 | | /// A single state in a trie. Uses a sparse representation for its transitions. |
2219 | | #[derive(Debug, Default)] |
2220 | | struct State { |
2221 | | /// Sparse representation of the transitions out of this state. Transitions |
2222 | | /// are sorted by byte. There is at most one such transition for any |
2223 | | /// particular byte. |
2224 | | trans: Vec<(u8, usize)>, |
2225 | | } |
2226 | | |
2227 | | impl PreferenceTrie { |
2228 | | /// Minimizes the given sequence of literals while preserving preference |
2229 | | /// order semantics. |
2230 | | /// |
2231 | | /// When `keep_exact` is true, the exactness of every literal retained is |
2232 | | /// kept. This is useful when dealing with a fully extracted `Seq` that |
2233 | | /// only contains exact literals. In that case, we can keep all retained |
2234 | | /// literals as exact because we know we'll never need to match anything |
2235 | | /// after them and because any removed literals are guaranteed to never |
2236 | | /// match. |
2237 | 0 | fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) { |
2238 | 0 | let mut trie = PreferenceTrie { |
2239 | 0 | states: vec![], |
2240 | 0 | matches: vec![], |
2241 | 0 | next_literal_index: 1, |
2242 | 0 | }; |
2243 | 0 | let mut make_inexact = vec![]; |
2244 | 0 | literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { |
2245 | 0 | Ok(_) => true, |
2246 | 0 | Err(i) => { |
2247 | 0 | if !keep_exact { |
2248 | 0 | make_inexact.push(i.checked_sub(1).unwrap()); |
2249 | 0 | } |
2250 | 0 | false |
2251 | | } |
2252 | 0 | }); |
2253 | 0 | for i in make_inexact { |
2254 | 0 | literals[i].make_inexact(); |
2255 | 0 | } |
2256 | 0 | } |
2257 | | |
2258 | | /// Returns `Ok` if the given byte string is accepted into this trie and |
2259 | | /// `Err` otherwise. The index for the success case corresponds to the |
2260 | | /// index of the literal added. The index for the error case corresponds to |
2261 | | /// the index of the literal already in the trie that prevented the given |
2262 | | /// byte string from being added. (Which implies it is a prefix of the one |
2263 | | /// given.) |
2264 | | /// |
2265 | | /// In short, the byte string given is accepted into the trie if and only |
2266 | | /// if it is possible for it to match when executing a preference order |
2267 | | /// search. |
2268 | 0 | fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> { |
2269 | 0 | let mut prev = self.root(); |
2270 | 0 | if let Some(idx) = self.matches[prev] { |
2271 | 0 | return Err(idx.get()); |
2272 | 0 | } |
2273 | 0 | for &b in bytes.iter() { |
2274 | 0 | match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { |
2275 | 0 | Ok(i) => { |
2276 | 0 | prev = self.states[prev].trans[i].1; |
2277 | 0 | if let Some(idx) = self.matches[prev] { |
2278 | 0 | return Err(idx.get()); |
2279 | 0 | } |
2280 | | } |
2281 | 0 | Err(i) => { |
2282 | 0 | let next = self.create_state(); |
2283 | 0 | self.states[prev].trans.insert(i, (b, next)); |
2284 | 0 | prev = next; |
2285 | 0 | } |
2286 | | } |
2287 | | } |
2288 | 0 | let idx = self.next_literal_index; |
2289 | 0 | self.next_literal_index += 1; |
2290 | 0 | self.matches[prev] = NonZeroUsize::new(idx); |
2291 | 0 | Ok(idx) |
2292 | 0 | } |
2293 | | |
2294 | | /// Returns the root state ID, and if it doesn't exist, creates it. |
2295 | 0 | fn root(&mut self) -> usize { |
2296 | 0 | if !self.states.is_empty() { |
2297 | 0 | 0 |
2298 | | } else { |
2299 | 0 | self.create_state() |
2300 | | } |
2301 | 0 | } |
2302 | | |
2303 | | /// Creates a new empty state and returns its ID. |
2304 | 0 | fn create_state(&mut self) -> usize { |
2305 | 0 | let id = self.states.len(); |
2306 | 0 | self.states.push(State::default()); |
2307 | 0 | self.matches.push(None); |
2308 | 0 | id |
2309 | 0 | } |
2310 | | } |
2311 | | |
2312 | | /// Returns the "rank" of the given byte. |
2313 | | /// |
2314 | | /// The minimum rank value is `0` and the maximum rank value is `255`. |
2315 | | /// |
2316 | | /// The rank of a byte is derived from a heuristic background distribution of |
2317 | | /// relative frequencies of bytes. The heuristic says that lower the rank of a |
2318 | | /// byte, the less likely that byte is to appear in any arbitrary haystack. |
2319 | 0 | pub fn rank(byte: u8) -> u8 { |
2320 | 0 | crate::rank::BYTE_FREQUENCIES[usize::from(byte)] |
2321 | 0 | } |
2322 | | |
2323 | | #[cfg(test)] |
2324 | | mod tests { |
2325 | | use super::*; |
2326 | | |
2327 | | fn parse(pattern: &str) -> Hir { |
2328 | | crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() |
2329 | | } |
2330 | | |
2331 | | fn prefixes(pattern: &str) -> Seq { |
2332 | | Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) |
2333 | | } |
2334 | | |
2335 | | fn suffixes(pattern: &str) -> Seq { |
2336 | | Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) |
2337 | | } |
2338 | | |
2339 | | fn e(pattern: &str) -> (Seq, Seq) { |
2340 | | (prefixes(pattern), suffixes(pattern)) |
2341 | | } |
2342 | | |
2343 | | #[allow(non_snake_case)] |
2344 | | fn E(x: &str) -> Literal { |
2345 | | Literal::exact(x.as_bytes()) |
2346 | | } |
2347 | | |
2348 | | #[allow(non_snake_case)] |
2349 | | fn I(x: &str) -> Literal { |
2350 | | Literal::inexact(x.as_bytes()) |
2351 | | } |
2352 | | |
2353 | | fn seq<I: IntoIterator<Item = Literal>>(it: I) -> Seq { |
2354 | | Seq::from_iter(it) |
2355 | | } |
2356 | | |
2357 | | fn infinite() -> (Seq, Seq) { |
2358 | | (Seq::infinite(), Seq::infinite()) |
2359 | | } |
2360 | | |
2361 | | fn inexact<I1, I2>(it1: I1, it2: I2) -> (Seq, Seq) |
2362 | | where |
2363 | | I1: IntoIterator<Item = Literal>, |
2364 | | I2: IntoIterator<Item = Literal>, |
2365 | | { |
2366 | | (Seq::from_iter(it1), Seq::from_iter(it2)) |
2367 | | } |
2368 | | |
2369 | | fn exact<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { |
2370 | | let s1 = Seq::new(it); |
2371 | | let s2 = s1.clone(); |
2372 | | (s1, s2) |
2373 | | } |
2374 | | |
2375 | | fn opt<B: AsRef<[u8]>, I: IntoIterator<Item = B>>(it: I) -> (Seq, Seq) { |
2376 | | let (mut p, mut s) = exact(it); |
2377 | | p.optimize_for_prefix_by_preference(); |
2378 | | s.optimize_for_suffix_by_preference(); |
2379 | | (p, s) |
2380 | | } |
2381 | | |
2382 | | #[test] |
2383 | | fn literal() { |
2384 | | assert_eq!(exact(["a"]), e("a")); |
2385 | | assert_eq!(exact(["aaaaa"]), e("aaaaa")); |
2386 | | assert_eq!(exact(["A", "a"]), e("(?i-u)a")); |
2387 | | assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); |
2388 | | assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); |
2389 | | |
2390 | | assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); |
2391 | | |
2392 | | #[cfg(feature = "unicode-case")] |
2393 | | { |
2394 | | assert_eq!(exact(["☃"]), e("☃")); |
2395 | | assert_eq!(exact(["☃"]), e("(?i)☃")); |
2396 | | assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); |
2397 | | |
2398 | | assert_eq!(exact(["Δ"]), e("Δ")); |
2399 | | assert_eq!(exact(["δ"]), e("δ")); |
2400 | | assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); |
2401 | | assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); |
2402 | | |
2403 | | assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); |
2404 | | assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); |
2405 | | assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); |
2406 | | } |
2407 | | |
2408 | | let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; |
2409 | | assert_eq!(exact([letters]), e(letters)); |
2410 | | } |
2411 | | |
2412 | | #[test] |
2413 | | fn class() { |
2414 | | assert_eq!(exact(["a", "b", "c"]), e("[abc]")); |
2415 | | assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); |
2416 | | assert_eq!(exact(["δ", "ε"]), e("[εδ]")); |
2417 | | #[cfg(feature = "unicode-case")] |
2418 | | { |
2419 | | assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); |
2420 | | } |
2421 | | } |
2422 | | |
2423 | | #[test] |
2424 | | fn look() { |
2425 | | assert_eq!(exact(["ab"]), e(r"a\Ab")); |
2426 | | assert_eq!(exact(["ab"]), e(r"a\zb")); |
2427 | | assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); |
2428 | | assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); |
2429 | | assert_eq!(exact(["ab"]), e(r"a\bb")); |
2430 | | assert_eq!(exact(["ab"]), e(r"a\Bb")); |
2431 | | assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); |
2432 | | assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); |
2433 | | |
2434 | | assert_eq!(exact(["ab"]), e(r"^ab")); |
2435 | | assert_eq!(exact(["ab"]), e(r"$ab")); |
2436 | | assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); |
2437 | | assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); |
2438 | | assert_eq!(exact(["ab"]), e(r"\bab")); |
2439 | | assert_eq!(exact(["ab"]), e(r"\Bab")); |
2440 | | assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); |
2441 | | assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); |
2442 | | |
2443 | | assert_eq!(exact(["ab"]), e(r"ab^")); |
2444 | | assert_eq!(exact(["ab"]), e(r"ab$")); |
2445 | | assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); |
2446 | | assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); |
2447 | | assert_eq!(exact(["ab"]), e(r"ab\b")); |
2448 | | assert_eq!(exact(["ab"]), e(r"ab\B")); |
2449 | | assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); |
2450 | | assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); |
2451 | | |
2452 | | let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); |
2453 | | assert_eq!(expected, e(r"^aZ*b")); |
2454 | | } |
2455 | | |
2456 | | #[test] |
2457 | | fn repetition() { |
2458 | | assert_eq!(exact(["a", ""]), e(r"a?")); |
2459 | | assert_eq!(exact(["", "a"]), e(r"a??")); |
2460 | | assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); |
2461 | | assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); |
2462 | | assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); |
2463 | | assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); |
2464 | | |
2465 | | assert_eq!(exact(["ab"]), e(r"aZ{0}b")); |
2466 | | assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); |
2467 | | assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); |
2468 | | assert_eq!( |
2469 | | inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), |
2470 | | e(r"aZ*b") |
2471 | | ); |
2472 | | assert_eq!( |
2473 | | inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), |
2474 | | e(r"aZ*?b") |
2475 | | ); |
2476 | | assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); |
2477 | | assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); |
2478 | | |
2479 | | assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); |
2480 | | assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); |
2481 | | |
2482 | | assert_eq!(exact(["abc", ""]), e(r"(abc)?")); |
2483 | | assert_eq!(exact(["", "abc"]), e(r"(abc)??")); |
2484 | | |
2485 | | assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); |
2486 | | assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); |
2487 | | assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); |
2488 | | assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); |
2489 | | |
2490 | | // FIXME: The suffixes for this don't look quite right to me. I think |
2491 | | // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I |
2492 | | // think is that suffixes are computed by iterating over concatenations |
2493 | | // in reverse, and then [bc, ac, c] ordering is indeed correct from |
2494 | | // that perspective. We also test a few more equivalent regexes, and |
2495 | | // we get the same result, so it is consistent at least I suppose. |
2496 | | // |
2497 | | // The reason why this isn't an issue is that it only messes up |
2498 | | // preference order, and currently, suffixes are never used in a |
2499 | | // context where preference order matters. For prefixes it matters |
2500 | | // because we sometimes want to use prefilters without confirmation |
2501 | | // when all of the literals are exact (and there's no look-around). But |
2502 | | // we never do that for suffixes. Any time we use suffixes, we always |
2503 | | // include a confirmation step. If that ever changes, then it's likely |
2504 | | // this bug will need to be fixed, but last time I looked, it appears |
2505 | | // hard to do so. |
2506 | | assert_eq!( |
2507 | | inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), |
2508 | | e(r"a*b*c") |
2509 | | ); |
2510 | | assert_eq!( |
2511 | | inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), |
2512 | | e(r"(a+)?(b+)?c") |
2513 | | ); |
2514 | | assert_eq!( |
2515 | | inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), |
2516 | | e(r"(a+|)(b+|)c") |
2517 | | ); |
2518 | | // A few more similarish but not identical regexes. These may have a |
2519 | | // similar problem as above. |
2520 | | assert_eq!( |
2521 | | inexact( |
2522 | | [I("a"), I("b"), I("c"), E("")], |
2523 | | [I("c"), I("b"), I("a"), E("")] |
2524 | | ), |
2525 | | e(r"a*b*c*") |
2526 | | ); |
2527 | | assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); |
2528 | | assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); |
2529 | | assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); |
2530 | | assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); |
2531 | | assert_eq!( |
2532 | | inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), |
2533 | | e(r"ab*c") |
2534 | | ); |
2535 | | assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); |
2536 | | assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); |
2537 | | |
2538 | | assert_eq!( |
2539 | | inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), |
2540 | | e(r"z*azb") |
2541 | | ); |
2542 | | |
2543 | | let expected = |
2544 | | exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); |
2545 | | assert_eq!(expected, e(r"[ab]{3}")); |
2546 | | let expected = inexact( |
2547 | | [ |
2548 | | I("aaa"), |
2549 | | I("aab"), |
2550 | | I("aba"), |
2551 | | I("abb"), |
2552 | | I("baa"), |
2553 | | I("bab"), |
2554 | | I("bba"), |
2555 | | I("bbb"), |
2556 | | ], |
2557 | | [ |
2558 | | I("aaa"), |
2559 | | I("aab"), |
2560 | | I("aba"), |
2561 | | I("abb"), |
2562 | | I("baa"), |
2563 | | I("bab"), |
2564 | | I("bba"), |
2565 | | I("bbb"), |
2566 | | ], |
2567 | | ); |
2568 | | assert_eq!(expected, e(r"[ab]{3,4}")); |
2569 | | } |
2570 | | |
2571 | | #[test] |
2572 | | fn concat() { |
2573 | | let empty: [&str; 0] = []; |
2574 | | |
2575 | | assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); |
2576 | | assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); |
2577 | | assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); |
2578 | | assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); |
2579 | | assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); |
2580 | | } |
2581 | | |
2582 | | #[test] |
2583 | | fn alternation() { |
2584 | | assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); |
2585 | | assert_eq!( |
2586 | | inexact( |
2587 | | [E("abc"), I("mZ"), E("mo"), E("xyz")], |
2588 | | [E("abc"), I("Zo"), E("mo"), E("xyz")] |
2589 | | ), |
2590 | | e(r"abc|mZ*o|xyz") |
2591 | | ); |
2592 | | assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); |
2593 | | assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); |
2594 | | |
2595 | | assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); |
2596 | | assert_eq!( |
2597 | | inexact( |
2598 | | [I("aaa"), E(""), I("aaaaa"), E("aa")], |
2599 | | [I("aaa"), E(""), E("aa")] |
2600 | | ), |
2601 | | e(r"(?:|aa)(?:aaa)*") |
2602 | | ); |
2603 | | assert_eq!( |
2604 | | inexact( |
2605 | | [E(""), I("aaa"), E("aa"), I("aaaaa")], |
2606 | | [E(""), I("aaa"), E("aa")] |
2607 | | ), |
2608 | | e(r"(?:|aa)(?:aaa)*?") |
2609 | | ); |
2610 | | |
2611 | | assert_eq!( |
2612 | | inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), |
2613 | | e(r"a|b*") |
2614 | | ); |
2615 | | assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); |
2616 | | |
2617 | | assert_eq!( |
2618 | | inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), |
2619 | | e(r"a*b|c") |
2620 | | ); |
2621 | | |
2622 | | assert_eq!( |
2623 | | inexact( |
2624 | | [E("a"), E("b"), I("c"), E("")], |
2625 | | [E("a"), E("b"), I("c"), E("")] |
2626 | | ), |
2627 | | e(r"a|(?:b|c*)") |
2628 | | ); |
2629 | | |
2630 | | assert_eq!( |
2631 | | inexact( |
2632 | | [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], |
2633 | | [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], |
2634 | | ), |
2635 | | e(r"(a|b)*c|(a|ab)*c") |
2636 | | ); |
2637 | | |
2638 | | assert_eq!( |
2639 | | exact(["abef", "abgh", "cdef", "cdgh"]), |
2640 | | e(r"(ab|cd)(ef|gh)") |
2641 | | ); |
2642 | | assert_eq!( |
2643 | | exact([ |
2644 | | "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", |
2645 | | "cdghij", "cdghkl", |
2646 | | ]), |
2647 | | e(r"(ab|cd)(ef|gh)(ij|kl)") |
2648 | | ); |
2649 | | |
2650 | | assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); |
2651 | | |
2652 | | assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); |
2653 | | |
2654 | | assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); |
2655 | | } |
2656 | | |
2657 | | #[test] |
2658 | | fn impossible() { |
2659 | | let empty: [&str; 0] = []; |
2660 | | |
2661 | | assert_eq!(exact(empty), e(r"[a&&b]")); |
2662 | | assert_eq!(exact(empty), e(r"a[a&&b]")); |
2663 | | assert_eq!(exact(empty), e(r"[a&&b]b")); |
2664 | | assert_eq!(exact(empty), e(r"a[a&&b]b")); |
2665 | | assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); |
2666 | | assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); |
2667 | | assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); |
2668 | | assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); |
2669 | | assert_eq!(exact([""]), e(r"[a&&b]*")); |
2670 | | assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); |
2671 | | } |
2672 | | |
2673 | | // This tests patterns that contain something that defeats literal |
2674 | | // detection, usually because it would blow some limit on the total number |
2675 | | // of literals that can be returned. |
2676 | | // |
2677 | | // The main idea is that when literal extraction sees something that |
2678 | | // it knows will blow a limit, it replaces it with a marker that says |
2679 | | // "any literal will match here." While not necessarily true, the |
2680 | | // over-estimation is just fine for the purposes of literal extraction, |
2681 | | // because the imprecision doesn't matter: too big is too big. |
2682 | | // |
2683 | | // This is one of the trickier parts of literal extraction, since we need |
2684 | | // to make sure all of our literal extraction operations correctly compose |
2685 | | // with the markers. |
2686 | | #[test] |
2687 | | fn anything() { |
2688 | | assert_eq!(infinite(), e(r".")); |
2689 | | assert_eq!(infinite(), e(r"(?s).")); |
2690 | | assert_eq!(infinite(), e(r"[A-Za-z]")); |
2691 | | assert_eq!(infinite(), e(r"[A-Z]")); |
2692 | | assert_eq!(exact([""]), e(r"[A-Z]{0}")); |
2693 | | assert_eq!(infinite(), e(r"[A-Z]?")); |
2694 | | assert_eq!(infinite(), e(r"[A-Z]*")); |
2695 | | assert_eq!(infinite(), e(r"[A-Z]+")); |
2696 | | assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); |
2697 | | assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); |
2698 | | assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); |
2699 | | assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); |
2700 | | assert_eq!(infinite(), e(r"1|[A-Z]|3")); |
2701 | | assert_eq!( |
2702 | | (seq([E("1"), I("2"), E("3")]), Seq::infinite()), |
2703 | | e(r"1|2[A-Z]|3"), |
2704 | | ); |
2705 | | assert_eq!( |
2706 | | (Seq::infinite(), seq([E("1"), I("2"), E("3")])), |
2707 | | e(r"1|[A-Z]2|3"), |
2708 | | ); |
2709 | | assert_eq!( |
2710 | | (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), |
2711 | | e(r"1|2[A-Z]3|4"), |
2712 | | ); |
2713 | | assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); |
2714 | | assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); |
2715 | | } |
2716 | | |
2717 | | // Like the 'anything' test, but it uses smaller limits in order to test |
2718 | | // the logic for effectively aborting literal extraction when the seqs get |
2719 | | // too big. |
2720 | | #[test] |
2721 | | fn anything_small_limits() { |
2722 | | fn prefixes(pattern: &str) -> Seq { |
2723 | | Extractor::new() |
2724 | | .kind(ExtractKind::Prefix) |
2725 | | .limit_total(10) |
2726 | | .extract(&parse(pattern)) |
2727 | | } |
2728 | | |
2729 | | fn suffixes(pattern: &str) -> Seq { |
2730 | | Extractor::new() |
2731 | | .kind(ExtractKind::Suffix) |
2732 | | .limit_total(10) |
2733 | | .extract(&parse(pattern)) |
2734 | | } |
2735 | | |
2736 | | fn e(pattern: &str) -> (Seq, Seq) { |
2737 | | (prefixes(pattern), suffixes(pattern)) |
2738 | | } |
2739 | | |
2740 | | assert_eq!( |
2741 | | ( |
2742 | | seq([ |
2743 | | I("aaa"), |
2744 | | I("aab"), |
2745 | | I("aba"), |
2746 | | I("abb"), |
2747 | | I("baa"), |
2748 | | I("bab"), |
2749 | | I("bba"), |
2750 | | I("bbb") |
2751 | | ]), |
2752 | | seq([ |
2753 | | I("aaa"), |
2754 | | I("aab"), |
2755 | | I("aba"), |
2756 | | I("abb"), |
2757 | | I("baa"), |
2758 | | I("bab"), |
2759 | | I("bba"), |
2760 | | I("bbb") |
2761 | | ]) |
2762 | | ), |
2763 | | e(r"[ab]{3}{3}") |
2764 | | ); |
2765 | | |
2766 | | assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); |
2767 | | } |
2768 | | |
2769 | | #[test] |
2770 | | fn empty() { |
2771 | | assert_eq!(exact([""]), e(r"")); |
2772 | | assert_eq!(exact([""]), e(r"^")); |
2773 | | assert_eq!(exact([""]), e(r"$")); |
2774 | | assert_eq!(exact([""]), e(r"(?m:^)")); |
2775 | | assert_eq!(exact([""]), e(r"(?m:$)")); |
2776 | | assert_eq!(exact([""]), e(r"\b")); |
2777 | | assert_eq!(exact([""]), e(r"\B")); |
2778 | | assert_eq!(exact([""]), e(r"(?-u:\b)")); |
2779 | | assert_eq!(exact([""]), e(r"(?-u:\B)")); |
2780 | | } |
2781 | | |
2782 | | #[test] |
2783 | | fn odds_and_ends() { |
2784 | | assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); |
2785 | | assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); |
2786 | | assert_eq!(infinite(), e(r"a|.")); |
2787 | | assert_eq!(infinite(), e(r".|a")); |
2788 | | |
2789 | | let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; |
2790 | | let expected = inexact( |
2791 | | ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), |
2792 | | [ |
2793 | | "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", |
2794 | | "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", |
2795 | | "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", |
2796 | | "zzafy", "zafi", "zafy", |
2797 | | ] |
2798 | | .map(I), |
2799 | | ); |
2800 | | assert_eq!(expected, e(pat)); |
2801 | | |
2802 | | assert_eq!( |
2803 | | (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), |
2804 | | e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), |
2805 | | ); |
2806 | | assert_eq!( |
2807 | | inexact([I("foo")], [I("quux")]), |
2808 | | e(r"foo[A-Z]+bar[A-Z]+quux") |
2809 | | ); |
2810 | | assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); |
2811 | | assert_eq!( |
2812 | | exact(["Sherlock Holmes"]), |
2813 | | e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") |
2814 | | ); |
2815 | | |
2816 | | assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); |
2817 | | } |
2818 | | |
2819 | | // This tests a specific regex along with some heuristic steps to reduce |
2820 | | // the sequences extracted. This is meant to roughly correspond to the |
2821 | | // types of heuristics used to shrink literal sets in practice. (Shrinking |
2822 | | // is done because you want to balance "spend too much work looking for |
2823 | | // too many literals" and "spend too much work processing false positive |
2824 | | // matches from short literals.") |
2825 | | #[test] |
2826 | | #[cfg(feature = "unicode-case")] |
2827 | | fn holmes() { |
2828 | | let expected = inexact( |
2829 | | ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), |
2830 | | [ |
2831 | | "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", |
2832 | | "mes", |
2833 | | ] |
2834 | | .map(I), |
2835 | | ); |
2836 | | let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); |
2837 | | prefixes.keep_first_bytes(3); |
2838 | | suffixes.keep_last_bytes(3); |
2839 | | prefixes.minimize_by_preference(); |
2840 | | suffixes.minimize_by_preference(); |
2841 | | assert_eq!(expected, (prefixes, suffixes)); |
2842 | | } |
2843 | | |
2844 | | // This tests that we get some kind of literals extracted for a beefier |
2845 | | // alternation with case insensitive mode enabled. At one point during |
2846 | | // development, this returned nothing, and motivated some special case |
2847 | | // code in Extractor::union to try and trim down the literal sequences |
2848 | | // if the union would blow the limits set. |
2849 | | #[test] |
2850 | | #[cfg(feature = "unicode-case")] |
2851 | | fn holmes_alt() { |
2852 | | let mut pre = |
2853 | | prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); |
2854 | | assert!(pre.len().unwrap() > 0); |
2855 | | pre.optimize_for_prefix_by_preference(); |
2856 | | assert!(pre.len().unwrap() > 0); |
2857 | | } |
2858 | | |
2859 | | // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 |
2860 | | // See: CVE-2022-24713 |
2861 | | // |
2862 | | // We test this here to ensure literal extraction completes in reasonable |
2863 | | // time and isn't materially impacted by these sorts of pathological |
2864 | | // repeats. |
2865 | | #[test] |
2866 | | fn crazy_repeats() { |
2867 | | assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}")); |
2868 | | assert_eq!( |
2869 | | inexact([E("")], [E("")]), |
2870 | | e(r"(?:){64}{64}{64}{64}{64}{64}") |
2871 | | ); |
2872 | | assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}")); |
2873 | | assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}")); |
2874 | | |
2875 | | assert_eq!( |
2876 | | inexact([E("")], [E("")]), |
2877 | | e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") |
2878 | | ); |
2879 | | let repa = "a".repeat(100); |
2880 | | assert_eq!( |
2881 | | inexact([I(&repa)], [I(&repa)]), |
2882 | | e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") |
2883 | | ); |
2884 | | } |
2885 | | |
2886 | | #[test] |
2887 | | fn huge() { |
2888 | | let pat = r#"(?-u) |
2889 | | 2(?: |
2890 | | [45]\d{3}| |
2891 | | 7(?: |
2892 | | 1[0-267]| |
2893 | | 2[0-289]| |
2894 | | 3[0-29]| |
2895 | | 4[01]| |
2896 | | 5[1-3]| |
2897 | | 6[013]| |
2898 | | 7[0178]| |
2899 | | 91 |
2900 | | )| |
2901 | | 8(?: |
2902 | | 0[125]| |
2903 | | [139][1-6]| |
2904 | | 2[0157-9]| |
2905 | | 41| |
2906 | | 6[1-35]| |
2907 | | 7[1-5]| |
2908 | | 8[1-8]| |
2909 | | 90 |
2910 | | )| |
2911 | | 9(?: |
2912 | | 0[0-2]| |
2913 | | 1[0-4]| |
2914 | | 2[568]| |
2915 | | 3[3-6]| |
2916 | | 5[5-7]| |
2917 | | 6[0167]| |
2918 | | 7[15]| |
2919 | | 8[0146-9] |
2920 | | ) |
2921 | | )\d{4}| |
2922 | | 3(?: |
2923 | | 12?[5-7]\d{2}| |
2924 | | 0(?: |
2925 | | 2(?: |
2926 | | [025-79]\d| |
2927 | | [348]\d{1,2} |
2928 | | )| |
2929 | | 3(?: |
2930 | | [2-4]\d| |
2931 | | [56]\d? |
2932 | | ) |
2933 | | )| |
2934 | | 2(?: |
2935 | | 1\d{2}| |
2936 | | 2(?: |
2937 | | [12]\d| |
2938 | | [35]\d{1,2}| |
2939 | | 4\d? |
2940 | | ) |
2941 | | )| |
2942 | | 3(?: |
2943 | | 1\d{2}| |
2944 | | 2(?: |
2945 | | [2356]\d| |
2946 | | 4\d{1,2} |
2947 | | ) |
2948 | | )| |
2949 | | 4(?: |
2950 | | 1\d{2}| |
2951 | | 2(?: |
2952 | | 2\d{1,2}| |
2953 | | [47]| |
2954 | | 5\d{2} |
2955 | | ) |
2956 | | )| |
2957 | | 5(?: |
2958 | | 1\d{2}| |
2959 | | 29 |
2960 | | )| |
2961 | | [67]1\d{2}| |
2962 | | 8(?: |
2963 | | 1\d{2}| |
2964 | | 2(?: |
2965 | | 2\d{2}| |
2966 | | 3| |
2967 | | 4\d |
2968 | | ) |
2969 | | ) |
2970 | | )\d{3}| |
2971 | | 4(?: |
2972 | | 0(?: |
2973 | | 2(?: |
2974 | | [09]\d| |
2975 | | 7 |
2976 | | )| |
2977 | | 33\d{2} |
2978 | | )| |
2979 | | 1\d{3}| |
2980 | | 2(?: |
2981 | | 1\d{2}| |
2982 | | 2(?: |
2983 | | [25]\d?| |
2984 | | [348]\d| |
2985 | | [67]\d{1,2} |
2986 | | ) |
2987 | | )| |
2988 | | 3(?: |
2989 | | 1\d{2}(?: |
2990 | | \d{2} |
2991 | | )?| |
2992 | | 2(?: |
2993 | | [045]\d| |
2994 | | [236-9]\d{1,2} |
2995 | | )| |
2996 | | 32\d{2} |
2997 | | )| |
2998 | | 4(?: |
2999 | | [18]\d{2}| |
3000 | | 2(?: |
3001 | | [2-46]\d{2}| |
3002 | | 3 |
3003 | | )| |
3004 | | 5[25]\d{2} |
3005 | | )| |
3006 | | 5(?: |
3007 | | 1\d{2}| |
3008 | | 2(?: |
3009 | | 3\d| |
3010 | | 5 |
3011 | | ) |
3012 | | )| |
3013 | | 6(?: |
3014 | | [18]\d{2}| |
3015 | | 2(?: |
3016 | | 3(?: |
3017 | | \d{2} |
3018 | | )?| |
3019 | | [46]\d{1,2}| |
3020 | | 5\d{2}| |
3021 | | 7\d |
3022 | | )| |
3023 | | 5(?: |
3024 | | 3\d?| |
3025 | | 4\d| |
3026 | | [57]\d{1,2}| |
3027 | | 6\d{2}| |
3028 | | 8 |
3029 | | ) |
3030 | | )| |
3031 | | 71\d{2}| |
3032 | | 8(?: |
3033 | | [18]\d{2}| |
3034 | | 23\d{2}| |
3035 | | 54\d{2} |
3036 | | )| |
3037 | | 9(?: |
3038 | | [18]\d{2}| |
3039 | | 2[2-5]\d{2}| |
3040 | | 53\d{1,2} |
3041 | | ) |
3042 | | )\d{3}| |
3043 | | 5(?: |
3044 | | 02[03489]\d{2}| |
3045 | | 1\d{2}| |
3046 | | 2(?: |
3047 | | 1\d{2}| |
3048 | | 2(?: |
3049 | | 2(?: |
3050 | | \d{2} |
3051 | | )?| |
3052 | | [457]\d{2} |
3053 | | ) |
3054 | | )| |
3055 | | 3(?: |
3056 | | 1\d{2}| |
3057 | | 2(?: |
3058 | | [37](?: |
3059 | | \d{2} |
3060 | | )?| |
3061 | | [569]\d{2} |
3062 | | ) |
3063 | | )| |
3064 | | 4(?: |
3065 | | 1\d{2}| |
3066 | | 2[46]\d{2} |
3067 | | )| |
3068 | | 5(?: |
3069 | | 1\d{2}| |
3070 | | 26\d{1,2} |
3071 | | )| |
3072 | | 6(?: |
3073 | | [18]\d{2}| |
3074 | | 2| |
3075 | | 53\d{2} |
3076 | | )| |
3077 | | 7(?: |
3078 | | 1| |
3079 | | 24 |
3080 | | )\d{2}| |
3081 | | 8(?: |
3082 | | 1| |
3083 | | 26 |
3084 | | )\d{2}| |
3085 | | 91\d{2} |
3086 | | )\d{3}| |
3087 | | 6(?: |
3088 | | 0(?: |
3089 | | 1\d{2}| |
3090 | | 2(?: |
3091 | | 3\d{2}| |
3092 | | 4\d{1,2} |
3093 | | ) |
3094 | | )| |
3095 | | 2(?: |
3096 | | 2[2-5]\d{2}| |
3097 | | 5(?: |
3098 | | [3-5]\d{2}| |
3099 | | 7 |
3100 | | )| |
3101 | | 8\d{2} |
3102 | | )| |
3103 | | 3(?: |
3104 | | 1| |
3105 | | 2[3478] |
3106 | | )\d{2}| |
3107 | | 4(?: |
3108 | | 1| |
3109 | | 2[34] |
3110 | | )\d{2}| |
3111 | | 5(?: |
3112 | | 1| |
3113 | | 2[47] |
3114 | | )\d{2}| |
3115 | | 6(?: |
3116 | | [18]\d{2}| |
3117 | | 6(?: |
3118 | | 2(?: |
3119 | | 2\d| |
3120 | | [34]\d{2} |
3121 | | )| |
3122 | | 5(?: |
3123 | | [24]\d{2}| |
3124 | | 3\d| |
3125 | | 5\d{1,2} |
3126 | | ) |
3127 | | ) |
3128 | | )| |
3129 | | 72[2-5]\d{2}| |
3130 | | 8(?: |
3131 | | 1\d{2}| |
3132 | | 2[2-5]\d{2} |
3133 | | )| |
3134 | | 9(?: |
3135 | | 1\d{2}| |
3136 | | 2[2-6]\d{2} |
3137 | | ) |
3138 | | )\d{3}| |
3139 | | 7(?: |
3140 | | (?: |
3141 | | 02| |
3142 | | [3-589]1| |
3143 | | 6[12]| |
3144 | | 72[24] |
3145 | | )\d{2}| |
3146 | | 21\d{3}| |
3147 | | 32 |
3148 | | )\d{3}| |
3149 | | 8(?: |
3150 | | (?: |
3151 | | 4[12]| |
3152 | | [5-7]2| |
3153 | | 1\d? |
3154 | | )| |
3155 | | (?: |
3156 | | 0| |
3157 | | 3[12]| |
3158 | | [5-7]1| |
3159 | | 217 |
3160 | | )\d |
3161 | | )\d{4}| |
3162 | | 9(?: |
3163 | | [35]1| |
3164 | | (?: |
3165 | | [024]2| |
3166 | | 81 |
3167 | | )\d| |
3168 | | (?: |
3169 | | 1| |
3170 | | [24]1 |
3171 | | )\d{2} |
3172 | | )\d{3} |
3173 | | "#; |
3174 | | // TODO: This is a good candidate of a seq of literals that could be |
3175 | | // shrunk quite a bit and still be very productive with respect to |
3176 | | // literal optimizations. |
3177 | | let (prefixes, suffixes) = e(pat); |
3178 | | assert!(!suffixes.is_finite()); |
3179 | | assert_eq!(Some(243), prefixes.len()); |
3180 | | } |
3181 | | |
3182 | | #[test] |
3183 | | fn optimize() { |
3184 | | // This gets a common prefix that isn't too short. |
3185 | | let (p, s) = |
3186 | | opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); |
3187 | | assert_eq!(seq([I("foobar")]), p); |
3188 | | assert_eq!(seq([I("foobar")]), s); |
3189 | | |
3190 | | // This also finds a common prefix, but since it's only one byte, it |
3191 | | // prefers the multiple literals. |
3192 | | let (p, s) = opt(["abba", "akka", "abccba"]); |
3193 | | assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); |
3194 | | |
3195 | | let (p, s) = opt(["sam", "samwise"]); |
3196 | | assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); |
3197 | | |
3198 | | // The empty string is poisonous, so our seq becomes infinite, even |
3199 | | // though all literals are exact. |
3200 | | let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); |
3201 | | assert!(!p.is_finite()); |
3202 | | assert!(!s.is_finite()); |
3203 | | |
3204 | | // A space is also poisonous, so our seq becomes infinite. But this |
3205 | | // only gets triggered when we don't have a completely exact sequence. |
3206 | | // When the sequence is exact, spaces are okay, since we presume that |
3207 | | // any prefilter will match a space more quickly than the regex engine. |
3208 | | // (When the sequence is exact, there's a chance of the prefilter being |
3209 | | // used without needing the regex engine at all.) |
3210 | | let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); |
3211 | | p.optimize_for_prefix_by_preference(); |
3212 | | assert!(!p.is_finite()); |
3213 | | } |
3214 | | } |