/src/regex/regex-lite/src/hir/mod.rs
Line | Count | Source |
1 | | use alloc::{boxed::Box, string::String, vec, vec::Vec}; |
2 | | |
3 | | use crate::{error::Error, utf8}; |
4 | | |
5 | | mod parse; |
6 | | |
7 | | /// Escapes all regular expression meta characters in `pattern`. |
8 | | /// |
9 | | /// The string returned may be safely used as a literal in a regular |
10 | | /// expression. |
11 | 0 | pub fn escape(pattern: &str) -> String { |
12 | 0 | let mut buf = String::new(); |
13 | 0 | buf.reserve(pattern.len()); |
14 | 0 | for ch in pattern.chars() { |
15 | 0 | if is_meta_character(ch) { |
16 | 0 | buf.push('\\'); |
17 | 0 | } |
18 | 0 | buf.push(ch); |
19 | | } |
20 | 0 | buf |
21 | 0 | } |
22 | | |
23 | | /// Returns true if the given character has significance in a regex. |
24 | | /// |
25 | | /// Generally speaking, these are the only characters which _must_ be escaped |
26 | | /// in order to match their literal meaning. For example, to match a literal |
27 | | /// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For |
28 | | /// example, `-` is treated as a meta character because of its significance |
29 | | /// for writing ranges inside of character classes, but the regex `-` will |
30 | | /// match a literal `-` because `-` has no special meaning outside of character |
31 | | /// classes. |
32 | | /// |
33 | | /// In order to determine whether a character may be escaped at all, the |
34 | | /// [`is_escapable_character`] routine should be used. The difference between |
35 | | /// `is_meta_character` and `is_escapable_character` is that the latter will |
36 | | /// return true for some characters that are _not_ meta characters. For |
37 | | /// example, `%` and `\%` both match a literal `%` in all contexts. In other |
38 | | /// words, `is_escapable_character` includes "superfluous" escapes. |
39 | | /// |
40 | | /// Note that the set of characters for which this function returns `true` or |
41 | | /// `false` is fixed and won't change in a semver compatible release. (In this |
42 | | /// case, "semver compatible release" actually refers to the `regex` crate |
43 | | /// itself, since reducing or expanding the set of meta characters would be a |
44 | | /// breaking change for not just `regex-syntax` but also `regex` itself.) |
45 | 1.05M | fn is_meta_character(c: char) -> bool { |
46 | 1.05M | match c { |
47 | | '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' |
48 | 850k | | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, |
49 | 203k | _ => false, |
50 | | } |
51 | 1.05M | } |
52 | | |
53 | | /// Returns true if the given character can be escaped in a regex. |
54 | | /// |
55 | | /// This returns true in all cases that `is_meta_character` returns true, but |
56 | | /// also returns true in some cases where `is_meta_character` returns false. |
57 | | /// For example, `%` is not a meta character, but it is escapable. That is, |
58 | | /// `%` and `\%` both match a literal `%` in all contexts. |
59 | | /// |
60 | | /// The purpose of this routine is to provide knowledge about what characters |
61 | | /// may be escaped. Namely, most regex engines permit "superfluous" escapes |
62 | | /// where characters without any special significance may be escaped even |
63 | | /// though there is no actual _need_ to do so. |
64 | | /// |
65 | | /// This will return false for some characters. For example, `e` is not |
66 | | /// escapable. Therefore, `\e` will either result in a parse error (which is |
67 | | /// true today), or it could backwards compatibly evolve into a new construct |
68 | | /// with its own meaning. Indeed, that is the purpose of banning _some_ |
69 | | /// superfluous escapes: it provides a way to evolve the syntax in a compatible |
70 | | /// manner. |
71 | 101k | fn is_escapable_character(c: char) -> bool { |
72 | | // Certainly escapable if it's a meta character. |
73 | 101k | if is_meta_character(c) { |
74 | 0 | return true; |
75 | 101k | } |
76 | | // Any character that isn't ASCII is definitely not escapable. There's |
77 | | // no real need to allow things like \☃ right? |
78 | 101k | if !c.is_ascii() { |
79 | 24 | return false; |
80 | 101k | } |
81 | | // Otherwise, we basically say that everything is escapable unless it's a |
82 | | // letter or digit. Things like \3 are either octal (when enabled) or an |
83 | | // error, and we should keep it that way. Otherwise, letters are reserved |
84 | | // for adding new syntax in a backwards compatible way. |
85 | 101k | match c { |
86 | 44.9k | '0'..='9' | 'A'..='Z' | 'a'..='z' => false, |
87 | | // While not currently supported, we keep these as not escapable to |
88 | | // give us some flexibility with respect to supporting the \< and |
89 | | // \> word boundary assertions in the future. By rejecting them as |
90 | | // escapable, \< and \> will result in a parse error. Thus, we can |
91 | | // turn them into something else in the future without it being a |
92 | | // backwards incompatible change. |
93 | 699 | '<' | '>' => false, |
94 | 63.6k | _ => true, |
95 | | } |
96 | 101k | } |
97 | | |
98 | | /// The configuration for a regex parser. |
99 | | #[derive(Clone, Copy, Debug)] |
100 | | pub(crate) struct Config { |
101 | | /// The maximum number of times we're allowed to recurse. |
102 | | /// |
103 | | /// Note that unlike the regex-syntax parser, we actually use recursion in |
104 | | /// this parser for simplicity. My hope is that by setting a conservative |
105 | | /// default call limit and providing a way to configure it, that we can |
106 | | /// keep this simplification. But if we must, we can re-work the parser to |
107 | | /// put the call stack on the heap like regex-syntax does. |
108 | | pub(crate) nest_limit: u32, |
109 | | /// Various flags that control how a pattern is interpreted. |
110 | | pub(crate) flags: Flags, |
111 | | } |
112 | | |
113 | | impl Default for Config { |
114 | 7.64k | fn default() -> Config { |
115 | 7.64k | Config { nest_limit: 50, flags: Flags::default() } |
116 | 7.64k | } |
117 | | } |
118 | | |
119 | | /// Various flags that control the interpretation of the pattern. |
120 | | /// |
121 | | /// These can be set via explicit configuration in code, or change dynamically |
122 | | /// during parsing via inline flags. For example, `foo(?i:bar)baz` will match |
123 | | /// `foo` and `baz` case sensitively and `bar` case insensitively (assuming a |
124 | | /// default configuration). |
125 | | #[derive(Clone, Copy, Debug, Default)] |
126 | | pub(crate) struct Flags { |
127 | | /// Whether to match case insensitively. |
128 | | /// |
129 | | /// This is the `i` flag. |
130 | | pub(crate) case_insensitive: bool, |
131 | | /// Whether `^` and `$` should be treated as line anchors or not. |
132 | | /// |
133 | | /// This is the `m` flag. |
134 | | pub(crate) multi_line: bool, |
135 | | /// Whether `.` should match line terminators or not. |
136 | | /// |
137 | | /// This is the `s` flag. |
138 | | pub(crate) dot_matches_new_line: bool, |
139 | | /// Whether to swap the meaning of greedy and non-greedy operators. |
140 | | /// |
141 | | /// This is the `U` flag. |
142 | | pub(crate) swap_greed: bool, |
143 | | /// Whether to enable CRLF mode. |
144 | | /// |
145 | | /// This is the `R` flag. |
146 | | pub(crate) crlf: bool, |
147 | | /// Whether to ignore whitespace. i.e., verbose mode. |
148 | | /// |
149 | | /// This is the `x` flag. |
150 | | pub(crate) ignore_whitespace: bool, |
151 | | } |
152 | | |
153 | | #[derive(Clone, Debug, Eq, PartialEq)] |
154 | | pub(crate) struct Hir { |
155 | | kind: HirKind, |
156 | | is_start_anchored: bool, |
157 | | is_match_empty: bool, |
158 | | static_explicit_captures_len: Option<usize>, |
159 | | } |
160 | | |
161 | | #[derive(Clone, Debug, Eq, PartialEq)] |
162 | | pub(crate) enum HirKind { |
163 | | Empty, |
164 | | Char(char), |
165 | | Class(Class), |
166 | | Look(Look), |
167 | | Repetition(Repetition), |
168 | | Capture(Capture), |
169 | | Concat(Vec<Hir>), |
170 | | Alternation(Vec<Hir>), |
171 | | } |
172 | | |
173 | | impl Hir { |
174 | | /// Parses the given pattern string with the given configuration into a |
175 | | /// structured representation. If the pattern is invalid, then an error |
176 | | /// is returned. |
177 | 7.64k | pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> { |
178 | 7.64k | self::parse::Parser::new(config, pattern).parse() |
179 | 7.64k | } |
180 | | |
181 | | /// Returns the underlying kind of this high-level intermediate |
182 | | /// representation. |
183 | | /// |
184 | | /// Note that there is explicitly no way to build an `Hir` directly from |
185 | | /// an `HirKind`. If you need to do that, then you must do case analysis |
186 | | /// on the `HirKind` and call the appropriate smart constructor on `Hir`. |
187 | 67.3M | pub(crate) fn kind(&self) -> &HirKind { |
188 | 67.3M | &self.kind |
189 | 67.3M | } |
190 | | |
191 | | /// Returns true if and only if this Hir expression can only match at the |
192 | | /// beginning of a haystack. |
193 | 5.56k | pub(crate) fn is_start_anchored(&self) -> bool { |
194 | 5.56k | self.is_start_anchored |
195 | 5.56k | } |
196 | | |
197 | | /// Returns true if and only if this Hir expression can match the empty |
198 | | /// string. |
199 | 42.6k | pub(crate) fn is_match_empty(&self) -> bool { |
200 | 42.6k | self.is_match_empty |
201 | 42.6k | } |
202 | | |
203 | | /// If the pattern always reports the same number of matching capture groups |
204 | | /// for every match, then this returns the number of those groups. This |
205 | | /// doesn't include the implicit group found in every pattern. |
206 | 5.56k | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
207 | 5.56k | self.static_explicit_captures_len |
208 | 5.56k | } |
209 | | |
210 | 0 | fn fail() -> Hir { |
211 | 0 | let kind = HirKind::Class(Class { ranges: vec![] }); |
212 | 0 | Hir { |
213 | 0 | kind, |
214 | 0 | is_start_anchored: false, |
215 | 0 | is_match_empty: false, |
216 | 0 | static_explicit_captures_len: Some(0), |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | 3.91M | fn empty() -> Hir { |
221 | 3.91M | let kind = HirKind::Empty; |
222 | 3.91M | Hir { |
223 | 3.91M | kind, |
224 | 3.91M | is_start_anchored: false, |
225 | 3.91M | is_match_empty: true, |
226 | 3.91M | static_explicit_captures_len: Some(0), |
227 | 3.91M | } |
228 | 3.91M | } |
229 | | |
230 | 31.2M | fn char(ch: char) -> Hir { |
231 | 31.2M | let kind = HirKind::Char(ch); |
232 | 31.2M | Hir { |
233 | 31.2M | kind, |
234 | 31.2M | is_start_anchored: false, |
235 | 31.2M | is_match_empty: false, |
236 | 31.2M | static_explicit_captures_len: Some(0), |
237 | 31.2M | } |
238 | 31.2M | } |
239 | | |
240 | 24.7M | fn class(class: Class) -> Hir { |
241 | 24.7M | let kind = HirKind::Class(class); |
242 | 24.7M | Hir { |
243 | 24.7M | kind, |
244 | 24.7M | is_start_anchored: false, |
245 | 24.7M | is_match_empty: false, |
246 | 24.7M | static_explicit_captures_len: Some(0), |
247 | 24.7M | } |
248 | 24.7M | } |
249 | | |
250 | 38.2k | fn look(look: Look) -> Hir { |
251 | 38.2k | let kind = HirKind::Look(look); |
252 | | Hir { |
253 | 38.2k | kind, |
254 | 38.2k | is_start_anchored: matches!(look, Look::Start), |
255 | | is_match_empty: true, |
256 | 38.2k | static_explicit_captures_len: Some(0), |
257 | | } |
258 | 38.2k | } |
259 | | |
260 | 350k | fn repetition(rep: Repetition) -> Hir { |
261 | 350k | if rep.min == 0 && rep.max == Some(0) { |
262 | 200k | return Hir::empty(); |
263 | 149k | } else if rep.min == 1 && rep.max == Some(1) { |
264 | 753 | return *rep.sub; |
265 | 148k | } |
266 | 148k | let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; |
267 | 148k | let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; |
268 | 148k | let mut static_explicit_captures_len = |
269 | 148k | rep.sub.static_explicit_captures_len; |
270 | | // If the static captures len of the sub-expression is not known or |
271 | | // is greater than zero, then it automatically propagates to the |
272 | | // repetition, regardless of the repetition. Otherwise, it might |
273 | | // change, but only when the repetition can match 0 times. |
274 | 148k | if rep.min == 0 |
275 | 101k | && static_explicit_captures_len.map_or(false, |len| len > 0) |
276 | | { |
277 | | // If we require a match 0 times, then our captures len is |
278 | | // guaranteed to be zero. Otherwise, if we *can* match the empty |
279 | | // string, then it's impossible to know how many captures will be |
280 | | // in the resulting match. |
281 | 23.9k | if rep.max == Some(0) { |
282 | 0 | static_explicit_captures_len = Some(0); |
283 | 23.9k | } else { |
284 | 23.9k | static_explicit_captures_len = None; |
285 | 23.9k | } |
286 | 124k | } |
287 | 148k | Hir { |
288 | 148k | kind: HirKind::Repetition(rep), |
289 | 148k | is_start_anchored, |
290 | 148k | is_match_empty, |
291 | 148k | static_explicit_captures_len, |
292 | 148k | } |
293 | 350k | } |
294 | | |
295 | 414k | fn capture(cap: Capture) -> Hir { |
296 | 414k | let is_start_anchored = cap.sub.is_start_anchored; |
297 | 414k | let is_match_empty = cap.sub.is_match_empty; |
298 | 414k | let static_explicit_captures_len = cap |
299 | 414k | .sub |
300 | 414k | .static_explicit_captures_len |
301 | 414k | .map(|len| len.saturating_add(1)); |
302 | 414k | let kind = HirKind::Capture(cap); |
303 | 414k | Hir { |
304 | 414k | kind, |
305 | 414k | is_start_anchored, |
306 | 414k | is_match_empty, |
307 | 414k | static_explicit_captures_len, |
308 | 414k | } |
309 | 414k | } |
310 | | |
311 | 3.47M | fn concat(mut subs: Vec<Hir>) -> Hir { |
312 | 3.47M | if subs.is_empty() { |
313 | 3.35M | Hir::empty() |
314 | 127k | } else if subs.len() == 1 { |
315 | 67.2k | subs.pop().unwrap() |
316 | | } else { |
317 | 59.7k | let is_start_anchored = subs[0].is_start_anchored; |
318 | 59.7k | let mut is_match_empty = true; |
319 | 59.7k | let mut static_explicit_captures_len = Some(0usize); |
320 | 12.2M | for sub in subs.iter() { |
321 | 12.2M | is_match_empty = is_match_empty && sub.is_match_empty; |
322 | 12.2M | static_explicit_captures_len = static_explicit_captures_len |
323 | 12.2M | .and_then(|len1| { |
324 | 12.1M | Some((len1, sub.static_explicit_captures_len?)) |
325 | 12.1M | }) |
326 | 12.2M | .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); |
327 | | } |
328 | 59.7k | Hir { |
329 | 59.7k | kind: HirKind::Concat(subs), |
330 | 59.7k | is_start_anchored, |
331 | 59.7k | is_match_empty, |
332 | 59.7k | static_explicit_captures_len, |
333 | 59.7k | } |
334 | | } |
335 | 3.47M | } |
336 | | |
337 | 421k | fn alternation(mut subs: Vec<Hir>) -> Hir { |
338 | 421k | if subs.is_empty() { |
339 | 0 | Hir::fail() |
340 | 421k | } else if subs.len() == 1 { |
341 | 417k | subs.pop().unwrap() |
342 | | } else { |
343 | 3.24k | let mut it = subs.iter().peekable(); |
344 | 3.24k | let mut is_start_anchored = |
345 | 3.24k | it.peek().map_or(false, |sub| sub.is_start_anchored); |
346 | 3.24k | let mut is_match_empty = |
347 | 3.24k | it.peek().map_or(false, |sub| sub.is_match_empty); |
348 | 3.24k | let mut static_explicit_captures_len = |
349 | 3.24k | it.peek().and_then(|sub| sub.static_explicit_captures_len); |
350 | 2.41M | for sub in it { |
351 | 2.40M | is_start_anchored = is_start_anchored && sub.is_start_anchored; |
352 | 2.40M | is_match_empty = is_match_empty || sub.is_match_empty; |
353 | 2.40M | if static_explicit_captures_len |
354 | 2.40M | != sub.static_explicit_captures_len |
355 | 81.5k | { |
356 | 81.5k | static_explicit_captures_len = None; |
357 | 2.32M | } |
358 | | } |
359 | 3.24k | Hir { |
360 | 3.24k | kind: HirKind::Alternation(subs), |
361 | 3.24k | is_start_anchored, |
362 | 3.24k | is_match_empty, |
363 | 3.24k | static_explicit_captures_len, |
364 | 3.24k | } |
365 | | } |
366 | 421k | } |
367 | | } |
368 | | |
369 | | impl HirKind { |
370 | | /// Returns a slice of this kind's sub-expressions, if any. |
371 | 581k | fn subs(&self) -> &[Hir] { |
372 | | use core::slice::from_ref; |
373 | | |
374 | 581k | match *self { |
375 | | HirKind::Empty |
376 | | | HirKind::Char(_) |
377 | | | HirKind::Class(_) |
378 | 563k | | HirKind::Look(_) => &[], |
379 | 1.41k | HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), |
380 | 7.73k | HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), |
381 | 8.86k | HirKind::Concat(ref subs) => subs, |
382 | 395 | HirKind::Alternation(ref subs) => subs, |
383 | | } |
384 | 581k | } |
385 | | } |
386 | | |
387 | | #[derive(Clone, Debug, Eq, PartialEq)] |
388 | | pub(crate) struct Class { |
389 | | pub(crate) ranges: Vec<ClassRange>, |
390 | | } |
391 | | |
392 | | impl Class { |
393 | | /// Create a new class from the given ranges. The ranges may be provided |
394 | | /// in any order or may even overlap. They will be automatically |
395 | | /// canonicalized. |
396 | 24.8M | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { |
397 | 24.8M | let mut class = Class { ranges: ranges.into_iter().collect() }; |
398 | 24.8M | class.canonicalize(); |
399 | 24.8M | class |
400 | 24.8M | } <regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 1]> Line | Count | Source | 396 | 2.95k | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { | 397 | 2.95k | let mut class = Class { ranges: ranges.into_iter().collect() }; | 398 | 2.95k | class.canonicalize(); | 399 | 2.95k | class | 400 | 2.95k | } |
<regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 2]> Line | Count | Source | 396 | 3.33M | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { | 397 | 3.33M | let mut class = Class { ranges: ranges.into_iter().collect() }; | 398 | 3.33M | class.canonicalize(); | 399 | 3.33M | class | 400 | 3.33M | } |
<regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 3]> Line | Count | Source | 396 | 1.56k | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { | 397 | 1.56k | let mut class = Class { ranges: ranges.into_iter().collect() }; | 398 | 1.56k | class.canonicalize(); | 399 | 1.56k | class | 400 | 1.56k | } |
<regex_lite::hir::Class>::new::<alloc::vec::Vec<regex_lite::hir::ClassRange>> Line | Count | Source | 396 | 147k | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { | 397 | 147k | let mut class = Class { ranges: ranges.into_iter().collect() }; | 398 | 147k | class.canonicalize(); | 399 | 147k | class | 400 | 147k | } |
<regex_lite::hir::Class>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<(u8, u8)>, regex_lite::hir::parse::posix_class::{closure#0}>>Line | Count | Source | 396 | 21.3M | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { | 397 | 21.3M | let mut class = Class { ranges: ranges.into_iter().collect() }; | 398 | 21.3M | class.canonicalize(); | 399 | 21.3M | class | 400 | 21.3M | } |
|
401 | | |
402 | | /// Expand this class such that it matches the ASCII codepoints in this set |
403 | | /// case insensitively. |
404 | 99.2k | fn ascii_case_fold(&mut self) { |
405 | 99.2k | let len = self.ranges.len(); |
406 | 352k | for i in 0..len { |
407 | 352k | if let Some(folded) = self.ranges[i].ascii_case_fold() { |
408 | 130k | self.ranges.push(folded); |
409 | 221k | } |
410 | | } |
411 | 99.2k | self.canonicalize(); |
412 | 99.2k | } |
413 | | |
414 | | /// Negate this set. |
415 | | /// |
416 | | /// For all `x` where `x` is any element, if `x` was in this set, then it |
417 | | /// will not be in this set after negation. |
418 | 21.1M | fn negate(&mut self) { |
419 | | const MIN: char = '\x00'; |
420 | | const MAX: char = char::MAX; |
421 | | |
422 | 21.1M | if self.ranges.is_empty() { |
423 | 0 | self.ranges.push(ClassRange { start: MIN, end: MAX }); |
424 | 0 | return; |
425 | 21.1M | } |
426 | | |
427 | | // There should be a way to do this in-place with constant memory, |
428 | | // but I couldn't figure out a simple way to do it. So just append |
429 | | // the negation to the end of this range, and then drain it before |
430 | | // we're done. |
431 | 21.1M | let drain_end = self.ranges.len(); |
432 | | |
433 | | // If our class doesn't start the minimum possible char, then negation |
434 | | // needs to include all codepoints up to the minimum in this set. |
435 | 21.1M | if self.ranges[0].start > MIN { |
436 | 21.1M | self.ranges.push(ClassRange { |
437 | 21.1M | start: MIN, |
438 | 21.1M | // OK because we know it's bigger than MIN. |
439 | 21.1M | end: prev_char(self.ranges[0].start).unwrap(), |
440 | 21.1M | }); |
441 | 21.1M | } |
442 | 63.3M | for i in 1..drain_end { |
443 | 63.3M | // let lower = self.ranges[i - 1].upper().increment(); |
444 | 63.3M | // let upper = self.ranges[i].lower().decrement(); |
445 | 63.3M | // self.ranges.push(I::create(lower, upper)); |
446 | 63.3M | self.ranges.push(ClassRange { |
447 | 63.3M | // OK because we know i-1 is never the last range and therefore |
448 | 63.3M | // there must be a range greater than it. It therefore follows |
449 | 63.3M | // that 'end' can never be char::MAX, and thus there must be |
450 | 63.3M | // a next char. |
451 | 63.3M | start: next_char(self.ranges[i - 1].end).unwrap(), |
452 | 63.3M | // Since 'i' is guaranteed to never be the first range, it |
453 | 63.3M | // follows that there is always a range before this and thus |
454 | 63.3M | // 'start' can never be '\x00'. Thus, there must be a previous |
455 | 63.3M | // char. |
456 | 63.3M | end: prev_char(self.ranges[i].start).unwrap(), |
457 | 63.3M | }); |
458 | 63.3M | } |
459 | 21.1M | if self.ranges[drain_end - 1].end < MAX { |
460 | 21.1M | // let lower = self.ranges[drain_end - 1].upper().increment(); |
461 | 21.1M | // self.ranges.push(I::create(lower, I::Bound::max_value())); |
462 | 21.1M | self.ranges.push(ClassRange { |
463 | 21.1M | // OK because we know 'end' is less than char::MAX, and thus |
464 | 21.1M | // there is a next char. |
465 | 21.1M | start: next_char(self.ranges[drain_end - 1].end).unwrap(), |
466 | 21.1M | end: MAX, |
467 | 21.1M | }); |
468 | 21.1M | } |
469 | 21.1M | self.ranges.drain(..drain_end); |
470 | | // We don't need to canonicalize because we processed the ranges above |
471 | | // in canonical order and the new ranges we added based on those are |
472 | | // also necessarily in canonical order. |
473 | 21.1M | } |
474 | | |
475 | | /// Converts this set into a canonical ordering. |
476 | 24.9M | fn canonicalize(&mut self) { |
477 | 24.9M | if self.is_canonical() { |
478 | 21.7M | return; |
479 | 3.18M | } |
480 | 3.18M | self.ranges.sort(); |
481 | 3.18M | assert!(!self.ranges.is_empty()); |
482 | | |
483 | | // Is there a way to do this in-place with constant memory? I couldn't |
484 | | // figure out a way to do it. So just append the canonicalization to |
485 | | // the end of this range, and then drain it before we're done. |
486 | 3.18M | let drain_end = self.ranges.len(); |
487 | 133M | for oldi in 0..drain_end { |
488 | | // If we've added at least one new range, then check if we can |
489 | | // merge this range in the previously added range. |
490 | 133M | if self.ranges.len() > drain_end { |
491 | 130M | let (last, rest) = self.ranges.split_last_mut().unwrap(); |
492 | 130M | if let Some(union) = last.union(&rest[oldi]) { |
493 | 126M | *last = union; |
494 | 126M | continue; |
495 | 3.61M | } |
496 | 3.18M | } |
497 | 6.80M | self.ranges.push(self.ranges[oldi]); |
498 | | } |
499 | 3.18M | self.ranges.drain(..drain_end); |
500 | 24.9M | } |
501 | | |
502 | | /// Returns true if and only if this class is in a canonical ordering. |
503 | 24.9M | fn is_canonical(&self) -> bool { |
504 | 67.7M | for pair in self.ranges.windows(2) { |
505 | 67.7M | if pair[0] >= pair[1] { |
506 | 3.14M | return false; |
507 | 64.6M | } |
508 | 64.6M | if pair[0].is_contiguous(&pair[1]) { |
509 | 44.2k | return false; |
510 | 64.6M | } |
511 | | } |
512 | 21.7M | true |
513 | 24.9M | } |
514 | | } |
515 | | |
516 | | #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] |
517 | | pub(crate) struct ClassRange { |
518 | | pub(crate) start: char, |
519 | | pub(crate) end: char, |
520 | | } |
521 | | |
522 | | impl ClassRange { |
523 | | /// Apply simple case folding to this byte range. Only ASCII case mappings |
524 | | /// (for A-Za-z) are applied. |
525 | | /// |
526 | | /// Additional ranges are appended to the given vector. Canonical ordering |
527 | | /// is *not* maintained in the given vector. |
528 | 5.65M | fn ascii_case_fold(&self) -> Option<ClassRange> { |
529 | 5.65M | if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) { |
530 | 3.08M | let start = core::cmp::max(self.start, 'a'); |
531 | 3.08M | let end = core::cmp::min(self.end, 'z'); |
532 | 3.08M | return Some(ClassRange { |
533 | 3.08M | start: char::try_from(u32::from(start) - 32).unwrap(), |
534 | 3.08M | end: char::try_from(u32::from(end) - 32).unwrap(), |
535 | 3.08M | }); |
536 | 2.57M | } |
537 | 2.57M | if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) { |
538 | 370k | let start = core::cmp::max(self.start, 'A'); |
539 | 370k | let end = core::cmp::min(self.end, 'Z'); |
540 | 370k | return Some(ClassRange { |
541 | 370k | start: char::try_from(u32::from(start) + 32).unwrap(), |
542 | 370k | end: char::try_from(u32::from(end) + 32).unwrap(), |
543 | 370k | }); |
544 | 2.20M | } |
545 | 2.20M | None |
546 | 5.65M | } |
547 | | |
548 | | /// Union the given overlapping range into this range. |
549 | | /// |
550 | | /// If the two ranges aren't contiguous, then this returns `None`. |
551 | 130M | fn union(&self, other: &ClassRange) -> Option<ClassRange> { |
552 | 130M | if !self.is_contiguous(other) { |
553 | 3.61M | return None; |
554 | 126M | } |
555 | 126M | let start = core::cmp::min(self.start, other.start); |
556 | 126M | let end = core::cmp::max(self.end, other.end); |
557 | 126M | Some(ClassRange { start, end }) |
558 | 130M | } |
559 | | |
560 | | /// Returns true if and only if the two ranges are contiguous. Two ranges |
561 | | /// are contiguous if and only if the ranges are either overlapping or |
562 | | /// adjacent. |
563 | 195M | fn is_contiguous(&self, other: &ClassRange) -> bool { |
564 | 195M | let (s1, e1) = (u32::from(self.start), u32::from(self.end)); |
565 | 195M | let (s2, e2) = (u32::from(other.start), u32::from(other.end)); |
566 | 195M | core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) |
567 | 195M | } |
568 | | |
569 | | /// Returns true if and only if the intersection of this range and the |
570 | | /// other range is empty. |
571 | 8.22M | fn is_intersection_empty(&self, other: &ClassRange) -> bool { |
572 | 8.22M | let (s1, e1) = (self.start, self.end); |
573 | 8.22M | let (s2, e2) = (other.start, other.end); |
574 | 8.22M | core::cmp::max(s1, s2) > core::cmp::min(e1, e2) |
575 | 8.22M | } |
576 | | } |
577 | | |
578 | | /// The high-level intermediate representation for a look-around assertion. |
579 | | /// |
580 | | /// An assertion match is always zero-length. Also called an "empty match." |
581 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
582 | | pub(crate) enum Look { |
583 | | /// Match the beginning of text. Specifically, this matches at the starting |
584 | | /// position of the input. |
585 | | Start = 1 << 0, |
586 | | /// Match the end of text. Specifically, this matches at the ending |
587 | | /// position of the input. |
588 | | End = 1 << 1, |
589 | | /// Match the beginning of a line or the beginning of text. Specifically, |
590 | | /// this matches at the starting position of the input, or at the position |
591 | | /// immediately following a `\n` character. |
592 | | StartLF = 1 << 2, |
593 | | /// Match the end of a line or the end of text. Specifically, this matches |
594 | | /// at the end position of the input, or at the position immediately |
595 | | /// preceding a `\n` character. |
596 | | EndLF = 1 << 3, |
597 | | /// Match the beginning of a line or the beginning of text. Specifically, |
598 | | /// this matches at the starting position of the input, or at the position |
599 | | /// immediately following either a `\r` or `\n` character, but never after |
600 | | /// a `\r` when a `\n` follows. |
601 | | StartCRLF = 1 << 4, |
602 | | /// Match the end of a line or the end of text. Specifically, this matches |
603 | | /// at the end position of the input, or at the position immediately |
604 | | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
605 | | /// precedes it. |
606 | | EndCRLF = 1 << 5, |
607 | | /// Match an ASCII-only word boundary. That is, this matches a position |
608 | | /// where the left adjacent character and right adjacent character |
609 | | /// correspond to a word and non-word or a non-word and word character. |
610 | | Word = 1 << 6, |
611 | | /// Match an ASCII-only negation of a word boundary. |
612 | | WordNegate = 1 << 7, |
613 | | /// Match the start of an ASCII-only word boundary. That is, this matches a |
614 | | /// position at either the beginning of the haystack or where the previous |
615 | | /// character is not a word character and the following character is a word |
616 | | /// character. |
617 | | WordStart = 1 << 8, |
618 | | /// Match the end of an ASCII-only word boundary. That is, this matches |
619 | | /// a position at either the end of the haystack or where the previous |
620 | | /// character is a word character and the following character is not a word |
621 | | /// character. |
622 | | WordEnd = 1 << 9, |
623 | | /// Match the start half of an ASCII-only word boundary. That is, this |
624 | | /// matches a position at either the beginning of the haystack or where the |
625 | | /// previous character is not a word character. |
626 | | WordStartHalf = 1 << 10, |
627 | | /// Match the end half of an ASCII-only word boundary. That is, this |
628 | | /// matches a position at either the end of the haystack or where the |
629 | | /// following character is not a word character. |
630 | | WordEndHalf = 1 << 11, |
631 | | } |
632 | | |
633 | | impl Look { |
634 | | /// Returns true if the given position in the given haystack matches this |
635 | | /// look-around assertion. |
636 | 9.82M | pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { |
637 | | use self::Look::*; |
638 | | |
639 | 9.82M | match *self { |
640 | 2.23M | Start => at == 0, |
641 | 638k | End => at == haystack.len(), |
642 | 572k | StartLF => at == 0 || haystack[at - 1] == b'\n', |
643 | 573k | EndLF => at == haystack.len() || haystack[at] == b'\n', |
644 | | StartCRLF => { |
645 | 381k | at == 0 |
646 | 378k | || haystack[at - 1] == b'\n' |
647 | 375k | || (haystack[at - 1] == b'\r' |
648 | 2.26k | && (at >= haystack.len() || haystack[at] != b'\n')) |
649 | | } |
650 | | EndCRLF => { |
651 | 377k | at == haystack.len() |
652 | 375k | || haystack[at] == b'\r' |
653 | 374k | || (haystack[at] == b'\n' |
654 | 3.49k | && (at == 0 || haystack[at - 1] != b'\r')) |
655 | | } |
656 | | Word => { |
657 | 4.34M | let word_before = |
658 | 4.34M | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
659 | 4.34M | let word_after = |
660 | 4.34M | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
661 | 4.34M | word_before != word_after |
662 | | } |
663 | | WordNegate => { |
664 | 279k | let word_before = |
665 | 279k | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
666 | 279k | let word_after = |
667 | 279k | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
668 | 279k | word_before == word_after |
669 | | } |
670 | | WordStart => { |
671 | 4.42k | let word_before = |
672 | 4.42k | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
673 | 4.42k | let word_after = |
674 | 4.42k | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
675 | 4.42k | !word_before && word_after |
676 | | } |
677 | | WordEnd => { |
678 | 407k | let word_before = |
679 | 407k | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
680 | 407k | let word_after = |
681 | 407k | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
682 | 407k | word_before && !word_after |
683 | | } |
684 | | WordStartHalf => { |
685 | 1.23k | let word_before = |
686 | 1.23k | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
687 | 1.23k | !word_before |
688 | | } |
689 | | WordEndHalf => { |
690 | 2.56k | let word_after = |
691 | 2.56k | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
692 | 2.56k | !word_after |
693 | | } |
694 | | } |
695 | 9.82M | } |
696 | | } |
697 | | |
698 | | /// The high-level intermediate representation of a repetition operator. |
699 | | /// |
700 | | /// A repetition operator permits the repetition of an arbitrary |
701 | | /// sub-expression. |
702 | | #[derive(Clone, Debug, Eq, PartialEq)] |
703 | | pub(crate) struct Repetition { |
704 | | /// The minimum range of the repetition. |
705 | | /// |
706 | | /// Note that special cases like `?`, `+` and `*` all get translated into |
707 | | /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. |
708 | | /// |
709 | | /// When `min` is zero, this expression can match the empty string |
710 | | /// regardless of what its sub-expression is. |
711 | | pub(crate) min: u32, |
712 | | /// The maximum range of the repetition. |
713 | | /// |
714 | | /// Note that when `max` is `None`, `min` acts as a lower bound but where |
715 | | /// there is no upper bound. For something like `x{5}` where the min and |
716 | | /// max are equivalent, `min` will be set to `5` and `max` will be set to |
717 | | /// `Some(5)`. |
718 | | pub(crate) max: Option<u32>, |
719 | | /// Whether this repetition operator is greedy or not. A greedy operator |
720 | | /// will match as much as it can. A non-greedy operator will match as |
721 | | /// little as it can. |
722 | | /// |
723 | | /// Typically, operators are greedy by default and are only non-greedy when |
724 | | /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is |
725 | | /// not. However, this can be inverted via the `U` "ungreedy" flag. |
726 | | pub(crate) greedy: bool, |
727 | | /// The expression being repeated. |
728 | | pub(crate) sub: Box<Hir>, |
729 | | } |
730 | | |
731 | | /// The high-level intermediate representation for a capturing group. |
732 | | /// |
733 | | /// A capturing group always has an index and a child expression. It may |
734 | | /// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not |
735 | | /// necessary. |
736 | | /// |
737 | | /// Note that there is no explicit representation of a non-capturing group |
738 | | /// in a `Hir`. Instead, non-capturing grouping is handled automatically by |
739 | | /// the recursive structure of the `Hir` itself. |
740 | | #[derive(Clone, Debug, Eq, PartialEq)] |
741 | | pub(crate) struct Capture { |
742 | | /// The capture index of the capture. |
743 | | pub(crate) index: u32, |
744 | | /// The name of the capture, if it exists. |
745 | | pub(crate) name: Option<Box<str>>, |
746 | | /// The expression inside the capturing group, which may be empty. |
747 | | pub(crate) sub: Box<Hir>, |
748 | | } |
749 | | |
750 | 84.4M | fn next_char(ch: char) -> Option<char> { |
751 | | // Skip over the surrogate range. |
752 | 84.4M | if ch == '\u{D7FF}' { |
753 | 2.45k | return Some('\u{E000}'); |
754 | 84.4M | } |
755 | | // OK because char::MAX < u32::MAX and we handle U+D7FF above. |
756 | 84.4M | char::from_u32(u32::from(ch).checked_add(1).unwrap()) |
757 | 84.4M | } |
758 | | |
759 | 84.4M | fn prev_char(ch: char) -> Option<char> { |
760 | | // Skip over the surrogate range. |
761 | 84.4M | if ch == '\u{E000}' { |
762 | 391 | return Some('\u{D7FF}'); |
763 | 84.4M | } |
764 | | // OK because subtracting 1 from any valid scalar value other than 0 |
765 | | // and U+E000 yields a valid scalar value. |
766 | 84.4M | Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) |
767 | 84.4M | } |
768 | | |
769 | | impl Drop for Hir { |
770 | 60.5M | fn drop(&mut self) { |
771 | | use core::mem; |
772 | | |
773 | 60.5M | match *self.kind() { |
774 | | HirKind::Empty |
775 | | | HirKind::Char(_) |
776 | | | HirKind::Class(_) |
777 | 59.9M | | HirKind::Look(_) => return, |
778 | 430k | HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, |
779 | 150k | HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { |
780 | 148k | return |
781 | | } |
782 | 76.3k | HirKind::Concat(ref x) if x.is_empty() => return, |
783 | 3.99k | HirKind::Alternation(ref x) if x.is_empty() => return, |
784 | 35.7k | _ => {} |
785 | | } |
786 | | |
787 | 35.7k | let mut stack = vec![mem::replace(self, Hir::empty())]; |
788 | 15.0M | while let Some(mut expr) = stack.pop() { |
789 | 15.0M | match expr.kind { |
790 | | HirKind::Empty |
791 | | | HirKind::Char(_) |
792 | | | HirKind::Class(_) |
793 | 14.6M | | HirKind::Look(_) => {} |
794 | 180k | HirKind::Capture(ref mut x) => { |
795 | 180k | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
796 | 180k | } |
797 | 145k | HirKind::Repetition(ref mut x) => { |
798 | 145k | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
799 | 145k | } |
800 | 59.7k | HirKind::Concat(ref mut x) => { |
801 | 59.7k | stack.extend(x.drain(..)); |
802 | 59.7k | } |
803 | 3.24k | HirKind::Alternation(ref mut x) => { |
804 | 3.24k | stack.extend(x.drain(..)); |
805 | 3.24k | } |
806 | | } |
807 | | } |
808 | 60.5M | } |
809 | | } |