/src/regex/regex-lite/src/hir/parse.rs
Line | Count | Source |
1 | | use core::cell::{Cell, RefCell}; |
2 | | |
3 | | use alloc::{ |
4 | | boxed::Box, |
5 | | string::{String, ToString}, |
6 | | vec, |
7 | | vec::Vec, |
8 | | }; |
9 | | |
10 | | use crate::{ |
11 | | error::Error, |
12 | | hir::{self, Config, Flags, Hir, HirKind}, |
13 | | }; |
14 | | |
15 | | // These are all of the errors that can occur while parsing a regex. Unlike |
16 | | // regex-syntax, our errors are not particularly great. They are just enough |
17 | | // to get a general sense of what went wrong. But in exchange, the error |
18 | | // reporting mechanism is *much* simpler than what's in regex-syntax. |
19 | | // |
20 | | // By convention, we use each of these messages in exactly one place. That |
21 | | // way, every branch that leads to an error has a unique message. This in turn |
22 | | // means that given a message, one can precisely identify which part of the |
23 | | // parser reported it. |
24 | | // |
25 | | // Finally, we give names to each message so that we can reference them in |
26 | | // tests. |
27 | | const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting"; |
28 | | const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups"; |
29 | | const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name"; |
30 | | const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'"; |
31 | | const ERR_UNCLOSED_GROUP_QUESTION: &str = |
32 | | "expected closing ')', but got end of pattern"; |
33 | | const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('"; |
34 | | const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported"; |
35 | | const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed"; |
36 | | const ERR_MISSING_GROUP_NAME: &str = |
37 | | "expected capture group name, but got end of pattern"; |
38 | | const ERR_INVALID_GROUP_NAME: &str = "invalid group name"; |
39 | | const ERR_UNCLOSED_GROUP_NAME: &str = |
40 | | "expected end of capture group name, but got end of pattern"; |
41 | | const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed"; |
42 | | const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag"; |
43 | | const ERR_FLAG_REPEATED_NEGATION: &str = |
44 | | "inline flag negation cannot be repeated"; |
45 | | const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed"; |
46 | | const ERR_FLAG_UNEXPECTED_EOF: &str = |
47 | | "expected ':' or ')' to end inline flags, but got end of pattern"; |
48 | | const ERR_FLAG_DANGLING_NEGATION: &str = |
49 | | "inline flags cannot end with negation directive"; |
50 | | const ERR_DECIMAL_NO_DIGITS: &str = |
51 | | "expected decimal number, but found no digits"; |
52 | | const ERR_DECIMAL_INVALID: &str = "got invalid decimal number"; |
53 | | const ERR_HEX_BRACE_INVALID_DIGIT: &str = |
54 | | "expected hexadecimal number in braces, but got non-hex digit"; |
55 | | const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = |
56 | | "expected hexadecimal number, but saw end of pattern before closing brace"; |
57 | | const ERR_HEX_BRACE_EMPTY: &str = |
58 | | "expected hexadecimal number in braces, but got no digits"; |
59 | | const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces"; |
60 | | const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = |
61 | | "expected fixed length hexadecimal number, but saw end of pattern first"; |
62 | | const ERR_HEX_FIXED_INVALID_DIGIT: &str = |
63 | | "expected fixed length hexadecimal number, but got non-hex digit"; |
64 | | const ERR_HEX_FIXED_INVALID: &str = |
65 | | "got invalid fixed length hexadecimal number"; |
66 | | const ERR_HEX_UNEXPECTED_EOF: &str = |
67 | | "expected hexadecimal number, but saw end of pattern first"; |
68 | | const ERR_ESCAPE_UNEXPECTED_EOF: &str = |
69 | | "saw start of escape sequence, but saw end of pattern before it finished"; |
70 | | const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported"; |
71 | | const ERR_UNICODE_CLASS_UNSUPPORTED: &str = |
72 | | "Unicode character classes are not supported"; |
73 | | const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence"; |
74 | | const ERR_POSIX_CLASS_UNRECOGNIZED: &str = |
75 | | "unrecognized POSIX character class"; |
76 | | const ERR_UNCOUNTED_REP_SUB_MISSING: &str = |
77 | | "uncounted repetition operator must be applied to a sub-expression"; |
78 | | const ERR_COUNTED_REP_SUB_MISSING: &str = |
79 | | "counted repetition operator must be applied to a sub-expression"; |
80 | | const ERR_COUNTED_REP_UNCLOSED: &str = |
81 | | "found unclosed counted repetition operator"; |
82 | | const ERR_COUNTED_REP_MIN_UNCLOSED: &str = |
83 | | "found incomplete and unclosed counted repetition operator"; |
84 | | const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = |
85 | | "found counted repetition operator with a comma that is unclosed"; |
86 | | const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = |
87 | | "found counted repetition with min and max that is unclosed"; |
88 | | const ERR_COUNTED_REP_INVALID: &str = |
89 | | "expected closing brace for counted repetition, but got something else"; |
90 | | const ERR_COUNTED_REP_INVALID_RANGE: &str = |
91 | | "found counted repetition with a min bigger than its max"; |
92 | | const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = |
93 | | "non-empty character class has no closing bracket"; |
94 | | const ERR_CLASS_INVALID_RANGE_ITEM: &str = |
95 | | "character class ranges must start and end with a single character"; |
96 | | const ERR_CLASS_INVALID_ITEM: &str = |
97 | | "invalid escape sequence in character class"; |
98 | | const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = |
99 | | "non-empty character class has no closing bracket after dash"; |
100 | | const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = |
101 | | "negated character class has no closing bracket"; |
102 | | const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = |
103 | | "character class begins with literal ']' but has no closing bracket"; |
104 | | const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class"; |
105 | | const ERR_CLASS_UNCLOSED: &str = "found unclosed character class"; |
106 | | const ERR_CLASS_NEST_UNSUPPORTED: &str = |
107 | | "nested character classes are not supported"; |
108 | | const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = |
109 | | "character class intersection is not supported"; |
110 | | const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = |
111 | | "character class difference is not supported"; |
112 | | const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = |
113 | | "character class symmetric difference is not supported"; |
114 | | const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = |
115 | | "special word boundary assertion is unclosed or has an invalid character"; |
116 | | const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = |
117 | | "special word boundary assertion is unrecognized"; |
118 | | const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = |
119 | | "found start of special word boundary or repetition without an end"; |
120 | | |
121 | | /// A regular expression parser. |
122 | | /// |
123 | | /// This parses a string representation of a regular expression into an |
124 | | /// abstract syntax tree. The size of the tree is proportional to the length |
125 | | /// of the regular expression pattern. |
126 | | /// |
127 | | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
128 | | #[derive(Clone, Debug)] |
129 | | pub(super) struct Parser<'a> { |
130 | | /// The configuration of the parser as given by the caller. |
131 | | config: Config, |
132 | | /// The pattern we're parsing as given by the caller. |
133 | | pattern: &'a str, |
134 | | /// The call depth of the parser. This is incremented for each |
135 | | /// sub-expression parsed. Its peak value is the maximum nesting of the |
136 | | /// pattern. |
137 | | depth: Cell<u32>, |
138 | | /// The current position of the parser. |
139 | | pos: Cell<usize>, |
140 | | /// The current codepoint of the parser. The codepoint corresponds to the |
141 | | /// codepoint encoded in `pattern` beginning at `pos`. |
142 | | /// |
143 | | /// This is `None` if and only if `pos == pattern.len()`. |
144 | | char: Cell<Option<char>>, |
145 | | /// The current capture index. |
146 | | capture_index: Cell<u32>, |
147 | | /// The flags that are currently set. |
148 | | flags: RefCell<Flags>, |
149 | | /// A sorted sequence of capture names. This is used to detect duplicate |
150 | | /// capture names and report an error if one is detected. |
151 | | capture_names: RefCell<Vec<String>>, |
152 | | } |
153 | | |
154 | | /// The constructor and a variety of helper routines. |
155 | | impl<'a> Parser<'a> { |
156 | | /// Build a parser from this configuration with the given pattern. |
157 | 7.98k | pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { |
158 | 7.98k | Parser { |
159 | 7.98k | config, |
160 | 7.98k | pattern, |
161 | 7.98k | depth: Cell::new(0), |
162 | 7.98k | pos: Cell::new(0), |
163 | 7.98k | char: Cell::new(pattern.chars().next()), |
164 | 7.98k | capture_index: Cell::new(0), |
165 | 7.98k | flags: RefCell::new(config.flags), |
166 | 7.98k | capture_names: RefCell::new(vec![]), |
167 | 7.98k | } |
168 | 7.98k | } |
169 | | |
170 | | /// Returns the full pattern string that we're parsing. |
171 | 87.7M | fn pattern(&self) -> &str { |
172 | 87.7M | self.pattern |
173 | 87.7M | } |
174 | | |
175 | | /// Return the current byte offset of the parser. |
176 | | /// |
177 | | /// The offset starts at `0` from the beginning of the regular expression |
178 | | /// pattern string. |
179 | 398M | fn pos(&self) -> usize { |
180 | 398M | self.pos.get() |
181 | 398M | } |
182 | | |
183 | | /// Increments the call depth of the parser. |
184 | | /// |
185 | | /// If the call depth would exceed the configured nest limit, then this |
186 | | /// returns an error. |
187 | | /// |
188 | | /// This returns the old depth. |
189 | 444k | fn increment_depth(&self) -> Result<u32, Error> { |
190 | 444k | let old = self.depth.get(); |
191 | 444k | if old > self.config.nest_limit { |
192 | 2 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
193 | 444k | } |
194 | | // OK because our depth starts at 0, and we return an error if it |
195 | | // ever reaches the limit. So the call depth can never exceed u32::MAX. |
196 | 444k | let new = old.checked_add(1).unwrap(); |
197 | 444k | self.depth.set(new); |
198 | 444k | Ok(old) |
199 | 444k | } |
200 | | |
201 | | /// Decrements the call depth of the parser. |
202 | | /// |
203 | | /// This panics if the current depth is 0. |
204 | 440k | fn decrement_depth(&self) { |
205 | 440k | let old = self.depth.get(); |
206 | | // If this fails then the caller has a bug in how they're incrementing |
207 | | // and decrementing the depth of the parser's call stack. |
208 | 440k | let new = old.checked_sub(1).unwrap(); |
209 | 440k | self.depth.set(new); |
210 | 440k | } |
211 | | |
212 | | /// Return the codepoint at the current position of the parser. |
213 | | /// |
214 | | /// This panics if the parser is positioned at the end of the pattern. |
215 | 319M | fn char(&self) -> char { |
216 | 319M | self.char.get().expect("codepoint, but parser is done") |
217 | 319M | } |
218 | | |
219 | | /// Returns true if the next call to `bump` would return false. |
220 | 223M | fn is_done(&self) -> bool { |
221 | 223M | self.pos() == self.pattern.len() |
222 | 223M | } |
223 | | |
224 | | /// Returns the flags that are current set for this regex. |
225 | 114M | fn flags(&self) -> Flags { |
226 | 114M | *self.flags.borrow() |
227 | 114M | } |
228 | | |
229 | | /// Bump the parser to the next Unicode scalar value. |
230 | | /// |
231 | | /// If the end of the input has been reached, then `false` is returned. |
232 | 84.0M | fn bump(&self) -> bool { |
233 | 84.0M | if self.is_done() { |
234 | 0 | return false; |
235 | 84.0M | } |
236 | 84.0M | self.pos.set(self.pos() + self.char().len_utf8()); |
237 | 84.0M | self.char.set(self.pattern()[self.pos()..].chars().next()); |
238 | 84.0M | self.char.get().is_some() |
239 | 84.0M | } |
240 | | |
241 | | /// If the substring starting at the current position of the parser has |
242 | | /// the given prefix, then bump the parser to the character immediately |
243 | | /// following the prefix and return true. Otherwise, don't bump the parser |
244 | | /// and return false. |
245 | 3.07M | fn bump_if(&self, prefix: &str) -> bool { |
246 | 3.07M | if self.pattern()[self.pos()..].starts_with(prefix) { |
247 | 41.5k | for _ in 0..prefix.chars().count() { |
248 | 41.5k | self.bump(); |
249 | 41.5k | } |
250 | 22.4k | true |
251 | | } else { |
252 | 3.04M | false |
253 | | } |
254 | 3.07M | } |
255 | | |
256 | | /// Bump the parser, and if the `x` flag is enabled, bump through any |
257 | | /// subsequent spaces. Return true if and only if the parser is not done. |
258 | 3.05M | fn bump_and_bump_space(&self) -> bool { |
259 | 3.05M | if !self.bump() { |
260 | 1.31k | return false; |
261 | 3.05M | } |
262 | 3.05M | self.bump_space(); |
263 | 3.05M | !self.is_done() |
264 | 3.05M | } |
265 | | |
266 | | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
267 | | /// comments), then this will advance the parser through all whitespace |
268 | | /// and comments to the next non-whitespace non-comment byte. |
269 | | /// |
270 | | /// If the `x` flag is disabled, then this is a no-op. |
271 | | /// |
272 | | /// This should be used selectively throughout the parser where |
273 | | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
274 | | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
275 | 99.6M | fn bump_space(&self) { |
276 | 99.6M | if !self.flags().ignore_whitespace { |
277 | 83.0M | return; |
278 | 16.5M | } |
279 | 17.2M | while !self.is_done() { |
280 | 17.2M | if self.char().is_whitespace() { |
281 | 643k | self.bump(); |
282 | 16.5M | } else if self.char() == '#' { |
283 | 739 | self.bump(); |
284 | 1.99M | while !self.is_done() { |
285 | 1.99M | let c = self.char(); |
286 | 1.99M | self.bump(); |
287 | 1.99M | if c == '\n' { |
288 | 686 | break; |
289 | 1.99M | } |
290 | | } |
291 | | } else { |
292 | 16.5M | break; |
293 | | } |
294 | | } |
295 | 99.6M | } |
296 | | |
297 | | /// Peek at the next character in the input without advancing the parser. |
298 | | /// |
299 | | /// If the input has been exhausted, then this returns `None`. |
300 | 203k | fn peek(&self) -> Option<char> { |
301 | 203k | if self.is_done() { |
302 | 0 | return None; |
303 | 203k | } |
304 | 203k | self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() |
305 | 203k | } |
306 | | |
307 | | /// Peeks at the next character in the pattern from the current offset, and |
308 | | /// will ignore spaces when the parser is in whitespace insensitive mode. |
309 | 337k | fn peek_space(&self) -> Option<char> { |
310 | 337k | if !self.flags().ignore_whitespace { |
311 | 127k | return self.peek(); |
312 | 210k | } |
313 | 210k | if self.is_done() { |
314 | 0 | return None; |
315 | 210k | } |
316 | 210k | let mut start = self.pos() + self.char().len_utf8(); |
317 | 210k | let mut in_comment = false; |
318 | 309k | for (i, ch) in self.pattern()[start..].char_indices() { |
319 | 309k | if ch.is_whitespace() { |
320 | 98.9k | continue; |
321 | 210k | } else if !in_comment && ch == '#' { |
322 | 676 | in_comment = true; |
323 | 210k | } else if in_comment && ch == '\n' { |
324 | 0 | in_comment = false; |
325 | 0 | } else { |
326 | 210k | start += i; |
327 | 210k | break; |
328 | | } |
329 | | } |
330 | 210k | self.pattern()[start..].chars().next() |
331 | 337k | } |
332 | | |
333 | | /// Return the next capturing index. Each subsequent call increments the |
334 | | /// internal index. Since the way capture indices are computed is a public |
335 | | /// API guarantee, use of this routine depends on the parser being depth |
336 | | /// first and left-to-right. |
337 | | /// |
338 | | /// If the capture limit is exceeded, then an error is returned. |
339 | 435k | fn next_capture_index(&self) -> Result<u32, Error> { |
340 | 435k | let current = self.capture_index.get(); |
341 | 435k | let next = current |
342 | 435k | .checked_add(1) |
343 | 435k | .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; |
344 | 435k | self.capture_index.set(next); |
345 | 435k | Ok(next) |
346 | 435k | } |
347 | | |
348 | | /// Adds the given capture name to this parser. If this capture name has |
349 | | /// already been used, then an error is returned. |
350 | 8.96k | fn add_capture_name(&self, name: &str) -> Result<(), Error> { |
351 | 8.96k | let mut names = self.capture_names.borrow_mut(); |
352 | 42.9k | match names.binary_search_by(|n| name.cmp(n)) { |
353 | 7 | Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), |
354 | 8.96k | Err(i) => { |
355 | 8.96k | names.insert(i, name.to_string()); |
356 | 8.96k | Ok(()) |
357 | | } |
358 | | } |
359 | 8.96k | } |
360 | | |
361 | | /// Returns true if and only if the parser is positioned at a look-around |
362 | | /// prefix. The conditions under which this returns true must always |
363 | | /// correspond to a regular expression that would otherwise be consider |
364 | | /// invalid. |
365 | | /// |
366 | | /// This should only be called immediately after parsing the opening of |
367 | | /// a group or a set of flags. |
368 | 438k | fn is_lookaround_prefix(&self) -> bool { |
369 | 438k | self.bump_if("?=") |
370 | 438k | || self.bump_if("?!") |
371 | 438k | || self.bump_if("?<=") |
372 | 438k | || self.bump_if("?<!") |
373 | 438k | } |
374 | | } |
375 | | |
376 | | /// The actual parser. We try to break out each kind of regex syntax into its |
377 | | /// own routine. |
378 | | impl<'a> Parser<'a> { |
379 | 7.98k | pub(super) fn parse(&self) -> Result<Hir, Error> { |
380 | 7.98k | let hir = self.parse_inner()?; |
381 | | // While we also check nesting during parsing, that only checks the |
382 | | // number of recursive parse calls. It does not necessarily cover |
383 | | // all possible recursive nesting of the Hir itself. For example, |
384 | | // repetition operators don't require recursive parse calls. So one |
385 | | // can stack them arbitrarily without overflowing the stack in the |
386 | | // *parser*. But then if one recurses over the resulting Hir, a stack |
387 | | // overflow is possible. So here we check the Hir nesting level |
388 | | // thoroughly to ensure it isn't nested too deeply. |
389 | | // |
390 | | // Note that we do still need the nesting limit check in the parser as |
391 | | // well, since that will avoid overflowing the stack during parse time |
392 | | // before the complete Hir value is constructed. |
393 | 5.85k | check_hir_nesting(&hir, self.config.nest_limit)?; |
394 | 5.84k | Ok(hir) |
395 | 7.98k | } |
396 | | |
397 | 444k | fn parse_inner(&self) -> Result<Hir, Error> { |
398 | 444k | let depth = self.increment_depth()?; |
399 | 444k | let mut alternates = vec![]; |
400 | 444k | let mut concat = vec![]; |
401 | | loop { |
402 | 18.6M | self.bump_space(); |
403 | 18.6M | if self.is_done() { |
404 | 6.10k | break; |
405 | 18.6M | } |
406 | 18.6M | match self.char() { |
407 | | '(' => { |
408 | | // Save the old flags and reset them only when we close |
409 | | // the group. |
410 | 438k | let oldflags = *self.flags.borrow(); |
411 | 438k | if let Some(sub) = self.parse_group()? { |
412 | 434k | concat.push(sub); |
413 | 434k | // We only reset them here because if 'parse_group' |
414 | 434k | // returns None, then that means it handled a flag |
415 | 434k | // directive, e.g., '(?ism)'. And the whole point is |
416 | 434k | // that those flags remain active until either disabled |
417 | 434k | // or the end of the pattern or current group. |
418 | 434k | *self.flags.borrow_mut() = oldflags; |
419 | 434k | } |
420 | 436k | if self.char.get() != Some(')') { |
421 | 249 | return Err(Error::new(ERR_UNCLOSED_GROUP)); |
422 | 436k | } |
423 | 436k | self.bump(); |
424 | | } |
425 | | ')' => { |
426 | 434k | if depth == 0 { |
427 | 26 | return Err(Error::new(ERR_UNOPENED_GROUP)); |
428 | 434k | } |
429 | 434k | break; |
430 | | } |
431 | 4.24M | '|' => { |
432 | 4.24M | alternates.push(Hir::concat(core::mem::take(&mut concat))); |
433 | 4.24M | self.bump(); |
434 | 4.24M | } |
435 | 151k | '[' => concat.push(self.parse_class()?), |
436 | | '?' | '*' | '+' => { |
437 | 176k | concat = self.parse_uncounted_repetition(concat)?; |
438 | | } |
439 | | '{' => { |
440 | 199k | concat = self.parse_counted_repetition(concat)?; |
441 | | } |
442 | 13.0M | _ => concat.push(self.parse_primitive()?), |
443 | | } |
444 | | } |
445 | 440k | self.decrement_depth(); |
446 | 440k | alternates.push(Hir::concat(concat)); |
447 | | // N.B. This strips off the "alternation" if there's only one branch. |
448 | 440k | Ok(Hir::alternation(alternates)) |
449 | 444k | } |
450 | | |
451 | | /// Parses a "primitive" pattern. A primitive is any expression that does |
452 | | /// not contain any sub-expressions. |
453 | | /// |
454 | | /// This assumes the parser is pointing at the beginning of the primitive. |
455 | 13.0M | fn parse_primitive(&self) -> Result<Hir, Error> { |
456 | 13.0M | let ch = self.char(); |
457 | 13.0M | self.bump(); |
458 | 13.0M | match ch { |
459 | 82.6k | '\\' => self.parse_escape(), |
460 | 27.0k | '.' => Ok(self.hir_dot()), |
461 | 12.2k | '^' => Ok(self.hir_anchor_start()), |
462 | 27.8k | '$' => Ok(self.hir_anchor_end()), |
463 | 12.8M | ch => Ok(self.hir_char(ch)), |
464 | | } |
465 | 13.0M | } |
466 | | |
467 | | /// Parse an escape sequence. This always results in a "primitive" HIR, |
468 | | /// that is, an HIR with no sub-expressions. |
469 | | /// |
470 | | /// This assumes the parser is positioned at the start of the sequence, |
471 | | /// immediately *after* the `\`. It advances the parser to the first |
472 | | /// position immediately following the escape sequence. |
473 | 17.6M | fn parse_escape(&self) -> Result<Hir, Error> { |
474 | 17.6M | if self.is_done() { |
475 | 61 | return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); |
476 | 17.6M | } |
477 | 17.6M | let ch = self.char(); |
478 | | // Put some of the more complicated routines into helpers. |
479 | 17.6M | match ch { |
480 | 17.5M | '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), |
481 | | 'p' | 'P' => { |
482 | 2 | return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) |
483 | | } |
484 | 8.74k | 'x' | 'u' | 'U' => return self.parse_hex(), |
485 | | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
486 | 16.8M | return Ok(self.parse_perl_class()); |
487 | | } |
488 | 785k | _ => {} |
489 | | } |
490 | | |
491 | | // Handle all of the one letter sequences inline. |
492 | 785k | self.bump(); |
493 | 785k | if hir::is_meta_character(ch) || hir::is_escapable_character(ch) { |
494 | 741k | return Ok(self.hir_char(ch)); |
495 | 43.8k | } |
496 | 43.8k | let special = |ch| Ok(self.hir_char(ch)); |
497 | 43.8k | match ch { |
498 | 8.07k | 'a' => special('\x07'), |
499 | 23.5k | 'f' => special('\x0C'), |
500 | 457 | 't' => special('\t'), |
501 | 1.22k | 'n' => special('\n'), |
502 | 491 | 'r' => special('\r'), |
503 | 3.80k | 'v' => special('\x0B'), |
504 | 294 | 'A' => Ok(Hir::look(hir::Look::Start)), |
505 | 1.91k | 'z' => Ok(Hir::look(hir::Look::End)), |
506 | | 'b' => { |
507 | 2.86k | let mut hir = Hir::look(hir::Look::Word); |
508 | 2.86k | if !self.is_done() && self.char() == '{' { |
509 | 1.38k | if let Some(special) = |
510 | 2.19k | self.maybe_parse_special_word_boundary()? |
511 | 1.38k | { |
512 | 1.38k | hir = special; |
513 | 1.38k | } |
514 | 665 | } |
515 | 2.71k | Ok(hir) |
516 | | } |
517 | 448 | 'B' => Ok(Hir::look(hir::Look::WordNegate)), |
518 | 436 | '<' => Ok(Hir::look(hir::Look::WordStart)), |
519 | 278 | '>' => Ok(Hir::look(hir::Look::WordEnd)), |
520 | 40 | _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), |
521 | | } |
522 | 17.6M | } |
523 | | |
524 | | /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
525 | | /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
526 | | /// |
527 | | /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
528 | | /// if it fails it will just return `None` with no error. This is done |
529 | | /// because `\b{5}` is a valid expression and we want to let that be parsed |
530 | | /// by the existing counted repetition parsing code. (I thought about just |
531 | | /// invoking the counted repetition code from here, but it seemed a little |
532 | | /// ham-fisted.) |
533 | | /// |
534 | | /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
535 | | /// Namely, if we definitely know it isn't a counted repetition, then we |
536 | | /// return an error specific to the specialty word boundaries. |
537 | | /// |
538 | | /// This assumes the parser is positioned at a `{` immediately following |
539 | | /// a `\b`. When `None` is returned, the parser is returned to the position |
540 | | /// at which it started: pointing at a `{`. |
541 | | /// |
542 | | /// The position given should correspond to the start of the `\b`. |
543 | 2.19k | fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> { |
544 | 2.19k | assert_eq!(self.char(), '{'); |
545 | | |
546 | 14.7k | let is_valid_char = |c| match c { |
547 | 13.0k | 'A'..='Z' | 'a'..='z' | '-' => true, |
548 | 2.16k | _ => false, |
549 | 14.7k | }; |
550 | 2.19k | let start = self.pos(); |
551 | 2.19k | if !self.bump_and_bump_space() { |
552 | 3 | return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); |
553 | 2.19k | } |
554 | | // This is one of the critical bits: if the first non-whitespace |
555 | | // character isn't in [-A-Za-z] (i.e., this can't be a special word |
556 | | // boundary), then we bail and let the counted repetition parser deal |
557 | | // with this. |
558 | 2.19k | if !is_valid_char(self.char()) { |
559 | 663 | self.pos.set(start); |
560 | 663 | self.char.set(Some('{')); |
561 | 663 | return Ok(None); |
562 | 1.52k | } |
563 | | |
564 | | // Now collect up our chars until we see a '}'. |
565 | 1.52k | let mut scratch = String::new(); |
566 | 12.5k | while !self.is_done() && is_valid_char(self.char()) { |
567 | 11.0k | scratch.push(self.char()); |
568 | 11.0k | self.bump_and_bump_space(); |
569 | 11.0k | } |
570 | 1.52k | if self.is_done() || self.char() != '}' { |
571 | 89 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); |
572 | 1.44k | } |
573 | 1.44k | self.bump(); |
574 | 1.44k | let kind = match scratch.as_str() { |
575 | 1.44k | "start" => hir::Look::WordStart, |
576 | 1.23k | "end" => hir::Look::WordEnd, |
577 | 1.03k | "start-half" => hir::Look::WordStartHalf, |
578 | 811 | "end-half" => hir::Look::WordEndHalf, |
579 | | _ => { |
580 | 55 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) |
581 | | } |
582 | | }; |
583 | 1.38k | Ok(Some(Hir::look(kind))) |
584 | 2.19k | } |
585 | | |
586 | | /// Parse a hex representation of a Unicode codepoint. This handles both |
587 | | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
588 | | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
589 | | /// the first character immediately following the hexadecimal literal. |
590 | 8.74k | fn parse_hex(&self) -> Result<Hir, Error> { |
591 | 8.74k | let digit_len = match self.char() { |
592 | 3.33k | 'x' => 2, |
593 | 3.55k | 'u' => 4, |
594 | 1.85k | 'U' => 8, |
595 | 0 | unk => unreachable!( |
596 | | "invalid start of fixed length hexadecimal number {unk}" |
597 | | ), |
598 | | }; |
599 | 8.74k | if !self.bump_and_bump_space() { |
600 | 11 | return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); |
601 | 8.73k | } |
602 | 8.73k | if self.char() == '{' { |
603 | 3.10k | self.parse_hex_brace() |
604 | | } else { |
605 | 5.62k | self.parse_hex_digits(digit_len) |
606 | | } |
607 | 8.74k | } |
608 | | |
609 | | /// Parse an N-digit hex representation of a Unicode codepoint. This |
610 | | /// expects the parser to be positioned at the first digit and will advance |
611 | | /// the parser to the first character immediately following the escape |
612 | | /// sequence. |
613 | | /// |
614 | | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
615 | | /// or 8 (for `\UNNNNNNNN`). |
616 | 5.62k | fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> { |
617 | 5.62k | let mut scratch = String::new(); |
618 | 16.6k | for i in 0..digit_len { |
619 | 16.6k | if i > 0 && !self.bump_and_bump_space() { |
620 | 32 | return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); |
621 | 16.6k | } |
622 | 16.6k | if !is_hex(self.char()) { |
623 | 77 | return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); |
624 | 16.5k | } |
625 | 16.5k | scratch.push(self.char()); |
626 | | } |
627 | | // The final bump just moves the parser past the literal, which may |
628 | | // be EOF. |
629 | 5.51k | self.bump_and_bump_space(); |
630 | 5.51k | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
631 | 32 | None => Err(Error::new(ERR_HEX_FIXED_INVALID)), |
632 | 5.48k | Some(ch) => Ok(self.hir_char(ch)), |
633 | | } |
634 | 5.62k | } |
635 | | |
636 | | /// Parse a hex representation of any Unicode scalar value. This expects |
637 | | /// the parser to be positioned at the opening brace `{` and will advance |
638 | | /// the parser to the first character following the closing brace `}`. |
639 | 3.10k | fn parse_hex_brace(&self) -> Result<Hir, Error> { |
640 | 3.10k | let mut scratch = String::new(); |
641 | 341k | while self.bump_and_bump_space() && self.char() != '}' { |
642 | 338k | if !is_hex(self.char()) { |
643 | 47 | return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); |
644 | 338k | } |
645 | 338k | scratch.push(self.char()); |
646 | | } |
647 | 3.05k | if self.is_done() { |
648 | 30 | return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); |
649 | 3.02k | } |
650 | 3.02k | assert_eq!(self.char(), '}'); |
651 | 3.02k | self.bump_and_bump_space(); |
652 | | |
653 | 3.02k | if scratch.is_empty() { |
654 | 4 | return Err(Error::new(ERR_HEX_BRACE_EMPTY)); |
655 | 3.02k | } |
656 | 3.02k | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
657 | 52 | None => Err(Error::new(ERR_HEX_BRACE_INVALID)), |
658 | 2.96k | Some(ch) => Ok(self.hir_char(ch)), |
659 | | } |
660 | 3.10k | } |
661 | | |
662 | | /// Parse a decimal number into a u32 while trimming leading and trailing |
663 | | /// whitespace. |
664 | | /// |
665 | | /// This expects the parser to be positioned at the first position where |
666 | | /// a decimal digit could occur. This will advance the parser to the byte |
667 | | /// immediately following the last contiguous decimal digit. |
668 | | /// |
669 | | /// If no decimal digit could be found or if there was a problem parsing |
670 | | /// the complete set of digits into a u32, then an error is returned. |
671 | 200k | fn parse_decimal(&self) -> Result<u32, Error> { |
672 | 200k | let mut scratch = String::new(); |
673 | 201k | while !self.is_done() && self.char().is_whitespace() { |
674 | 1.55k | self.bump(); |
675 | 1.55k | } |
676 | 1.72M | while !self.is_done() && '0' <= self.char() && self.char() <= '9' { |
677 | 1.52M | scratch.push(self.char()); |
678 | 1.52M | self.bump_and_bump_space(); |
679 | 1.52M | } |
680 | 201k | while !self.is_done() && self.char().is_whitespace() { |
681 | 1.57k | self.bump_and_bump_space(); |
682 | 1.57k | } |
683 | 200k | let digits = scratch.as_str(); |
684 | 200k | if digits.is_empty() { |
685 | 212 | return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); |
686 | 200k | } |
687 | 200k | match u32::from_str_radix(digits, 10).ok() { |
688 | 200k | Some(n) => Ok(n), |
689 | 14 | None => Err(Error::new(ERR_DECIMAL_INVALID)), |
690 | | } |
691 | 200k | } |
692 | | |
693 | | /// Parses an uncounted repetition operator. An uncounted repetition |
694 | | /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` |
695 | | /// syntax. The current character should be one of `?`, `*` or `+`. Any |
696 | | /// other character will result in a panic. |
697 | | /// |
698 | | /// This assumes that the parser is currently positioned at the repetition |
699 | | /// operator and advances the parser to the first character after the |
700 | | /// operator. (Note that the operator may include a single additional `?`, |
701 | | /// which makes the operator ungreedy.) |
702 | | /// |
703 | | /// The caller should include the concatenation that is being built. The |
704 | | /// concatenation returned includes the repetition operator applied to the |
705 | | /// last expression in the given concatenation. |
706 | | /// |
707 | | /// If the concatenation is empty, then this returns an error. |
708 | 176k | fn parse_uncounted_repetition( |
709 | 176k | &self, |
710 | 176k | mut concat: Vec<Hir>, |
711 | 176k | ) -> Result<Vec<Hir>, Error> { |
712 | 176k | let sub = match concat.pop() { |
713 | 176k | Some(hir) => Box::new(hir), |
714 | | None => { |
715 | 7 | return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); |
716 | | } |
717 | | }; |
718 | 176k | let (min, max) = match self.char() { |
719 | 76.9k | '?' => (0, Some(1)), |
720 | 46.0k | '*' => (0, None), |
721 | 53.8k | '+' => (1, None), |
722 | 0 | unk => unreachable!("unrecognized repetition operator '{unk}'"), |
723 | | }; |
724 | 176k | let mut greedy = true; |
725 | 176k | if self.bump() && self.char() == '?' { |
726 | 33.7k | greedy = false; |
727 | 33.7k | self.bump(); |
728 | 143k | } |
729 | 176k | if self.flags().swap_greed { |
730 | 61.0k | greedy = !greedy; |
731 | 115k | } |
732 | 176k | concat.push(Hir::repetition(hir::Repetition { |
733 | 176k | min, |
734 | 176k | max, |
735 | 176k | greedy, |
736 | 176k | sub, |
737 | 176k | })); |
738 | 176k | Ok(concat) |
739 | 176k | } |
740 | | |
741 | | /// Parses a counted repetition operation. A counted repetition operator |
742 | | /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or |
743 | | /// `+` operators. |
744 | | /// |
745 | | /// This assumes that the parser is currently at the opening `{` and |
746 | | /// advances the parser to the first character after the operator. (Note |
747 | | /// that the operator may include a single additional `?`, which makes the |
748 | | /// operator ungreedy.) |
749 | | /// |
750 | | /// The caller should include the concatenation that is being built. The |
751 | | /// concatenation returned includes the repetition operator applied to the |
752 | | /// last expression in the given concatenation. |
753 | | /// |
754 | | /// If the concatenation is empty, then this returns an error. |
755 | 199k | fn parse_counted_repetition( |
756 | 199k | &self, |
757 | 199k | mut concat: Vec<Hir>, |
758 | 199k | ) -> Result<Vec<Hir>, Error> { |
759 | 199k | assert_eq!(self.char(), '{', "expected opening brace"); |
760 | 199k | let sub = match concat.pop() { |
761 | 199k | Some(hir) => Box::new(hir), |
762 | | None => { |
763 | 5 | return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); |
764 | | } |
765 | | }; |
766 | 199k | if !self.bump_and_bump_space() { |
767 | 22 | return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); |
768 | 199k | } |
769 | 199k | let min = self.parse_decimal()?; |
770 | 198k | let mut max = Some(min); |
771 | 198k | if self.is_done() { |
772 | 43 | return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); |
773 | 198k | } |
774 | 198k | if self.char() == ',' { |
775 | 1.84k | if !self.bump_and_bump_space() { |
776 | 4 | return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); |
777 | 1.83k | } |
778 | 1.83k | if self.char() != '}' { |
779 | 1.17k | max = Some(self.parse_decimal()?); |
780 | 664 | } else { |
781 | 664 | max = None; |
782 | 664 | } |
783 | 1.80k | if self.is_done() { |
784 | 3 | return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); |
785 | 1.80k | } |
786 | 197k | } |
787 | 198k | if self.char() != '}' { |
788 | 69 | return Err(Error::new(ERR_COUNTED_REP_INVALID)); |
789 | 198k | } |
790 | | |
791 | 198k | let mut greedy = true; |
792 | 198k | if self.bump_and_bump_space() && self.char() == '?' { |
793 | 421 | greedy = false; |
794 | 421 | self.bump(); |
795 | 198k | } |
796 | 198k | if self.flags().swap_greed { |
797 | 29.5k | greedy = !greedy; |
798 | 169k | } |
799 | | |
800 | 198k | if max.map_or(false, |max| min > max) { |
801 | 17 | return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); |
802 | 198k | } |
803 | 198k | concat.push(Hir::repetition(hir::Repetition { |
804 | 198k | min, |
805 | 198k | max, |
806 | 198k | greedy, |
807 | 198k | sub, |
808 | 198k | })); |
809 | 198k | Ok(concat) |
810 | 199k | } |
811 | | |
812 | | /// Parses the part of a pattern that starts with a `(`. This is usually |
813 | | /// a group sub-expression, but might just be a directive that enables |
814 | | /// (or disables) certain flags. |
815 | | /// |
816 | | /// This assumes the parser is pointing at the opening `(`. |
817 | 438k | fn parse_group(&self) -> Result<Option<Hir>, Error> { |
818 | 438k | assert_eq!(self.char(), '('); |
819 | 438k | self.bump_and_bump_space(); |
820 | 438k | if self.is_lookaround_prefix() { |
821 | 6 | return Err(Error::new(ERR_LOOK_UNSUPPORTED)); |
822 | 438k | } |
823 | 438k | if self.bump_if("?P<") || self.bump_if("?<") { |
824 | 9.12k | let index = self.next_capture_index()?; |
825 | 9.12k | let name = Some(Box::from(self.parse_capture_name()?)); |
826 | 8.96k | let sub = Box::new(self.parse_inner()?); |
827 | 8.77k | let cap = hir::Capture { index, name, sub }; |
828 | 8.77k | Ok(Some(Hir::capture(cap))) |
829 | 429k | } else if self.bump_if("?") { |
830 | 3.46k | if self.is_done() { |
831 | 5 | return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); |
832 | 3.46k | } |
833 | 3.46k | let start = self.pos(); |
834 | | // The flags get reset in the top-level 'parse' routine. |
835 | 3.46k | *self.flags.borrow_mut() = self.parse_flags()?; |
836 | 3.38k | let consumed = self.pos() - start; |
837 | 3.38k | if self.char() == ')' { |
838 | | // We don't allow empty flags, e.g., `(?)`. |
839 | 2.04k | if consumed == 0 { |
840 | 2 | return Err(Error::new(ERR_EMPTY_FLAGS)); |
841 | 2.04k | } |
842 | 2.04k | Ok(None) |
843 | | } else { |
844 | 1.33k | assert_eq!(':', self.char()); |
845 | 1.33k | self.bump(); |
846 | 1.33k | self.parse_inner().map(Some) |
847 | | } |
848 | | } else { |
849 | 426k | let index = self.next_capture_index()?; |
850 | 426k | let sub = Box::new(self.parse_inner()?); |
851 | 424k | let cap = hir::Capture { index, name: None, sub }; |
852 | 424k | Ok(Some(Hir::capture(cap))) |
853 | | } |
854 | 438k | } |
855 | | |
856 | | /// Parses a capture group name. Assumes that the parser is positioned at |
857 | | /// the first character in the name following the opening `<` (and may |
858 | | /// possibly be EOF). This advances the parser to the first character |
859 | | /// following the closing `>`. |
860 | 9.12k | fn parse_capture_name(&self) -> Result<&str, Error> { |
861 | 9.12k | if self.is_done() { |
862 | 8 | return Err(Error::new(ERR_MISSING_GROUP_NAME)); |
863 | 9.11k | } |
864 | 9.11k | let start = self.pos(); |
865 | | loop { |
866 | 3.56M | if self.char() == '>' { |
867 | 8.97k | break; |
868 | 3.55M | } |
869 | 3.55M | if !is_capture_char(self.char(), self.pos() == start) { |
870 | 54 | return Err(Error::new(ERR_INVALID_GROUP_NAME)); |
871 | 3.55M | } |
872 | 3.55M | if !self.bump() { |
873 | 89 | break; |
874 | 3.55M | } |
875 | | } |
876 | 9.06k | let end = self.pos(); |
877 | 9.06k | if self.is_done() { |
878 | 89 | return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); |
879 | 8.97k | } |
880 | 8.97k | assert_eq!(self.char(), '>'); |
881 | 8.97k | self.bump(); |
882 | 8.97k | let name = &self.pattern()[start..end]; |
883 | 8.97k | if name.is_empty() { |
884 | 6 | return Err(Error::new(ERR_EMPTY_GROUP_NAME)); |
885 | 8.96k | } |
886 | 8.96k | self.add_capture_name(name)?; |
887 | 8.96k | Ok(name) |
888 | 9.12k | } |
889 | | |
890 | | /// Parse a sequence of flags starting at the current character. |
891 | | /// |
892 | | /// This advances the parser to the character immediately following the |
893 | | /// flags, which is guaranteed to be either `:` or `)`. |
894 | | /// |
895 | | /// # Errors |
896 | | /// |
897 | | /// If any flags are duplicated, then an error is returned. |
898 | | /// |
899 | | /// If the negation operator is used more than once, then an error is |
900 | | /// returned. |
901 | | /// |
902 | | /// If no flags could be found or if the negation operation is not followed |
903 | | /// by any flags, then an error is returned. |
904 | 3.46k | fn parse_flags(&self) -> Result<Flags, Error> { |
905 | 3.46k | let mut flags = *self.flags.borrow(); |
906 | 3.46k | let mut negate = false; |
907 | | // Keeps track of whether the previous flag item was a '-'. We use this |
908 | | // to detect whether there is a dangling '-', which is invalid. |
909 | 3.46k | let mut last_was_negation = false; |
910 | | // A set to keep track of the flags we've seen. Since all flags are |
911 | | // ASCII, we only need 128 bytes. |
912 | 3.46k | let mut seen = [false; 128]; |
913 | 6.84k | while self.char() != ':' && self.char() != ')' { |
914 | 3.45k | if self.char() == '-' { |
915 | 290 | last_was_negation = true; |
916 | 290 | if negate { |
917 | 1 | return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); |
918 | 289 | } |
919 | 289 | negate = true; |
920 | | } else { |
921 | 3.16k | last_was_negation = false; |
922 | 3.16k | self.parse_flag(&mut flags, negate)?; |
923 | | // OK because every valid flag is ASCII, and we're only here if |
924 | | // the flag is valid. |
925 | 3.13k | let flag_byte = u8::try_from(self.char()).unwrap(); |
926 | 3.13k | if seen[usize::from(flag_byte)] { |
927 | 28 | return Err(Error::new(ERR_FLAG_DUPLICATE)); |
928 | 3.10k | } |
929 | 3.10k | seen[usize::from(flag_byte)] = true; |
930 | | } |
931 | 3.39k | if !self.bump() { |
932 | 16 | return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); |
933 | 3.38k | } |
934 | | } |
935 | 3.38k | if last_was_negation { |
936 | 1 | return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); |
937 | 3.38k | } |
938 | 3.38k | Ok(flags) |
939 | 3.46k | } |
940 | | |
941 | | /// Parse the current character as a flag. Do not advance the parser. |
942 | | /// |
943 | | /// This sets the appropriate boolean value in place on the set of flags |
944 | | /// given. The boolean is inverted when `negate` is true. |
945 | | /// |
946 | | /// # Errors |
947 | | /// |
948 | | /// If the flag is not recognized, then an error is returned. |
949 | 3.16k | fn parse_flag( |
950 | 3.16k | &self, |
951 | 3.16k | flags: &mut Flags, |
952 | 3.16k | negate: bool, |
953 | 3.16k | ) -> Result<(), Error> { |
954 | 3.16k | let enabled = !negate; |
955 | 3.16k | match self.char() { |
956 | 317 | 'i' => flags.case_insensitive = enabled, |
957 | 523 | 'm' => flags.multi_line = enabled, |
958 | 293 | 's' => flags.dot_matches_new_line = enabled, |
959 | 428 | 'U' => flags.swap_greed = enabled, |
960 | 297 | 'R' => flags.crlf = enabled, |
961 | 975 | 'x' => flags.ignore_whitespace = enabled, |
962 | | // We make a special exception for this flag where we let it |
963 | | // through as a recognized flag, but treat it as a no-op. This in |
964 | | // practice retains some compatibility with the regex crate. It is |
965 | | // a little suspect to do this, but for example, '(?-u:\b).+' in |
966 | | // the regex crate is equivalent to '\b.+' in regex-lite. |
967 | 303 | 'u' => {} |
968 | 33 | _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), |
969 | | } |
970 | 3.13k | Ok(()) |
971 | 3.16k | } |
972 | | |
973 | | /// Parse a standard character class consisting primarily of characters or |
974 | | /// character ranges. |
975 | | /// |
976 | | /// This assumes the parser is positioned at the opening `[`. If parsing |
977 | | /// is successful, then the parser is advanced to the position immediately |
978 | | /// following the closing `]`. |
979 | 151k | fn parse_class(&self) -> Result<Hir, Error> { |
980 | 151k | assert_eq!(self.char(), '['); |
981 | | |
982 | 151k | let mut union = vec![]; |
983 | 151k | if !self.bump_and_bump_space() { |
984 | 8 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
985 | 151k | } |
986 | | // Determine whether the class is negated or not. |
987 | 151k | let negate = if self.char() != '^' { |
988 | 140k | false |
989 | | } else { |
990 | 10.8k | if !self.bump_and_bump_space() { |
991 | 4 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); |
992 | 10.8k | } |
993 | 10.8k | true |
994 | | }; |
995 | | // Accept any number of `-` as literal `-`. |
996 | 160k | while self.char() == '-' { |
997 | 9.22k | union.push(hir::ClassRange { start: '-', end: '-' }); |
998 | 9.22k | if !self.bump_and_bump_space() { |
999 | 13 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
1000 | 9.21k | } |
1001 | | } |
1002 | | // If `]` is the *first* char in a set, then interpret it as a literal |
1003 | | // `]`. That is, an empty class is impossible to write. |
1004 | 151k | if union.is_empty() && self.char() == ']' { |
1005 | 2.37k | union.push(hir::ClassRange { start: ']', end: ']' }); |
1006 | 2.37k | if !self.bump_and_bump_space() { |
1007 | 5 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); |
1008 | 2.37k | } |
1009 | 149k | } |
1010 | | loop { |
1011 | 39.0M | self.bump_space(); |
1012 | 39.0M | if self.is_done() { |
1013 | 132 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
1014 | 39.0M | } |
1015 | 39.0M | match self.char() { |
1016 | | '[' => { |
1017 | | // Attempt to treat this as the beginning of a POSIX class. |
1018 | | // If POSIX class parsing fails, then the parser backs up |
1019 | | // to `[`. |
1020 | 9.94k | if let Some(class) = self.maybe_parse_posix_class() { |
1021 | 9.78k | union.extend_from_slice(&class.ranges); |
1022 | 9.78k | continue; |
1023 | 163 | } |
1024 | | // ... otherwise we don't support nested classes. |
1025 | 163 | return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); |
1026 | | } |
1027 | | ']' => { |
1028 | 151k | self.bump(); |
1029 | 151k | let mut class = hir::Class::new(union); |
1030 | | // Note that we must apply case folding before negation! |
1031 | | // Consider `(?i)[^x]`. If we applied negation first, then |
1032 | | // the result would be the character class that matched any |
1033 | | // Unicode scalar value. |
1034 | 151k | if self.flags().case_insensitive { |
1035 | 48.5k | class.ascii_case_fold(); |
1036 | 102k | } |
1037 | 151k | if negate { |
1038 | 10.8k | class.negate(); |
1039 | 140k | } |
1040 | 151k | return Ok(Hir::class(class)); |
1041 | | } |
1042 | 2.63k | '&' if self.peek() == Some('&') => { |
1043 | 1 | return Err(Error::new( |
1044 | 1 | ERR_CLASS_INTERSECTION_UNSUPPORTED, |
1045 | 1 | )); |
1046 | | } |
1047 | 64.5k | '-' if self.peek() == Some('-') => { |
1048 | 10 | return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); |
1049 | | } |
1050 | 8.57k | '~' if self.peek() == Some('~') => { |
1051 | 1 | return Err(Error::new( |
1052 | 1 | ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, |
1053 | 1 | )); |
1054 | | } |
1055 | 38.8M | _ => self.parse_class_range(&mut union)?, |
1056 | | } |
1057 | | } |
1058 | 151k | } |
1059 | | |
1060 | | /// Parse a single primitive item in a character class set. The item to |
1061 | | /// be parsed can either be one of a simple literal character, a range |
1062 | | /// between two simple literal characters or a "primitive" character |
1063 | | /// class like `\w`. |
1064 | | /// |
1065 | | /// If an invalid escape is found, or if a character class is found where |
1066 | | /// a simple literal is expected (e.g., in a range), then an error is |
1067 | | /// returned. |
1068 | | /// |
1069 | | /// Otherwise, the range (or ranges) are appended to the given union of |
1070 | | /// ranges. |
1071 | 38.8M | fn parse_class_range( |
1072 | 38.8M | &self, |
1073 | 38.8M | union: &mut Vec<hir::ClassRange>, |
1074 | 38.8M | ) -> Result<(), Error> { |
1075 | 38.8M | let prim1 = self.parse_class_item()?; |
1076 | 38.8M | self.bump_space(); |
1077 | 38.8M | if self.is_done() { |
1078 | 207 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); |
1079 | 38.8M | } |
1080 | | // If the next char isn't a `-`, then we don't have a range. |
1081 | | // There are two exceptions. If the char after a `-` is a `]`, then |
1082 | | // `-` is interpreted as a literal `-`. Alternatively, if the char |
1083 | | // after a `-` is a `-`, then `--` corresponds to a "difference" |
1084 | | // operation. (Which we don't support in regex-lite, but error about |
1085 | | // specifically in an effort to be loud about differences between the |
1086 | | // main regex crate where possible.) |
1087 | 38.8M | if self.char() != '-' |
1088 | 200k | || self.peek_space() == Some(']') |
1089 | 137k | || self.peek_space() == Some('-') |
1090 | | { |
1091 | 38.7M | union.extend_from_slice(&into_class_item_ranges(prim1)?); |
1092 | 38.7M | return Ok(()); |
1093 | 136k | } |
1094 | | // OK, now we're parsing a range, so bump past the `-` and parse the |
1095 | | // second half of the range. |
1096 | 136k | if !self.bump_and_bump_space() { |
1097 | 63 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
1098 | 136k | } |
1099 | 136k | let prim2 = self.parse_class_item()?; |
1100 | 136k | let range = hir::ClassRange { |
1101 | 136k | start: into_class_item_range(prim1)?, |
1102 | 136k | end: into_class_item_range(prim2)?, |
1103 | | }; |
1104 | 136k | if range.start > range.end { |
1105 | 39 | return Err(Error::new(ERR_CLASS_INVALID_RANGE)); |
1106 | 136k | } |
1107 | 136k | union.push(range); |
1108 | 136k | Ok(()) |
1109 | 38.8M | } |
1110 | | |
1111 | | /// Parse a single item in a character class as a primitive, where the |
1112 | | /// primitive either consists of a verbatim literal or a single escape |
1113 | | /// sequence. |
1114 | | /// |
1115 | | /// This assumes the parser is positioned at the beginning of a primitive, |
1116 | | /// and advances the parser to the first position after the primitive if |
1117 | | /// successful. |
1118 | | /// |
1119 | | /// Note that it is the caller's responsibility to report an error if an |
1120 | | /// illegal primitive was parsed. |
1121 | 39.0M | fn parse_class_item(&self) -> Result<Hir, Error> { |
1122 | 39.0M | let ch = self.char(); |
1123 | 39.0M | self.bump(); |
1124 | 39.0M | if ch == '\\' { |
1125 | 17.5M | self.parse_escape() |
1126 | | } else { |
1127 | 21.4M | Ok(Hir::char(ch)) |
1128 | | } |
1129 | 39.0M | } |
1130 | | |
1131 | | /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. |
1132 | | /// |
1133 | | /// This assumes the parser is positioned at the opening `[`. |
1134 | | /// |
1135 | | /// If no valid POSIX character class could be found, then this does not |
1136 | | /// advance the parser and `None` is returned. Otherwise, the parser is |
1137 | | /// advanced to the first byte following the closing `]` and the |
1138 | | /// corresponding POSIX class is returned. |
1139 | 9.94k | fn maybe_parse_posix_class(&self) -> Option<hir::Class> { |
1140 | | // POSIX character classes are interesting from a parsing perspective |
1141 | | // because parsing cannot fail with any interesting error. For example, |
1142 | | // in order to use an POSIX character class, it must be enclosed in |
1143 | | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
1144 | | // of it as "POSIX character classes have the syntax `[:NAME:]` which |
1145 | | // can only appear within character brackets." This means that things |
1146 | | // like `[[:lower:]A]` are legal constructs. |
1147 | | // |
1148 | | // However, if one types an incorrect POSIX character class, e.g., |
1149 | | // `[[:loower:]]`, then we treat that as if it were normal nested |
1150 | | // character class containing the characters `:elorw`. (Which isn't |
1151 | | // supported and results in an error in regex-lite.) One might argue |
1152 | | // that we should return an error instead since the repeated colons |
1153 | | // give away the intent to write an POSIX class. But what if the user |
1154 | | // typed `[[:lower]]` instead? How can we tell that was intended to be |
1155 | | // a POSIX class and not just a normal nested class? |
1156 | | // |
1157 | | // Reasonable people can probably disagree over this, but for better |
1158 | | // or worse, we implement semantics that never fails at the expense of |
1159 | | // better failure modes. |
1160 | 9.94k | assert_eq!(self.char(), '['); |
1161 | | |
1162 | | // If parsing fails, then we back up the parser to this starting point. |
1163 | 9.94k | let start_pos = self.pos(); |
1164 | 9.94k | let start_char = self.char.get(); |
1165 | 9.94k | let reset = || { |
1166 | 163 | self.pos.set(start_pos); |
1167 | 163 | self.char.set(start_char); |
1168 | 163 | }; |
1169 | | |
1170 | 9.94k | let mut negated = false; |
1171 | 9.94k | if !self.bump() || self.char() != ':' { |
1172 | 51 | reset(); |
1173 | 51 | return None; |
1174 | 9.89k | } |
1175 | 9.89k | if !self.bump() { |
1176 | 4 | reset(); |
1177 | 4 | return None; |
1178 | 9.89k | } |
1179 | 9.89k | if self.char() == '^' { |
1180 | 2.54k | negated = true; |
1181 | 2.54k | if !self.bump() { |
1182 | 1 | reset(); |
1183 | 1 | return None; |
1184 | 2.54k | } |
1185 | 7.34k | } |
1186 | 9.89k | let name_start = self.pos(); |
1187 | 64.1k | while self.char() != ':' && self.bump() {} |
1188 | 9.89k | if self.is_done() { |
1189 | 57 | reset(); |
1190 | 57 | return None; |
1191 | 9.83k | } |
1192 | 9.83k | let name = &self.pattern()[name_start..self.pos()]; |
1193 | 9.83k | if !self.bump_if(":]") { |
1194 | 25 | reset(); |
1195 | 25 | return None; |
1196 | 9.80k | } |
1197 | 9.80k | if let Ok(ranges) = posix_class(name) { |
1198 | 9.78k | let mut class = hir::Class::new(ranges); |
1199 | 9.78k | if negated { |
1200 | 2.53k | class.negate(); |
1201 | 7.24k | } |
1202 | 9.78k | return Some(class); |
1203 | 25 | } |
1204 | 25 | reset(); |
1205 | 25 | None |
1206 | 9.94k | } |
1207 | | |
1208 | | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
1209 | | /// parser is currently at a valid character class name and will be |
1210 | | /// advanced to the character immediately following the class. |
1211 | 16.8M | fn parse_perl_class(&self) -> Hir { |
1212 | 16.8M | let ch = self.char(); |
1213 | 16.8M | self.bump(); |
1214 | 16.8M | let mut class = hir::Class::new(match ch { |
1215 | 2.44k | 'd' | 'D' => posix_class("digit").unwrap(), |
1216 | 10.9k | 's' | 'S' => posix_class("space").unwrap(), |
1217 | 16.8M | 'w' | 'W' => posix_class("word").unwrap(), |
1218 | 0 | unk => unreachable!("invalid Perl class \\{unk}"), |
1219 | | }); |
1220 | 16.8M | if ch.is_ascii_uppercase() { |
1221 | 16.7M | class.negate(); |
1222 | 16.7M | } |
1223 | 16.8M | Hir::class(class) |
1224 | 16.8M | } |
1225 | | |
1226 | 27.0k | fn hir_dot(&self) -> Hir { |
1227 | 27.0k | if self.flags().dot_matches_new_line { |
1228 | 3.11k | Hir::class(hir::Class::new([hir::ClassRange { |
1229 | 3.11k | start: '\x00', |
1230 | 3.11k | end: '\u{10FFFF}', |
1231 | 3.11k | }])) |
1232 | 23.9k | } else if self.flags().crlf { |
1233 | 1.93k | Hir::class(hir::Class::new([ |
1234 | 1.93k | hir::ClassRange { start: '\x00', end: '\x09' }, |
1235 | 1.93k | hir::ClassRange { start: '\x0B', end: '\x0C' }, |
1236 | 1.93k | hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' }, |
1237 | 1.93k | ])) |
1238 | | } else { |
1239 | 21.9k | Hir::class(hir::Class::new([ |
1240 | 21.9k | hir::ClassRange { start: '\x00', end: '\x09' }, |
1241 | 21.9k | hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' }, |
1242 | 21.9k | ])) |
1243 | | } |
1244 | 27.0k | } |
1245 | | |
1246 | 12.2k | fn hir_anchor_start(&self) -> Hir { |
1247 | 12.2k | let look = if self.flags().multi_line { |
1248 | 2.70k | if self.flags().crlf { |
1249 | 1.60k | hir::Look::StartCRLF |
1250 | | } else { |
1251 | 1.10k | hir::Look::StartLF |
1252 | | } |
1253 | | } else { |
1254 | 9.55k | hir::Look::Start |
1255 | | }; |
1256 | 12.2k | Hir::look(look) |
1257 | 12.2k | } |
1258 | | |
1259 | 27.8k | fn hir_anchor_end(&self) -> Hir { |
1260 | 27.8k | let look = if self.flags().multi_line { |
1261 | 10.5k | if self.flags().crlf { |
1262 | 4.21k | hir::Look::EndCRLF |
1263 | | } else { |
1264 | 6.32k | hir::Look::EndLF |
1265 | | } |
1266 | | } else { |
1267 | 17.3k | hir::Look::End |
1268 | | }; |
1269 | 27.8k | Hir::look(look) |
1270 | 27.8k | } |
1271 | | |
1272 | 13.6M | fn hir_char(&self, ch: char) -> Hir { |
1273 | 13.6M | if self.flags().case_insensitive { |
1274 | 5.95M | let this = hir::ClassRange { start: ch, end: ch }; |
1275 | 5.95M | if let Some(folded) = this.ascii_case_fold() { |
1276 | 4.06M | return Hir::class(hir::Class::new([this, folded])); |
1277 | 1.89M | } |
1278 | 7.68M | } |
1279 | 9.57M | Hir::char(ch) |
1280 | 13.6M | } |
1281 | | } |
1282 | | |
1283 | | /// This checks the depth of the given `Hir` value, and if it exceeds the given |
1284 | | /// limit, then an error is returned. |
1285 | 5.85k | fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { |
1286 | 6.60M | fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { |
1287 | 6.60M | if depth > limit { |
1288 | 11 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
1289 | 6.60M | } |
1290 | 6.60M | let Some(next_depth) = depth.checked_add(1) else { |
1291 | 0 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
1292 | | }; |
1293 | 6.60M | match *hir.kind() { |
1294 | | HirKind::Empty |
1295 | | | HirKind::Char(_) |
1296 | | | HirKind::Class(_) |
1297 | 6.51M | | HirKind::Look(_) => Ok(()), |
1298 | 33.4k | HirKind::Repetition(hir::Repetition { ref sub, .. }) => { |
1299 | 33.4k | recurse(sub, limit, next_depth) |
1300 | | } |
1301 | 26.7k | HirKind::Capture(hir::Capture { ref sub, .. }) => { |
1302 | 26.7k | recurse(sub, limit, next_depth) |
1303 | | } |
1304 | 26.6k | HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { |
1305 | 6.53M | for sub in subs.iter() { |
1306 | 6.53M | recurse(sub, limit, next_depth)?; |
1307 | | } |
1308 | 28.2k | Ok(()) |
1309 | | } |
1310 | | } |
1311 | 6.60M | } |
1312 | 5.85k | recurse(hir, limit, 0) |
1313 | 5.85k | } |
1314 | | |
1315 | | /// Converts the given Hir to a literal char if the Hir is just a single |
1316 | | /// character. Otherwise this returns an error. |
1317 | | /// |
1318 | | /// This is useful in contexts where you can only accept a single character, |
1319 | | /// but where it is convenient to parse something more general. For example, |
1320 | | /// parsing a single part of a character class range. It's useful to reuse |
1321 | | /// the literal parsing code, but that code can itself return entire classes |
1322 | | /// which can't be used as the start/end of a class range. |
1323 | 272k | fn into_class_item_range(hir: Hir) -> Result<char, Error> { |
1324 | 272k | match hir.kind { |
1325 | 272k | HirKind::Char(ch) => Ok(ch), |
1326 | 3 | _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), |
1327 | | } |
1328 | 272k | } |
1329 | | |
1330 | 38.7M | fn into_class_item_ranges( |
1331 | 38.7M | mut hir: Hir, |
1332 | 38.7M | ) -> Result<Vec<hir::ClassRange>, Error> { |
1333 | 38.7M | match core::mem::replace(&mut hir.kind, HirKind::Empty) { |
1334 | 21.9M | HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), |
1335 | 16.8M | HirKind::Class(hir::Class { ranges }) => Ok(ranges), |
1336 | 2 | _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), |
1337 | | } |
1338 | 38.7M | } |
1339 | | |
1340 | | /// Returns an iterator of character class ranges for the given named POSIX |
1341 | | /// character class. If no such character class exists for the name given, then |
1342 | | /// an error is returned. |
1343 | 16.8M | fn posix_class( |
1344 | 16.8M | kind: &str, |
1345 | 16.8M | ) -> Result<impl Iterator<Item = hir::ClassRange>, Error> { |
1346 | 16.8M | let slice: &'static [(u8, u8)] = match kind { |
1347 | 16.8M | "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], |
1348 | 16.8M | "alpha" => &[(b'A', b'Z'), (b'a', b'z')], |
1349 | 16.8M | "ascii" => &[(b'\x00', b'\x7F')], |
1350 | 16.8M | "blank" => &[(b'\t', b'\t'), (b' ', b' ')], |
1351 | 16.8M | "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], |
1352 | 16.8M | "digit" => &[(b'0', b'9')], |
1353 | 16.8M | "graph" => &[(b'!', b'~')], |
1354 | 16.8M | "lower" => &[(b'a', b'z')], |
1355 | 16.8M | "print" => &[(b' ', b'~')], |
1356 | 16.8M | "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], |
1357 | 16.8M | "space" => &[ |
1358 | 10.9k | (b'\t', b'\t'), |
1359 | 10.9k | (b'\n', b'\n'), |
1360 | 10.9k | (b'\x0B', b'\x0B'), |
1361 | 10.9k | (b'\x0C', b'\x0C'), |
1362 | 10.9k | (b'\r', b'\r'), |
1363 | 10.9k | (b' ', b' '), |
1364 | 10.9k | ], |
1365 | 16.8M | "upper" => &[(b'A', b'Z')], |
1366 | 16.8M | "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], |
1367 | 5.07k | "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], |
1368 | 25 | _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), |
1369 | | }; |
1370 | 16.8M | Ok(slice.iter().map(|&(start, end)| hir::ClassRange { |
1371 | 67.3M | start: char::from(start), |
1372 | 67.3M | end: char::from(end), |
1373 | 67.3M | })) |
1374 | 16.8M | } |
1375 | | |
1376 | | /// Returns true if the given character is a hexadecimal digit. |
1377 | 355k | fn is_hex(c: char) -> bool { |
1378 | 355k | ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
1379 | 355k | } |
1380 | | |
1381 | | /// Returns true if the given character is a valid in a capture group name. |
1382 | | /// |
1383 | | /// If `first` is true, then `c` is treated as the first character in the |
1384 | | /// group name (which must be alphabetic or underscore). |
1385 | 3.55M | fn is_capture_char(c: char, first: bool) -> bool { |
1386 | 3.55M | if first { |
1387 | 9.11k | c == '_' || c.is_alphabetic() |
1388 | | } else { |
1389 | 3.54M | c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
1390 | | } |
1391 | 3.55M | } |
1392 | | |
1393 | | #[cfg(test)] |
1394 | | mod tests { |
1395 | | use super::*; |
1396 | | |
1397 | | fn p(pattern: &str) -> Hir { |
1398 | | Parser::new(Config::default(), pattern).parse_inner().unwrap() |
1399 | | } |
1400 | | |
1401 | | fn perr(pattern: &str) -> String { |
1402 | | Parser::new(Config::default(), pattern) |
1403 | | .parse_inner() |
1404 | | .unwrap_err() |
1405 | | .to_string() |
1406 | | } |
1407 | | |
1408 | | fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir { |
1409 | | Hir::class(hir::Class::new( |
1410 | | it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), |
1411 | | )) |
1412 | | } |
1413 | | |
1414 | | fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir { |
1415 | | Hir::class(hir::Class::new( |
1416 | | it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), |
1417 | | )) |
1418 | | } |
1419 | | |
1420 | | fn posix(name: &str) -> Hir { |
1421 | | Hir::class(hir::Class::new(posix_class(name).unwrap())) |
1422 | | } |
1423 | | |
1424 | | fn cap(index: u32, sub: Hir) -> Hir { |
1425 | | Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) |
1426 | | } |
1427 | | |
1428 | | fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { |
1429 | | Hir::capture(hir::Capture { |
1430 | | index, |
1431 | | name: Some(Box::from(name)), |
1432 | | sub: Box::new(sub), |
1433 | | }) |
1434 | | } |
1435 | | |
1436 | | #[test] |
1437 | | fn ok_literal() { |
1438 | | assert_eq!(p("a"), Hir::char('a')); |
1439 | | assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')])); |
1440 | | assert_eq!(p("💩"), Hir::char('💩')); |
1441 | | } |
1442 | | |
1443 | | #[test] |
1444 | | fn ok_meta_escapes() { |
1445 | | assert_eq!(p(r"\*"), Hir::char('*')); |
1446 | | assert_eq!(p(r"\+"), Hir::char('+')); |
1447 | | assert_eq!(p(r"\?"), Hir::char('?')); |
1448 | | assert_eq!(p(r"\|"), Hir::char('|')); |
1449 | | assert_eq!(p(r"\("), Hir::char('(')); |
1450 | | assert_eq!(p(r"\)"), Hir::char(')')); |
1451 | | assert_eq!(p(r"\^"), Hir::char('^')); |
1452 | | assert_eq!(p(r"\$"), Hir::char('$')); |
1453 | | assert_eq!(p(r"\["), Hir::char('[')); |
1454 | | assert_eq!(p(r"\]"), Hir::char(']')); |
1455 | | } |
1456 | | |
1457 | | #[test] |
1458 | | fn ok_special_escapes() { |
1459 | | assert_eq!(p(r"\a"), Hir::char('\x07')); |
1460 | | assert_eq!(p(r"\f"), Hir::char('\x0C')); |
1461 | | assert_eq!(p(r"\t"), Hir::char('\t')); |
1462 | | assert_eq!(p(r"\n"), Hir::char('\n')); |
1463 | | assert_eq!(p(r"\r"), Hir::char('\r')); |
1464 | | assert_eq!(p(r"\v"), Hir::char('\x0B')); |
1465 | | assert_eq!(p(r"\A"), Hir::look(hir::Look::Start)); |
1466 | | assert_eq!(p(r"\z"), Hir::look(hir::Look::End)); |
1467 | | assert_eq!(p(r"\b"), Hir::look(hir::Look::Word)); |
1468 | | assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate)); |
1469 | | } |
1470 | | |
1471 | | #[test] |
1472 | | fn ok_hex() { |
1473 | | // fixed length |
1474 | | assert_eq!(p(r"\x41"), Hir::char('A')); |
1475 | | assert_eq!(p(r"\u2603"), Hir::char('☃')); |
1476 | | assert_eq!(p(r"\U0001F4A9"), Hir::char('💩')); |
1477 | | // braces |
1478 | | assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩')); |
1479 | | assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩')); |
1480 | | assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩')); |
1481 | | } |
1482 | | |
1483 | | #[test] |
1484 | | fn ok_perl() { |
1485 | | assert_eq!(p(r"\d"), posix("digit")); |
1486 | | assert_eq!(p(r"\s"), posix("space")); |
1487 | | assert_eq!(p(r"\w"), posix("word")); |
1488 | | |
1489 | | let negated = |name| { |
1490 | | let mut class = hir::Class::new(posix_class(name).unwrap()); |
1491 | | class.negate(); |
1492 | | Hir::class(class) |
1493 | | }; |
1494 | | assert_eq!(p(r"\D"), negated("digit")); |
1495 | | assert_eq!(p(r"\S"), negated("space")); |
1496 | | assert_eq!(p(r"\W"), negated("word")); |
1497 | | } |
1498 | | |
1499 | | #[test] |
1500 | | fn ok_flags_and_primitives() { |
1501 | | assert_eq!(p(r"a"), Hir::char('a')); |
1502 | | assert_eq!(p(r"(?i:a)"), singles(['A', 'a'])); |
1503 | | |
1504 | | assert_eq!(p(r"^"), Hir::look(hir::Look::Start)); |
1505 | | assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF)); |
1506 | | assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF)); |
1507 | | |
1508 | | assert_eq!(p(r"$"), Hir::look(hir::Look::End)); |
1509 | | assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF)); |
1510 | | assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF)); |
1511 | | |
1512 | | assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')])); |
1513 | | assert_eq!( |
1514 | | p(r"(?R:.)"), |
1515 | | class([ |
1516 | | ('\x00', '\x09'), |
1517 | | ('\x0B', '\x0C'), |
1518 | | ('\x0E', '\u{10FFFF}'), |
1519 | | ]) |
1520 | | ); |
1521 | | assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')])); |
1522 | | assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')])); |
1523 | | } |
1524 | | |
1525 | | #[test] |
1526 | | fn ok_alternate() { |
1527 | | assert_eq!( |
1528 | | p(r"a|b"), |
1529 | | Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
1530 | | ); |
1531 | | assert_eq!( |
1532 | | p(r"(?:a|b)"), |
1533 | | Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
1534 | | ); |
1535 | | |
1536 | | assert_eq!( |
1537 | | p(r"(a|b)"), |
1538 | | cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')])) |
1539 | | ); |
1540 | | assert_eq!( |
1541 | | p(r"(?<foo>a|b)"), |
1542 | | named_cap( |
1543 | | 1, |
1544 | | "foo", |
1545 | | Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
1546 | | ) |
1547 | | ); |
1548 | | |
1549 | | assert_eq!( |
1550 | | p(r"a|b|c"), |
1551 | | Hir::alternation(vec![ |
1552 | | Hir::char('a'), |
1553 | | Hir::char('b'), |
1554 | | Hir::char('c') |
1555 | | ]) |
1556 | | ); |
1557 | | |
1558 | | assert_eq!( |
1559 | | p(r"ax|by|cz"), |
1560 | | Hir::alternation(vec![ |
1561 | | Hir::concat(vec![Hir::char('a'), Hir::char('x')]), |
1562 | | Hir::concat(vec![Hir::char('b'), Hir::char('y')]), |
1563 | | Hir::concat(vec![Hir::char('c'), Hir::char('z')]), |
1564 | | ]) |
1565 | | ); |
1566 | | assert_eq!( |
1567 | | p(r"(ax|(by|(cz)))"), |
1568 | | cap( |
1569 | | 1, |
1570 | | Hir::alternation(vec![ |
1571 | | Hir::concat(vec![Hir::char('a'), Hir::char('x')]), |
1572 | | cap( |
1573 | | 2, |
1574 | | Hir::alternation(vec![ |
1575 | | Hir::concat(vec![Hir::char('b'), Hir::char('y')]), |
1576 | | cap( |
1577 | | 3, |
1578 | | Hir::concat(vec![ |
1579 | | Hir::char('c'), |
1580 | | Hir::char('z') |
1581 | | ]) |
1582 | | ), |
1583 | | ]) |
1584 | | ), |
1585 | | ]) |
1586 | | ) |
1587 | | ); |
1588 | | |
1589 | | assert_eq!( |
1590 | | p(r"|"), |
1591 | | Hir::alternation(vec![Hir::empty(), Hir::empty()]) |
1592 | | ); |
1593 | | assert_eq!( |
1594 | | p(r"||"), |
1595 | | Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) |
1596 | | ); |
1597 | | |
1598 | | assert_eq!( |
1599 | | p(r"a|"), |
1600 | | Hir::alternation(vec![Hir::char('a'), Hir::empty()]) |
1601 | | ); |
1602 | | assert_eq!( |
1603 | | p(r"|a"), |
1604 | | Hir::alternation(vec![Hir::empty(), Hir::char('a')]) |
1605 | | ); |
1606 | | |
1607 | | assert_eq!( |
1608 | | p(r"(|)"), |
1609 | | cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) |
1610 | | ); |
1611 | | assert_eq!( |
1612 | | p(r"(a|)"), |
1613 | | cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()])) |
1614 | | ); |
1615 | | assert_eq!( |
1616 | | p(r"(|a)"), |
1617 | | cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')])) |
1618 | | ); |
1619 | | } |
1620 | | |
1621 | | #[test] |
1622 | | fn ok_flag_group() { |
1623 | | assert_eq!( |
1624 | | p("a(?i:b)"), |
1625 | | Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) |
1626 | | ); |
1627 | | } |
1628 | | |
1629 | | #[test] |
1630 | | fn ok_flag_directive() { |
1631 | | assert_eq!(p("(?i)a"), singles(['A', 'a'])); |
1632 | | assert_eq!(p("a(?i)"), Hir::char('a')); |
1633 | | assert_eq!( |
1634 | | p("a(?i)b"), |
1635 | | Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) |
1636 | | ); |
1637 | | assert_eq!( |
1638 | | p("a(?i)a(?-i)a"), |
1639 | | Hir::concat(vec![ |
1640 | | Hir::char('a'), |
1641 | | singles(['A', 'a']), |
1642 | | Hir::char('a'), |
1643 | | ]) |
1644 | | ); |
1645 | | assert_eq!( |
1646 | | p("a(?:(?i)a)a"), |
1647 | | Hir::concat(vec![ |
1648 | | Hir::char('a'), |
1649 | | singles(['A', 'a']), |
1650 | | Hir::char('a'), |
1651 | | ]) |
1652 | | ); |
1653 | | assert_eq!( |
1654 | | p("a((?i)a)a"), |
1655 | | Hir::concat(vec![ |
1656 | | Hir::char('a'), |
1657 | | cap(1, singles(['A', 'a'])), |
1658 | | Hir::char('a'), |
1659 | | ]) |
1660 | | ); |
1661 | | } |
1662 | | |
1663 | | #[test] |
1664 | | fn ok_uncounted_repetition() { |
1665 | | assert_eq!( |
1666 | | p(r"a?"), |
1667 | | Hir::repetition(hir::Repetition { |
1668 | | min: 0, |
1669 | | max: Some(1), |
1670 | | greedy: true, |
1671 | | sub: Box::new(Hir::char('a')), |
1672 | | }), |
1673 | | ); |
1674 | | assert_eq!( |
1675 | | p(r"a*"), |
1676 | | Hir::repetition(hir::Repetition { |
1677 | | min: 0, |
1678 | | max: None, |
1679 | | greedy: true, |
1680 | | sub: Box::new(Hir::char('a')), |
1681 | | }), |
1682 | | ); |
1683 | | assert_eq!( |
1684 | | p(r"a+"), |
1685 | | Hir::repetition(hir::Repetition { |
1686 | | min: 1, |
1687 | | max: None, |
1688 | | greedy: true, |
1689 | | sub: Box::new(Hir::char('a')), |
1690 | | }), |
1691 | | ); |
1692 | | |
1693 | | assert_eq!( |
1694 | | p(r"a??"), |
1695 | | Hir::repetition(hir::Repetition { |
1696 | | min: 0, |
1697 | | max: Some(1), |
1698 | | greedy: false, |
1699 | | sub: Box::new(Hir::char('a')), |
1700 | | }), |
1701 | | ); |
1702 | | assert_eq!( |
1703 | | p(r"a*?"), |
1704 | | Hir::repetition(hir::Repetition { |
1705 | | min: 0, |
1706 | | max: None, |
1707 | | greedy: false, |
1708 | | sub: Box::new(Hir::char('a')), |
1709 | | }), |
1710 | | ); |
1711 | | assert_eq!( |
1712 | | p(r"a+?"), |
1713 | | Hir::repetition(hir::Repetition { |
1714 | | min: 1, |
1715 | | max: None, |
1716 | | greedy: false, |
1717 | | sub: Box::new(Hir::char('a')), |
1718 | | }), |
1719 | | ); |
1720 | | |
1721 | | assert_eq!( |
1722 | | p(r"a?b"), |
1723 | | Hir::concat(vec![ |
1724 | | Hir::repetition(hir::Repetition { |
1725 | | min: 0, |
1726 | | max: Some(1), |
1727 | | greedy: true, |
1728 | | sub: Box::new(Hir::char('a')), |
1729 | | }), |
1730 | | Hir::char('b'), |
1731 | | ]), |
1732 | | ); |
1733 | | |
1734 | | assert_eq!( |
1735 | | p(r"ab?"), |
1736 | | Hir::concat(vec![ |
1737 | | Hir::char('a'), |
1738 | | Hir::repetition(hir::Repetition { |
1739 | | min: 0, |
1740 | | max: Some(1), |
1741 | | greedy: true, |
1742 | | sub: Box::new(Hir::char('b')), |
1743 | | }), |
1744 | | ]), |
1745 | | ); |
1746 | | |
1747 | | assert_eq!( |
1748 | | p(r"(?:ab)?"), |
1749 | | Hir::repetition(hir::Repetition { |
1750 | | min: 0, |
1751 | | max: Some(1), |
1752 | | greedy: true, |
1753 | | sub: Box::new(Hir::concat(vec![ |
1754 | | Hir::char('a'), |
1755 | | Hir::char('b') |
1756 | | ])), |
1757 | | }), |
1758 | | ); |
1759 | | |
1760 | | assert_eq!( |
1761 | | p(r"(ab)?"), |
1762 | | Hir::repetition(hir::Repetition { |
1763 | | min: 0, |
1764 | | max: Some(1), |
1765 | | greedy: true, |
1766 | | sub: Box::new(cap( |
1767 | | 1, |
1768 | | Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
1769 | | )), |
1770 | | }), |
1771 | | ); |
1772 | | |
1773 | | assert_eq!( |
1774 | | p(r"|a?"), |
1775 | | Hir::alternation(vec![ |
1776 | | Hir::empty(), |
1777 | | Hir::repetition(hir::Repetition { |
1778 | | min: 0, |
1779 | | max: Some(1), |
1780 | | greedy: true, |
1781 | | sub: Box::new(Hir::char('a')), |
1782 | | }) |
1783 | | ]), |
1784 | | ); |
1785 | | } |
1786 | | |
1787 | | #[test] |
1788 | | fn ok_counted_repetition() { |
1789 | | assert_eq!( |
1790 | | p(r"a{5}"), |
1791 | | Hir::repetition(hir::Repetition { |
1792 | | min: 5, |
1793 | | max: Some(5), |
1794 | | greedy: true, |
1795 | | sub: Box::new(Hir::char('a')), |
1796 | | }), |
1797 | | ); |
1798 | | assert_eq!( |
1799 | | p(r"a{5}?"), |
1800 | | Hir::repetition(hir::Repetition { |
1801 | | min: 5, |
1802 | | max: Some(5), |
1803 | | greedy: false, |
1804 | | sub: Box::new(Hir::char('a')), |
1805 | | }), |
1806 | | ); |
1807 | | |
1808 | | assert_eq!( |
1809 | | p(r"a{5,}"), |
1810 | | Hir::repetition(hir::Repetition { |
1811 | | min: 5, |
1812 | | max: None, |
1813 | | greedy: true, |
1814 | | sub: Box::new(Hir::char('a')), |
1815 | | }), |
1816 | | ); |
1817 | | |
1818 | | assert_eq!( |
1819 | | p(r"a{5,9}"), |
1820 | | Hir::repetition(hir::Repetition { |
1821 | | min: 5, |
1822 | | max: Some(9), |
1823 | | greedy: true, |
1824 | | sub: Box::new(Hir::char('a')), |
1825 | | }), |
1826 | | ); |
1827 | | |
1828 | | assert_eq!( |
1829 | | p(r"ab{5}c"), |
1830 | | Hir::concat(vec![ |
1831 | | Hir::char('a'), |
1832 | | Hir::repetition(hir::Repetition { |
1833 | | min: 5, |
1834 | | max: Some(5), |
1835 | | greedy: true, |
1836 | | sub: Box::new(Hir::char('b')), |
1837 | | }), |
1838 | | Hir::char('c'), |
1839 | | ]), |
1840 | | ); |
1841 | | |
1842 | | assert_eq!( |
1843 | | p(r"a{ 5 }"), |
1844 | | Hir::repetition(hir::Repetition { |
1845 | | min: 5, |
1846 | | max: Some(5), |
1847 | | greedy: true, |
1848 | | sub: Box::new(Hir::char('a')), |
1849 | | }), |
1850 | | ); |
1851 | | assert_eq!( |
1852 | | p(r"a{ 5 , 9 }"), |
1853 | | Hir::repetition(hir::Repetition { |
1854 | | min: 5, |
1855 | | max: Some(9), |
1856 | | greedy: true, |
1857 | | sub: Box::new(Hir::char('a')), |
1858 | | }), |
1859 | | ); |
1860 | | } |
1861 | | |
1862 | | #[test] |
1863 | | fn ok_group_unnamed() { |
1864 | | assert_eq!(p("(a)"), cap(1, Hir::char('a'))); |
1865 | | assert_eq!( |
1866 | | p("(ab)"), |
1867 | | cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')])) |
1868 | | ); |
1869 | | } |
1870 | | |
1871 | | #[test] |
1872 | | fn ok_group_named() { |
1873 | | assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a'))); |
1874 | | assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a'))); |
1875 | | |
1876 | | assert_eq!( |
1877 | | p("(?P<foo>ab)"), |
1878 | | named_cap( |
1879 | | 1, |
1880 | | "foo", |
1881 | | Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
1882 | | ) |
1883 | | ); |
1884 | | assert_eq!( |
1885 | | p("(?<foo>ab)"), |
1886 | | named_cap( |
1887 | | 1, |
1888 | | "foo", |
1889 | | Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
1890 | | ) |
1891 | | ); |
1892 | | |
1893 | | assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z'))); |
1894 | | assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z'))); |
1895 | | |
1896 | | assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z'))); |
1897 | | assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z'))); |
1898 | | |
1899 | | assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z'))); |
1900 | | assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z'))); |
1901 | | |
1902 | | assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z'))); |
1903 | | assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z'))); |
1904 | | |
1905 | | assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z'))); |
1906 | | assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z'))); |
1907 | | |
1908 | | assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z'))); |
1909 | | assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z'))); |
1910 | | } |
1911 | | |
1912 | | #[test] |
1913 | | fn ok_class() { |
1914 | | assert_eq!(p(r"[a]"), singles(['a'])); |
1915 | | assert_eq!(p(r"[a\]]"), singles(['a', ']'])); |
1916 | | assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z'])); |
1917 | | assert_eq!(p(r"[ab]"), class([('a', 'b')])); |
1918 | | assert_eq!(p(r"[a-]"), singles(['a', '-'])); |
1919 | | assert_eq!(p(r"[-a]"), singles(['a', '-'])); |
1920 | | assert_eq!(p(r"[--a]"), singles(['a', '-'])); |
1921 | | assert_eq!(p(r"[---a]"), singles(['a', '-'])); |
1922 | | assert_eq!(p(r"[[:alnum:]]"), posix("alnum")); |
1923 | | assert_eq!(p(r"[\w]"), posix("word")); |
1924 | | assert_eq!(p(r"[a\wz]"), posix("word")); |
1925 | | assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')])); |
1926 | | assert_eq!(p(r"[^\s\S]"), Hir::fail()); |
1927 | | assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')])); |
1928 | | assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')])); |
1929 | | assert_eq!(p(r"[]]"), singles([']'])); |
1930 | | assert_eq!(p(r"[]a]"), singles([']', 'a'])); |
1931 | | assert_eq!(p(r"[]\[]"), singles(['[', ']'])); |
1932 | | assert_eq!(p(r"[\[]"), singles(['['])); |
1933 | | |
1934 | | assert_eq!(p(r"(?i)[a]"), singles(['A', 'a'])); |
1935 | | assert_eq!(p(r"(?i)[A]"), singles(['A', 'a'])); |
1936 | | assert_eq!(p(r"(?i)[k]"), singles(['K', 'k'])); |
1937 | | assert_eq!(p(r"(?i)[s]"), singles(['S', 's'])); |
1938 | | assert_eq!(p(r"(?i)[β]"), singles(['β'])); |
1939 | | |
1940 | | assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')])); |
1941 | | assert_eq!( |
1942 | | p(r"[^-a]"), |
1943 | | class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')]) |
1944 | | ); |
1945 | | |
1946 | | assert_eq!( |
1947 | | p(r"[-]a]"), |
1948 | | Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')]) |
1949 | | ); |
1950 | | } |
1951 | | |
1952 | | #[test] |
1953 | | fn ok_verbatim() { |
1954 | | assert_eq!( |
1955 | | p(r"(?x)a{5,9} ?"), |
1956 | | Hir::repetition(hir::Repetition { |
1957 | | min: 5, |
1958 | | max: Some(9), |
1959 | | greedy: false, |
1960 | | sub: Box::new(Hir::char('a')), |
1961 | | }) |
1962 | | ); |
1963 | | assert_eq!(p(r"(?x)[ a]"), singles(['a'])); |
1964 | | assert_eq!( |
1965 | | p(r"(?x)[ ^ a]"), |
1966 | | class([('\x00', '`'), ('b', '\u{10FFFF}')]) |
1967 | | ); |
1968 | | assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-'])); |
1969 | | assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a'])); |
1970 | | |
1971 | | assert_eq!( |
1972 | | p(r"(?x)a b"), |
1973 | | Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
1974 | | ); |
1975 | | assert_eq!( |
1976 | | p(r"(?x)a b(?-x)a b"), |
1977 | | Hir::concat(vec![ |
1978 | | Hir::char('a'), |
1979 | | Hir::char('b'), |
1980 | | Hir::char('a'), |
1981 | | Hir::char(' '), |
1982 | | Hir::char('b'), |
1983 | | ]) |
1984 | | ); |
1985 | | assert_eq!( |
1986 | | p(r"a (?x:a )a "), |
1987 | | Hir::concat(vec![ |
1988 | | Hir::char('a'), |
1989 | | Hir::char(' '), |
1990 | | Hir::char('a'), |
1991 | | Hir::char('a'), |
1992 | | Hir::char(' '), |
1993 | | ]) |
1994 | | ); |
1995 | | assert_eq!( |
1996 | | p(r"(?x)( ?P<foo> a )"), |
1997 | | named_cap(1, "foo", Hir::char('a')), |
1998 | | ); |
1999 | | assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a'))); |
2000 | | assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a')); |
2001 | | assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53')); |
2002 | | assert_eq!(p(r"(?x)\ "), Hir::char(' ')); |
2003 | | } |
2004 | | |
2005 | | #[test] |
2006 | | fn ok_comments() { |
2007 | | let pat = "(?x) |
2008 | | # This is comment 1. |
2009 | | foo # This is comment 2. |
2010 | | # This is comment 3. |
2011 | | bar |
2012 | | # This is comment 4."; |
2013 | | assert_eq!( |
2014 | | p(pat), |
2015 | | Hir::concat(vec![ |
2016 | | Hir::char('f'), |
2017 | | Hir::char('o'), |
2018 | | Hir::char('o'), |
2019 | | Hir::char('b'), |
2020 | | Hir::char('a'), |
2021 | | Hir::char('r'), |
2022 | | ]) |
2023 | | ); |
2024 | | } |
2025 | | |
2026 | | #[test] |
2027 | | fn err_standard() { |
2028 | | assert_eq!( |
2029 | | ERR_TOO_MUCH_NESTING, |
2030 | | perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"), |
2031 | | ); |
2032 | | // This one is tricky, because the only way it can happen is if the |
2033 | | // number of captures overflows u32. Perhaps we should allow setting a |
2034 | | // lower limit? |
2035 | | // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); |
2036 | | assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)")); |
2037 | | assert_eq!(ERR_UNCLOSED_GROUP, perr("(")); |
2038 | | assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?")); |
2039 | | assert_eq!(ERR_UNOPENED_GROUP, perr(")")); |
2040 | | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)")); |
2041 | | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)")); |
2042 | | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)")); |
2043 | | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)")); |
2044 | | assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)")); |
2045 | | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<")); |
2046 | | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<")); |
2047 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)")); |
2048 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)")); |
2049 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)")); |
2050 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)")); |
2051 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)")); |
2052 | | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)")); |
2053 | | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo")); |
2054 | | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo")); |
2055 | | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)")); |
2056 | | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)")); |
2057 | | assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)")); |
2058 | | assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)")); |
2059 | | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)")); |
2060 | | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)")); |
2061 | | assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is")); |
2062 | | assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)")); |
2063 | | assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}")); |
2064 | | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{")); |
2065 | | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A")); |
2066 | | assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}")); |
2067 | | assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}")); |
2068 | | assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA")); |
2069 | | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ")); |
2070 | | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA")); |
2071 | | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ")); |
2072 | | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800")); |
2073 | | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF")); |
2074 | | assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x")); |
2075 | | assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\")); |
2076 | | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0")); |
2077 | | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1")); |
2078 | | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8")); |
2079 | | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); |
2080 | | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); |
2081 | | assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); |
2082 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); |
2083 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); |
2084 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); |
2085 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)")); |
2086 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?")); |
2087 | | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?")); |
2088 | | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}")); |
2089 | | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})")); |
2090 | | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}")); |
2091 | | assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{")); |
2092 | | assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5")); |
2093 | | assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,")); |
2094 | | assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6")); |
2095 | | assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z")); |
2096 | | assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}")); |
2097 | | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}")); |
2098 | | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}")); |
2099 | | assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}")); |
2100 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a")); |
2101 | | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]")); |
2102 | | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]")); |
2103 | | assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]")); |
2104 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-")); |
2105 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^")); |
2106 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]")); |
2107 | | assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]")); |
2108 | | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[")); |
2109 | | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z")); |
2110 | | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]")); |
2111 | | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]")); |
2112 | | assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); |
2113 | | assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); |
2114 | | assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); |
2115 | | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); |
2116 | | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); |
2117 | | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); |
2118 | | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); |
2119 | | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); |
2120 | | } |
2121 | | |
2122 | | #[test] |
2123 | | fn err_verbatim() { |
2124 | | // See: https://github.com/rust-lang/regex/issues/792 |
2125 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]")); |
2126 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a ")); |
2127 | | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- ")); |
2128 | | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ ")); |
2129 | | } |
2130 | | |
2131 | | // This tests a bug fix where the nest limit checker wasn't decrementing |
2132 | | // its depth during post-traversal, which causes long regexes to trip |
2133 | | // the default limit too aggressively. |
2134 | | #[test] |
2135 | | fn regression_454_nest_too_big() { |
2136 | | let pattern = r#" |
2137 | | 2(?: |
2138 | | [45]\d{3}| |
2139 | | 7(?: |
2140 | | 1[0-267]| |
2141 | | 2[0-289]| |
2142 | | 3[0-29]| |
2143 | | 4[01]| |
2144 | | 5[1-3]| |
2145 | | 6[013]| |
2146 | | 7[0178]| |
2147 | | 91 |
2148 | | )| |
2149 | | 8(?: |
2150 | | 0[125]| |
2151 | | [139][1-6]| |
2152 | | 2[0157-9]| |
2153 | | 41| |
2154 | | 6[1-35]| |
2155 | | 7[1-5]| |
2156 | | 8[1-8]| |
2157 | | 90 |
2158 | | )| |
2159 | | 9(?: |
2160 | | 0[0-2]| |
2161 | | 1[0-4]| |
2162 | | 2[568]| |
2163 | | 3[3-6]| |
2164 | | 5[5-7]| |
2165 | | 6[0167]| |
2166 | | 7[15]| |
2167 | | 8[0146-9] |
2168 | | ) |
2169 | | )\d{4} |
2170 | | "#; |
2171 | | p(pattern); |
2172 | | } |
2173 | | |
2174 | | // This tests that we treat a trailing `-` in a character class as a |
2175 | | // literal `-` even when whitespace mode is enabled and there is whitespace |
2176 | | // after the trailing `-`. |
2177 | | #[test] |
2178 | | fn regression_455_trailing_dash_ignore_whitespace() { |
2179 | | p("(?x)[ / - ]"); |
2180 | | p("(?x)[ a - ]"); |
2181 | | p("(?x)[ |
2182 | | a |
2183 | | - ] |
2184 | | "); |
2185 | | p("(?x)[ |
2186 | | a # wat |
2187 | | - ] |
2188 | | "); |
2189 | | |
2190 | | perr("(?x)[ / -"); |
2191 | | perr("(?x)[ / - "); |
2192 | | perr( |
2193 | | "(?x)[ |
2194 | | / - |
2195 | | ", |
2196 | | ); |
2197 | | perr( |
2198 | | "(?x)[ |
2199 | | / - # wat |
2200 | | ", |
2201 | | ); |
2202 | | } |
2203 | | |
2204 | | #[test] |
2205 | | fn regression_capture_indices() { |
2206 | | let got = p(r"(a|ab|c|bcd){4,10}(d*)"); |
2207 | | assert_eq!( |
2208 | | got, |
2209 | | Hir::concat(vec![ |
2210 | | Hir::repetition(hir::Repetition { |
2211 | | min: 4, |
2212 | | max: Some(10), |
2213 | | greedy: true, |
2214 | | sub: Box::new(cap( |
2215 | | 1, |
2216 | | Hir::alternation(vec![ |
2217 | | Hir::char('a'), |
2218 | | Hir::concat(vec![Hir::char('a'), Hir::char('b')]), |
2219 | | Hir::char('c'), |
2220 | | Hir::concat(vec![ |
2221 | | Hir::char('b'), |
2222 | | Hir::char('c'), |
2223 | | Hir::char('d') |
2224 | | ]), |
2225 | | ]) |
2226 | | )) |
2227 | | }), |
2228 | | cap( |
2229 | | 2, |
2230 | | Hir::repetition(hir::Repetition { |
2231 | | min: 0, |
2232 | | max: None, |
2233 | | greedy: true, |
2234 | | sub: Box::new(Hir::char('d')), |
2235 | | }) |
2236 | | ), |
2237 | | ]) |
2238 | | ); |
2239 | | } |
2240 | | } |