/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/ast/parse.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | This module provides a regular expression parser. |
3 | | */ |
4 | | |
5 | | use core::{ |
6 | | borrow::Borrow, |
7 | | cell::{Cell, RefCell}, |
8 | | mem, |
9 | | }; |
10 | | |
11 | | use alloc::{ |
12 | | boxed::Box, |
13 | | string::{String, ToString}, |
14 | | vec, |
15 | | vec::Vec, |
16 | | }; |
17 | | |
18 | | use crate::{ |
19 | | ast::{self, Ast, Position, Span}, |
20 | | either::Either, |
21 | | is_escapeable_character, is_meta_character, |
22 | | }; |
23 | | |
24 | | type Result<T> = core::result::Result<T, ast::Error>; |
25 | | |
26 | | /// A primitive is an expression with no sub-expressions. This includes |
27 | | /// literals, assertions and non-set character classes. This representation |
28 | | /// is used as intermediate state in the parser. |
29 | | /// |
30 | | /// This does not include ASCII character classes, since they can only appear |
31 | | /// within a set character class. |
32 | | #[derive(Clone, Debug, Eq, PartialEq)] |
33 | | enum Primitive { |
34 | | Literal(ast::Literal), |
35 | | Assertion(ast::Assertion), |
36 | | Dot(Span), |
37 | | Perl(ast::ClassPerl), |
38 | | Unicode(ast::ClassUnicode), |
39 | | } |
40 | | |
41 | | impl Primitive { |
42 | | /// Return the span of this primitive. |
43 | 0 | fn span(&self) -> &Span { |
44 | 0 | match *self { |
45 | 0 | Primitive::Literal(ref x) => &x.span, |
46 | 0 | Primitive::Assertion(ref x) => &x.span, |
47 | 0 | Primitive::Dot(ref span) => span, |
48 | 0 | Primitive::Perl(ref x) => &x.span, |
49 | 0 | Primitive::Unicode(ref x) => &x.span, |
50 | | } |
51 | 0 | } |
52 | | |
53 | | /// Convert this primitive into a proper AST. |
54 | 0 | fn into_ast(self) -> Ast { |
55 | 0 | match self { |
56 | 0 | Primitive::Literal(lit) => Ast::literal(lit), |
57 | 0 | Primitive::Assertion(assert) => Ast::assertion(assert), |
58 | 0 | Primitive::Dot(span) => Ast::dot(span), |
59 | 0 | Primitive::Perl(cls) => Ast::class_perl(cls), |
60 | 0 | Primitive::Unicode(cls) => Ast::class_unicode(cls), |
61 | | } |
62 | 0 | } |
63 | | |
64 | | /// Convert this primitive into an item in a character class. |
65 | | /// |
66 | | /// If this primitive is not a legal item (i.e., an assertion or a dot), |
67 | | /// then return an error. |
68 | 0 | fn into_class_set_item<P: Borrow<Parser>>( |
69 | 0 | self, |
70 | 0 | p: &ParserI<'_, P>, |
71 | 0 | ) -> Result<ast::ClassSetItem> { |
72 | | use self::Primitive::*; |
73 | | use crate::ast::ClassSetItem; |
74 | | |
75 | 0 | match self { |
76 | 0 | Literal(lit) => Ok(ClassSetItem::Literal(lit)), |
77 | 0 | Perl(cls) => Ok(ClassSetItem::Perl(cls)), |
78 | 0 | Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), |
79 | 0 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), |
80 | | } |
81 | 0 | } |
82 | | |
83 | | /// Convert this primitive into a literal in a character class. In |
84 | | /// particular, literals are the only valid items that can appear in |
85 | | /// ranges. |
86 | | /// |
87 | | /// If this primitive is not a legal item (i.e., a class, assertion or a |
88 | | /// dot), then return an error. |
89 | 0 | fn into_class_literal<P: Borrow<Parser>>( |
90 | 0 | self, |
91 | 0 | p: &ParserI<'_, P>, |
92 | 0 | ) -> Result<ast::Literal> { |
93 | | use self::Primitive::*; |
94 | | |
95 | 0 | match self { |
96 | 0 | Literal(lit) => Ok(lit), |
97 | 0 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), |
98 | | } |
99 | 0 | } |
100 | | } |
101 | | |
102 | | /// Returns true if the given character is a hexadecimal digit. |
103 | 0 | fn is_hex(c: char) -> bool { |
104 | 0 | ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
105 | 0 | } |
106 | | |
107 | | /// Returns true if the given character is a valid in a capture group name. |
108 | | /// |
109 | | /// If `first` is true, then `c` is treated as the first character in the |
110 | | /// group name (which must be alphabetic or underscore). |
111 | 0 | fn is_capture_char(c: char, first: bool) -> bool { |
112 | 0 | if first { |
113 | 0 | c == '_' || c.is_alphabetic() |
114 | | } else { |
115 | 0 | c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
116 | | } |
117 | 0 | } |
118 | | |
119 | | /// A builder for a regular expression parser. |
120 | | /// |
121 | | /// This builder permits modifying configuration options for the parser. |
122 | | #[derive(Clone, Debug)] |
123 | | pub struct ParserBuilder { |
124 | | ignore_whitespace: bool, |
125 | | nest_limit: u32, |
126 | | octal: bool, |
127 | | empty_min_range: bool, |
128 | | } |
129 | | |
130 | | impl Default for ParserBuilder { |
131 | 0 | fn default() -> ParserBuilder { |
132 | 0 | ParserBuilder::new() |
133 | 0 | } |
134 | | } |
135 | | |
136 | | impl ParserBuilder { |
137 | | /// Create a new parser builder with a default configuration. |
138 | 0 | pub fn new() -> ParserBuilder { |
139 | 0 | ParserBuilder { |
140 | 0 | ignore_whitespace: false, |
141 | 0 | nest_limit: 250, |
142 | 0 | octal: false, |
143 | 0 | empty_min_range: false, |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | | /// Build a parser from this configuration with the given pattern. |
148 | 0 | pub fn build(&self) -> Parser { |
149 | 0 | Parser { |
150 | 0 | pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), |
151 | 0 | capture_index: Cell::new(0), |
152 | 0 | nest_limit: self.nest_limit, |
153 | 0 | octal: self.octal, |
154 | 0 | empty_min_range: self.empty_min_range, |
155 | 0 | initial_ignore_whitespace: self.ignore_whitespace, |
156 | 0 | ignore_whitespace: Cell::new(self.ignore_whitespace), |
157 | 0 | comments: RefCell::new(vec![]), |
158 | 0 | stack_group: RefCell::new(vec![]), |
159 | 0 | stack_class: RefCell::new(vec![]), |
160 | 0 | capture_names: RefCell::new(vec![]), |
161 | 0 | scratch: RefCell::new(String::new()), |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | | /// Set the nesting limit for this parser. |
166 | | /// |
167 | | /// The nesting limit controls how deep the abstract syntax tree is allowed |
168 | | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
169 | | /// groups), then an error is returned by the parser. |
170 | | /// |
171 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
172 | | /// overflow for consumers that do structural induction on an `Ast` using |
173 | | /// explicit recursion. While this crate never does this (instead using |
174 | | /// constant stack space and moving the call stack to the heap), other |
175 | | /// crates may. |
176 | | /// |
177 | | /// This limit is not checked until the entire AST is parsed. Therefore, |
178 | | /// if callers want to put a limit on the amount of heap space used, then |
179 | | /// they should impose a limit on the length, in bytes, of the concrete |
180 | | /// pattern string. In particular, this is viable since this parser |
181 | | /// implementation will limit itself to heap space proportional to the |
182 | | /// length of the pattern string. |
183 | | /// |
184 | | /// Note that a nest limit of `0` will return a nest limit error for most |
185 | | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
186 | | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
187 | | /// depth of `1`. In general, a nest limit is not something that manifests |
188 | | /// in an obvious way in the concrete syntax, therefore, it should not be |
189 | | /// used in a granular way. |
190 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { |
191 | 0 | self.nest_limit = limit; |
192 | 0 | self |
193 | 0 | } |
194 | | |
195 | | /// Whether to support octal syntax or not. |
196 | | /// |
197 | | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
198 | | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
199 | | /// `\141` are all equivalent regular expressions, where the last example |
200 | | /// shows octal syntax. |
201 | | /// |
202 | | /// While supporting octal syntax isn't in and of itself a problem, it does |
203 | | /// make good error messages harder. That is, in PCRE based regex engines, |
204 | | /// syntax like `\0` invokes a backreference, which is explicitly |
205 | | /// unsupported in Rust's regex engine. However, many users expect it to |
206 | | /// be supported. Therefore, when octal support is disabled, the error |
207 | | /// message will explicitly mention that backreferences aren't supported. |
208 | | /// |
209 | | /// Octal syntax is disabled by default. |
210 | 0 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { |
211 | 0 | self.octal = yes; |
212 | 0 | self |
213 | 0 | } |
214 | | |
215 | | /// Enable verbose mode in the regular expression. |
216 | | /// |
217 | | /// When enabled, verbose mode permits insignificant whitespace in many |
218 | | /// places in the regular expression, as well as comments. Comments are |
219 | | /// started using `#` and continue until the end of the line. |
220 | | /// |
221 | | /// By default, this is disabled. It may be selectively enabled in the |
222 | | /// regular expression by using the `x` flag regardless of this setting. |
223 | 0 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { |
224 | 0 | self.ignore_whitespace = yes; |
225 | 0 | self |
226 | 0 | } |
227 | | |
228 | | /// Allow using `{,n}` as an equivalent to `{0,n}`. |
229 | | /// |
230 | | /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`. |
231 | | /// Most regular expression engines don't support the `{,n}` syntax, but |
232 | | /// some others do it, namely Python's `re` library. |
233 | | /// |
234 | | /// This is disabled by default. |
235 | 0 | pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder { |
236 | 0 | self.empty_min_range = yes; |
237 | 0 | self |
238 | 0 | } |
239 | | } |
240 | | |
241 | | /// A regular expression parser. |
242 | | /// |
243 | | /// This parses a string representation of a regular expression into an |
244 | | /// abstract syntax tree. The size of the tree is proportional to the length |
245 | | /// of the regular expression pattern. |
246 | | /// |
247 | | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
248 | | #[derive(Clone, Debug)] |
249 | | pub struct Parser { |
250 | | /// The current position of the parser. |
251 | | pos: Cell<Position>, |
252 | | /// The current capture index. |
253 | | capture_index: Cell<u32>, |
254 | | /// The maximum number of open parens/brackets allowed. If the parser |
255 | | /// exceeds this number, then an error is returned. |
256 | | nest_limit: u32, |
257 | | /// Whether to support octal syntax or not. When `false`, the parser will |
258 | | /// return an error helpfully pointing out that backreferences are not |
259 | | /// supported. |
260 | | octal: bool, |
261 | | /// The initial setting for `ignore_whitespace` as provided by |
262 | | /// `ParserBuilder`. It is used when resetting the parser's state. |
263 | | initial_ignore_whitespace: bool, |
264 | | /// Whether the parser supports `{,n}` repetitions as an equivalent to |
265 | | /// `{0,n}.` |
266 | | empty_min_range: bool, |
267 | | /// Whether whitespace should be ignored. When enabled, comments are |
268 | | /// also permitted. |
269 | | ignore_whitespace: Cell<bool>, |
270 | | /// A list of comments, in order of appearance. |
271 | | comments: RefCell<Vec<ast::Comment>>, |
272 | | /// A stack of grouped sub-expressions, including alternations. |
273 | | stack_group: RefCell<Vec<GroupState>>, |
274 | | /// A stack of nested character classes. This is only non-empty when |
275 | | /// parsing a class. |
276 | | stack_class: RefCell<Vec<ClassState>>, |
277 | | /// A sorted sequence of capture names. This is used to detect duplicate |
278 | | /// capture names and report an error if one is detected. |
279 | | capture_names: RefCell<Vec<ast::CaptureName>>, |
280 | | /// A scratch buffer used in various places. Mostly this is used to |
281 | | /// accumulate relevant characters from parts of a pattern. |
282 | | scratch: RefCell<String>, |
283 | | } |
284 | | |
285 | | /// ParserI is the internal parser implementation. |
286 | | /// |
287 | | /// We use this separate type so that we can carry the provided pattern string |
288 | | /// along with us. In particular, a `Parser` internal state is not tied to any |
289 | | /// one pattern, but `ParserI` is. |
290 | | /// |
291 | | /// This type also lets us use `ParserI<&Parser>` in production code while |
292 | | /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes |
293 | | /// work against the internal interface of the parser. |
294 | | #[derive(Clone, Debug)] |
295 | | struct ParserI<'s, P> { |
296 | | /// The parser state/configuration. |
297 | | parser: P, |
298 | | /// The full regular expression provided by the user. |
299 | | pattern: &'s str, |
300 | | } |
301 | | |
302 | | /// GroupState represents a single stack frame while parsing nested groups |
303 | | /// and alternations. Each frame records the state up to an opening parenthesis |
304 | | /// or a alternating bracket `|`. |
305 | | #[derive(Clone, Debug)] |
306 | | enum GroupState { |
307 | | /// This state is pushed whenever an opening group is found. |
308 | | Group { |
309 | | /// The concatenation immediately preceding the opening group. |
310 | | concat: ast::Concat, |
311 | | /// The group that has been opened. Its sub-AST is always empty. |
312 | | group: ast::Group, |
313 | | /// Whether this group has the `x` flag enabled or not. |
314 | | ignore_whitespace: bool, |
315 | | }, |
316 | | /// This state is pushed whenever a new alternation branch is found. If |
317 | | /// an alternation branch is found and this state is at the top of the |
318 | | /// stack, then this state should be modified to include the new |
319 | | /// alternation. |
320 | | Alternation(ast::Alternation), |
321 | | } |
322 | | |
323 | | /// ClassState represents a single stack frame while parsing character classes. |
324 | | /// Each frame records the state up to an intersection, difference, symmetric |
325 | | /// difference or nested class. |
326 | | /// |
327 | | /// Note that a parser's character class stack is only non-empty when parsing |
328 | | /// a character class. In all other cases, it is empty. |
329 | | #[derive(Clone, Debug)] |
330 | | enum ClassState { |
331 | | /// This state is pushed whenever an opening bracket is found. |
332 | | Open { |
333 | | /// The union of class items immediately preceding this class. |
334 | | union: ast::ClassSetUnion, |
335 | | /// The class that has been opened. Typically this just corresponds |
336 | | /// to the `[`, but it can also include `[^` since `^` indicates |
337 | | /// negation of the class. |
338 | | set: ast::ClassBracketed, |
339 | | }, |
340 | | /// This state is pushed when a operator is seen. When popped, the stored |
341 | | /// set becomes the left hand side of the operator. |
342 | | Op { |
343 | | /// The type of the operation, i.e., &&, -- or ~~. |
344 | | kind: ast::ClassSetBinaryOpKind, |
345 | | /// The left-hand side of the operator. |
346 | | lhs: ast::ClassSet, |
347 | | }, |
348 | | } |
349 | | |
350 | | impl Parser { |
351 | | /// Create a new parser with a default configuration. |
352 | | /// |
353 | | /// The parser can be run with either the `parse` or `parse_with_comments` |
354 | | /// methods. The parse methods return an abstract syntax tree. |
355 | | /// |
356 | | /// To set configuration options on the parser, use [`ParserBuilder`]. |
357 | 0 | pub fn new() -> Parser { |
358 | 0 | ParserBuilder::new().build() |
359 | 0 | } |
360 | | |
361 | | /// Parse the regular expression into an abstract syntax tree. |
362 | 0 | pub fn parse(&mut self, pattern: &str) -> Result<Ast> { |
363 | 0 | ParserI::new(self, pattern).parse() |
364 | 0 | } |
365 | | |
366 | | /// Parse the regular expression and return an abstract syntax tree with |
367 | | /// all of the comments found in the pattern. |
368 | 0 | pub fn parse_with_comments( |
369 | 0 | &mut self, |
370 | 0 | pattern: &str, |
371 | 0 | ) -> Result<ast::WithComments> { |
372 | 0 | ParserI::new(self, pattern).parse_with_comments() |
373 | 0 | } |
374 | | |
375 | | /// Reset the internal state of a parser. |
376 | | /// |
377 | | /// This is called at the beginning of every parse. This prevents the |
378 | | /// parser from running with inconsistent state (say, if a previous |
379 | | /// invocation returned an error and the parser is reused). |
380 | 0 | fn reset(&self) { |
381 | 0 | // These settings should be in line with the construction |
382 | 0 | // in `ParserBuilder::build`. |
383 | 0 | self.pos.set(Position { offset: 0, line: 1, column: 1 }); |
384 | 0 | self.ignore_whitespace.set(self.initial_ignore_whitespace); |
385 | 0 | self.comments.borrow_mut().clear(); |
386 | 0 | self.stack_group.borrow_mut().clear(); |
387 | 0 | self.stack_class.borrow_mut().clear(); |
388 | 0 | } |
389 | | } |
390 | | |
391 | | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
392 | | /// Build an internal parser from a parser configuration and a pattern. |
393 | 0 | fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { |
394 | 0 | ParserI { parser, pattern } |
395 | 0 | } |
396 | | |
397 | | /// Return a reference to the parser state. |
398 | 0 | fn parser(&self) -> &Parser { |
399 | 0 | self.parser.borrow() |
400 | 0 | } |
401 | | |
402 | | /// Return a reference to the pattern being parsed. |
403 | 0 | fn pattern(&self) -> &str { |
404 | 0 | self.pattern |
405 | 0 | } |
406 | | |
407 | | /// Create a new error with the given span and error type. |
408 | 0 | fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { |
409 | 0 | ast::Error { kind, pattern: self.pattern().to_string(), span } |
410 | 0 | } |
411 | | |
412 | | /// Return the current offset of the parser. |
413 | | /// |
414 | | /// The offset starts at `0` from the beginning of the regular expression |
415 | | /// pattern string. |
416 | 0 | fn offset(&self) -> usize { |
417 | 0 | self.parser().pos.get().offset |
418 | 0 | } |
419 | | |
420 | | /// Return the current line number of the parser. |
421 | | /// |
422 | | /// The line number starts at `1`. |
423 | 0 | fn line(&self) -> usize { |
424 | 0 | self.parser().pos.get().line |
425 | 0 | } |
426 | | |
427 | | /// Return the current column of the parser. |
428 | | /// |
429 | | /// The column number starts at `1` and is reset whenever a `\n` is seen. |
430 | 0 | fn column(&self) -> usize { |
431 | 0 | self.parser().pos.get().column |
432 | 0 | } |
433 | | |
434 | | /// Return the next capturing index. Each subsequent call increments the |
435 | | /// internal index. |
436 | | /// |
437 | | /// The span given should correspond to the location of the opening |
438 | | /// parenthesis. |
439 | | /// |
440 | | /// If the capture limit is exceeded, then an error is returned. |
441 | 0 | fn next_capture_index(&self, span: Span) -> Result<u32> { |
442 | 0 | let current = self.parser().capture_index.get(); |
443 | 0 | let i = current.checked_add(1).ok_or_else(|| { |
444 | 0 | self.error(span, ast::ErrorKind::CaptureLimitExceeded) |
445 | 0 | })?; |
446 | 0 | self.parser().capture_index.set(i); |
447 | 0 | Ok(i) |
448 | 0 | } |
449 | | |
450 | | /// Adds the given capture name to this parser. If this capture name has |
451 | | /// already been used, then an error is returned. |
452 | 0 | fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { |
453 | 0 | let mut names = self.parser().capture_names.borrow_mut(); |
454 | 0 | match names |
455 | 0 | .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) |
456 | | { |
457 | 0 | Err(i) => { |
458 | 0 | names.insert(i, cap.clone()); |
459 | 0 | Ok(()) |
460 | | } |
461 | 0 | Ok(i) => Err(self.error( |
462 | 0 | cap.span, |
463 | 0 | ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, |
464 | 0 | )), |
465 | | } |
466 | 0 | } |
467 | | |
468 | | /// Return whether the parser should ignore whitespace or not. |
469 | 0 | fn ignore_whitespace(&self) -> bool { |
470 | 0 | self.parser().ignore_whitespace.get() |
471 | 0 | } |
472 | | |
473 | | /// Return the character at the current position of the parser. |
474 | | /// |
475 | | /// This panics if the current position does not point to a valid char. |
476 | 0 | fn char(&self) -> char { |
477 | 0 | self.char_at(self.offset()) |
478 | 0 | } |
479 | | |
480 | | /// Return the character at the given position. |
481 | | /// |
482 | | /// This panics if the given position does not point to a valid char. |
483 | 0 | fn char_at(&self, i: usize) -> char { |
484 | 0 | self.pattern()[i..] |
485 | 0 | .chars() |
486 | 0 | .next() |
487 | 0 | .unwrap_or_else(|| panic!("expected char at offset {}", i)) |
488 | 0 | } |
489 | | |
490 | | /// Bump the parser to the next Unicode scalar value. |
491 | | /// |
492 | | /// If the end of the input has been reached, then `false` is returned. |
493 | 0 | fn bump(&self) -> bool { |
494 | 0 | if self.is_eof() { |
495 | 0 | return false; |
496 | 0 | } |
497 | 0 | let Position { mut offset, mut line, mut column } = self.pos(); |
498 | 0 | if self.char() == '\n' { |
499 | 0 | line = line.checked_add(1).unwrap(); |
500 | 0 | column = 1; |
501 | 0 | } else { |
502 | 0 | column = column.checked_add(1).unwrap(); |
503 | 0 | } |
504 | 0 | offset += self.char().len_utf8(); |
505 | 0 | self.parser().pos.set(Position { offset, line, column }); |
506 | 0 | self.pattern()[self.offset()..].chars().next().is_some() |
507 | 0 | } |
508 | | |
509 | | /// If the substring starting at the current position of the parser has |
510 | | /// the given prefix, then bump the parser to the character immediately |
511 | | /// following the prefix and return true. Otherwise, don't bump the parser |
512 | | /// and return false. |
513 | 0 | fn bump_if(&self, prefix: &str) -> bool { |
514 | 0 | if self.pattern()[self.offset()..].starts_with(prefix) { |
515 | 0 | for _ in 0..prefix.chars().count() { |
516 | 0 | self.bump(); |
517 | 0 | } |
518 | 0 | true |
519 | | } else { |
520 | 0 | false |
521 | | } |
522 | 0 | } |
523 | | |
524 | | /// Returns true if and only if the parser is positioned at a look-around |
525 | | /// prefix. The conditions under which this returns true must always |
526 | | /// correspond to a regular expression that would otherwise be consider |
527 | | /// invalid. |
528 | | /// |
529 | | /// This should only be called immediately after parsing the opening of |
530 | | /// a group or a set of flags. |
531 | 0 | fn is_lookaround_prefix(&self) -> bool { |
532 | 0 | self.bump_if("?=") |
533 | 0 | || self.bump_if("?!") |
534 | 0 | || self.bump_if("?<=") |
535 | 0 | || self.bump_if("?<!") |
536 | 0 | } |
537 | | |
538 | | /// Bump the parser, and if the `x` flag is enabled, bump through any |
539 | | /// subsequent spaces. Return true if and only if the parser is not at |
540 | | /// EOF. |
541 | 0 | fn bump_and_bump_space(&self) -> bool { |
542 | 0 | if !self.bump() { |
543 | 0 | return false; |
544 | 0 | } |
545 | 0 | self.bump_space(); |
546 | 0 | !self.is_eof() |
547 | 0 | } |
548 | | |
549 | | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
550 | | /// comments), then this will advance the parser through all whitespace |
551 | | /// and comments to the next non-whitespace non-comment byte. |
552 | | /// |
553 | | /// If the `x` flag is disabled, then this is a no-op. |
554 | | /// |
555 | | /// This should be used selectively throughout the parser where |
556 | | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
557 | | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
558 | 0 | fn bump_space(&self) { |
559 | 0 | if !self.ignore_whitespace() { |
560 | 0 | return; |
561 | 0 | } |
562 | 0 | while !self.is_eof() { |
563 | 0 | if self.char().is_whitespace() { |
564 | 0 | self.bump(); |
565 | 0 | } else if self.char() == '#' { |
566 | 0 | let start = self.pos(); |
567 | 0 | let mut comment_text = String::new(); |
568 | 0 | self.bump(); |
569 | 0 | while !self.is_eof() { |
570 | 0 | let c = self.char(); |
571 | 0 | self.bump(); |
572 | 0 | if c == '\n' { |
573 | 0 | break; |
574 | 0 | } |
575 | 0 | comment_text.push(c); |
576 | | } |
577 | 0 | let comment = ast::Comment { |
578 | 0 | span: Span::new(start, self.pos()), |
579 | 0 | comment: comment_text, |
580 | 0 | }; |
581 | 0 | self.parser().comments.borrow_mut().push(comment); |
582 | | } else { |
583 | 0 | break; |
584 | | } |
585 | | } |
586 | 0 | } |
587 | | |
588 | | /// Peek at the next character in the input without advancing the parser. |
589 | | /// |
590 | | /// If the input has been exhausted, then this returns `None`. |
591 | 0 | fn peek(&self) -> Option<char> { |
592 | 0 | if self.is_eof() { |
593 | 0 | return None; |
594 | 0 | } |
595 | 0 | self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() |
596 | 0 | } |
597 | | |
598 | | /// Like peek, but will ignore spaces when the parser is in whitespace |
599 | | /// insensitive mode. |
600 | 0 | fn peek_space(&self) -> Option<char> { |
601 | 0 | if !self.ignore_whitespace() { |
602 | 0 | return self.peek(); |
603 | 0 | } |
604 | 0 | if self.is_eof() { |
605 | 0 | return None; |
606 | 0 | } |
607 | 0 | let mut start = self.offset() + self.char().len_utf8(); |
608 | 0 | let mut in_comment = false; |
609 | 0 | for (i, c) in self.pattern()[start..].char_indices() { |
610 | 0 | if c.is_whitespace() { |
611 | 0 | continue; |
612 | 0 | } else if !in_comment && c == '#' { |
613 | 0 | in_comment = true; |
614 | 0 | } else if in_comment && c == '\n' { |
615 | 0 | in_comment = false; |
616 | 0 | } else { |
617 | 0 | start += i; |
618 | 0 | break; |
619 | | } |
620 | | } |
621 | 0 | self.pattern()[start..].chars().next() |
622 | 0 | } |
623 | | |
624 | | /// Returns true if the next call to `bump` would return false. |
625 | 0 | fn is_eof(&self) -> bool { |
626 | 0 | self.offset() == self.pattern().len() |
627 | 0 | } |
628 | | |
629 | | /// Return the current position of the parser, which includes the offset, |
630 | | /// line and column. |
631 | 0 | fn pos(&self) -> Position { |
632 | 0 | self.parser().pos.get() |
633 | 0 | } |
634 | | |
635 | | /// Create a span at the current position of the parser. Both the start |
636 | | /// and end of the span are set. |
637 | 0 | fn span(&self) -> Span { |
638 | 0 | Span::splat(self.pos()) |
639 | 0 | } |
640 | | |
641 | | /// Create a span that covers the current character. |
642 | 0 | fn span_char(&self) -> Span { |
643 | 0 | let mut next = Position { |
644 | 0 | offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), |
645 | 0 | line: self.line(), |
646 | 0 | column: self.column().checked_add(1).unwrap(), |
647 | 0 | }; |
648 | 0 | if self.char() == '\n' { |
649 | 0 | next.line += 1; |
650 | 0 | next.column = 1; |
651 | 0 | } |
652 | 0 | Span::new(self.pos(), next) |
653 | 0 | } |
654 | | |
655 | | /// Parse and push a single alternation on to the parser's internal stack. |
656 | | /// If the top of the stack already has an alternation, then add to that |
657 | | /// instead of pushing a new one. |
658 | | /// |
659 | | /// The concatenation given corresponds to a single alternation branch. |
660 | | /// The concatenation returned starts the next branch and is empty. |
661 | | /// |
662 | | /// This assumes the parser is currently positioned at `|` and will advance |
663 | | /// the parser to the character following `|`. |
664 | | #[inline(never)] |
665 | 0 | fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
666 | 0 | assert_eq!(self.char(), '|'); |
667 | 0 | concat.span.end = self.pos(); |
668 | 0 | self.push_or_add_alternation(concat); |
669 | 0 | self.bump(); |
670 | 0 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
671 | 0 | } |
672 | | |
673 | | /// Pushes or adds the given branch of an alternation to the parser's |
674 | | /// internal stack of state. |
675 | 0 | fn push_or_add_alternation(&self, concat: ast::Concat) { |
676 | | use self::GroupState::*; |
677 | | |
678 | 0 | let mut stack = self.parser().stack_group.borrow_mut(); |
679 | 0 | if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { |
680 | 0 | alts.asts.push(concat.into_ast()); |
681 | 0 | return; |
682 | 0 | } |
683 | 0 | stack.push(Alternation(ast::Alternation { |
684 | 0 | span: Span::new(concat.span.start, self.pos()), |
685 | 0 | asts: vec![concat.into_ast()], |
686 | 0 | })); |
687 | 0 | } |
688 | | |
689 | | /// Parse and push a group AST (and its parent concatenation) on to the |
690 | | /// parser's internal stack. Return a fresh concatenation corresponding |
691 | | /// to the group's sub-AST. |
692 | | /// |
693 | | /// If a set of flags was found (with no group), then the concatenation |
694 | | /// is returned with that set of flags added. |
695 | | /// |
696 | | /// This assumes that the parser is currently positioned on the opening |
697 | | /// parenthesis. It advances the parser to the character at the start |
698 | | /// of the sub-expression (or adjoining expression). |
699 | | /// |
700 | | /// If there was a problem parsing the start of the group, then an error |
701 | | /// is returned. |
702 | | #[inline(never)] |
703 | 0 | fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
704 | 0 | assert_eq!(self.char(), '('); |
705 | 0 | match self.parse_group()? { |
706 | 0 | Either::Left(set) => { |
707 | 0 | let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); |
708 | 0 | if let Some(v) = ignore { |
709 | 0 | self.parser().ignore_whitespace.set(v); |
710 | 0 | } |
711 | | |
712 | 0 | concat.asts.push(Ast::flags(set)); |
713 | 0 | Ok(concat) |
714 | | } |
715 | 0 | Either::Right(group) => { |
716 | 0 | let old_ignore_whitespace = self.ignore_whitespace(); |
717 | 0 | let new_ignore_whitespace = group |
718 | 0 | .flags() |
719 | 0 | .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) |
720 | 0 | .unwrap_or(old_ignore_whitespace); |
721 | 0 | self.parser().stack_group.borrow_mut().push( |
722 | 0 | GroupState::Group { |
723 | 0 | concat, |
724 | 0 | group, |
725 | 0 | ignore_whitespace: old_ignore_whitespace, |
726 | 0 | }, |
727 | 0 | ); |
728 | 0 | self.parser().ignore_whitespace.set(new_ignore_whitespace); |
729 | 0 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
730 | | } |
731 | | } |
732 | 0 | } |
733 | | |
734 | | /// Pop a group AST from the parser's internal stack and set the group's |
735 | | /// AST to the given concatenation. Return the concatenation containing |
736 | | /// the group. |
737 | | /// |
738 | | /// This assumes that the parser is currently positioned on the closing |
739 | | /// parenthesis and advances the parser to the character following the `)`. |
740 | | /// |
741 | | /// If no such group could be popped, then an unopened group error is |
742 | | /// returned. |
743 | | #[inline(never)] |
744 | 0 | fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { |
745 | | use self::GroupState::*; |
746 | | |
747 | 0 | assert_eq!(self.char(), ')'); |
748 | 0 | let mut stack = self.parser().stack_group.borrow_mut(); |
749 | 0 | let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack |
750 | 0 | .pop() |
751 | | { |
752 | 0 | Some(Group { concat, group, ignore_whitespace }) => { |
753 | 0 | (concat, group, ignore_whitespace, None) |
754 | | } |
755 | 0 | Some(Alternation(alt)) => match stack.pop() { |
756 | 0 | Some(Group { concat, group, ignore_whitespace }) => { |
757 | 0 | (concat, group, ignore_whitespace, Some(alt)) |
758 | | } |
759 | | None | Some(Alternation(_)) => { |
760 | 0 | return Err(self.error( |
761 | 0 | self.span_char(), |
762 | 0 | ast::ErrorKind::GroupUnopened, |
763 | 0 | )); |
764 | | } |
765 | | }, |
766 | | None => { |
767 | 0 | return Err(self |
768 | 0 | .error(self.span_char(), ast::ErrorKind::GroupUnopened)); |
769 | | } |
770 | | }; |
771 | 0 | self.parser().ignore_whitespace.set(ignore_whitespace); |
772 | 0 | group_concat.span.end = self.pos(); |
773 | 0 | self.bump(); |
774 | 0 | group.span.end = self.pos(); |
775 | 0 | match alt { |
776 | 0 | Some(mut alt) => { |
777 | 0 | alt.span.end = group_concat.span.end; |
778 | 0 | alt.asts.push(group_concat.into_ast()); |
779 | 0 | group.ast = Box::new(alt.into_ast()); |
780 | 0 | } |
781 | 0 | None => { |
782 | 0 | group.ast = Box::new(group_concat.into_ast()); |
783 | 0 | } |
784 | | } |
785 | 0 | prior_concat.asts.push(Ast::group(group)); |
786 | 0 | Ok(prior_concat) |
787 | 0 | } |
788 | | |
789 | | /// Pop the last state from the parser's internal stack, if it exists, and |
790 | | /// add the given concatenation to it. There either must be no state or a |
791 | | /// single alternation item on the stack. Any other scenario produces an |
792 | | /// error. |
793 | | /// |
794 | | /// This assumes that the parser has advanced to the end. |
795 | | #[inline(never)] |
796 | 0 | fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { |
797 | 0 | concat.span.end = self.pos(); |
798 | 0 | let mut stack = self.parser().stack_group.borrow_mut(); |
799 | 0 | let ast = match stack.pop() { |
800 | 0 | None => Ok(concat.into_ast()), |
801 | 0 | Some(GroupState::Alternation(mut alt)) => { |
802 | 0 | alt.span.end = self.pos(); |
803 | 0 | alt.asts.push(concat.into_ast()); |
804 | 0 | Ok(Ast::alternation(alt)) |
805 | | } |
806 | 0 | Some(GroupState::Group { group, .. }) => { |
807 | 0 | return Err( |
808 | 0 | self.error(group.span, ast::ErrorKind::GroupUnclosed) |
809 | 0 | ); |
810 | | } |
811 | | }; |
812 | | // If we try to pop again, there should be nothing. |
813 | 0 | match stack.pop() { |
814 | 0 | None => ast, |
815 | | Some(GroupState::Alternation(_)) => { |
816 | | // This unreachable is unfortunate. This case can't happen |
817 | | // because the only way we can be here is if there were two |
818 | | // `GroupState::Alternation`s adjacent in the parser's stack, |
819 | | // which we guarantee to never happen because we never push a |
820 | | // `GroupState::Alternation` if one is already at the top of |
821 | | // the stack. |
822 | 0 | unreachable!() |
823 | | } |
824 | 0 | Some(GroupState::Group { group, .. }) => { |
825 | 0 | Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) |
826 | | } |
827 | | } |
828 | 0 | } |
829 | | |
830 | | /// Parse the opening of a character class and push the current class |
831 | | /// parsing context onto the parser's stack. This assumes that the parser |
832 | | /// is positioned at an opening `[`. The given union should correspond to |
833 | | /// the union of set items built up before seeing the `[`. |
834 | | /// |
835 | | /// If there was a problem parsing the opening of the class, then an error |
836 | | /// is returned. Otherwise, a new union of set items for the class is |
837 | | /// returned (which may be populated with either a `]` or a `-`). |
838 | | #[inline(never)] |
839 | 0 | fn push_class_open( |
840 | 0 | &self, |
841 | 0 | parent_union: ast::ClassSetUnion, |
842 | 0 | ) -> Result<ast::ClassSetUnion> { |
843 | 0 | assert_eq!(self.char(), '['); |
844 | | |
845 | 0 | let (nested_set, nested_union) = self.parse_set_class_open()?; |
846 | 0 | self.parser() |
847 | 0 | .stack_class |
848 | 0 | .borrow_mut() |
849 | 0 | .push(ClassState::Open { union: parent_union, set: nested_set }); |
850 | 0 | Ok(nested_union) |
851 | 0 | } |
852 | | |
853 | | /// Parse the end of a character class set and pop the character class |
854 | | /// parser stack. The union given corresponds to the last union built |
855 | | /// before seeing the closing `]`. The union returned corresponds to the |
856 | | /// parent character class set with the nested class added to it. |
857 | | /// |
858 | | /// This assumes that the parser is positioned at a `]` and will advance |
859 | | /// the parser to the byte immediately following the `]`. |
860 | | /// |
861 | | /// If the stack is empty after popping, then this returns the final |
862 | | /// "top-level" character class AST (where a "top-level" character class |
863 | | /// is one that is not nested inside any other character class). |
864 | | /// |
865 | | /// If there is no corresponding opening bracket on the parser's stack, |
866 | | /// then an error is returned. |
867 | | #[inline(never)] |
868 | 0 | fn pop_class( |
869 | 0 | &self, |
870 | 0 | nested_union: ast::ClassSetUnion, |
871 | 0 | ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> { |
872 | 0 | assert_eq!(self.char(), ']'); |
873 | | |
874 | 0 | let item = ast::ClassSet::Item(nested_union.into_item()); |
875 | 0 | let prevset = self.pop_class_op(item); |
876 | 0 | let mut stack = self.parser().stack_class.borrow_mut(); |
877 | 0 | match stack.pop() { |
878 | | None => { |
879 | | // We can never observe an empty stack: |
880 | | // |
881 | | // 1) We are guaranteed to start with a non-empty stack since |
882 | | // the character class parser is only initiated when it sees |
883 | | // a `[`. |
884 | | // 2) If we ever observe an empty stack while popping after |
885 | | // seeing a `]`, then we signal the character class parser |
886 | | // to terminate. |
887 | 0 | panic!("unexpected empty character class stack") |
888 | | } |
889 | | Some(ClassState::Op { .. }) => { |
890 | | // This panic is unfortunate, but this case is impossible |
891 | | // since we already popped the Op state if one exists above. |
892 | | // Namely, every push to the class parser stack is guarded by |
893 | | // whether an existing Op is already on the top of the stack. |
894 | | // If it is, the existing Op is modified. That is, the stack |
895 | | // can never have consecutive Op states. |
896 | 0 | panic!("unexpected ClassState::Op") |
897 | | } |
898 | 0 | Some(ClassState::Open { mut union, mut set }) => { |
899 | 0 | self.bump(); |
900 | 0 | set.span.end = self.pos(); |
901 | 0 | set.kind = prevset; |
902 | 0 | if stack.is_empty() { |
903 | 0 | Ok(Either::Right(set)) |
904 | | } else { |
905 | 0 | union.push(ast::ClassSetItem::Bracketed(Box::new(set))); |
906 | 0 | Ok(Either::Left(union)) |
907 | | } |
908 | | } |
909 | | } |
910 | 0 | } |
911 | | |
912 | | /// Return an "unclosed class" error whose span points to the most |
913 | | /// recently opened class. |
914 | | /// |
915 | | /// This should only be called while parsing a character class. |
916 | | #[inline(never)] |
917 | 0 | fn unclosed_class_error(&self) -> ast::Error { |
918 | 0 | for state in self.parser().stack_class.borrow().iter().rev() { |
919 | 0 | if let ClassState::Open { ref set, .. } = *state { |
920 | 0 | return self.error(set.span, ast::ErrorKind::ClassUnclosed); |
921 | 0 | } |
922 | | } |
923 | | // We are guaranteed to have a non-empty stack with at least |
924 | | // one open bracket, so we should never get here. |
925 | 0 | panic!("no open character class found") |
926 | 0 | } |
927 | | |
928 | | /// Push the current set of class items on to the class parser's stack as |
929 | | /// the left hand side of the given operator. |
930 | | /// |
931 | | /// A fresh set union is returned, which should be used to build the right |
932 | | /// hand side of this operator. |
933 | | #[inline(never)] |
934 | 0 | fn push_class_op( |
935 | 0 | &self, |
936 | 0 | next_kind: ast::ClassSetBinaryOpKind, |
937 | 0 | next_union: ast::ClassSetUnion, |
938 | 0 | ) -> ast::ClassSetUnion { |
939 | 0 | let item = ast::ClassSet::Item(next_union.into_item()); |
940 | 0 | let new_lhs = self.pop_class_op(item); |
941 | 0 | self.parser() |
942 | 0 | .stack_class |
943 | 0 | .borrow_mut() |
944 | 0 | .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); |
945 | 0 | ast::ClassSetUnion { span: self.span(), items: vec![] } |
946 | 0 | } |
947 | | |
948 | | /// Pop a character class set from the character class parser stack. If the |
949 | | /// top of the stack is just an item (not an operation), then return the |
950 | | /// given set unchanged. If the top of the stack is an operation, then the |
951 | | /// given set will be used as the rhs of the operation on the top of the |
952 | | /// stack. In that case, the binary operation is returned as a set. |
953 | | #[inline(never)] |
954 | 0 | fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { |
955 | 0 | let mut stack = self.parser().stack_class.borrow_mut(); |
956 | 0 | let (kind, lhs) = match stack.pop() { |
957 | 0 | Some(ClassState::Op { kind, lhs }) => (kind, lhs), |
958 | 0 | Some(state @ ClassState::Open { .. }) => { |
959 | 0 | stack.push(state); |
960 | 0 | return rhs; |
961 | | } |
962 | 0 | None => unreachable!(), |
963 | | }; |
964 | 0 | let span = Span::new(lhs.span().start, rhs.span().end); |
965 | 0 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
966 | 0 | span, |
967 | 0 | kind, |
968 | 0 | lhs: Box::new(lhs), |
969 | 0 | rhs: Box::new(rhs), |
970 | 0 | }) |
971 | 0 | } |
972 | | } |
973 | | |
974 | | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
975 | | /// Parse the regular expression into an abstract syntax tree. |
976 | 0 | fn parse(&self) -> Result<Ast> { |
977 | 0 | self.parse_with_comments().map(|astc| astc.ast) |
978 | 0 | } |
979 | | |
980 | | /// Parse the regular expression and return an abstract syntax tree with |
981 | | /// all of the comments found in the pattern. |
982 | 0 | fn parse_with_comments(&self) -> Result<ast::WithComments> { |
983 | 0 | assert_eq!(self.offset(), 0, "parser can only be used once"); |
984 | 0 | self.parser().reset(); |
985 | 0 | let mut concat = ast::Concat { span: self.span(), asts: vec![] }; |
986 | | loop { |
987 | 0 | self.bump_space(); |
988 | 0 | if self.is_eof() { |
989 | 0 | break; |
990 | 0 | } |
991 | 0 | match self.char() { |
992 | 0 | '(' => concat = self.push_group(concat)?, |
993 | 0 | ')' => concat = self.pop_group(concat)?, |
994 | 0 | '|' => concat = self.push_alternate(concat)?, |
995 | | '[' => { |
996 | 0 | let class = self.parse_set_class()?; |
997 | 0 | concat.asts.push(Ast::class_bracketed(class)); |
998 | | } |
999 | | '?' => { |
1000 | 0 | concat = self.parse_uncounted_repetition( |
1001 | 0 | concat, |
1002 | 0 | ast::RepetitionKind::ZeroOrOne, |
1003 | 0 | )?; |
1004 | | } |
1005 | | '*' => { |
1006 | 0 | concat = self.parse_uncounted_repetition( |
1007 | 0 | concat, |
1008 | 0 | ast::RepetitionKind::ZeroOrMore, |
1009 | 0 | )?; |
1010 | | } |
1011 | | '+' => { |
1012 | 0 | concat = self.parse_uncounted_repetition( |
1013 | 0 | concat, |
1014 | 0 | ast::RepetitionKind::OneOrMore, |
1015 | 0 | )?; |
1016 | | } |
1017 | | '{' => { |
1018 | 0 | concat = self.parse_counted_repetition(concat)?; |
1019 | | } |
1020 | 0 | _ => concat.asts.push(self.parse_primitive()?.into_ast()), |
1021 | | } |
1022 | | } |
1023 | 0 | let ast = self.pop_group_end(concat)?; |
1024 | 0 | NestLimiter::new(self).check(&ast)?; |
1025 | 0 | Ok(ast::WithComments { |
1026 | 0 | ast, |
1027 | 0 | comments: mem::replace( |
1028 | 0 | &mut *self.parser().comments.borrow_mut(), |
1029 | 0 | vec![], |
1030 | 0 | ), |
1031 | 0 | }) |
1032 | 0 | } |
1033 | | |
1034 | | /// Parses an uncounted repetition operation. An uncounted repetition |
1035 | | /// operator includes ?, * and +, but does not include the {m,n} syntax. |
1036 | | /// The given `kind` should correspond to the operator observed by the |
1037 | | /// caller. |
1038 | | /// |
1039 | | /// This assumes that the parser is currently positioned at the repetition |
1040 | | /// operator and advances the parser to the first character after the |
1041 | | /// operator. (Note that the operator may include a single additional `?`, |
1042 | | /// which makes the operator ungreedy.) |
1043 | | /// |
1044 | | /// The caller should include the concatenation that is being built. The |
1045 | | /// concatenation returned includes the repetition operator applied to the |
1046 | | /// last expression in the given concatenation. |
1047 | | #[inline(never)] |
1048 | 0 | fn parse_uncounted_repetition( |
1049 | 0 | &self, |
1050 | 0 | mut concat: ast::Concat, |
1051 | 0 | kind: ast::RepetitionKind, |
1052 | 0 | ) -> Result<ast::Concat> { |
1053 | 0 | assert!( |
1054 | 0 | self.char() == '?' || self.char() == '*' || self.char() == '+' |
1055 | | ); |
1056 | 0 | let op_start = self.pos(); |
1057 | 0 | let ast = match concat.asts.pop() { |
1058 | 0 | Some(ast) => ast, |
1059 | | None => { |
1060 | 0 | return Err( |
1061 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1062 | 0 | ) |
1063 | | } |
1064 | | }; |
1065 | 0 | match ast { |
1066 | | Ast::Empty(_) | Ast::Flags(_) => { |
1067 | 0 | return Err( |
1068 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1069 | 0 | ) |
1070 | | } |
1071 | 0 | _ => {} |
1072 | 0 | } |
1073 | 0 | let mut greedy = true; |
1074 | 0 | if self.bump() && self.char() == '?' { |
1075 | 0 | greedy = false; |
1076 | 0 | self.bump(); |
1077 | 0 | } |
1078 | 0 | concat.asts.push(Ast::repetition(ast::Repetition { |
1079 | 0 | span: ast.span().with_end(self.pos()), |
1080 | 0 | op: ast::RepetitionOp { |
1081 | 0 | span: Span::new(op_start, self.pos()), |
1082 | 0 | kind, |
1083 | 0 | }, |
1084 | 0 | greedy, |
1085 | 0 | ast: Box::new(ast), |
1086 | 0 | })); |
1087 | 0 | Ok(concat) |
1088 | 0 | } |
1089 | | |
1090 | | /// Parses a counted repetition operation. A counted repetition operator |
1091 | | /// corresponds to the {m,n} syntax, and does not include the ?, * or + |
1092 | | /// operators. |
1093 | | /// |
1094 | | /// This assumes that the parser is currently positioned at the opening `{` |
1095 | | /// and advances the parser to the first character after the operator. |
1096 | | /// (Note that the operator may include a single additional `?`, which |
1097 | | /// makes the operator ungreedy.) |
1098 | | /// |
1099 | | /// The caller should include the concatenation that is being built. The |
1100 | | /// concatenation returned includes the repetition operator applied to the |
1101 | | /// last expression in the given concatenation. |
1102 | | #[inline(never)] |
1103 | 0 | fn parse_counted_repetition( |
1104 | 0 | &self, |
1105 | 0 | mut concat: ast::Concat, |
1106 | 0 | ) -> Result<ast::Concat> { |
1107 | 0 | assert!(self.char() == '{'); |
1108 | 0 | let start = self.pos(); |
1109 | 0 | let ast = match concat.asts.pop() { |
1110 | 0 | Some(ast) => ast, |
1111 | | None => { |
1112 | 0 | return Err( |
1113 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1114 | 0 | ) |
1115 | | } |
1116 | | }; |
1117 | 0 | match ast { |
1118 | | Ast::Empty(_) | Ast::Flags(_) => { |
1119 | 0 | return Err( |
1120 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1121 | 0 | ) |
1122 | | } |
1123 | 0 | _ => {} |
1124 | 0 | } |
1125 | 0 | if !self.bump_and_bump_space() { |
1126 | 0 | return Err(self.error( |
1127 | 0 | Span::new(start, self.pos()), |
1128 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1129 | 0 | )); |
1130 | 0 | } |
1131 | 0 | let count_start = specialize_err( |
1132 | 0 | self.parse_decimal(), |
1133 | 0 | ast::ErrorKind::DecimalEmpty, |
1134 | 0 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1135 | 0 | ); |
1136 | 0 | if self.is_eof() { |
1137 | 0 | return Err(self.error( |
1138 | 0 | Span::new(start, self.pos()), |
1139 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1140 | 0 | )); |
1141 | 0 | } |
1142 | 0 | let range = if self.char() == ',' { |
1143 | 0 | if !self.bump_and_bump_space() { |
1144 | 0 | return Err(self.error( |
1145 | 0 | Span::new(start, self.pos()), |
1146 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1147 | 0 | )); |
1148 | 0 | } |
1149 | 0 | if self.char() != '}' { |
1150 | 0 | let count_start = match count_start { |
1151 | 0 | Ok(c) => c, |
1152 | 0 | Err(err) |
1153 | 0 | if err.kind |
1154 | 0 | == ast::ErrorKind::RepetitionCountDecimalEmpty => |
1155 | 0 | { |
1156 | 0 | if self.parser().empty_min_range { |
1157 | 0 | 0 |
1158 | | } else { |
1159 | 0 | return Err(err); |
1160 | | } |
1161 | | } |
1162 | 0 | err => err?, |
1163 | | }; |
1164 | 0 | let count_end = specialize_err( |
1165 | 0 | self.parse_decimal(), |
1166 | 0 | ast::ErrorKind::DecimalEmpty, |
1167 | 0 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1168 | 0 | )?; |
1169 | 0 | ast::RepetitionRange::Bounded(count_start, count_end) |
1170 | | } else { |
1171 | 0 | ast::RepetitionRange::AtLeast(count_start?) |
1172 | | } |
1173 | | } else { |
1174 | 0 | ast::RepetitionRange::Exactly(count_start?) |
1175 | | }; |
1176 | | |
1177 | 0 | if self.is_eof() || self.char() != '}' { |
1178 | 0 | return Err(self.error( |
1179 | 0 | Span::new(start, self.pos()), |
1180 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1181 | 0 | )); |
1182 | 0 | } |
1183 | 0 |
|
1184 | 0 | let mut greedy = true; |
1185 | 0 | if self.bump_and_bump_space() && self.char() == '?' { |
1186 | 0 | greedy = false; |
1187 | 0 | self.bump(); |
1188 | 0 | } |
1189 | | |
1190 | 0 | let op_span = Span::new(start, self.pos()); |
1191 | 0 | if !range.is_valid() { |
1192 | 0 | return Err( |
1193 | 0 | self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) |
1194 | 0 | ); |
1195 | 0 | } |
1196 | 0 | concat.asts.push(Ast::repetition(ast::Repetition { |
1197 | 0 | span: ast.span().with_end(self.pos()), |
1198 | 0 | op: ast::RepetitionOp { |
1199 | 0 | span: op_span, |
1200 | 0 | kind: ast::RepetitionKind::Range(range), |
1201 | 0 | }, |
1202 | 0 | greedy, |
1203 | 0 | ast: Box::new(ast), |
1204 | 0 | })); |
1205 | 0 | Ok(concat) |
1206 | 0 | } |
1207 | | |
1208 | | /// Parse a group (which contains a sub-expression) or a set of flags. |
1209 | | /// |
1210 | | /// If a group was found, then it is returned with an empty AST. If a set |
1211 | | /// of flags is found, then that set is returned. |
1212 | | /// |
1213 | | /// The parser should be positioned at the opening parenthesis. |
1214 | | /// |
1215 | | /// This advances the parser to the character before the start of the |
1216 | | /// sub-expression (in the case of a group) or to the closing parenthesis |
1217 | | /// immediately following the set of flags. |
1218 | | /// |
1219 | | /// # Errors |
1220 | | /// |
1221 | | /// If flags are given and incorrectly specified, then a corresponding |
1222 | | /// error is returned. |
1223 | | /// |
1224 | | /// If a capture name is given and it is incorrectly specified, then a |
1225 | | /// corresponding error is returned. |
1226 | | #[inline(never)] |
1227 | 0 | fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { |
1228 | 0 | assert_eq!(self.char(), '('); |
1229 | 0 | let open_span = self.span_char(); |
1230 | 0 | self.bump(); |
1231 | 0 | self.bump_space(); |
1232 | 0 | if self.is_lookaround_prefix() { |
1233 | 0 | return Err(self.error( |
1234 | 0 | Span::new(open_span.start, self.span().end), |
1235 | 0 | ast::ErrorKind::UnsupportedLookAround, |
1236 | 0 | )); |
1237 | 0 | } |
1238 | 0 | let inner_span = self.span(); |
1239 | 0 | let mut starts_with_p = true; |
1240 | 0 | if self.bump_if("?P<") || { |
1241 | 0 | starts_with_p = false; |
1242 | 0 | self.bump_if("?<") |
1243 | | } { |
1244 | 0 | let capture_index = self.next_capture_index(open_span)?; |
1245 | 0 | let name = self.parse_capture_name(capture_index)?; |
1246 | 0 | Ok(Either::Right(ast::Group { |
1247 | 0 | span: open_span, |
1248 | 0 | kind: ast::GroupKind::CaptureName { starts_with_p, name }, |
1249 | 0 | ast: Box::new(Ast::empty(self.span())), |
1250 | 0 | })) |
1251 | 0 | } else if self.bump_if("?") { |
1252 | 0 | if self.is_eof() { |
1253 | 0 | return Err( |
1254 | 0 | self.error(open_span, ast::ErrorKind::GroupUnclosed) |
1255 | 0 | ); |
1256 | 0 | } |
1257 | 0 | let flags = self.parse_flags()?; |
1258 | 0 | let char_end = self.char(); |
1259 | 0 | self.bump(); |
1260 | 0 | if char_end == ')' { |
1261 | | // We don't allow empty flags, e.g., `(?)`. We instead |
1262 | | // interpret it as a repetition operator missing its argument. |
1263 | 0 | if flags.items.is_empty() { |
1264 | 0 | return Err(self.error( |
1265 | 0 | inner_span, |
1266 | 0 | ast::ErrorKind::RepetitionMissing, |
1267 | 0 | )); |
1268 | 0 | } |
1269 | 0 | Ok(Either::Left(ast::SetFlags { |
1270 | 0 | span: Span { end: self.pos(), ..open_span }, |
1271 | 0 | flags, |
1272 | 0 | })) |
1273 | | } else { |
1274 | 0 | assert_eq!(char_end, ':'); |
1275 | 0 | Ok(Either::Right(ast::Group { |
1276 | 0 | span: open_span, |
1277 | 0 | kind: ast::GroupKind::NonCapturing(flags), |
1278 | 0 | ast: Box::new(Ast::empty(self.span())), |
1279 | 0 | })) |
1280 | | } |
1281 | | } else { |
1282 | 0 | let capture_index = self.next_capture_index(open_span)?; |
1283 | 0 | Ok(Either::Right(ast::Group { |
1284 | 0 | span: open_span, |
1285 | 0 | kind: ast::GroupKind::CaptureIndex(capture_index), |
1286 | 0 | ast: Box::new(Ast::empty(self.span())), |
1287 | 0 | })) |
1288 | | } |
1289 | 0 | } |
1290 | | |
1291 | | /// Parses a capture group name. Assumes that the parser is positioned at |
1292 | | /// the first character in the name following the opening `<` (and may |
1293 | | /// possibly be EOF). This advances the parser to the first character |
1294 | | /// following the closing `>`. |
1295 | | /// |
1296 | | /// The caller must provide the capture index of the group for this name. |
1297 | | #[inline(never)] |
1298 | 0 | fn parse_capture_name( |
1299 | 0 | &self, |
1300 | 0 | capture_index: u32, |
1301 | 0 | ) -> Result<ast::CaptureName> { |
1302 | 0 | if self.is_eof() { |
1303 | 0 | return Err(self |
1304 | 0 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1305 | 0 | } |
1306 | 0 | let start = self.pos(); |
1307 | | loop { |
1308 | 0 | if self.char() == '>' { |
1309 | 0 | break; |
1310 | 0 | } |
1311 | 0 | if !is_capture_char(self.char(), self.pos() == start) { |
1312 | 0 | return Err(self.error( |
1313 | 0 | self.span_char(), |
1314 | 0 | ast::ErrorKind::GroupNameInvalid, |
1315 | 0 | )); |
1316 | 0 | } |
1317 | 0 | if !self.bump() { |
1318 | 0 | break; |
1319 | 0 | } |
1320 | | } |
1321 | 0 | let end = self.pos(); |
1322 | 0 | if self.is_eof() { |
1323 | 0 | return Err(self |
1324 | 0 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1325 | 0 | } |
1326 | 0 | assert_eq!(self.char(), '>'); |
1327 | 0 | self.bump(); |
1328 | 0 | let name = &self.pattern()[start.offset..end.offset]; |
1329 | 0 | if name.is_empty() { |
1330 | 0 | return Err(self.error( |
1331 | 0 | Span::new(start, start), |
1332 | 0 | ast::ErrorKind::GroupNameEmpty, |
1333 | 0 | )); |
1334 | 0 | } |
1335 | 0 | let capname = ast::CaptureName { |
1336 | 0 | span: Span::new(start, end), |
1337 | 0 | name: name.to_string(), |
1338 | 0 | index: capture_index, |
1339 | 0 | }; |
1340 | 0 | self.add_capture_name(&capname)?; |
1341 | 0 | Ok(capname) |
1342 | 0 | } |
1343 | | |
1344 | | /// Parse a sequence of flags starting at the current character. |
1345 | | /// |
1346 | | /// This advances the parser to the character immediately following the |
1347 | | /// flags, which is guaranteed to be either `:` or `)`. |
1348 | | /// |
1349 | | /// # Errors |
1350 | | /// |
1351 | | /// If any flags are duplicated, then an error is returned. |
1352 | | /// |
1353 | | /// If the negation operator is used more than once, then an error is |
1354 | | /// returned. |
1355 | | /// |
1356 | | /// If no flags could be found or if the negation operation is not followed |
1357 | | /// by any flags, then an error is returned. |
1358 | | #[inline(never)] |
1359 | 0 | fn parse_flags(&self) -> Result<ast::Flags> { |
1360 | 0 | let mut flags = ast::Flags { span: self.span(), items: vec![] }; |
1361 | 0 | let mut last_was_negation = None; |
1362 | 0 | while self.char() != ':' && self.char() != ')' { |
1363 | 0 | if self.char() == '-' { |
1364 | 0 | last_was_negation = Some(self.span_char()); |
1365 | 0 | let item = ast::FlagsItem { |
1366 | 0 | span: self.span_char(), |
1367 | 0 | kind: ast::FlagsItemKind::Negation, |
1368 | 0 | }; |
1369 | 0 | if let Some(i) = flags.add_item(item) { |
1370 | 0 | return Err(self.error( |
1371 | 0 | self.span_char(), |
1372 | 0 | ast::ErrorKind::FlagRepeatedNegation { |
1373 | 0 | original: flags.items[i].span, |
1374 | 0 | }, |
1375 | 0 | )); |
1376 | 0 | } |
1377 | | } else { |
1378 | 0 | last_was_negation = None; |
1379 | 0 | let item = ast::FlagsItem { |
1380 | 0 | span: self.span_char(), |
1381 | 0 | kind: ast::FlagsItemKind::Flag(self.parse_flag()?), |
1382 | | }; |
1383 | 0 | if let Some(i) = flags.add_item(item) { |
1384 | 0 | return Err(self.error( |
1385 | 0 | self.span_char(), |
1386 | 0 | ast::ErrorKind::FlagDuplicate { |
1387 | 0 | original: flags.items[i].span, |
1388 | 0 | }, |
1389 | 0 | )); |
1390 | 0 | } |
1391 | | } |
1392 | 0 | if !self.bump() { |
1393 | 0 | return Err( |
1394 | 0 | self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) |
1395 | 0 | ); |
1396 | 0 | } |
1397 | | } |
1398 | 0 | if let Some(span) = last_was_negation { |
1399 | 0 | return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); |
1400 | 0 | } |
1401 | 0 | flags.span.end = self.pos(); |
1402 | 0 | Ok(flags) |
1403 | 0 | } |
1404 | | |
1405 | | /// Parse the current character as a flag. Do not advance the parser. |
1406 | | /// |
1407 | | /// # Errors |
1408 | | /// |
1409 | | /// If the flag is not recognized, then an error is returned. |
1410 | | #[inline(never)] |
1411 | 0 | fn parse_flag(&self) -> Result<ast::Flag> { |
1412 | 0 | match self.char() { |
1413 | 0 | 'i' => Ok(ast::Flag::CaseInsensitive), |
1414 | 0 | 'm' => Ok(ast::Flag::MultiLine), |
1415 | 0 | 's' => Ok(ast::Flag::DotMatchesNewLine), |
1416 | 0 | 'U' => Ok(ast::Flag::SwapGreed), |
1417 | 0 | 'u' => Ok(ast::Flag::Unicode), |
1418 | 0 | 'R' => Ok(ast::Flag::CRLF), |
1419 | 0 | 'x' => Ok(ast::Flag::IgnoreWhitespace), |
1420 | | _ => { |
1421 | 0 | Err(self |
1422 | 0 | .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) |
1423 | | } |
1424 | | } |
1425 | 0 | } |
1426 | | |
1427 | | /// Parse a primitive AST. e.g., A literal, non-set character class or |
1428 | | /// assertion. |
1429 | | /// |
1430 | | /// This assumes that the parser expects a primitive at the current |
1431 | | /// location. i.e., All other non-primitive cases have been handled. |
1432 | | /// For example, if the parser's position is at `|`, then `|` will be |
1433 | | /// treated as a literal (e.g., inside a character class). |
1434 | | /// |
1435 | | /// This advances the parser to the first character immediately following |
1436 | | /// the primitive. |
1437 | 0 | fn parse_primitive(&self) -> Result<Primitive> { |
1438 | 0 | match self.char() { |
1439 | 0 | '\\' => self.parse_escape(), |
1440 | | '.' => { |
1441 | 0 | let ast = Primitive::Dot(self.span_char()); |
1442 | 0 | self.bump(); |
1443 | 0 | Ok(ast) |
1444 | | } |
1445 | | '^' => { |
1446 | 0 | let ast = Primitive::Assertion(ast::Assertion { |
1447 | 0 | span: self.span_char(), |
1448 | 0 | kind: ast::AssertionKind::StartLine, |
1449 | 0 | }); |
1450 | 0 | self.bump(); |
1451 | 0 | Ok(ast) |
1452 | | } |
1453 | | '$' => { |
1454 | 0 | let ast = Primitive::Assertion(ast::Assertion { |
1455 | 0 | span: self.span_char(), |
1456 | 0 | kind: ast::AssertionKind::EndLine, |
1457 | 0 | }); |
1458 | 0 | self.bump(); |
1459 | 0 | Ok(ast) |
1460 | | } |
1461 | 0 | c => { |
1462 | 0 | let ast = Primitive::Literal(ast::Literal { |
1463 | 0 | span: self.span_char(), |
1464 | 0 | kind: ast::LiteralKind::Verbatim, |
1465 | 0 | c, |
1466 | 0 | }); |
1467 | 0 | self.bump(); |
1468 | 0 | Ok(ast) |
1469 | | } |
1470 | | } |
1471 | 0 | } |
1472 | | |
1473 | | /// Parse an escape sequence as a primitive AST. |
1474 | | /// |
1475 | | /// This assumes the parser is positioned at the start of the escape |
1476 | | /// sequence, i.e., `\`. It advances the parser to the first position |
1477 | | /// immediately following the escape sequence. |
1478 | | #[inline(never)] |
1479 | 0 | fn parse_escape(&self) -> Result<Primitive> { |
1480 | 0 | assert_eq!(self.char(), '\\'); |
1481 | 0 | let start = self.pos(); |
1482 | 0 | if !self.bump() { |
1483 | 0 | return Err(self.error( |
1484 | 0 | Span::new(start, self.pos()), |
1485 | 0 | ast::ErrorKind::EscapeUnexpectedEof, |
1486 | 0 | )); |
1487 | 0 | } |
1488 | 0 | let c = self.char(); |
1489 | | // Put some of the more complicated routines into helpers. |
1490 | 0 | match c { |
1491 | 0 | '0'..='7' => { |
1492 | 0 | if !self.parser().octal { |
1493 | 0 | return Err(self.error( |
1494 | 0 | Span::new(start, self.span_char().end), |
1495 | 0 | ast::ErrorKind::UnsupportedBackreference, |
1496 | 0 | )); |
1497 | 0 | } |
1498 | 0 | let mut lit = self.parse_octal(); |
1499 | 0 | lit.span.start = start; |
1500 | 0 | return Ok(Primitive::Literal(lit)); |
1501 | | } |
1502 | 0 | '8'..='9' if !self.parser().octal => { |
1503 | 0 | return Err(self.error( |
1504 | 0 | Span::new(start, self.span_char().end), |
1505 | 0 | ast::ErrorKind::UnsupportedBackreference, |
1506 | 0 | )); |
1507 | | } |
1508 | | 'x' | 'u' | 'U' => { |
1509 | 0 | let mut lit = self.parse_hex()?; |
1510 | 0 | lit.span.start = start; |
1511 | 0 | return Ok(Primitive::Literal(lit)); |
1512 | | } |
1513 | | 'p' | 'P' => { |
1514 | 0 | let mut cls = self.parse_unicode_class()?; |
1515 | 0 | cls.span.start = start; |
1516 | 0 | return Ok(Primitive::Unicode(cls)); |
1517 | | } |
1518 | | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
1519 | 0 | let mut cls = self.parse_perl_class(); |
1520 | 0 | cls.span.start = start; |
1521 | 0 | return Ok(Primitive::Perl(cls)); |
1522 | | } |
1523 | 0 | _ => {} |
1524 | 0 | } |
1525 | 0 |
|
1526 | 0 | // Handle all of the one letter sequences inline. |
1527 | 0 | self.bump(); |
1528 | 0 | let span = Span::new(start, self.pos()); |
1529 | 0 | if is_meta_character(c) { |
1530 | 0 | return Ok(Primitive::Literal(ast::Literal { |
1531 | 0 | span, |
1532 | 0 | kind: ast::LiteralKind::Meta, |
1533 | 0 | c, |
1534 | 0 | })); |
1535 | 0 | } |
1536 | 0 | if is_escapeable_character(c) { |
1537 | 0 | return Ok(Primitive::Literal(ast::Literal { |
1538 | 0 | span, |
1539 | 0 | kind: ast::LiteralKind::Superfluous, |
1540 | 0 | c, |
1541 | 0 | })); |
1542 | 0 | } |
1543 | 0 | let special = |kind, c| { |
1544 | 0 | Ok(Primitive::Literal(ast::Literal { |
1545 | 0 | span, |
1546 | 0 | kind: ast::LiteralKind::Special(kind), |
1547 | 0 | c, |
1548 | 0 | })) |
1549 | 0 | }; |
1550 | 0 | match c { |
1551 | 0 | 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), |
1552 | 0 | 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), |
1553 | 0 | 't' => special(ast::SpecialLiteralKind::Tab, '\t'), |
1554 | 0 | 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), |
1555 | 0 | 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), |
1556 | 0 | 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), |
1557 | 0 | 'A' => Ok(Primitive::Assertion(ast::Assertion { |
1558 | 0 | span, |
1559 | 0 | kind: ast::AssertionKind::StartText, |
1560 | 0 | })), |
1561 | 0 | 'z' => Ok(Primitive::Assertion(ast::Assertion { |
1562 | 0 | span, |
1563 | 0 | kind: ast::AssertionKind::EndText, |
1564 | 0 | })), |
1565 | | 'b' => { |
1566 | 0 | let mut wb = ast::Assertion { |
1567 | 0 | span, |
1568 | 0 | kind: ast::AssertionKind::WordBoundary, |
1569 | 0 | }; |
1570 | 0 | // After a \b, we "try" to parse things like \b{start} for |
1571 | 0 | // special word boundary assertions. |
1572 | 0 | if !self.is_eof() && self.char() == '{' { |
1573 | 0 | if let Some(kind) = |
1574 | 0 | self.maybe_parse_special_word_boundary(start)? |
1575 | 0 | { |
1576 | 0 | wb.kind = kind; |
1577 | 0 | wb.span.end = self.pos(); |
1578 | 0 | } |
1579 | 0 | } |
1580 | 0 | Ok(Primitive::Assertion(wb)) |
1581 | | } |
1582 | 0 | 'B' => Ok(Primitive::Assertion(ast::Assertion { |
1583 | 0 | span, |
1584 | 0 | kind: ast::AssertionKind::NotWordBoundary, |
1585 | 0 | })), |
1586 | 0 | '<' => Ok(Primitive::Assertion(ast::Assertion { |
1587 | 0 | span, |
1588 | 0 | kind: ast::AssertionKind::WordBoundaryStartAngle, |
1589 | 0 | })), |
1590 | 0 | '>' => Ok(Primitive::Assertion(ast::Assertion { |
1591 | 0 | span, |
1592 | 0 | kind: ast::AssertionKind::WordBoundaryEndAngle, |
1593 | 0 | })), |
1594 | 0 | _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), |
1595 | | } |
1596 | 0 | } |
1597 | | |
1598 | | /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
1599 | | /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
1600 | | /// |
1601 | | /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
1602 | | /// if it fails it will just return `None` with no error. This is done |
1603 | | /// because `\b{5}` is a valid expression and we want to let that be parsed |
1604 | | /// by the existing counted repetition parsing code. (I thought about just |
1605 | | /// invoking the counted repetition code from here, but it seemed a little |
1606 | | /// ham-fisted.) |
1607 | | /// |
1608 | | /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
1609 | | /// Namely, if we definitely know it isn't a counted repetition, then we |
1610 | | /// return an error specific to the specialty word boundaries. |
1611 | | /// |
1612 | | /// This assumes the parser is positioned at a `{` immediately following |
1613 | | /// a `\b`. When `None` is returned, the parser is returned to the position |
1614 | | /// at which it started: pointing at a `{`. |
1615 | | /// |
1616 | | /// The position given should correspond to the start of the `\b`. |
1617 | 0 | fn maybe_parse_special_word_boundary( |
1618 | 0 | &self, |
1619 | 0 | wb_start: Position, |
1620 | 0 | ) -> Result<Option<ast::AssertionKind>> { |
1621 | 0 | assert_eq!(self.char(), '{'); |
1622 | | |
1623 | 0 | let is_valid_char = |c| match c { |
1624 | 0 | 'A'..='Z' | 'a'..='z' | '-' => true, |
1625 | 0 | _ => false, |
1626 | 0 | }; |
1627 | 0 | let start = self.pos(); |
1628 | 0 | if !self.bump_and_bump_space() { |
1629 | 0 | return Err(self.error( |
1630 | 0 | Span::new(wb_start, self.pos()), |
1631 | 0 | ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
1632 | 0 | )); |
1633 | 0 | } |
1634 | 0 | let start_contents = self.pos(); |
1635 | 0 | // This is one of the critical bits: if the first non-whitespace |
1636 | 0 | // character isn't in [-A-Za-z] (i.e., this can't be a special word |
1637 | 0 | // boundary), then we bail and let the counted repetition parser deal |
1638 | 0 | // with this. |
1639 | 0 | if !is_valid_char(self.char()) { |
1640 | 0 | self.parser().pos.set(start); |
1641 | 0 | return Ok(None); |
1642 | 0 | } |
1643 | 0 |
|
1644 | 0 | // Now collect up our chars until we see a '}'. |
1645 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1646 | 0 | scratch.clear(); |
1647 | 0 | while !self.is_eof() && is_valid_char(self.char()) { |
1648 | 0 | scratch.push(self.char()); |
1649 | 0 | self.bump_and_bump_space(); |
1650 | 0 | } |
1651 | 0 | if self.is_eof() || self.char() != '}' { |
1652 | 0 | return Err(self.error( |
1653 | 0 | Span::new(start, self.pos()), |
1654 | 0 | ast::ErrorKind::SpecialWordBoundaryUnclosed, |
1655 | 0 | )); |
1656 | 0 | } |
1657 | 0 | let end = self.pos(); |
1658 | 0 | self.bump(); |
1659 | 0 | let kind = match scratch.as_str() { |
1660 | 0 | "start" => ast::AssertionKind::WordBoundaryStart, |
1661 | 0 | "end" => ast::AssertionKind::WordBoundaryEnd, |
1662 | 0 | "start-half" => ast::AssertionKind::WordBoundaryStartHalf, |
1663 | 0 | "end-half" => ast::AssertionKind::WordBoundaryEndHalf, |
1664 | | _ => { |
1665 | 0 | return Err(self.error( |
1666 | 0 | Span::new(start_contents, end), |
1667 | 0 | ast::ErrorKind::SpecialWordBoundaryUnrecognized, |
1668 | 0 | )) |
1669 | | } |
1670 | | }; |
1671 | 0 | Ok(Some(kind)) |
1672 | 0 | } |
1673 | | |
1674 | | /// Parse an octal representation of a Unicode codepoint up to 3 digits |
1675 | | /// long. This expects the parser to be positioned at the first octal |
1676 | | /// digit and advances the parser to the first character immediately |
1677 | | /// following the octal number. This also assumes that parsing octal |
1678 | | /// escapes is enabled. |
1679 | | /// |
1680 | | /// Assuming the preconditions are met, this routine can never fail. |
1681 | | #[inline(never)] |
1682 | 0 | fn parse_octal(&self) -> ast::Literal { |
1683 | 0 | assert!(self.parser().octal); |
1684 | 0 | assert!('0' <= self.char() && self.char() <= '7'); |
1685 | 0 | let start = self.pos(); |
1686 | | // Parse up to two more digits. |
1687 | 0 | while self.bump() |
1688 | 0 | && '0' <= self.char() |
1689 | 0 | && self.char() <= '7' |
1690 | 0 | && self.pos().offset - start.offset <= 2 |
1691 | 0 | {} |
1692 | 0 | let end = self.pos(); |
1693 | 0 | let octal = &self.pattern()[start.offset..end.offset]; |
1694 | 0 | // Parsing the octal should never fail since the above guarantees a |
1695 | 0 | // valid number. |
1696 | 0 | let codepoint = |
1697 | 0 | u32::from_str_radix(octal, 8).expect("valid octal number"); |
1698 | 0 | // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no |
1699 | 0 | // invalid Unicode scalar values. |
1700 | 0 | let c = char::from_u32(codepoint).expect("Unicode scalar value"); |
1701 | 0 | ast::Literal { |
1702 | 0 | span: Span::new(start, end), |
1703 | 0 | kind: ast::LiteralKind::Octal, |
1704 | 0 | c, |
1705 | 0 | } |
1706 | 0 | } |
1707 | | |
1708 | | /// Parse a hex representation of a Unicode codepoint. This handles both |
1709 | | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
1710 | | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
1711 | | /// the first character immediately following the hexadecimal literal. |
1712 | | #[inline(never)] |
1713 | 0 | fn parse_hex(&self) -> Result<ast::Literal> { |
1714 | 0 | assert!( |
1715 | 0 | self.char() == 'x' || self.char() == 'u' || self.char() == 'U' |
1716 | | ); |
1717 | | |
1718 | 0 | let hex_kind = match self.char() { |
1719 | 0 | 'x' => ast::HexLiteralKind::X, |
1720 | 0 | 'u' => ast::HexLiteralKind::UnicodeShort, |
1721 | 0 | _ => ast::HexLiteralKind::UnicodeLong, |
1722 | | }; |
1723 | 0 | if !self.bump_and_bump_space() { |
1724 | 0 | return Err( |
1725 | 0 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
1726 | 0 | ); |
1727 | 0 | } |
1728 | 0 | if self.char() == '{' { |
1729 | 0 | self.parse_hex_brace(hex_kind) |
1730 | | } else { |
1731 | 0 | self.parse_hex_digits(hex_kind) |
1732 | | } |
1733 | 0 | } |
1734 | | |
1735 | | /// Parse an N-digit hex representation of a Unicode codepoint. This |
1736 | | /// expects the parser to be positioned at the first digit and will advance |
1737 | | /// the parser to the first character immediately following the escape |
1738 | | /// sequence. |
1739 | | /// |
1740 | | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
1741 | | /// or 8 (for `\UNNNNNNNN`). |
1742 | | #[inline(never)] |
1743 | 0 | fn parse_hex_digits( |
1744 | 0 | &self, |
1745 | 0 | kind: ast::HexLiteralKind, |
1746 | 0 | ) -> Result<ast::Literal> { |
1747 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1748 | 0 | scratch.clear(); |
1749 | 0 |
|
1750 | 0 | let start = self.pos(); |
1751 | 0 | for i in 0..kind.digits() { |
1752 | 0 | if i > 0 && !self.bump_and_bump_space() { |
1753 | 0 | return Err(self |
1754 | 0 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
1755 | 0 | } |
1756 | 0 | if !is_hex(self.char()) { |
1757 | 0 | return Err(self.error( |
1758 | 0 | self.span_char(), |
1759 | 0 | ast::ErrorKind::EscapeHexInvalidDigit, |
1760 | 0 | )); |
1761 | 0 | } |
1762 | 0 | scratch.push(self.char()); |
1763 | | } |
1764 | | // The final bump just moves the parser past the literal, which may |
1765 | | // be EOF. |
1766 | 0 | self.bump_and_bump_space(); |
1767 | 0 | let end = self.pos(); |
1768 | 0 | let hex = scratch.as_str(); |
1769 | 0 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1770 | 0 | None => Err(self.error( |
1771 | 0 | Span::new(start, end), |
1772 | 0 | ast::ErrorKind::EscapeHexInvalid, |
1773 | 0 | )), |
1774 | 0 | Some(c) => Ok(ast::Literal { |
1775 | 0 | span: Span::new(start, end), |
1776 | 0 | kind: ast::LiteralKind::HexFixed(kind), |
1777 | 0 | c, |
1778 | 0 | }), |
1779 | | } |
1780 | 0 | } |
1781 | | |
1782 | | /// Parse a hex representation of any Unicode scalar value. This expects |
1783 | | /// the parser to be positioned at the opening brace `{` and will advance |
1784 | | /// the parser to the first character following the closing brace `}`. |
1785 | | #[inline(never)] |
1786 | 0 | fn parse_hex_brace( |
1787 | 0 | &self, |
1788 | 0 | kind: ast::HexLiteralKind, |
1789 | 0 | ) -> Result<ast::Literal> { |
1790 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1791 | 0 | scratch.clear(); |
1792 | 0 |
|
1793 | 0 | let brace_pos = self.pos(); |
1794 | 0 | let start = self.span_char().end; |
1795 | 0 | while self.bump_and_bump_space() && self.char() != '}' { |
1796 | 0 | if !is_hex(self.char()) { |
1797 | 0 | return Err(self.error( |
1798 | 0 | self.span_char(), |
1799 | 0 | ast::ErrorKind::EscapeHexInvalidDigit, |
1800 | 0 | )); |
1801 | 0 | } |
1802 | 0 | scratch.push(self.char()); |
1803 | | } |
1804 | 0 | if self.is_eof() { |
1805 | 0 | return Err(self.error( |
1806 | 0 | Span::new(brace_pos, self.pos()), |
1807 | 0 | ast::ErrorKind::EscapeUnexpectedEof, |
1808 | 0 | )); |
1809 | 0 | } |
1810 | 0 | let end = self.pos(); |
1811 | 0 | let hex = scratch.as_str(); |
1812 | 0 | assert_eq!(self.char(), '}'); |
1813 | 0 | self.bump_and_bump_space(); |
1814 | 0 |
|
1815 | 0 | if hex.is_empty() { |
1816 | 0 | return Err(self.error( |
1817 | 0 | Span::new(brace_pos, self.pos()), |
1818 | 0 | ast::ErrorKind::EscapeHexEmpty, |
1819 | 0 | )); |
1820 | 0 | } |
1821 | 0 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1822 | 0 | None => Err(self.error( |
1823 | 0 | Span::new(start, end), |
1824 | 0 | ast::ErrorKind::EscapeHexInvalid, |
1825 | 0 | )), |
1826 | 0 | Some(c) => Ok(ast::Literal { |
1827 | 0 | span: Span::new(start, self.pos()), |
1828 | 0 | kind: ast::LiteralKind::HexBrace(kind), |
1829 | 0 | c, |
1830 | 0 | }), |
1831 | | } |
1832 | 0 | } |
1833 | | |
1834 | | /// Parse a decimal number into a u32 while trimming leading and trailing |
1835 | | /// whitespace. |
1836 | | /// |
1837 | | /// This expects the parser to be positioned at the first position where |
1838 | | /// a decimal digit could occur. This will advance the parser to the byte |
1839 | | /// immediately following the last contiguous decimal digit. |
1840 | | /// |
1841 | | /// If no decimal digit could be found or if there was a problem parsing |
1842 | | /// the complete set of digits into a u32, then an error is returned. |
1843 | 0 | fn parse_decimal(&self) -> Result<u32> { |
1844 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1845 | 0 | scratch.clear(); |
1846 | | |
1847 | 0 | while !self.is_eof() && self.char().is_whitespace() { |
1848 | 0 | self.bump(); |
1849 | 0 | } |
1850 | 0 | let start = self.pos(); |
1851 | 0 | while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { |
1852 | 0 | scratch.push(self.char()); |
1853 | 0 | self.bump_and_bump_space(); |
1854 | 0 | } |
1855 | 0 | let span = Span::new(start, self.pos()); |
1856 | 0 | while !self.is_eof() && self.char().is_whitespace() { |
1857 | 0 | self.bump_and_bump_space(); |
1858 | 0 | } |
1859 | 0 | let digits = scratch.as_str(); |
1860 | 0 | if digits.is_empty() { |
1861 | 0 | return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); |
1862 | 0 | } |
1863 | 0 | match u32::from_str_radix(digits, 10).ok() { |
1864 | 0 | Some(n) => Ok(n), |
1865 | 0 | None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), |
1866 | | } |
1867 | 0 | } |
1868 | | |
1869 | | /// Parse a standard character class consisting primarily of characters or |
1870 | | /// character ranges, but can also contain nested character classes of |
1871 | | /// any type (sans `.`). |
1872 | | /// |
1873 | | /// This assumes the parser is positioned at the opening `[`. If parsing |
1874 | | /// is successful, then the parser is advanced to the position immediately |
1875 | | /// following the closing `]`. |
1876 | | #[inline(never)] |
1877 | 0 | fn parse_set_class(&self) -> Result<ast::ClassBracketed> { |
1878 | 0 | assert_eq!(self.char(), '['); |
1879 | | |
1880 | 0 | let mut union = |
1881 | 0 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
1882 | | loop { |
1883 | 0 | self.bump_space(); |
1884 | 0 | if self.is_eof() { |
1885 | 0 | return Err(self.unclosed_class_error()); |
1886 | 0 | } |
1887 | 0 | match self.char() { |
1888 | | '[' => { |
1889 | | // If we've already parsed the opening bracket, then |
1890 | | // attempt to treat this as the beginning of an ASCII |
1891 | | // class. If ASCII class parsing fails, then the parser |
1892 | | // backs up to `[`. |
1893 | 0 | if !self.parser().stack_class.borrow().is_empty() { |
1894 | 0 | if let Some(cls) = self.maybe_parse_ascii_class() { |
1895 | 0 | union.push(ast::ClassSetItem::Ascii(cls)); |
1896 | 0 | continue; |
1897 | 0 | } |
1898 | 0 | } |
1899 | 0 | union = self.push_class_open(union)?; |
1900 | | } |
1901 | 0 | ']' => match self.pop_class(union)? { |
1902 | 0 | Either::Left(nested_union) => { |
1903 | 0 | union = nested_union; |
1904 | 0 | } |
1905 | 0 | Either::Right(class) => return Ok(class), |
1906 | | }, |
1907 | 0 | '&' if self.peek() == Some('&') => { |
1908 | 0 | assert!(self.bump_if("&&")); |
1909 | 0 | union = self.push_class_op( |
1910 | 0 | ast::ClassSetBinaryOpKind::Intersection, |
1911 | 0 | union, |
1912 | 0 | ); |
1913 | | } |
1914 | 0 | '-' if self.peek() == Some('-') => { |
1915 | 0 | assert!(self.bump_if("--")); |
1916 | 0 | union = self.push_class_op( |
1917 | 0 | ast::ClassSetBinaryOpKind::Difference, |
1918 | 0 | union, |
1919 | 0 | ); |
1920 | | } |
1921 | 0 | '~' if self.peek() == Some('~') => { |
1922 | 0 | assert!(self.bump_if("~~")); |
1923 | 0 | union = self.push_class_op( |
1924 | 0 | ast::ClassSetBinaryOpKind::SymmetricDifference, |
1925 | 0 | union, |
1926 | 0 | ); |
1927 | | } |
1928 | | _ => { |
1929 | 0 | union.push(self.parse_set_class_range()?); |
1930 | | } |
1931 | | } |
1932 | | } |
1933 | 0 | } |
1934 | | |
1935 | | /// Parse a single primitive item in a character class set. The item to |
1936 | | /// be parsed can either be one of a simple literal character, a range |
1937 | | /// between two simple literal characters or a "primitive" character |
1938 | | /// class like \w or \p{Greek}. |
1939 | | /// |
1940 | | /// If an invalid escape is found, or if a character class is found where |
1941 | | /// a simple literal is expected (e.g., in a range), then an error is |
1942 | | /// returned. |
1943 | | #[inline(never)] |
1944 | 0 | fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { |
1945 | 0 | let prim1 = self.parse_set_class_item()?; |
1946 | 0 | self.bump_space(); |
1947 | 0 | if self.is_eof() { |
1948 | 0 | return Err(self.unclosed_class_error()); |
1949 | 0 | } |
1950 | 0 | // If the next char isn't a `-`, then we don't have a range. |
1951 | 0 | // There are two exceptions. If the char after a `-` is a `]`, then |
1952 | 0 | // `-` is interpreted as a literal `-`. Alternatively, if the char |
1953 | 0 | // after a `-` is a `-`, then `--` corresponds to a "difference" |
1954 | 0 | // operation. |
1955 | 0 | if self.char() != '-' |
1956 | 0 | || self.peek_space() == Some(']') |
1957 | 0 | || self.peek_space() == Some('-') |
1958 | | { |
1959 | 0 | return prim1.into_class_set_item(self); |
1960 | 0 | } |
1961 | 0 | // OK, now we're parsing a range, so bump past the `-` and parse the |
1962 | 0 | // second half of the range. |
1963 | 0 | if !self.bump_and_bump_space() { |
1964 | 0 | return Err(self.unclosed_class_error()); |
1965 | 0 | } |
1966 | 0 | let prim2 = self.parse_set_class_item()?; |
1967 | 0 | let range = ast::ClassSetRange { |
1968 | 0 | span: Span::new(prim1.span().start, prim2.span().end), |
1969 | 0 | start: prim1.into_class_literal(self)?, |
1970 | 0 | end: prim2.into_class_literal(self)?, |
1971 | | }; |
1972 | 0 | if !range.is_valid() { |
1973 | 0 | return Err( |
1974 | 0 | self.error(range.span, ast::ErrorKind::ClassRangeInvalid) |
1975 | 0 | ); |
1976 | 0 | } |
1977 | 0 | Ok(ast::ClassSetItem::Range(range)) |
1978 | 0 | } |
1979 | | |
1980 | | /// Parse a single item in a character class as a primitive, where the |
1981 | | /// primitive either consists of a verbatim literal or a single escape |
1982 | | /// sequence. |
1983 | | /// |
1984 | | /// This assumes the parser is positioned at the beginning of a primitive, |
1985 | | /// and advances the parser to the first position after the primitive if |
1986 | | /// successful. |
1987 | | /// |
1988 | | /// Note that it is the caller's responsibility to report an error if an |
1989 | | /// illegal primitive was parsed. |
1990 | | #[inline(never)] |
1991 | 0 | fn parse_set_class_item(&self) -> Result<Primitive> { |
1992 | 0 | if self.char() == '\\' { |
1993 | 0 | self.parse_escape() |
1994 | | } else { |
1995 | 0 | let x = Primitive::Literal(ast::Literal { |
1996 | 0 | span: self.span_char(), |
1997 | 0 | kind: ast::LiteralKind::Verbatim, |
1998 | 0 | c: self.char(), |
1999 | 0 | }); |
2000 | 0 | self.bump(); |
2001 | 0 | Ok(x) |
2002 | | } |
2003 | 0 | } |
2004 | | |
2005 | | /// Parses the opening of a character class set. This includes the opening |
2006 | | /// bracket along with `^` if present to indicate negation. This also |
2007 | | /// starts parsing the opening set of unioned items if applicable, since |
2008 | | /// there are special rules applied to certain characters in the opening |
2009 | | /// of a character class. For example, `[^]]` is the class of all |
2010 | | /// characters not equal to `]`. (`]` would need to be escaped in any other |
2011 | | /// position.) Similarly for `-`. |
2012 | | /// |
2013 | | /// In all cases, the op inside the returned `ast::ClassBracketed` is an |
2014 | | /// empty union. This empty union should be replaced with the actual item |
2015 | | /// when it is popped from the parser's stack. |
2016 | | /// |
2017 | | /// This assumes the parser is positioned at the opening `[` and advances |
2018 | | /// the parser to the first non-special byte of the character class. |
2019 | | /// |
2020 | | /// An error is returned if EOF is found. |
2021 | | #[inline(never)] |
2022 | 0 | fn parse_set_class_open( |
2023 | 0 | &self, |
2024 | 0 | ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { |
2025 | 0 | assert_eq!(self.char(), '['); |
2026 | 0 | let start = self.pos(); |
2027 | 0 | if !self.bump_and_bump_space() { |
2028 | 0 | return Err(self.error( |
2029 | 0 | Span::new(start, self.pos()), |
2030 | 0 | ast::ErrorKind::ClassUnclosed, |
2031 | 0 | )); |
2032 | 0 | } |
2033 | | |
2034 | 0 | let negated = if self.char() != '^' { |
2035 | 0 | false |
2036 | | } else { |
2037 | 0 | if !self.bump_and_bump_space() { |
2038 | 0 | return Err(self.error( |
2039 | 0 | Span::new(start, self.pos()), |
2040 | 0 | ast::ErrorKind::ClassUnclosed, |
2041 | 0 | )); |
2042 | 0 | } |
2043 | 0 | true |
2044 | | }; |
2045 | | // Accept any number of `-` as literal `-`. |
2046 | 0 | let mut union = |
2047 | 0 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
2048 | 0 | while self.char() == '-' { |
2049 | 0 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
2050 | 0 | span: self.span_char(), |
2051 | 0 | kind: ast::LiteralKind::Verbatim, |
2052 | 0 | c: '-', |
2053 | 0 | })); |
2054 | 0 | if !self.bump_and_bump_space() { |
2055 | 0 | return Err(self.error( |
2056 | 0 | Span::new(start, start), |
2057 | 0 | ast::ErrorKind::ClassUnclosed, |
2058 | 0 | )); |
2059 | 0 | } |
2060 | | } |
2061 | | // If `]` is the *first* char in a set, then interpret it as a literal |
2062 | | // `]`. That is, an empty class is impossible to write. |
2063 | 0 | if union.items.is_empty() && self.char() == ']' { |
2064 | 0 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
2065 | 0 | span: self.span_char(), |
2066 | 0 | kind: ast::LiteralKind::Verbatim, |
2067 | 0 | c: ']', |
2068 | 0 | })); |
2069 | 0 | if !self.bump_and_bump_space() { |
2070 | 0 | return Err(self.error( |
2071 | 0 | Span::new(start, self.pos()), |
2072 | 0 | ast::ErrorKind::ClassUnclosed, |
2073 | 0 | )); |
2074 | 0 | } |
2075 | 0 | } |
2076 | 0 | let set = ast::ClassBracketed { |
2077 | 0 | span: Span::new(start, self.pos()), |
2078 | 0 | negated, |
2079 | 0 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
2080 | 0 | span: Span::new(union.span.start, union.span.start), |
2081 | 0 | items: vec![], |
2082 | 0 | }), |
2083 | 0 | }; |
2084 | 0 | Ok((set, union)) |
2085 | 0 | } |
2086 | | |
2087 | | /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. |
2088 | | /// |
2089 | | /// This assumes the parser is positioned at the opening `[`. |
2090 | | /// |
2091 | | /// If no valid ASCII character class could be found, then this does not |
2092 | | /// advance the parser and `None` is returned. Otherwise, the parser is |
2093 | | /// advanced to the first byte following the closing `]` and the |
2094 | | /// corresponding ASCII class is returned. |
2095 | | #[inline(never)] |
2096 | 0 | fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { |
2097 | 0 | // ASCII character classes are interesting from a parsing perspective |
2098 | 0 | // because parsing cannot fail with any interesting error. For example, |
2099 | 0 | // in order to use an ASCII character class, it must be enclosed in |
2100 | 0 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
2101 | 0 | // of it as "ASCII character classes have the syntax `[:NAME:]` which |
2102 | 0 | // can only appear within character brackets." This means that things |
2103 | 0 | // like `[[:lower:]A]` are legal constructs. |
2104 | 0 | // |
2105 | 0 | // However, if one types an incorrect ASCII character class, e.g., |
2106 | 0 | // `[[:loower:]]`, then we treat that as a normal nested character |
2107 | 0 | // class containing the characters `:elorw`. One might argue that we |
2108 | 0 | // should return an error instead since the repeated colons give away |
2109 | 0 | // the intent to write an ASCII class. But what if the user typed |
2110 | 0 | // `[[:lower]]` instead? How can we tell that was intended to be an |
2111 | 0 | // ASCII class and not just a normal nested class? |
2112 | 0 | // |
2113 | 0 | // Reasonable people can probably disagree over this, but for better |
2114 | 0 | // or worse, we implement semantics that never fails at the expense |
2115 | 0 | // of better failure modes. |
2116 | 0 | assert_eq!(self.char(), '['); |
2117 | | // If parsing fails, then we back up the parser to this starting point. |
2118 | 0 | let start = self.pos(); |
2119 | 0 | let mut negated = false; |
2120 | 0 | if !self.bump() || self.char() != ':' { |
2121 | 0 | self.parser().pos.set(start); |
2122 | 0 | return None; |
2123 | 0 | } |
2124 | 0 | if !self.bump() { |
2125 | 0 | self.parser().pos.set(start); |
2126 | 0 | return None; |
2127 | 0 | } |
2128 | 0 | if self.char() == '^' { |
2129 | 0 | negated = true; |
2130 | 0 | if !self.bump() { |
2131 | 0 | self.parser().pos.set(start); |
2132 | 0 | return None; |
2133 | 0 | } |
2134 | 0 | } |
2135 | 0 | let name_start = self.offset(); |
2136 | 0 | while self.char() != ':' && self.bump() {} |
2137 | 0 | if self.is_eof() { |
2138 | 0 | self.parser().pos.set(start); |
2139 | 0 | return None; |
2140 | 0 | } |
2141 | 0 | let name = &self.pattern()[name_start..self.offset()]; |
2142 | 0 | if !self.bump_if(":]") { |
2143 | 0 | self.parser().pos.set(start); |
2144 | 0 | return None; |
2145 | 0 | } |
2146 | 0 | let kind = match ast::ClassAsciiKind::from_name(name) { |
2147 | 0 | Some(kind) => kind, |
2148 | | None => { |
2149 | 0 | self.parser().pos.set(start); |
2150 | 0 | return None; |
2151 | | } |
2152 | | }; |
2153 | 0 | Some(ast::ClassAscii { |
2154 | 0 | span: Span::new(start, self.pos()), |
2155 | 0 | kind, |
2156 | 0 | negated, |
2157 | 0 | }) |
2158 | 0 | } |
2159 | | |
2160 | | /// Parse a Unicode class in either the single character notation, `\pN` |
2161 | | /// or the multi-character bracketed notation, `\p{Greek}`. This assumes |
2162 | | /// the parser is positioned at the `p` (or `P` for negation) and will |
2163 | | /// advance the parser to the character immediately following the class. |
2164 | | /// |
2165 | | /// Note that this does not check whether the class name is valid or not. |
2166 | | #[inline(never)] |
2167 | 0 | fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { |
2168 | 0 | assert!(self.char() == 'p' || self.char() == 'P'); |
2169 | | |
2170 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
2171 | 0 | scratch.clear(); |
2172 | 0 |
|
2173 | 0 | let negated = self.char() == 'P'; |
2174 | 0 | if !self.bump_and_bump_space() { |
2175 | 0 | return Err( |
2176 | 0 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
2177 | 0 | ); |
2178 | 0 | } |
2179 | 0 | let (start, kind) = if self.char() == '{' { |
2180 | 0 | let start = self.span_char().end; |
2181 | 0 | while self.bump_and_bump_space() && self.char() != '}' { |
2182 | 0 | scratch.push(self.char()); |
2183 | 0 | } |
2184 | 0 | if self.is_eof() { |
2185 | 0 | return Err(self |
2186 | 0 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
2187 | 0 | } |
2188 | 0 | assert_eq!(self.char(), '}'); |
2189 | 0 | self.bump(); |
2190 | 0 |
|
2191 | 0 | let name = scratch.as_str(); |
2192 | 0 | if let Some(i) = name.find("!=") { |
2193 | 0 | ( |
2194 | 0 | start, |
2195 | 0 | ast::ClassUnicodeKind::NamedValue { |
2196 | 0 | op: ast::ClassUnicodeOpKind::NotEqual, |
2197 | 0 | name: name[..i].to_string(), |
2198 | 0 | value: name[i + 2..].to_string(), |
2199 | 0 | }, |
2200 | 0 | ) |
2201 | 0 | } else if let Some(i) = name.find(':') { |
2202 | 0 | ( |
2203 | 0 | start, |
2204 | 0 | ast::ClassUnicodeKind::NamedValue { |
2205 | 0 | op: ast::ClassUnicodeOpKind::Colon, |
2206 | 0 | name: name[..i].to_string(), |
2207 | 0 | value: name[i + 1..].to_string(), |
2208 | 0 | }, |
2209 | 0 | ) |
2210 | 0 | } else if let Some(i) = name.find('=') { |
2211 | 0 | ( |
2212 | 0 | start, |
2213 | 0 | ast::ClassUnicodeKind::NamedValue { |
2214 | 0 | op: ast::ClassUnicodeOpKind::Equal, |
2215 | 0 | name: name[..i].to_string(), |
2216 | 0 | value: name[i + 1..].to_string(), |
2217 | 0 | }, |
2218 | 0 | ) |
2219 | | } else { |
2220 | 0 | (start, ast::ClassUnicodeKind::Named(name.to_string())) |
2221 | | } |
2222 | | } else { |
2223 | 0 | let start = self.pos(); |
2224 | 0 | let c = self.char(); |
2225 | 0 | if c == '\\' { |
2226 | 0 | return Err(self.error( |
2227 | 0 | self.span_char(), |
2228 | 0 | ast::ErrorKind::UnicodeClassInvalid, |
2229 | 0 | )); |
2230 | 0 | } |
2231 | 0 | self.bump_and_bump_space(); |
2232 | 0 | let kind = ast::ClassUnicodeKind::OneLetter(c); |
2233 | 0 | (start, kind) |
2234 | | }; |
2235 | 0 | Ok(ast::ClassUnicode { |
2236 | 0 | span: Span::new(start, self.pos()), |
2237 | 0 | negated, |
2238 | 0 | kind, |
2239 | 0 | }) |
2240 | 0 | } |
2241 | | |
2242 | | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
2243 | | /// parser is currently at a valid character class name and will be |
2244 | | /// advanced to the character immediately following the class. |
2245 | | #[inline(never)] |
2246 | 0 | fn parse_perl_class(&self) -> ast::ClassPerl { |
2247 | 0 | let c = self.char(); |
2248 | 0 | let span = self.span_char(); |
2249 | 0 | self.bump(); |
2250 | 0 | let (negated, kind) = match c { |
2251 | 0 | 'd' => (false, ast::ClassPerlKind::Digit), |
2252 | 0 | 'D' => (true, ast::ClassPerlKind::Digit), |
2253 | 0 | 's' => (false, ast::ClassPerlKind::Space), |
2254 | 0 | 'S' => (true, ast::ClassPerlKind::Space), |
2255 | 0 | 'w' => (false, ast::ClassPerlKind::Word), |
2256 | 0 | 'W' => (true, ast::ClassPerlKind::Word), |
2257 | 0 | c => panic!("expected valid Perl class but got '{}'", c), |
2258 | | }; |
2259 | 0 | ast::ClassPerl { span, kind, negated } |
2260 | 0 | } |
2261 | | } |
2262 | | |
2263 | | /// A type that traverses a fully parsed Ast and checks whether its depth |
2264 | | /// exceeds the specified nesting limit. If it does, then an error is returned. |
2265 | | #[derive(Debug)] |
2266 | | struct NestLimiter<'p, 's, P> { |
2267 | | /// The parser that is checking the nest limit. |
2268 | | p: &'p ParserI<'s, P>, |
2269 | | /// The current depth while walking an Ast. |
2270 | | depth: u32, |
2271 | | } |
2272 | | |
2273 | | impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { |
2274 | 0 | fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { |
2275 | 0 | NestLimiter { p, depth: 0 } |
2276 | 0 | } |
2277 | | |
2278 | | #[inline(never)] |
2279 | 0 | fn check(self, ast: &Ast) -> Result<()> { |
2280 | 0 | ast::visit(ast, self) |
2281 | 0 | } |
2282 | | |
2283 | 0 | fn increment_depth(&mut self, span: &Span) -> Result<()> { |
2284 | 0 | let new = self.depth.checked_add(1).ok_or_else(|| { |
2285 | 0 | self.p.error( |
2286 | 0 | span.clone(), |
2287 | 0 | ast::ErrorKind::NestLimitExceeded(u32::MAX), |
2288 | 0 | ) |
2289 | 0 | })?; |
2290 | 0 | let limit = self.p.parser().nest_limit; |
2291 | 0 | if new > limit { |
2292 | 0 | return Err(self.p.error( |
2293 | 0 | span.clone(), |
2294 | 0 | ast::ErrorKind::NestLimitExceeded(limit), |
2295 | 0 | )); |
2296 | 0 | } |
2297 | 0 | self.depth = new; |
2298 | 0 | Ok(()) |
2299 | 0 | } |
2300 | | |
2301 | 0 | fn decrement_depth(&mut self) { |
2302 | 0 | // Assuming the correctness of the visitor, this should never drop |
2303 | 0 | // below 0. |
2304 | 0 | self.depth = self.depth.checked_sub(1).unwrap(); |
2305 | 0 | } |
2306 | | } |
2307 | | |
2308 | | impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { |
2309 | | type Output = (); |
2310 | | type Err = ast::Error; |
2311 | | |
2312 | 0 | fn finish(self) -> Result<()> { |
2313 | 0 | Ok(()) |
2314 | 0 | } |
2315 | | |
2316 | 0 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
2317 | 0 | let span = match *ast { |
2318 | | Ast::Empty(_) |
2319 | | | Ast::Flags(_) |
2320 | | | Ast::Literal(_) |
2321 | | | Ast::Dot(_) |
2322 | | | Ast::Assertion(_) |
2323 | | | Ast::ClassUnicode(_) |
2324 | | | Ast::ClassPerl(_) => { |
2325 | | // These are all base cases, so we don't increment depth. |
2326 | 0 | return Ok(()); |
2327 | | } |
2328 | 0 | Ast::ClassBracketed(ref x) => &x.span, |
2329 | 0 | Ast::Repetition(ref x) => &x.span, |
2330 | 0 | Ast::Group(ref x) => &x.span, |
2331 | 0 | Ast::Alternation(ref x) => &x.span, |
2332 | 0 | Ast::Concat(ref x) => &x.span, |
2333 | | }; |
2334 | 0 | self.increment_depth(span) |
2335 | 0 | } |
2336 | | |
2337 | 0 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
2338 | 0 | match *ast { |
2339 | | Ast::Empty(_) |
2340 | | | Ast::Flags(_) |
2341 | | | Ast::Literal(_) |
2342 | | | Ast::Dot(_) |
2343 | | | Ast::Assertion(_) |
2344 | | | Ast::ClassUnicode(_) |
2345 | | | Ast::ClassPerl(_) => { |
2346 | | // These are all base cases, so we don't decrement depth. |
2347 | 0 | Ok(()) |
2348 | | } |
2349 | | Ast::ClassBracketed(_) |
2350 | | | Ast::Repetition(_) |
2351 | | | Ast::Group(_) |
2352 | | | Ast::Alternation(_) |
2353 | | | Ast::Concat(_) => { |
2354 | 0 | self.decrement_depth(); |
2355 | 0 | Ok(()) |
2356 | | } |
2357 | | } |
2358 | 0 | } |
2359 | | |
2360 | 0 | fn visit_class_set_item_pre( |
2361 | 0 | &mut self, |
2362 | 0 | ast: &ast::ClassSetItem, |
2363 | 0 | ) -> Result<()> { |
2364 | 0 | let span = match *ast { |
2365 | | ast::ClassSetItem::Empty(_) |
2366 | | | ast::ClassSetItem::Literal(_) |
2367 | | | ast::ClassSetItem::Range(_) |
2368 | | | ast::ClassSetItem::Ascii(_) |
2369 | | | ast::ClassSetItem::Unicode(_) |
2370 | | | ast::ClassSetItem::Perl(_) => { |
2371 | | // These are all base cases, so we don't increment depth. |
2372 | 0 | return Ok(()); |
2373 | | } |
2374 | 0 | ast::ClassSetItem::Bracketed(ref x) => &x.span, |
2375 | 0 | ast::ClassSetItem::Union(ref x) => &x.span, |
2376 | | }; |
2377 | 0 | self.increment_depth(span) |
2378 | 0 | } |
2379 | | |
2380 | 0 | fn visit_class_set_item_post( |
2381 | 0 | &mut self, |
2382 | 0 | ast: &ast::ClassSetItem, |
2383 | 0 | ) -> Result<()> { |
2384 | 0 | match *ast { |
2385 | | ast::ClassSetItem::Empty(_) |
2386 | | | ast::ClassSetItem::Literal(_) |
2387 | | | ast::ClassSetItem::Range(_) |
2388 | | | ast::ClassSetItem::Ascii(_) |
2389 | | | ast::ClassSetItem::Unicode(_) |
2390 | | | ast::ClassSetItem::Perl(_) => { |
2391 | | // These are all base cases, so we don't decrement depth. |
2392 | 0 | Ok(()) |
2393 | | } |
2394 | | ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { |
2395 | 0 | self.decrement_depth(); |
2396 | 0 | Ok(()) |
2397 | | } |
2398 | | } |
2399 | 0 | } |
2400 | | |
2401 | 0 | fn visit_class_set_binary_op_pre( |
2402 | 0 | &mut self, |
2403 | 0 | ast: &ast::ClassSetBinaryOp, |
2404 | 0 | ) -> Result<()> { |
2405 | 0 | self.increment_depth(&ast.span) |
2406 | 0 | } |
2407 | | |
2408 | 0 | fn visit_class_set_binary_op_post( |
2409 | 0 | &mut self, |
2410 | 0 | _ast: &ast::ClassSetBinaryOp, |
2411 | 0 | ) -> Result<()> { |
2412 | 0 | self.decrement_depth(); |
2413 | 0 | Ok(()) |
2414 | 0 | } |
2415 | | } |
2416 | | |
2417 | | /// When the result is an error, transforms the ast::ErrorKind from the source |
2418 | | /// Result into another one. This function is used to return clearer error |
2419 | | /// messages when possible. |
2420 | 0 | fn specialize_err<T>( |
2421 | 0 | result: Result<T>, |
2422 | 0 | from: ast::ErrorKind, |
2423 | 0 | to: ast::ErrorKind, |
2424 | 0 | ) -> Result<T> { |
2425 | 0 | if let Err(e) = result { |
2426 | 0 | if e.kind == from { |
2427 | 0 | Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) |
2428 | | } else { |
2429 | 0 | Err(e) |
2430 | | } |
2431 | | } else { |
2432 | 0 | result |
2433 | | } |
2434 | 0 | } |
2435 | | |
2436 | | #[cfg(test)] |
2437 | | mod tests { |
2438 | | use core::ops::Range; |
2439 | | |
2440 | | use alloc::format; |
2441 | | |
2442 | | use super::*; |
2443 | | |
2444 | | // Our own assert_eq, which has slightly better formatting (but honestly |
2445 | | // still kind of crappy). |
2446 | | macro_rules! assert_eq { |
2447 | | ($left:expr, $right:expr) => {{ |
2448 | | match (&$left, &$right) { |
2449 | | (left_val, right_val) => { |
2450 | | if !(*left_val == *right_val) { |
2451 | | panic!( |
2452 | | "assertion failed: `(left == right)`\n\n\ |
2453 | | left: `{:?}`\nright: `{:?}`\n\n", |
2454 | | left_val, right_val |
2455 | | ) |
2456 | | } |
2457 | | } |
2458 | | } |
2459 | | }}; |
2460 | | } |
2461 | | |
2462 | | // We create these errors to compare with real ast::Errors in the tests. |
2463 | | // We define equality between TestError and ast::Error to disregard the |
2464 | | // pattern string in ast::Error, which is annoying to provide in tests. |
2465 | | #[derive(Clone, Debug)] |
2466 | | struct TestError { |
2467 | | span: Span, |
2468 | | kind: ast::ErrorKind, |
2469 | | } |
2470 | | |
2471 | | impl PartialEq<ast::Error> for TestError { |
2472 | | fn eq(&self, other: &ast::Error) -> bool { |
2473 | | self.span == other.span && self.kind == other.kind |
2474 | | } |
2475 | | } |
2476 | | |
2477 | | impl PartialEq<TestError> for ast::Error { |
2478 | | fn eq(&self, other: &TestError) -> bool { |
2479 | | self.span == other.span && self.kind == other.kind |
2480 | | } |
2481 | | } |
2482 | | |
2483 | | fn s(str: &str) -> String { |
2484 | | str.to_string() |
2485 | | } |
2486 | | |
2487 | | fn parser(pattern: &str) -> ParserI<'_, Parser> { |
2488 | | ParserI::new(Parser::new(), pattern) |
2489 | | } |
2490 | | |
2491 | | fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { |
2492 | | let parser = ParserBuilder::new().octal(true).build(); |
2493 | | ParserI::new(parser, pattern) |
2494 | | } |
2495 | | |
2496 | | fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> { |
2497 | | let parser = ParserBuilder::new().empty_min_range(true).build(); |
2498 | | ParserI::new(parser, pattern) |
2499 | | } |
2500 | | |
2501 | | fn parser_nest_limit( |
2502 | | pattern: &str, |
2503 | | nest_limit: u32, |
2504 | | ) -> ParserI<'_, Parser> { |
2505 | | let p = ParserBuilder::new().nest_limit(nest_limit).build(); |
2506 | | ParserI::new(p, pattern) |
2507 | | } |
2508 | | |
2509 | | fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { |
2510 | | let p = ParserBuilder::new().ignore_whitespace(true).build(); |
2511 | | ParserI::new(p, pattern) |
2512 | | } |
2513 | | |
2514 | | /// Short alias for creating a new span. |
2515 | | fn nspan(start: Position, end: Position) -> Span { |
2516 | | Span::new(start, end) |
2517 | | } |
2518 | | |
2519 | | /// Short alias for creating a new position. |
2520 | | fn npos(offset: usize, line: usize, column: usize) -> Position { |
2521 | | Position::new(offset, line, column) |
2522 | | } |
2523 | | |
2524 | | /// Create a new span from the given offset range. This assumes a single |
2525 | | /// line and sets the columns based on the offsets. i.e., This only works |
2526 | | /// out of the box for ASCII, which is fine for most tests. |
2527 | | fn span(range: Range<usize>) -> Span { |
2528 | | let start = Position::new(range.start, 1, range.start + 1); |
2529 | | let end = Position::new(range.end, 1, range.end + 1); |
2530 | | Span::new(start, end) |
2531 | | } |
2532 | | |
2533 | | /// Create a new span for the corresponding byte range in the given string. |
2534 | | fn span_range(subject: &str, range: Range<usize>) -> Span { |
2535 | | let start = Position { |
2536 | | offset: range.start, |
2537 | | line: 1 + subject[..range.start].matches('\n').count(), |
2538 | | column: 1 + subject[..range.start] |
2539 | | .chars() |
2540 | | .rev() |
2541 | | .position(|c| c == '\n') |
2542 | | .unwrap_or(subject[..range.start].chars().count()), |
2543 | | }; |
2544 | | let end = Position { |
2545 | | offset: range.end, |
2546 | | line: 1 + subject[..range.end].matches('\n').count(), |
2547 | | column: 1 + subject[..range.end] |
2548 | | .chars() |
2549 | | .rev() |
2550 | | .position(|c| c == '\n') |
2551 | | .unwrap_or(subject[..range.end].chars().count()), |
2552 | | }; |
2553 | | Span::new(start, end) |
2554 | | } |
2555 | | |
2556 | | /// Create a verbatim literal starting at the given position. |
2557 | | fn lit(c: char, start: usize) -> Ast { |
2558 | | lit_with(c, span(start..start + c.len_utf8())) |
2559 | | } |
2560 | | |
2561 | | /// Create a meta literal starting at the given position. |
2562 | | fn meta_lit(c: char, span: Span) -> Ast { |
2563 | | Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) |
2564 | | } |
2565 | | |
2566 | | /// Create a verbatim literal with the given span. |
2567 | | fn lit_with(c: char, span: Span) -> Ast { |
2568 | | Ast::literal(ast::Literal { |
2569 | | span, |
2570 | | kind: ast::LiteralKind::Verbatim, |
2571 | | c, |
2572 | | }) |
2573 | | } |
2574 | | |
2575 | | /// Create a concatenation with the given range. |
2576 | | fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2577 | | concat_with(span(range), asts) |
2578 | | } |
2579 | | |
2580 | | /// Create a concatenation with the given span. |
2581 | | fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { |
2582 | | Ast::concat(ast::Concat { span, asts }) |
2583 | | } |
2584 | | |
2585 | | /// Create an alternation with the given span. |
2586 | | fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2587 | | Ast::alternation(ast::Alternation { span: span(range), asts }) |
2588 | | } |
2589 | | |
2590 | | /// Create a capturing group with the given span. |
2591 | | fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { |
2592 | | Ast::group(ast::Group { |
2593 | | span: span(range), |
2594 | | kind: ast::GroupKind::CaptureIndex(index), |
2595 | | ast: Box::new(ast), |
2596 | | }) |
2597 | | } |
2598 | | |
2599 | | /// Create an ast::SetFlags. |
2600 | | /// |
2601 | | /// The given pattern should be the full pattern string. The range given |
2602 | | /// should correspond to the byte offsets where the flag set occurs. |
2603 | | /// |
2604 | | /// If negated is true, then the set is interpreted as beginning with a |
2605 | | /// negation. |
2606 | | fn flag_set( |
2607 | | pat: &str, |
2608 | | range: Range<usize>, |
2609 | | flag: ast::Flag, |
2610 | | negated: bool, |
2611 | | ) -> Ast { |
2612 | | let mut items = vec![ast::FlagsItem { |
2613 | | span: span_range(pat, (range.end - 2)..(range.end - 1)), |
2614 | | kind: ast::FlagsItemKind::Flag(flag), |
2615 | | }]; |
2616 | | if negated { |
2617 | | items.insert( |
2618 | | 0, |
2619 | | ast::FlagsItem { |
2620 | | span: span_range(pat, (range.start + 2)..(range.end - 2)), |
2621 | | kind: ast::FlagsItemKind::Negation, |
2622 | | }, |
2623 | | ); |
2624 | | } |
2625 | | Ast::flags(ast::SetFlags { |
2626 | | span: span_range(pat, range.clone()), |
2627 | | flags: ast::Flags { |
2628 | | span: span_range(pat, (range.start + 2)..(range.end - 1)), |
2629 | | items, |
2630 | | }, |
2631 | | }) |
2632 | | } |
2633 | | |
2634 | | #[test] |
2635 | | fn parse_nest_limit() { |
2636 | | // A nest limit of 0 still allows some types of regexes. |
2637 | | assert_eq!( |
2638 | | parser_nest_limit("", 0).parse(), |
2639 | | Ok(Ast::empty(span(0..0))) |
2640 | | ); |
2641 | | assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); |
2642 | | |
2643 | | // Test repetition operations, which require one level of nesting. |
2644 | | assert_eq!( |
2645 | | parser_nest_limit("a+", 0).parse().unwrap_err(), |
2646 | | TestError { |
2647 | | span: span(0..2), |
2648 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2649 | | } |
2650 | | ); |
2651 | | assert_eq!( |
2652 | | parser_nest_limit("a+", 1).parse(), |
2653 | | Ok(Ast::repetition(ast::Repetition { |
2654 | | span: span(0..2), |
2655 | | op: ast::RepetitionOp { |
2656 | | span: span(1..2), |
2657 | | kind: ast::RepetitionKind::OneOrMore, |
2658 | | }, |
2659 | | greedy: true, |
2660 | | ast: Box::new(lit('a', 0)), |
2661 | | })) |
2662 | | ); |
2663 | | assert_eq!( |
2664 | | parser_nest_limit("(a)+", 1).parse().unwrap_err(), |
2665 | | TestError { |
2666 | | span: span(0..3), |
2667 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2668 | | } |
2669 | | ); |
2670 | | assert_eq!( |
2671 | | parser_nest_limit("a+*", 1).parse().unwrap_err(), |
2672 | | TestError { |
2673 | | span: span(0..2), |
2674 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2675 | | } |
2676 | | ); |
2677 | | assert_eq!( |
2678 | | parser_nest_limit("a+*", 2).parse(), |
2679 | | Ok(Ast::repetition(ast::Repetition { |
2680 | | span: span(0..3), |
2681 | | op: ast::RepetitionOp { |
2682 | | span: span(2..3), |
2683 | | kind: ast::RepetitionKind::ZeroOrMore, |
2684 | | }, |
2685 | | greedy: true, |
2686 | | ast: Box::new(Ast::repetition(ast::Repetition { |
2687 | | span: span(0..2), |
2688 | | op: ast::RepetitionOp { |
2689 | | span: span(1..2), |
2690 | | kind: ast::RepetitionKind::OneOrMore, |
2691 | | }, |
2692 | | greedy: true, |
2693 | | ast: Box::new(lit('a', 0)), |
2694 | | })), |
2695 | | })) |
2696 | | ); |
2697 | | |
2698 | | // Test concatenations. A concatenation requires one level of nesting. |
2699 | | assert_eq!( |
2700 | | parser_nest_limit("ab", 0).parse().unwrap_err(), |
2701 | | TestError { |
2702 | | span: span(0..2), |
2703 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2704 | | } |
2705 | | ); |
2706 | | assert_eq!( |
2707 | | parser_nest_limit("ab", 1).parse(), |
2708 | | Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) |
2709 | | ); |
2710 | | assert_eq!( |
2711 | | parser_nest_limit("abc", 1).parse(), |
2712 | | Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) |
2713 | | ); |
2714 | | |
2715 | | // Test alternations. An alternation requires one level of nesting. |
2716 | | assert_eq!( |
2717 | | parser_nest_limit("a|b", 0).parse().unwrap_err(), |
2718 | | TestError { |
2719 | | span: span(0..3), |
2720 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2721 | | } |
2722 | | ); |
2723 | | assert_eq!( |
2724 | | parser_nest_limit("a|b", 1).parse(), |
2725 | | Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) |
2726 | | ); |
2727 | | assert_eq!( |
2728 | | parser_nest_limit("a|b|c", 1).parse(), |
2729 | | Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) |
2730 | | ); |
2731 | | |
2732 | | // Test character classes. Classes form their own mini-recursive |
2733 | | // syntax! |
2734 | | assert_eq!( |
2735 | | parser_nest_limit("[a]", 0).parse().unwrap_err(), |
2736 | | TestError { |
2737 | | span: span(0..3), |
2738 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2739 | | } |
2740 | | ); |
2741 | | assert_eq!( |
2742 | | parser_nest_limit("[a]", 1).parse(), |
2743 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
2744 | | span: span(0..3), |
2745 | | negated: false, |
2746 | | kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( |
2747 | | ast::Literal { |
2748 | | span: span(1..2), |
2749 | | kind: ast::LiteralKind::Verbatim, |
2750 | | c: 'a', |
2751 | | } |
2752 | | )), |
2753 | | })) |
2754 | | ); |
2755 | | assert_eq!( |
2756 | | parser_nest_limit("[ab]", 1).parse().unwrap_err(), |
2757 | | TestError { |
2758 | | span: span(1..3), |
2759 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2760 | | } |
2761 | | ); |
2762 | | assert_eq!( |
2763 | | parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), |
2764 | | TestError { |
2765 | | span: span(3..7), |
2766 | | kind: ast::ErrorKind::NestLimitExceeded(2), |
2767 | | } |
2768 | | ); |
2769 | | assert_eq!( |
2770 | | parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), |
2771 | | TestError { |
2772 | | span: span(4..6), |
2773 | | kind: ast::ErrorKind::NestLimitExceeded(3), |
2774 | | } |
2775 | | ); |
2776 | | assert_eq!( |
2777 | | parser_nest_limit("[a--b]", 1).parse().unwrap_err(), |
2778 | | TestError { |
2779 | | span: span(1..5), |
2780 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2781 | | } |
2782 | | ); |
2783 | | assert_eq!( |
2784 | | parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), |
2785 | | TestError { |
2786 | | span: span(4..6), |
2787 | | kind: ast::ErrorKind::NestLimitExceeded(2), |
2788 | | } |
2789 | | ); |
2790 | | } |
2791 | | |
2792 | | #[test] |
2793 | | fn parse_comments() { |
2794 | | let pat = "(?x) |
2795 | | # This is comment 1. |
2796 | | foo # This is comment 2. |
2797 | | # This is comment 3. |
2798 | | bar |
2799 | | # This is comment 4."; |
2800 | | let astc = parser(pat).parse_with_comments().unwrap(); |
2801 | | assert_eq!( |
2802 | | astc.ast, |
2803 | | concat_with( |
2804 | | span_range(pat, 0..pat.len()), |
2805 | | vec![ |
2806 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2807 | | lit_with('f', span_range(pat, 26..27)), |
2808 | | lit_with('o', span_range(pat, 27..28)), |
2809 | | lit_with('o', span_range(pat, 28..29)), |
2810 | | lit_with('b', span_range(pat, 74..75)), |
2811 | | lit_with('a', span_range(pat, 75..76)), |
2812 | | lit_with('r', span_range(pat, 76..77)), |
2813 | | ] |
2814 | | ) |
2815 | | ); |
2816 | | assert_eq!( |
2817 | | astc.comments, |
2818 | | vec![ |
2819 | | ast::Comment { |
2820 | | span: span_range(pat, 5..26), |
2821 | | comment: s(" This is comment 1."), |
2822 | | }, |
2823 | | ast::Comment { |
2824 | | span: span_range(pat, 30..51), |
2825 | | comment: s(" This is comment 2."), |
2826 | | }, |
2827 | | ast::Comment { |
2828 | | span: span_range(pat, 53..74), |
2829 | | comment: s(" This is comment 3."), |
2830 | | }, |
2831 | | ast::Comment { |
2832 | | span: span_range(pat, 78..98), |
2833 | | comment: s(" This is comment 4."), |
2834 | | }, |
2835 | | ] |
2836 | | ); |
2837 | | } |
2838 | | |
2839 | | #[test] |
2840 | | fn parse_holistic() { |
2841 | | assert_eq!(parser("]").parse(), Ok(lit(']', 0))); |
2842 | | assert_eq!( |
2843 | | parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), |
2844 | | Ok(concat( |
2845 | | 0..36, |
2846 | | vec![ |
2847 | | meta_lit('\\', span(0..2)), |
2848 | | meta_lit('.', span(2..4)), |
2849 | | meta_lit('+', span(4..6)), |
2850 | | meta_lit('*', span(6..8)), |
2851 | | meta_lit('?', span(8..10)), |
2852 | | meta_lit('(', span(10..12)), |
2853 | | meta_lit(')', span(12..14)), |
2854 | | meta_lit('|', span(14..16)), |
2855 | | meta_lit('[', span(16..18)), |
2856 | | meta_lit(']', span(18..20)), |
2857 | | meta_lit('{', span(20..22)), |
2858 | | meta_lit('}', span(22..24)), |
2859 | | meta_lit('^', span(24..26)), |
2860 | | meta_lit('$', span(26..28)), |
2861 | | meta_lit('#', span(28..30)), |
2862 | | meta_lit('&', span(30..32)), |
2863 | | meta_lit('-', span(32..34)), |
2864 | | meta_lit('~', span(34..36)), |
2865 | | ] |
2866 | | )) |
2867 | | ); |
2868 | | } |
2869 | | |
2870 | | #[test] |
2871 | | fn parse_ignore_whitespace() { |
2872 | | // Test that basic whitespace insensitivity works. |
2873 | | let pat = "(?x)a b"; |
2874 | | assert_eq!( |
2875 | | parser(pat).parse(), |
2876 | | Ok(concat_with( |
2877 | | nspan(npos(0, 1, 1), npos(7, 1, 8)), |
2878 | | vec![ |
2879 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2880 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2881 | | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2882 | | ] |
2883 | | )) |
2884 | | ); |
2885 | | |
2886 | | // Test that we can toggle whitespace insensitivity. |
2887 | | let pat = "(?x)a b(?-x)a b"; |
2888 | | assert_eq!( |
2889 | | parser(pat).parse(), |
2890 | | Ok(concat_with( |
2891 | | nspan(npos(0, 1, 1), npos(15, 1, 16)), |
2892 | | vec![ |
2893 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2894 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2895 | | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2896 | | flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), |
2897 | | lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), |
2898 | | lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), |
2899 | | lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), |
2900 | | ] |
2901 | | )) |
2902 | | ); |
2903 | | |
2904 | | // Test that nesting whitespace insensitive flags works. |
2905 | | let pat = "a (?x:a )a "; |
2906 | | assert_eq!( |
2907 | | parser(pat).parse(), |
2908 | | Ok(concat_with( |
2909 | | span_range(pat, 0..11), |
2910 | | vec![ |
2911 | | lit_with('a', span_range(pat, 0..1)), |
2912 | | lit_with(' ', span_range(pat, 1..2)), |
2913 | | Ast::group(ast::Group { |
2914 | | span: span_range(pat, 2..9), |
2915 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2916 | | span: span_range(pat, 4..5), |
2917 | | items: vec![ast::FlagsItem { |
2918 | | span: span_range(pat, 4..5), |
2919 | | kind: ast::FlagsItemKind::Flag( |
2920 | | ast::Flag::IgnoreWhitespace |
2921 | | ), |
2922 | | },], |
2923 | | }), |
2924 | | ast: Box::new(lit_with('a', span_range(pat, 6..7))), |
2925 | | }), |
2926 | | lit_with('a', span_range(pat, 9..10)), |
2927 | | lit_with(' ', span_range(pat, 10..11)), |
2928 | | ] |
2929 | | )) |
2930 | | ); |
2931 | | |
2932 | | // Test that whitespace after an opening paren is insignificant. |
2933 | | let pat = "(?x)( ?P<foo> a )"; |
2934 | | assert_eq!( |
2935 | | parser(pat).parse(), |
2936 | | Ok(concat_with( |
2937 | | span_range(pat, 0..pat.len()), |
2938 | | vec![ |
2939 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2940 | | Ast::group(ast::Group { |
2941 | | span: span_range(pat, 4..pat.len()), |
2942 | | kind: ast::GroupKind::CaptureName { |
2943 | | starts_with_p: true, |
2944 | | name: ast::CaptureName { |
2945 | | span: span_range(pat, 9..12), |
2946 | | name: s("foo"), |
2947 | | index: 1, |
2948 | | } |
2949 | | }, |
2950 | | ast: Box::new(lit_with('a', span_range(pat, 14..15))), |
2951 | | }), |
2952 | | ] |
2953 | | )) |
2954 | | ); |
2955 | | let pat = "(?x)( a )"; |
2956 | | assert_eq!( |
2957 | | parser(pat).parse(), |
2958 | | Ok(concat_with( |
2959 | | span_range(pat, 0..pat.len()), |
2960 | | vec![ |
2961 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2962 | | Ast::group(ast::Group { |
2963 | | span: span_range(pat, 4..pat.len()), |
2964 | | kind: ast::GroupKind::CaptureIndex(1), |
2965 | | ast: Box::new(lit_with('a', span_range(pat, 7..8))), |
2966 | | }), |
2967 | | ] |
2968 | | )) |
2969 | | ); |
2970 | | let pat = "(?x)( ?: a )"; |
2971 | | assert_eq!( |
2972 | | parser(pat).parse(), |
2973 | | Ok(concat_with( |
2974 | | span_range(pat, 0..pat.len()), |
2975 | | vec![ |
2976 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2977 | | Ast::group(ast::Group { |
2978 | | span: span_range(pat, 4..pat.len()), |
2979 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2980 | | span: span_range(pat, 8..8), |
2981 | | items: vec![], |
2982 | | }), |
2983 | | ast: Box::new(lit_with('a', span_range(pat, 11..12))), |
2984 | | }), |
2985 | | ] |
2986 | | )) |
2987 | | ); |
2988 | | let pat = r"(?x)\x { 53 }"; |
2989 | | assert_eq!( |
2990 | | parser(pat).parse(), |
2991 | | Ok(concat_with( |
2992 | | span_range(pat, 0..pat.len()), |
2993 | | vec![ |
2994 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2995 | | Ast::literal(ast::Literal { |
2996 | | span: span(4..13), |
2997 | | kind: ast::LiteralKind::HexBrace( |
2998 | | ast::HexLiteralKind::X |
2999 | | ), |
3000 | | c: 'S', |
3001 | | }), |
3002 | | ] |
3003 | | )) |
3004 | | ); |
3005 | | |
3006 | | // Test that whitespace after an escape is OK. |
3007 | | let pat = r"(?x)\ "; |
3008 | | assert_eq!( |
3009 | | parser(pat).parse(), |
3010 | | Ok(concat_with( |
3011 | | span_range(pat, 0..pat.len()), |
3012 | | vec![ |
3013 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
3014 | | Ast::literal(ast::Literal { |
3015 | | span: span_range(pat, 4..6), |
3016 | | kind: ast::LiteralKind::Superfluous, |
3017 | | c: ' ', |
3018 | | }), |
3019 | | ] |
3020 | | )) |
3021 | | ); |
3022 | | } |
3023 | | |
3024 | | #[test] |
3025 | | fn parse_newlines() { |
3026 | | let pat = ".\n."; |
3027 | | assert_eq!( |
3028 | | parser(pat).parse(), |
3029 | | Ok(concat_with( |
3030 | | span_range(pat, 0..3), |
3031 | | vec![ |
3032 | | Ast::dot(span_range(pat, 0..1)), |
3033 | | lit_with('\n', span_range(pat, 1..2)), |
3034 | | Ast::dot(span_range(pat, 2..3)), |
3035 | | ] |
3036 | | )) |
3037 | | ); |
3038 | | |
3039 | | let pat = "foobar\nbaz\nquux\n"; |
3040 | | assert_eq!( |
3041 | | parser(pat).parse(), |
3042 | | Ok(concat_with( |
3043 | | span_range(pat, 0..pat.len()), |
3044 | | vec![ |
3045 | | lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), |
3046 | | lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), |
3047 | | lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), |
3048 | | lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), |
3049 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
3050 | | lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), |
3051 | | lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), |
3052 | | lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), |
3053 | | lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), |
3054 | | lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), |
3055 | | lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), |
3056 | | lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), |
3057 | | lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), |
3058 | | lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), |
3059 | | lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), |
3060 | | lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), |
3061 | | ] |
3062 | | )) |
3063 | | ); |
3064 | | } |
3065 | | |
3066 | | #[test] |
3067 | | fn parse_uncounted_repetition() { |
3068 | | assert_eq!( |
3069 | | parser(r"a*").parse(), |
3070 | | Ok(Ast::repetition(ast::Repetition { |
3071 | | span: span(0..2), |
3072 | | op: ast::RepetitionOp { |
3073 | | span: span(1..2), |
3074 | | kind: ast::RepetitionKind::ZeroOrMore, |
3075 | | }, |
3076 | | greedy: true, |
3077 | | ast: Box::new(lit('a', 0)), |
3078 | | })) |
3079 | | ); |
3080 | | assert_eq!( |
3081 | | parser(r"a+").parse(), |
3082 | | Ok(Ast::repetition(ast::Repetition { |
3083 | | span: span(0..2), |
3084 | | op: ast::RepetitionOp { |
3085 | | span: span(1..2), |
3086 | | kind: ast::RepetitionKind::OneOrMore, |
3087 | | }, |
3088 | | greedy: true, |
3089 | | ast: Box::new(lit('a', 0)), |
3090 | | })) |
3091 | | ); |
3092 | | |
3093 | | assert_eq!( |
3094 | | parser(r"a?").parse(), |
3095 | | Ok(Ast::repetition(ast::Repetition { |
3096 | | span: span(0..2), |
3097 | | op: ast::RepetitionOp { |
3098 | | span: span(1..2), |
3099 | | kind: ast::RepetitionKind::ZeroOrOne, |
3100 | | }, |
3101 | | greedy: true, |
3102 | | ast: Box::new(lit('a', 0)), |
3103 | | })) |
3104 | | ); |
3105 | | assert_eq!( |
3106 | | parser(r"a??").parse(), |
3107 | | Ok(Ast::repetition(ast::Repetition { |
3108 | | span: span(0..3), |
3109 | | op: ast::RepetitionOp { |
3110 | | span: span(1..3), |
3111 | | kind: ast::RepetitionKind::ZeroOrOne, |
3112 | | }, |
3113 | | greedy: false, |
3114 | | ast: Box::new(lit('a', 0)), |
3115 | | })) |
3116 | | ); |
3117 | | assert_eq!( |
3118 | | parser(r"a?").parse(), |
3119 | | Ok(Ast::repetition(ast::Repetition { |
3120 | | span: span(0..2), |
3121 | | op: ast::RepetitionOp { |
3122 | | span: span(1..2), |
3123 | | kind: ast::RepetitionKind::ZeroOrOne, |
3124 | | }, |
3125 | | greedy: true, |
3126 | | ast: Box::new(lit('a', 0)), |
3127 | | })) |
3128 | | ); |
3129 | | assert_eq!( |
3130 | | parser(r"a?b").parse(), |
3131 | | Ok(concat( |
3132 | | 0..3, |
3133 | | vec![ |
3134 | | Ast::repetition(ast::Repetition { |
3135 | | span: span(0..2), |
3136 | | op: ast::RepetitionOp { |
3137 | | span: span(1..2), |
3138 | | kind: ast::RepetitionKind::ZeroOrOne, |
3139 | | }, |
3140 | | greedy: true, |
3141 | | ast: Box::new(lit('a', 0)), |
3142 | | }), |
3143 | | lit('b', 2), |
3144 | | ] |
3145 | | )) |
3146 | | ); |
3147 | | assert_eq!( |
3148 | | parser(r"a??b").parse(), |
3149 | | Ok(concat( |
3150 | | 0..4, |
3151 | | vec![ |
3152 | | Ast::repetition(ast::Repetition { |
3153 | | span: span(0..3), |
3154 | | op: ast::RepetitionOp { |
3155 | | span: span(1..3), |
3156 | | kind: ast::RepetitionKind::ZeroOrOne, |
3157 | | }, |
3158 | | greedy: false, |
3159 | | ast: Box::new(lit('a', 0)), |
3160 | | }), |
3161 | | lit('b', 3), |
3162 | | ] |
3163 | | )) |
3164 | | ); |
3165 | | assert_eq!( |
3166 | | parser(r"ab?").parse(), |
3167 | | Ok(concat( |
3168 | | 0..3, |
3169 | | vec![ |
3170 | | lit('a', 0), |
3171 | | Ast::repetition(ast::Repetition { |
3172 | | span: span(1..3), |
3173 | | op: ast::RepetitionOp { |
3174 | | span: span(2..3), |
3175 | | kind: ast::RepetitionKind::ZeroOrOne, |
3176 | | }, |
3177 | | greedy: true, |
3178 | | ast: Box::new(lit('b', 1)), |
3179 | | }), |
3180 | | ] |
3181 | | )) |
3182 | | ); |
3183 | | assert_eq!( |
3184 | | parser(r"(ab)?").parse(), |
3185 | | Ok(Ast::repetition(ast::Repetition { |
3186 | | span: span(0..5), |
3187 | | op: ast::RepetitionOp { |
3188 | | span: span(4..5), |
3189 | | kind: ast::RepetitionKind::ZeroOrOne, |
3190 | | }, |
3191 | | greedy: true, |
3192 | | ast: Box::new(group( |
3193 | | 0..4, |
3194 | | 1, |
3195 | | concat(1..3, vec![lit('a', 1), lit('b', 2),]) |
3196 | | )), |
3197 | | })) |
3198 | | ); |
3199 | | assert_eq!( |
3200 | | parser(r"|a?").parse(), |
3201 | | Ok(alt( |
3202 | | 0..3, |
3203 | | vec![ |
3204 | | Ast::empty(span(0..0)), |
3205 | | Ast::repetition(ast::Repetition { |
3206 | | span: span(1..3), |
3207 | | op: ast::RepetitionOp { |
3208 | | span: span(2..3), |
3209 | | kind: ast::RepetitionKind::ZeroOrOne, |
3210 | | }, |
3211 | | greedy: true, |
3212 | | ast: Box::new(lit('a', 1)), |
3213 | | }), |
3214 | | ] |
3215 | | )) |
3216 | | ); |
3217 | | |
3218 | | assert_eq!( |
3219 | | parser(r"*").parse().unwrap_err(), |
3220 | | TestError { |
3221 | | span: span(0..0), |
3222 | | kind: ast::ErrorKind::RepetitionMissing, |
3223 | | } |
3224 | | ); |
3225 | | assert_eq!( |
3226 | | parser(r"(?i)*").parse().unwrap_err(), |
3227 | | TestError { |
3228 | | span: span(4..4), |
3229 | | kind: ast::ErrorKind::RepetitionMissing, |
3230 | | } |
3231 | | ); |
3232 | | assert_eq!( |
3233 | | parser(r"(*)").parse().unwrap_err(), |
3234 | | TestError { |
3235 | | span: span(1..1), |
3236 | | kind: ast::ErrorKind::RepetitionMissing, |
3237 | | } |
3238 | | ); |
3239 | | assert_eq!( |
3240 | | parser(r"(?:?)").parse().unwrap_err(), |
3241 | | TestError { |
3242 | | span: span(3..3), |
3243 | | kind: ast::ErrorKind::RepetitionMissing, |
3244 | | } |
3245 | | ); |
3246 | | assert_eq!( |
3247 | | parser(r"+").parse().unwrap_err(), |
3248 | | TestError { |
3249 | | span: span(0..0), |
3250 | | kind: ast::ErrorKind::RepetitionMissing, |
3251 | | } |
3252 | | ); |
3253 | | assert_eq!( |
3254 | | parser(r"?").parse().unwrap_err(), |
3255 | | TestError { |
3256 | | span: span(0..0), |
3257 | | kind: ast::ErrorKind::RepetitionMissing, |
3258 | | } |
3259 | | ); |
3260 | | assert_eq!( |
3261 | | parser(r"(?)").parse().unwrap_err(), |
3262 | | TestError { |
3263 | | span: span(1..1), |
3264 | | kind: ast::ErrorKind::RepetitionMissing, |
3265 | | } |
3266 | | ); |
3267 | | assert_eq!( |
3268 | | parser(r"|*").parse().unwrap_err(), |
3269 | | TestError { |
3270 | | span: span(1..1), |
3271 | | kind: ast::ErrorKind::RepetitionMissing, |
3272 | | } |
3273 | | ); |
3274 | | assert_eq!( |
3275 | | parser(r"|+").parse().unwrap_err(), |
3276 | | TestError { |
3277 | | span: span(1..1), |
3278 | | kind: ast::ErrorKind::RepetitionMissing, |
3279 | | } |
3280 | | ); |
3281 | | assert_eq!( |
3282 | | parser(r"|?").parse().unwrap_err(), |
3283 | | TestError { |
3284 | | span: span(1..1), |
3285 | | kind: ast::ErrorKind::RepetitionMissing, |
3286 | | } |
3287 | | ); |
3288 | | } |
3289 | | |
3290 | | #[test] |
3291 | | fn parse_counted_repetition() { |
3292 | | assert_eq!( |
3293 | | parser(r"a{5}").parse(), |
3294 | | Ok(Ast::repetition(ast::Repetition { |
3295 | | span: span(0..4), |
3296 | | op: ast::RepetitionOp { |
3297 | | span: span(1..4), |
3298 | | kind: ast::RepetitionKind::Range( |
3299 | | ast::RepetitionRange::Exactly(5) |
3300 | | ), |
3301 | | }, |
3302 | | greedy: true, |
3303 | | ast: Box::new(lit('a', 0)), |
3304 | | })) |
3305 | | ); |
3306 | | assert_eq!( |
3307 | | parser(r"a{5,}").parse(), |
3308 | | Ok(Ast::repetition(ast::Repetition { |
3309 | | span: span(0..5), |
3310 | | op: ast::RepetitionOp { |
3311 | | span: span(1..5), |
3312 | | kind: ast::RepetitionKind::Range( |
3313 | | ast::RepetitionRange::AtLeast(5) |
3314 | | ), |
3315 | | }, |
3316 | | greedy: true, |
3317 | | ast: Box::new(lit('a', 0)), |
3318 | | })) |
3319 | | ); |
3320 | | assert_eq!( |
3321 | | parser(r"a{5,9}").parse(), |
3322 | | Ok(Ast::repetition(ast::Repetition { |
3323 | | span: span(0..6), |
3324 | | op: ast::RepetitionOp { |
3325 | | span: span(1..6), |
3326 | | kind: ast::RepetitionKind::Range( |
3327 | | ast::RepetitionRange::Bounded(5, 9) |
3328 | | ), |
3329 | | }, |
3330 | | greedy: true, |
3331 | | ast: Box::new(lit('a', 0)), |
3332 | | })) |
3333 | | ); |
3334 | | assert_eq!( |
3335 | | parser(r"a{5}?").parse(), |
3336 | | Ok(Ast::repetition(ast::Repetition { |
3337 | | span: span(0..5), |
3338 | | op: ast::RepetitionOp { |
3339 | | span: span(1..5), |
3340 | | kind: ast::RepetitionKind::Range( |
3341 | | ast::RepetitionRange::Exactly(5) |
3342 | | ), |
3343 | | }, |
3344 | | greedy: false, |
3345 | | ast: Box::new(lit('a', 0)), |
3346 | | })) |
3347 | | ); |
3348 | | assert_eq!( |
3349 | | parser(r"ab{5}").parse(), |
3350 | | Ok(concat( |
3351 | | 0..5, |
3352 | | vec![ |
3353 | | lit('a', 0), |
3354 | | Ast::repetition(ast::Repetition { |
3355 | | span: span(1..5), |
3356 | | op: ast::RepetitionOp { |
3357 | | span: span(2..5), |
3358 | | kind: ast::RepetitionKind::Range( |
3359 | | ast::RepetitionRange::Exactly(5) |
3360 | | ), |
3361 | | }, |
3362 | | greedy: true, |
3363 | | ast: Box::new(lit('b', 1)), |
3364 | | }), |
3365 | | ] |
3366 | | )) |
3367 | | ); |
3368 | | assert_eq!( |
3369 | | parser(r"ab{5}c").parse(), |
3370 | | Ok(concat( |
3371 | | 0..6, |
3372 | | vec![ |
3373 | | lit('a', 0), |
3374 | | Ast::repetition(ast::Repetition { |
3375 | | span: span(1..5), |
3376 | | op: ast::RepetitionOp { |
3377 | | span: span(2..5), |
3378 | | kind: ast::RepetitionKind::Range( |
3379 | | ast::RepetitionRange::Exactly(5) |
3380 | | ), |
3381 | | }, |
3382 | | greedy: true, |
3383 | | ast: Box::new(lit('b', 1)), |
3384 | | }), |
3385 | | lit('c', 5), |
3386 | | ] |
3387 | | )) |
3388 | | ); |
3389 | | |
3390 | | assert_eq!( |
3391 | | parser(r"a{ 5 }").parse(), |
3392 | | Ok(Ast::repetition(ast::Repetition { |
3393 | | span: span(0..6), |
3394 | | op: ast::RepetitionOp { |
3395 | | span: span(1..6), |
3396 | | kind: ast::RepetitionKind::Range( |
3397 | | ast::RepetitionRange::Exactly(5) |
3398 | | ), |
3399 | | }, |
3400 | | greedy: true, |
3401 | | ast: Box::new(lit('a', 0)), |
3402 | | })) |
3403 | | ); |
3404 | | assert_eq!( |
3405 | | parser(r"a{ 5 , 9 }").parse(), |
3406 | | Ok(Ast::repetition(ast::Repetition { |
3407 | | span: span(0..10), |
3408 | | op: ast::RepetitionOp { |
3409 | | span: span(1..10), |
3410 | | kind: ast::RepetitionKind::Range( |
3411 | | ast::RepetitionRange::Bounded(5, 9) |
3412 | | ), |
3413 | | }, |
3414 | | greedy: true, |
3415 | | ast: Box::new(lit('a', 0)), |
3416 | | })) |
3417 | | ); |
3418 | | assert_eq!( |
3419 | | parser_empty_min_range(r"a{,9}").parse(), |
3420 | | Ok(Ast::repetition(ast::Repetition { |
3421 | | span: span(0..5), |
3422 | | op: ast::RepetitionOp { |
3423 | | span: span(1..5), |
3424 | | kind: ast::RepetitionKind::Range( |
3425 | | ast::RepetitionRange::Bounded(0, 9) |
3426 | | ), |
3427 | | }, |
3428 | | greedy: true, |
3429 | | ast: Box::new(lit('a', 0)), |
3430 | | })) |
3431 | | ); |
3432 | | assert_eq!( |
3433 | | parser_ignore_whitespace(r"a{5,9} ?").parse(), |
3434 | | Ok(Ast::repetition(ast::Repetition { |
3435 | | span: span(0..8), |
3436 | | op: ast::RepetitionOp { |
3437 | | span: span(1..8), |
3438 | | kind: ast::RepetitionKind::Range( |
3439 | | ast::RepetitionRange::Bounded(5, 9) |
3440 | | ), |
3441 | | }, |
3442 | | greedy: false, |
3443 | | ast: Box::new(lit('a', 0)), |
3444 | | })) |
3445 | | ); |
3446 | | assert_eq!( |
3447 | | parser(r"\b{5,9}").parse(), |
3448 | | Ok(Ast::repetition(ast::Repetition { |
3449 | | span: span(0..7), |
3450 | | op: ast::RepetitionOp { |
3451 | | span: span(2..7), |
3452 | | kind: ast::RepetitionKind::Range( |
3453 | | ast::RepetitionRange::Bounded(5, 9) |
3454 | | ), |
3455 | | }, |
3456 | | greedy: true, |
3457 | | ast: Box::new(Ast::assertion(ast::Assertion { |
3458 | | span: span(0..2), |
3459 | | kind: ast::AssertionKind::WordBoundary, |
3460 | | })), |
3461 | | })) |
3462 | | ); |
3463 | | |
3464 | | assert_eq!( |
3465 | | parser(r"(?i){0}").parse().unwrap_err(), |
3466 | | TestError { |
3467 | | span: span(4..4), |
3468 | | kind: ast::ErrorKind::RepetitionMissing, |
3469 | | } |
3470 | | ); |
3471 | | assert_eq!( |
3472 | | parser(r"(?m){1,1}").parse().unwrap_err(), |
3473 | | TestError { |
3474 | | span: span(4..4), |
3475 | | kind: ast::ErrorKind::RepetitionMissing, |
3476 | | } |
3477 | | ); |
3478 | | assert_eq!( |
3479 | | parser(r"a{]}").parse().unwrap_err(), |
3480 | | TestError { |
3481 | | span: span(2..2), |
3482 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3483 | | } |
3484 | | ); |
3485 | | assert_eq!( |
3486 | | parser(r"a{1,]}").parse().unwrap_err(), |
3487 | | TestError { |
3488 | | span: span(4..4), |
3489 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3490 | | } |
3491 | | ); |
3492 | | assert_eq!( |
3493 | | parser(r"a{").parse().unwrap_err(), |
3494 | | TestError { |
3495 | | span: span(1..2), |
3496 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3497 | | } |
3498 | | ); |
3499 | | assert_eq!( |
3500 | | parser(r"a{}").parse().unwrap_err(), |
3501 | | TestError { |
3502 | | span: span(2..2), |
3503 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3504 | | } |
3505 | | ); |
3506 | | assert_eq!( |
3507 | | parser(r"a{a").parse().unwrap_err(), |
3508 | | TestError { |
3509 | | span: span(2..2), |
3510 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3511 | | } |
3512 | | ); |
3513 | | assert_eq!( |
3514 | | parser(r"a{9999999999}").parse().unwrap_err(), |
3515 | | TestError { |
3516 | | span: span(2..12), |
3517 | | kind: ast::ErrorKind::DecimalInvalid, |
3518 | | } |
3519 | | ); |
3520 | | assert_eq!( |
3521 | | parser(r"a{9").parse().unwrap_err(), |
3522 | | TestError { |
3523 | | span: span(1..3), |
3524 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3525 | | } |
3526 | | ); |
3527 | | assert_eq!( |
3528 | | parser(r"a{9,a").parse().unwrap_err(), |
3529 | | TestError { |
3530 | | span: span(4..4), |
3531 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3532 | | } |
3533 | | ); |
3534 | | assert_eq!( |
3535 | | parser(r"a{9,9999999999}").parse().unwrap_err(), |
3536 | | TestError { |
3537 | | span: span(4..14), |
3538 | | kind: ast::ErrorKind::DecimalInvalid, |
3539 | | } |
3540 | | ); |
3541 | | assert_eq!( |
3542 | | parser(r"a{9,").parse().unwrap_err(), |
3543 | | TestError { |
3544 | | span: span(1..4), |
3545 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3546 | | } |
3547 | | ); |
3548 | | assert_eq!( |
3549 | | parser(r"a{9,11").parse().unwrap_err(), |
3550 | | TestError { |
3551 | | span: span(1..6), |
3552 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3553 | | } |
3554 | | ); |
3555 | | assert_eq!( |
3556 | | parser(r"a{2,1}").parse().unwrap_err(), |
3557 | | TestError { |
3558 | | span: span(1..6), |
3559 | | kind: ast::ErrorKind::RepetitionCountInvalid, |
3560 | | } |
3561 | | ); |
3562 | | assert_eq!( |
3563 | | parser(r"{5}").parse().unwrap_err(), |
3564 | | TestError { |
3565 | | span: span(0..0), |
3566 | | kind: ast::ErrorKind::RepetitionMissing, |
3567 | | } |
3568 | | ); |
3569 | | assert_eq!( |
3570 | | parser(r"|{5}").parse().unwrap_err(), |
3571 | | TestError { |
3572 | | span: span(1..1), |
3573 | | kind: ast::ErrorKind::RepetitionMissing, |
3574 | | } |
3575 | | ); |
3576 | | } |
3577 | | |
3578 | | #[test] |
3579 | | fn parse_alternate() { |
3580 | | assert_eq!( |
3581 | | parser(r"a|b").parse(), |
3582 | | Ok(Ast::alternation(ast::Alternation { |
3583 | | span: span(0..3), |
3584 | | asts: vec![lit('a', 0), lit('b', 2)], |
3585 | | })) |
3586 | | ); |
3587 | | assert_eq!( |
3588 | | parser(r"(a|b)").parse(), |
3589 | | Ok(group( |
3590 | | 0..5, |
3591 | | 1, |
3592 | | Ast::alternation(ast::Alternation { |
3593 | | span: span(1..4), |
3594 | | asts: vec![lit('a', 1), lit('b', 3)], |
3595 | | }) |
3596 | | )) |
3597 | | ); |
3598 | | |
3599 | | assert_eq!( |
3600 | | parser(r"a|b|c").parse(), |
3601 | | Ok(Ast::alternation(ast::Alternation { |
3602 | | span: span(0..5), |
3603 | | asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], |
3604 | | })) |
3605 | | ); |
3606 | | assert_eq!( |
3607 | | parser(r"ax|by|cz").parse(), |
3608 | | Ok(Ast::alternation(ast::Alternation { |
3609 | | span: span(0..8), |
3610 | | asts: vec![ |
3611 | | concat(0..2, vec![lit('a', 0), lit('x', 1)]), |
3612 | | concat(3..5, vec![lit('b', 3), lit('y', 4)]), |
3613 | | concat(6..8, vec![lit('c', 6), lit('z', 7)]), |
3614 | | ], |
3615 | | })) |
3616 | | ); |
3617 | | assert_eq!( |
3618 | | parser(r"(ax|by|cz)").parse(), |
3619 | | Ok(group( |
3620 | | 0..10, |
3621 | | 1, |
3622 | | Ast::alternation(ast::Alternation { |
3623 | | span: span(1..9), |
3624 | | asts: vec![ |
3625 | | concat(1..3, vec![lit('a', 1), lit('x', 2)]), |
3626 | | concat(4..6, vec![lit('b', 4), lit('y', 5)]), |
3627 | | concat(7..9, vec![lit('c', 7), lit('z', 8)]), |
3628 | | ], |
3629 | | }) |
3630 | | )) |
3631 | | ); |
3632 | | assert_eq!( |
3633 | | parser(r"(ax|(by|(cz)))").parse(), |
3634 | | Ok(group( |
3635 | | 0..14, |
3636 | | 1, |
3637 | | alt( |
3638 | | 1..13, |
3639 | | vec![ |
3640 | | concat(1..3, vec![lit('a', 1), lit('x', 2)]), |
3641 | | group( |
3642 | | 4..13, |
3643 | | 2, |
3644 | | alt( |
3645 | | 5..12, |
3646 | | vec![ |
3647 | | concat( |
3648 | | 5..7, |
3649 | | vec![lit('b', 5), lit('y', 6)] |
3650 | | ), |
3651 | | group( |
3652 | | 8..12, |
3653 | | 3, |
3654 | | concat( |
3655 | | 9..11, |
3656 | | vec![lit('c', 9), lit('z', 10),] |
3657 | | ) |
3658 | | ), |
3659 | | ] |
3660 | | ) |
3661 | | ), |
3662 | | ] |
3663 | | ) |
3664 | | )) |
3665 | | ); |
3666 | | |
3667 | | assert_eq!( |
3668 | | parser(r"|").parse(), |
3669 | | Ok(alt( |
3670 | | 0..1, |
3671 | | vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] |
3672 | | )) |
3673 | | ); |
3674 | | assert_eq!( |
3675 | | parser(r"||").parse(), |
3676 | | Ok(alt( |
3677 | | 0..2, |
3678 | | vec![ |
3679 | | Ast::empty(span(0..0)), |
3680 | | Ast::empty(span(1..1)), |
3681 | | Ast::empty(span(2..2)), |
3682 | | ] |
3683 | | )) |
3684 | | ); |
3685 | | assert_eq!( |
3686 | | parser(r"a|").parse(), |
3687 | | Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) |
3688 | | ); |
3689 | | assert_eq!( |
3690 | | parser(r"|a").parse(), |
3691 | | Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) |
3692 | | ); |
3693 | | |
3694 | | assert_eq!( |
3695 | | parser(r"(|)").parse(), |
3696 | | Ok(group( |
3697 | | 0..3, |
3698 | | 1, |
3699 | | alt( |
3700 | | 1..2, |
3701 | | vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] |
3702 | | ) |
3703 | | )) |
3704 | | ); |
3705 | | assert_eq!( |
3706 | | parser(r"(a|)").parse(), |
3707 | | Ok(group( |
3708 | | 0..4, |
3709 | | 1, |
3710 | | alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) |
3711 | | )) |
3712 | | ); |
3713 | | assert_eq!( |
3714 | | parser(r"(|a)").parse(), |
3715 | | Ok(group( |
3716 | | 0..4, |
3717 | | 1, |
3718 | | alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) |
3719 | | )) |
3720 | | ); |
3721 | | |
3722 | | assert_eq!( |
3723 | | parser(r"a|b)").parse().unwrap_err(), |
3724 | | TestError { |
3725 | | span: span(3..4), |
3726 | | kind: ast::ErrorKind::GroupUnopened, |
3727 | | } |
3728 | | ); |
3729 | | assert_eq!( |
3730 | | parser(r"(a|b").parse().unwrap_err(), |
3731 | | TestError { |
3732 | | span: span(0..1), |
3733 | | kind: ast::ErrorKind::GroupUnclosed, |
3734 | | } |
3735 | | ); |
3736 | | } |
3737 | | |
3738 | | #[test] |
3739 | | fn parse_unsupported_lookaround() { |
3740 | | assert_eq!( |
3741 | | parser(r"(?=a)").parse().unwrap_err(), |
3742 | | TestError { |
3743 | | span: span(0..3), |
3744 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3745 | | } |
3746 | | ); |
3747 | | assert_eq!( |
3748 | | parser(r"(?!a)").parse().unwrap_err(), |
3749 | | TestError { |
3750 | | span: span(0..3), |
3751 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3752 | | } |
3753 | | ); |
3754 | | assert_eq!( |
3755 | | parser(r"(?<=a)").parse().unwrap_err(), |
3756 | | TestError { |
3757 | | span: span(0..4), |
3758 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3759 | | } |
3760 | | ); |
3761 | | assert_eq!( |
3762 | | parser(r"(?<!a)").parse().unwrap_err(), |
3763 | | TestError { |
3764 | | span: span(0..4), |
3765 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3766 | | } |
3767 | | ); |
3768 | | } |
3769 | | |
3770 | | #[test] |
3771 | | fn parse_group() { |
3772 | | assert_eq!( |
3773 | | parser("(?i)").parse(), |
3774 | | Ok(Ast::flags(ast::SetFlags { |
3775 | | span: span(0..4), |
3776 | | flags: ast::Flags { |
3777 | | span: span(2..3), |
3778 | | items: vec![ast::FlagsItem { |
3779 | | span: span(2..3), |
3780 | | kind: ast::FlagsItemKind::Flag( |
3781 | | ast::Flag::CaseInsensitive |
3782 | | ), |
3783 | | }], |
3784 | | }, |
3785 | | })) |
3786 | | ); |
3787 | | assert_eq!( |
3788 | | parser("(?iU)").parse(), |
3789 | | Ok(Ast::flags(ast::SetFlags { |
3790 | | span: span(0..5), |
3791 | | flags: ast::Flags { |
3792 | | span: span(2..4), |
3793 | | items: vec![ |
3794 | | ast::FlagsItem { |
3795 | | span: span(2..3), |
3796 | | kind: ast::FlagsItemKind::Flag( |
3797 | | ast::Flag::CaseInsensitive |
3798 | | ), |
3799 | | }, |
3800 | | ast::FlagsItem { |
3801 | | span: span(3..4), |
3802 | | kind: ast::FlagsItemKind::Flag( |
3803 | | ast::Flag::SwapGreed |
3804 | | ), |
3805 | | }, |
3806 | | ], |
3807 | | }, |
3808 | | })) |
3809 | | ); |
3810 | | assert_eq!( |
3811 | | parser("(?i-U)").parse(), |
3812 | | Ok(Ast::flags(ast::SetFlags { |
3813 | | span: span(0..6), |
3814 | | flags: ast::Flags { |
3815 | | span: span(2..5), |
3816 | | items: vec![ |
3817 | | ast::FlagsItem { |
3818 | | span: span(2..3), |
3819 | | kind: ast::FlagsItemKind::Flag( |
3820 | | ast::Flag::CaseInsensitive |
3821 | | ), |
3822 | | }, |
3823 | | ast::FlagsItem { |
3824 | | span: span(3..4), |
3825 | | kind: ast::FlagsItemKind::Negation, |
3826 | | }, |
3827 | | ast::FlagsItem { |
3828 | | span: span(4..5), |
3829 | | kind: ast::FlagsItemKind::Flag( |
3830 | | ast::Flag::SwapGreed |
3831 | | ), |
3832 | | }, |
3833 | | ], |
3834 | | }, |
3835 | | })) |
3836 | | ); |
3837 | | |
3838 | | assert_eq!( |
3839 | | parser("()").parse(), |
3840 | | Ok(Ast::group(ast::Group { |
3841 | | span: span(0..2), |
3842 | | kind: ast::GroupKind::CaptureIndex(1), |
3843 | | ast: Box::new(Ast::empty(span(1..1))), |
3844 | | })) |
3845 | | ); |
3846 | | assert_eq!( |
3847 | | parser("(a)").parse(), |
3848 | | Ok(Ast::group(ast::Group { |
3849 | | span: span(0..3), |
3850 | | kind: ast::GroupKind::CaptureIndex(1), |
3851 | | ast: Box::new(lit('a', 1)), |
3852 | | })) |
3853 | | ); |
3854 | | assert_eq!( |
3855 | | parser("(())").parse(), |
3856 | | Ok(Ast::group(ast::Group { |
3857 | | span: span(0..4), |
3858 | | kind: ast::GroupKind::CaptureIndex(1), |
3859 | | ast: Box::new(Ast::group(ast::Group { |
3860 | | span: span(1..3), |
3861 | | kind: ast::GroupKind::CaptureIndex(2), |
3862 | | ast: Box::new(Ast::empty(span(2..2))), |
3863 | | })), |
3864 | | })) |
3865 | | ); |
3866 | | |
3867 | | assert_eq!( |
3868 | | parser("(?:a)").parse(), |
3869 | | Ok(Ast::group(ast::Group { |
3870 | | span: span(0..5), |
3871 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3872 | | span: span(2..2), |
3873 | | items: vec![], |
3874 | | }), |
3875 | | ast: Box::new(lit('a', 3)), |
3876 | | })) |
3877 | | ); |
3878 | | |
3879 | | assert_eq!( |
3880 | | parser("(?i:a)").parse(), |
3881 | | Ok(Ast::group(ast::Group { |
3882 | | span: span(0..6), |
3883 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3884 | | span: span(2..3), |
3885 | | items: vec![ast::FlagsItem { |
3886 | | span: span(2..3), |
3887 | | kind: ast::FlagsItemKind::Flag( |
3888 | | ast::Flag::CaseInsensitive |
3889 | | ), |
3890 | | },], |
3891 | | }), |
3892 | | ast: Box::new(lit('a', 4)), |
3893 | | })) |
3894 | | ); |
3895 | | assert_eq!( |
3896 | | parser("(?i-U:a)").parse(), |
3897 | | Ok(Ast::group(ast::Group { |
3898 | | span: span(0..8), |
3899 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3900 | | span: span(2..5), |
3901 | | items: vec![ |
3902 | | ast::FlagsItem { |
3903 | | span: span(2..3), |
3904 | | kind: ast::FlagsItemKind::Flag( |
3905 | | ast::Flag::CaseInsensitive |
3906 | | ), |
3907 | | }, |
3908 | | ast::FlagsItem { |
3909 | | span: span(3..4), |
3910 | | kind: ast::FlagsItemKind::Negation, |
3911 | | }, |
3912 | | ast::FlagsItem { |
3913 | | span: span(4..5), |
3914 | | kind: ast::FlagsItemKind::Flag( |
3915 | | ast::Flag::SwapGreed |
3916 | | ), |
3917 | | }, |
3918 | | ], |
3919 | | }), |
3920 | | ast: Box::new(lit('a', 6)), |
3921 | | })) |
3922 | | ); |
3923 | | |
3924 | | assert_eq!( |
3925 | | parser("(").parse().unwrap_err(), |
3926 | | TestError { |
3927 | | span: span(0..1), |
3928 | | kind: ast::ErrorKind::GroupUnclosed, |
3929 | | } |
3930 | | ); |
3931 | | assert_eq!( |
3932 | | parser("(?").parse().unwrap_err(), |
3933 | | TestError { |
3934 | | span: span(0..1), |
3935 | | kind: ast::ErrorKind::GroupUnclosed, |
3936 | | } |
3937 | | ); |
3938 | | assert_eq!( |
3939 | | parser("(?P").parse().unwrap_err(), |
3940 | | TestError { |
3941 | | span: span(2..3), |
3942 | | kind: ast::ErrorKind::FlagUnrecognized, |
3943 | | } |
3944 | | ); |
3945 | | assert_eq!( |
3946 | | parser("(?P<").parse().unwrap_err(), |
3947 | | TestError { |
3948 | | span: span(4..4), |
3949 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3950 | | } |
3951 | | ); |
3952 | | assert_eq!( |
3953 | | parser("(a").parse().unwrap_err(), |
3954 | | TestError { |
3955 | | span: span(0..1), |
3956 | | kind: ast::ErrorKind::GroupUnclosed, |
3957 | | } |
3958 | | ); |
3959 | | assert_eq!( |
3960 | | parser("(()").parse().unwrap_err(), |
3961 | | TestError { |
3962 | | span: span(0..1), |
3963 | | kind: ast::ErrorKind::GroupUnclosed, |
3964 | | } |
3965 | | ); |
3966 | | assert_eq!( |
3967 | | parser(")").parse().unwrap_err(), |
3968 | | TestError { |
3969 | | span: span(0..1), |
3970 | | kind: ast::ErrorKind::GroupUnopened, |
3971 | | } |
3972 | | ); |
3973 | | assert_eq!( |
3974 | | parser("a)").parse().unwrap_err(), |
3975 | | TestError { |
3976 | | span: span(1..2), |
3977 | | kind: ast::ErrorKind::GroupUnopened, |
3978 | | } |
3979 | | ); |
3980 | | } |
3981 | | |
3982 | | #[test] |
3983 | | fn parse_capture_name() { |
3984 | | assert_eq!( |
3985 | | parser("(?<a>z)").parse(), |
3986 | | Ok(Ast::group(ast::Group { |
3987 | | span: span(0..7), |
3988 | | kind: ast::GroupKind::CaptureName { |
3989 | | starts_with_p: false, |
3990 | | name: ast::CaptureName { |
3991 | | span: span(3..4), |
3992 | | name: s("a"), |
3993 | | index: 1, |
3994 | | } |
3995 | | }, |
3996 | | ast: Box::new(lit('z', 5)), |
3997 | | })) |
3998 | | ); |
3999 | | assert_eq!( |
4000 | | parser("(?P<a>z)").parse(), |
4001 | | Ok(Ast::group(ast::Group { |
4002 | | span: span(0..8), |
4003 | | kind: ast::GroupKind::CaptureName { |
4004 | | starts_with_p: true, |
4005 | | name: ast::CaptureName { |
4006 | | span: span(4..5), |
4007 | | name: s("a"), |
4008 | | index: 1, |
4009 | | } |
4010 | | }, |
4011 | | ast: Box::new(lit('z', 6)), |
4012 | | })) |
4013 | | ); |
4014 | | assert_eq!( |
4015 | | parser("(?P<abc>z)").parse(), |
4016 | | Ok(Ast::group(ast::Group { |
4017 | | span: span(0..10), |
4018 | | kind: ast::GroupKind::CaptureName { |
4019 | | starts_with_p: true, |
4020 | | name: ast::CaptureName { |
4021 | | span: span(4..7), |
4022 | | name: s("abc"), |
4023 | | index: 1, |
4024 | | } |
4025 | | }, |
4026 | | ast: Box::new(lit('z', 8)), |
4027 | | })) |
4028 | | ); |
4029 | | |
4030 | | assert_eq!( |
4031 | | parser("(?P<a_1>z)").parse(), |
4032 | | Ok(Ast::group(ast::Group { |
4033 | | span: span(0..10), |
4034 | | kind: ast::GroupKind::CaptureName { |
4035 | | starts_with_p: true, |
4036 | | name: ast::CaptureName { |
4037 | | span: span(4..7), |
4038 | | name: s("a_1"), |
4039 | | index: 1, |
4040 | | } |
4041 | | }, |
4042 | | ast: Box::new(lit('z', 8)), |
4043 | | })) |
4044 | | ); |
4045 | | |
4046 | | assert_eq!( |
4047 | | parser("(?P<a.1>z)").parse(), |
4048 | | Ok(Ast::group(ast::Group { |
4049 | | span: span(0..10), |
4050 | | kind: ast::GroupKind::CaptureName { |
4051 | | starts_with_p: true, |
4052 | | name: ast::CaptureName { |
4053 | | span: span(4..7), |
4054 | | name: s("a.1"), |
4055 | | index: 1, |
4056 | | } |
4057 | | }, |
4058 | | ast: Box::new(lit('z', 8)), |
4059 | | })) |
4060 | | ); |
4061 | | |
4062 | | assert_eq!( |
4063 | | parser("(?P<a[1]>z)").parse(), |
4064 | | Ok(Ast::group(ast::Group { |
4065 | | span: span(0..11), |
4066 | | kind: ast::GroupKind::CaptureName { |
4067 | | starts_with_p: true, |
4068 | | name: ast::CaptureName { |
4069 | | span: span(4..8), |
4070 | | name: s("a[1]"), |
4071 | | index: 1, |
4072 | | } |
4073 | | }, |
4074 | | ast: Box::new(lit('z', 9)), |
4075 | | })) |
4076 | | ); |
4077 | | |
4078 | | assert_eq!( |
4079 | | parser("(?P<a¾>)").parse(), |
4080 | | Ok(Ast::group(ast::Group { |
4081 | | span: Span::new( |
4082 | | Position::new(0, 1, 1), |
4083 | | Position::new(9, 1, 9), |
4084 | | ), |
4085 | | kind: ast::GroupKind::CaptureName { |
4086 | | starts_with_p: true, |
4087 | | name: ast::CaptureName { |
4088 | | span: Span::new( |
4089 | | Position::new(4, 1, 5), |
4090 | | Position::new(7, 1, 7), |
4091 | | ), |
4092 | | name: s("a¾"), |
4093 | | index: 1, |
4094 | | } |
4095 | | }, |
4096 | | ast: Box::new(Ast::empty(Span::new( |
4097 | | Position::new(8, 1, 8), |
4098 | | Position::new(8, 1, 8), |
4099 | | ))), |
4100 | | })) |
4101 | | ); |
4102 | | assert_eq!( |
4103 | | parser("(?P<名字>)").parse(), |
4104 | | Ok(Ast::group(ast::Group { |
4105 | | span: Span::new( |
4106 | | Position::new(0, 1, 1), |
4107 | | Position::new(12, 1, 9), |
4108 | | ), |
4109 | | kind: ast::GroupKind::CaptureName { |
4110 | | starts_with_p: true, |
4111 | | name: ast::CaptureName { |
4112 | | span: Span::new( |
4113 | | Position::new(4, 1, 5), |
4114 | | Position::new(10, 1, 7), |
4115 | | ), |
4116 | | name: s("名字"), |
4117 | | index: 1, |
4118 | | } |
4119 | | }, |
4120 | | ast: Box::new(Ast::empty(Span::new( |
4121 | | Position::new(11, 1, 8), |
4122 | | Position::new(11, 1, 8), |
4123 | | ))), |
4124 | | })) |
4125 | | ); |
4126 | | |
4127 | | assert_eq!( |
4128 | | parser("(?P<").parse().unwrap_err(), |
4129 | | TestError { |
4130 | | span: span(4..4), |
4131 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4132 | | } |
4133 | | ); |
4134 | | assert_eq!( |
4135 | | parser("(?P<>z)").parse().unwrap_err(), |
4136 | | TestError { |
4137 | | span: span(4..4), |
4138 | | kind: ast::ErrorKind::GroupNameEmpty, |
4139 | | } |
4140 | | ); |
4141 | | assert_eq!( |
4142 | | parser("(?P<a").parse().unwrap_err(), |
4143 | | TestError { |
4144 | | span: span(5..5), |
4145 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4146 | | } |
4147 | | ); |
4148 | | assert_eq!( |
4149 | | parser("(?P<ab").parse().unwrap_err(), |
4150 | | TestError { |
4151 | | span: span(6..6), |
4152 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4153 | | } |
4154 | | ); |
4155 | | assert_eq!( |
4156 | | parser("(?P<0a").parse().unwrap_err(), |
4157 | | TestError { |
4158 | | span: span(4..5), |
4159 | | kind: ast::ErrorKind::GroupNameInvalid, |
4160 | | } |
4161 | | ); |
4162 | | assert_eq!( |
4163 | | parser("(?P<~").parse().unwrap_err(), |
4164 | | TestError { |
4165 | | span: span(4..5), |
4166 | | kind: ast::ErrorKind::GroupNameInvalid, |
4167 | | } |
4168 | | ); |
4169 | | assert_eq!( |
4170 | | parser("(?P<abc~").parse().unwrap_err(), |
4171 | | TestError { |
4172 | | span: span(7..8), |
4173 | | kind: ast::ErrorKind::GroupNameInvalid, |
4174 | | } |
4175 | | ); |
4176 | | assert_eq!( |
4177 | | parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(), |
4178 | | TestError { |
4179 | | span: span(12..13), |
4180 | | kind: ast::ErrorKind::GroupNameDuplicate { |
4181 | | original: span(4..5), |
4182 | | }, |
4183 | | } |
4184 | | ); |
4185 | | assert_eq!( |
4186 | | parser("(?P<5>)").parse().unwrap_err(), |
4187 | | TestError { |
4188 | | span: span(4..5), |
4189 | | kind: ast::ErrorKind::GroupNameInvalid, |
4190 | | } |
4191 | | ); |
4192 | | assert_eq!( |
4193 | | parser("(?P<5a>)").parse().unwrap_err(), |
4194 | | TestError { |
4195 | | span: span(4..5), |
4196 | | kind: ast::ErrorKind::GroupNameInvalid, |
4197 | | } |
4198 | | ); |
4199 | | assert_eq!( |
4200 | | parser("(?P<¾>)").parse().unwrap_err(), |
4201 | | TestError { |
4202 | | span: Span::new( |
4203 | | Position::new(4, 1, 5), |
4204 | | Position::new(6, 1, 6), |
4205 | | ), |
4206 | | kind: ast::ErrorKind::GroupNameInvalid, |
4207 | | } |
4208 | | ); |
4209 | | assert_eq!( |
4210 | | parser("(?P<¾a>)").parse().unwrap_err(), |
4211 | | TestError { |
4212 | | span: Span::new( |
4213 | | Position::new(4, 1, 5), |
4214 | | Position::new(6, 1, 6), |
4215 | | ), |
4216 | | kind: ast::ErrorKind::GroupNameInvalid, |
4217 | | } |
4218 | | ); |
4219 | | assert_eq!( |
4220 | | parser("(?P<☃>)").parse().unwrap_err(), |
4221 | | TestError { |
4222 | | span: Span::new( |
4223 | | Position::new(4, 1, 5), |
4224 | | Position::new(7, 1, 6), |
4225 | | ), |
4226 | | kind: ast::ErrorKind::GroupNameInvalid, |
4227 | | } |
4228 | | ); |
4229 | | assert_eq!( |
4230 | | parser("(?P<a☃>)").parse().unwrap_err(), |
4231 | | TestError { |
4232 | | span: Span::new( |
4233 | | Position::new(5, 1, 6), |
4234 | | Position::new(8, 1, 7), |
4235 | | ), |
4236 | | kind: ast::ErrorKind::GroupNameInvalid, |
4237 | | } |
4238 | | ); |
4239 | | } |
4240 | | |
4241 | | #[test] |
4242 | | fn parse_flags() { |
4243 | | assert_eq!( |
4244 | | parser("i:").parse_flags(), |
4245 | | Ok(ast::Flags { |
4246 | | span: span(0..1), |
4247 | | items: vec![ast::FlagsItem { |
4248 | | span: span(0..1), |
4249 | | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
4250 | | }], |
4251 | | }) |
4252 | | ); |
4253 | | assert_eq!( |
4254 | | parser("i)").parse_flags(), |
4255 | | Ok(ast::Flags { |
4256 | | span: span(0..1), |
4257 | | items: vec![ast::FlagsItem { |
4258 | | span: span(0..1), |
4259 | | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
4260 | | }], |
4261 | | }) |
4262 | | ); |
4263 | | |
4264 | | assert_eq!( |
4265 | | parser("isU:").parse_flags(), |
4266 | | Ok(ast::Flags { |
4267 | | span: span(0..3), |
4268 | | items: vec![ |
4269 | | ast::FlagsItem { |
4270 | | span: span(0..1), |
4271 | | kind: ast::FlagsItemKind::Flag( |
4272 | | ast::Flag::CaseInsensitive |
4273 | | ), |
4274 | | }, |
4275 | | ast::FlagsItem { |
4276 | | span: span(1..2), |
4277 | | kind: ast::FlagsItemKind::Flag( |
4278 | | ast::Flag::DotMatchesNewLine |
4279 | | ), |
4280 | | }, |
4281 | | ast::FlagsItem { |
4282 | | span: span(2..3), |
4283 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4284 | | }, |
4285 | | ], |
4286 | | }) |
4287 | | ); |
4288 | | |
4289 | | assert_eq!( |
4290 | | parser("-isU:").parse_flags(), |
4291 | | Ok(ast::Flags { |
4292 | | span: span(0..4), |
4293 | | items: vec![ |
4294 | | ast::FlagsItem { |
4295 | | span: span(0..1), |
4296 | | kind: ast::FlagsItemKind::Negation, |
4297 | | }, |
4298 | | ast::FlagsItem { |
4299 | | span: span(1..2), |
4300 | | kind: ast::FlagsItemKind::Flag( |
4301 | | ast::Flag::CaseInsensitive |
4302 | | ), |
4303 | | }, |
4304 | | ast::FlagsItem { |
4305 | | span: span(2..3), |
4306 | | kind: ast::FlagsItemKind::Flag( |
4307 | | ast::Flag::DotMatchesNewLine |
4308 | | ), |
4309 | | }, |
4310 | | ast::FlagsItem { |
4311 | | span: span(3..4), |
4312 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4313 | | }, |
4314 | | ], |
4315 | | }) |
4316 | | ); |
4317 | | assert_eq!( |
4318 | | parser("i-sU:").parse_flags(), |
4319 | | Ok(ast::Flags { |
4320 | | span: span(0..4), |
4321 | | items: vec![ |
4322 | | ast::FlagsItem { |
4323 | | span: span(0..1), |
4324 | | kind: ast::FlagsItemKind::Flag( |
4325 | | ast::Flag::CaseInsensitive |
4326 | | ), |
4327 | | }, |
4328 | | ast::FlagsItem { |
4329 | | span: span(1..2), |
4330 | | kind: ast::FlagsItemKind::Negation, |
4331 | | }, |
4332 | | ast::FlagsItem { |
4333 | | span: span(2..3), |
4334 | | kind: ast::FlagsItemKind::Flag( |
4335 | | ast::Flag::DotMatchesNewLine |
4336 | | ), |
4337 | | }, |
4338 | | ast::FlagsItem { |
4339 | | span: span(3..4), |
4340 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4341 | | }, |
4342 | | ], |
4343 | | }) |
4344 | | ); |
4345 | | assert_eq!( |
4346 | | parser("i-sR:").parse_flags(), |
4347 | | Ok(ast::Flags { |
4348 | | span: span(0..4), |
4349 | | items: vec![ |
4350 | | ast::FlagsItem { |
4351 | | span: span(0..1), |
4352 | | kind: ast::FlagsItemKind::Flag( |
4353 | | ast::Flag::CaseInsensitive |
4354 | | ), |
4355 | | }, |
4356 | | ast::FlagsItem { |
4357 | | span: span(1..2), |
4358 | | kind: ast::FlagsItemKind::Negation, |
4359 | | }, |
4360 | | ast::FlagsItem { |
4361 | | span: span(2..3), |
4362 | | kind: ast::FlagsItemKind::Flag( |
4363 | | ast::Flag::DotMatchesNewLine |
4364 | | ), |
4365 | | }, |
4366 | | ast::FlagsItem { |
4367 | | span: span(3..4), |
4368 | | kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), |
4369 | | }, |
4370 | | ], |
4371 | | }) |
4372 | | ); |
4373 | | |
4374 | | assert_eq!( |
4375 | | parser("isU").parse_flags().unwrap_err(), |
4376 | | TestError { |
4377 | | span: span(3..3), |
4378 | | kind: ast::ErrorKind::FlagUnexpectedEof, |
4379 | | } |
4380 | | ); |
4381 | | assert_eq!( |
4382 | | parser("isUa:").parse_flags().unwrap_err(), |
4383 | | TestError { |
4384 | | span: span(3..4), |
4385 | | kind: ast::ErrorKind::FlagUnrecognized, |
4386 | | } |
4387 | | ); |
4388 | | assert_eq!( |
4389 | | parser("isUi:").parse_flags().unwrap_err(), |
4390 | | TestError { |
4391 | | span: span(3..4), |
4392 | | kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, |
4393 | | } |
4394 | | ); |
4395 | | assert_eq!( |
4396 | | parser("i-sU-i:").parse_flags().unwrap_err(), |
4397 | | TestError { |
4398 | | span: span(4..5), |
4399 | | kind: ast::ErrorKind::FlagRepeatedNegation { |
4400 | | original: span(1..2), |
4401 | | }, |
4402 | | } |
4403 | | ); |
4404 | | assert_eq!( |
4405 | | parser("-)").parse_flags().unwrap_err(), |
4406 | | TestError { |
4407 | | span: span(0..1), |
4408 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4409 | | } |
4410 | | ); |
4411 | | assert_eq!( |
4412 | | parser("i-)").parse_flags().unwrap_err(), |
4413 | | TestError { |
4414 | | span: span(1..2), |
4415 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4416 | | } |
4417 | | ); |
4418 | | assert_eq!( |
4419 | | parser("iU-)").parse_flags().unwrap_err(), |
4420 | | TestError { |
4421 | | span: span(2..3), |
4422 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4423 | | } |
4424 | | ); |
4425 | | } |
4426 | | |
4427 | | #[test] |
4428 | | fn parse_flag() { |
4429 | | assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); |
4430 | | assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); |
4431 | | assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); |
4432 | | assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); |
4433 | | assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); |
4434 | | assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); |
4435 | | assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); |
4436 | | |
4437 | | assert_eq!( |
4438 | | parser("a").parse_flag().unwrap_err(), |
4439 | | TestError { |
4440 | | span: span(0..1), |
4441 | | kind: ast::ErrorKind::FlagUnrecognized, |
4442 | | } |
4443 | | ); |
4444 | | assert_eq!( |
4445 | | parser("☃").parse_flag().unwrap_err(), |
4446 | | TestError { |
4447 | | span: span_range("☃", 0..3), |
4448 | | kind: ast::ErrorKind::FlagUnrecognized, |
4449 | | } |
4450 | | ); |
4451 | | } |
4452 | | |
4453 | | #[test] |
4454 | | fn parse_primitive_non_escape() { |
4455 | | assert_eq!( |
4456 | | parser(r".").parse_primitive(), |
4457 | | Ok(Primitive::Dot(span(0..1))) |
4458 | | ); |
4459 | | assert_eq!( |
4460 | | parser(r"^").parse_primitive(), |
4461 | | Ok(Primitive::Assertion(ast::Assertion { |
4462 | | span: span(0..1), |
4463 | | kind: ast::AssertionKind::StartLine, |
4464 | | })) |
4465 | | ); |
4466 | | assert_eq!( |
4467 | | parser(r"$").parse_primitive(), |
4468 | | Ok(Primitive::Assertion(ast::Assertion { |
4469 | | span: span(0..1), |
4470 | | kind: ast::AssertionKind::EndLine, |
4471 | | })) |
4472 | | ); |
4473 | | |
4474 | | assert_eq!( |
4475 | | parser(r"a").parse_primitive(), |
4476 | | Ok(Primitive::Literal(ast::Literal { |
4477 | | span: span(0..1), |
4478 | | kind: ast::LiteralKind::Verbatim, |
4479 | | c: 'a', |
4480 | | })) |
4481 | | ); |
4482 | | assert_eq!( |
4483 | | parser(r"|").parse_primitive(), |
4484 | | Ok(Primitive::Literal(ast::Literal { |
4485 | | span: span(0..1), |
4486 | | kind: ast::LiteralKind::Verbatim, |
4487 | | c: '|', |
4488 | | })) |
4489 | | ); |
4490 | | assert_eq!( |
4491 | | parser(r"☃").parse_primitive(), |
4492 | | Ok(Primitive::Literal(ast::Literal { |
4493 | | span: span_range("☃", 0..3), |
4494 | | kind: ast::LiteralKind::Verbatim, |
4495 | | c: '☃', |
4496 | | })) |
4497 | | ); |
4498 | | } |
4499 | | |
4500 | | #[test] |
4501 | | fn parse_escape() { |
4502 | | assert_eq!( |
4503 | | parser(r"\|").parse_primitive(), |
4504 | | Ok(Primitive::Literal(ast::Literal { |
4505 | | span: span(0..2), |
4506 | | kind: ast::LiteralKind::Meta, |
4507 | | c: '|', |
4508 | | })) |
4509 | | ); |
4510 | | let specials = &[ |
4511 | | (r"\a", '\x07', ast::SpecialLiteralKind::Bell), |
4512 | | (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), |
4513 | | (r"\t", '\t', ast::SpecialLiteralKind::Tab), |
4514 | | (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), |
4515 | | (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), |
4516 | | (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), |
4517 | | ]; |
4518 | | for &(pat, c, ref kind) in specials { |
4519 | | assert_eq!( |
4520 | | parser(pat).parse_primitive(), |
4521 | | Ok(Primitive::Literal(ast::Literal { |
4522 | | span: span(0..2), |
4523 | | kind: ast::LiteralKind::Special(kind.clone()), |
4524 | | c, |
4525 | | })) |
4526 | | ); |
4527 | | } |
4528 | | assert_eq!( |
4529 | | parser(r"\A").parse_primitive(), |
4530 | | Ok(Primitive::Assertion(ast::Assertion { |
4531 | | span: span(0..2), |
4532 | | kind: ast::AssertionKind::StartText, |
4533 | | })) |
4534 | | ); |
4535 | | assert_eq!( |
4536 | | parser(r"\z").parse_primitive(), |
4537 | | Ok(Primitive::Assertion(ast::Assertion { |
4538 | | span: span(0..2), |
4539 | | kind: ast::AssertionKind::EndText, |
4540 | | })) |
4541 | | ); |
4542 | | assert_eq!( |
4543 | | parser(r"\b").parse_primitive(), |
4544 | | Ok(Primitive::Assertion(ast::Assertion { |
4545 | | span: span(0..2), |
4546 | | kind: ast::AssertionKind::WordBoundary, |
4547 | | })) |
4548 | | ); |
4549 | | assert_eq!( |
4550 | | parser(r"\b{start}").parse_primitive(), |
4551 | | Ok(Primitive::Assertion(ast::Assertion { |
4552 | | span: span(0..9), |
4553 | | kind: ast::AssertionKind::WordBoundaryStart, |
4554 | | })) |
4555 | | ); |
4556 | | assert_eq!( |
4557 | | parser(r"\b{end}").parse_primitive(), |
4558 | | Ok(Primitive::Assertion(ast::Assertion { |
4559 | | span: span(0..7), |
4560 | | kind: ast::AssertionKind::WordBoundaryEnd, |
4561 | | })) |
4562 | | ); |
4563 | | assert_eq!( |
4564 | | parser(r"\b{start-half}").parse_primitive(), |
4565 | | Ok(Primitive::Assertion(ast::Assertion { |
4566 | | span: span(0..14), |
4567 | | kind: ast::AssertionKind::WordBoundaryStartHalf, |
4568 | | })) |
4569 | | ); |
4570 | | assert_eq!( |
4571 | | parser(r"\b{end-half}").parse_primitive(), |
4572 | | Ok(Primitive::Assertion(ast::Assertion { |
4573 | | span: span(0..12), |
4574 | | kind: ast::AssertionKind::WordBoundaryEndHalf, |
4575 | | })) |
4576 | | ); |
4577 | | assert_eq!( |
4578 | | parser(r"\<").parse_primitive(), |
4579 | | Ok(Primitive::Assertion(ast::Assertion { |
4580 | | span: span(0..2), |
4581 | | kind: ast::AssertionKind::WordBoundaryStartAngle, |
4582 | | })) |
4583 | | ); |
4584 | | assert_eq!( |
4585 | | parser(r"\>").parse_primitive(), |
4586 | | Ok(Primitive::Assertion(ast::Assertion { |
4587 | | span: span(0..2), |
4588 | | kind: ast::AssertionKind::WordBoundaryEndAngle, |
4589 | | })) |
4590 | | ); |
4591 | | assert_eq!( |
4592 | | parser(r"\B").parse_primitive(), |
4593 | | Ok(Primitive::Assertion(ast::Assertion { |
4594 | | span: span(0..2), |
4595 | | kind: ast::AssertionKind::NotWordBoundary, |
4596 | | })) |
4597 | | ); |
4598 | | |
4599 | | // We also support superfluous escapes in most cases now too. |
4600 | | for c in ['!', '@', '%', '"', '\'', '/', ' '] { |
4601 | | let pat = format!(r"\{}", c); |
4602 | | assert_eq!( |
4603 | | parser(&pat).parse_primitive(), |
4604 | | Ok(Primitive::Literal(ast::Literal { |
4605 | | span: span(0..2), |
4606 | | kind: ast::LiteralKind::Superfluous, |
4607 | | c, |
4608 | | })) |
4609 | | ); |
4610 | | } |
4611 | | |
4612 | | // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This |
4613 | | // gives flexibility for future evolution. |
4614 | | assert_eq!( |
4615 | | parser(r"\e").parse_escape().unwrap_err(), |
4616 | | TestError { |
4617 | | span: span(0..2), |
4618 | | kind: ast::ErrorKind::EscapeUnrecognized, |
4619 | | } |
4620 | | ); |
4621 | | assert_eq!( |
4622 | | parser(r"\y").parse_escape().unwrap_err(), |
4623 | | TestError { |
4624 | | span: span(0..2), |
4625 | | kind: ast::ErrorKind::EscapeUnrecognized, |
4626 | | } |
4627 | | ); |
4628 | | |
4629 | | // Starting a special word boundary without any non-whitespace chars |
4630 | | // after the brace makes it ambiguous whether the user meant to write |
4631 | | // a counted repetition (probably not?) or an actual special word |
4632 | | // boundary assertion. |
4633 | | assert_eq!( |
4634 | | parser(r"\b{").parse_escape().unwrap_err(), |
4635 | | TestError { |
4636 | | span: span(0..3), |
4637 | | kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
4638 | | } |
4639 | | ); |
4640 | | assert_eq!( |
4641 | | parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), |
4642 | | TestError { |
4643 | | span: span(0..4), |
4644 | | kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
4645 | | } |
4646 | | ); |
4647 | | // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, |
4648 | | // and thus causes the parser to treat it as a counted repetition. |
4649 | | assert_eq!( |
4650 | | parser(r"\b{ ").parse().unwrap_err(), |
4651 | | TestError { |
4652 | | span: span(2..4), |
4653 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
4654 | | } |
4655 | | ); |
4656 | | // In this case, we got some valid chars that makes it look like the |
4657 | | // user is writing one of the special word boundary assertions, but |
4658 | | // we forget to close the brace. |
4659 | | assert_eq!( |
4660 | | parser(r"\b{foo").parse_escape().unwrap_err(), |
4661 | | TestError { |
4662 | | span: span(2..6), |
4663 | | kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, |
4664 | | } |
4665 | | ); |
4666 | | // We get the same error as above, except it is provoked by seeing a |
4667 | | // char that we know is invalid before seeing a closing brace. |
4668 | | assert_eq!( |
4669 | | parser(r"\b{foo!}").parse_escape().unwrap_err(), |
4670 | | TestError { |
4671 | | span: span(2..6), |
4672 | | kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, |
4673 | | } |
4674 | | ); |
4675 | | // And this one occurs when, syntactically, everything looks okay, but |
4676 | | // we don't use a valid spelling of a word boundary assertion. |
4677 | | assert_eq!( |
4678 | | parser(r"\b{foo}").parse_escape().unwrap_err(), |
4679 | | TestError { |
4680 | | span: span(3..6), |
4681 | | kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, |
4682 | | } |
4683 | | ); |
4684 | | |
4685 | | // An unfinished escape is illegal. |
4686 | | assert_eq!( |
4687 | | parser(r"\").parse_escape().unwrap_err(), |
4688 | | TestError { |
4689 | | span: span(0..1), |
4690 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4691 | | } |
4692 | | ); |
4693 | | } |
4694 | | |
4695 | | #[test] |
4696 | | fn parse_unsupported_backreference() { |
4697 | | assert_eq!( |
4698 | | parser(r"\0").parse_escape().unwrap_err(), |
4699 | | TestError { |
4700 | | span: span(0..2), |
4701 | | kind: ast::ErrorKind::UnsupportedBackreference, |
4702 | | } |
4703 | | ); |
4704 | | assert_eq!( |
4705 | | parser(r"\9").parse_escape().unwrap_err(), |
4706 | | TestError { |
4707 | | span: span(0..2), |
4708 | | kind: ast::ErrorKind::UnsupportedBackreference, |
4709 | | } |
4710 | | ); |
4711 | | } |
4712 | | |
4713 | | #[test] |
4714 | | fn parse_octal() { |
4715 | | for i in 0..511 { |
4716 | | let pat = format!(r"\{:o}", i); |
4717 | | assert_eq!( |
4718 | | parser_octal(&pat).parse_escape(), |
4719 | | Ok(Primitive::Literal(ast::Literal { |
4720 | | span: span(0..pat.len()), |
4721 | | kind: ast::LiteralKind::Octal, |
4722 | | c: char::from_u32(i).unwrap(), |
4723 | | })) |
4724 | | ); |
4725 | | } |
4726 | | assert_eq!( |
4727 | | parser_octal(r"\778").parse_escape(), |
4728 | | Ok(Primitive::Literal(ast::Literal { |
4729 | | span: span(0..3), |
4730 | | kind: ast::LiteralKind::Octal, |
4731 | | c: '?', |
4732 | | })) |
4733 | | ); |
4734 | | assert_eq!( |
4735 | | parser_octal(r"\7777").parse_escape(), |
4736 | | Ok(Primitive::Literal(ast::Literal { |
4737 | | span: span(0..4), |
4738 | | kind: ast::LiteralKind::Octal, |
4739 | | c: '\u{01FF}', |
4740 | | })) |
4741 | | ); |
4742 | | assert_eq!( |
4743 | | parser_octal(r"\778").parse(), |
4744 | | Ok(Ast::concat(ast::Concat { |
4745 | | span: span(0..4), |
4746 | | asts: vec![ |
4747 | | Ast::literal(ast::Literal { |
4748 | | span: span(0..3), |
4749 | | kind: ast::LiteralKind::Octal, |
4750 | | c: '?', |
4751 | | }), |
4752 | | Ast::literal(ast::Literal { |
4753 | | span: span(3..4), |
4754 | | kind: ast::LiteralKind::Verbatim, |
4755 | | c: '8', |
4756 | | }), |
4757 | | ], |
4758 | | })) |
4759 | | ); |
4760 | | assert_eq!( |
4761 | | parser_octal(r"\7777").parse(), |
4762 | | Ok(Ast::concat(ast::Concat { |
4763 | | span: span(0..5), |
4764 | | asts: vec![ |
4765 | | Ast::literal(ast::Literal { |
4766 | | span: span(0..4), |
4767 | | kind: ast::LiteralKind::Octal, |
4768 | | c: '\u{01FF}', |
4769 | | }), |
4770 | | Ast::literal(ast::Literal { |
4771 | | span: span(4..5), |
4772 | | kind: ast::LiteralKind::Verbatim, |
4773 | | c: '7', |
4774 | | }), |
4775 | | ], |
4776 | | })) |
4777 | | ); |
4778 | | |
4779 | | assert_eq!( |
4780 | | parser_octal(r"\8").parse_escape().unwrap_err(), |
4781 | | TestError { |
4782 | | span: span(0..2), |
4783 | | kind: ast::ErrorKind::EscapeUnrecognized, |
4784 | | } |
4785 | | ); |
4786 | | } |
4787 | | |
4788 | | #[test] |
4789 | | fn parse_hex_two() { |
4790 | | for i in 0..256 { |
4791 | | let pat = format!(r"\x{:02x}", i); |
4792 | | assert_eq!( |
4793 | | parser(&pat).parse_escape(), |
4794 | | Ok(Primitive::Literal(ast::Literal { |
4795 | | span: span(0..pat.len()), |
4796 | | kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), |
4797 | | c: char::from_u32(i).unwrap(), |
4798 | | })) |
4799 | | ); |
4800 | | } |
4801 | | |
4802 | | assert_eq!( |
4803 | | parser(r"\xF").parse_escape().unwrap_err(), |
4804 | | TestError { |
4805 | | span: span(3..3), |
4806 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4807 | | } |
4808 | | ); |
4809 | | assert_eq!( |
4810 | | parser(r"\xG").parse_escape().unwrap_err(), |
4811 | | TestError { |
4812 | | span: span(2..3), |
4813 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4814 | | } |
4815 | | ); |
4816 | | assert_eq!( |
4817 | | parser(r"\xFG").parse_escape().unwrap_err(), |
4818 | | TestError { |
4819 | | span: span(3..4), |
4820 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4821 | | } |
4822 | | ); |
4823 | | } |
4824 | | |
4825 | | #[test] |
4826 | | fn parse_hex_four() { |
4827 | | for i in 0..65536 { |
4828 | | let c = match char::from_u32(i) { |
4829 | | None => continue, |
4830 | | Some(c) => c, |
4831 | | }; |
4832 | | let pat = format!(r"\u{:04x}", i); |
4833 | | assert_eq!( |
4834 | | parser(&pat).parse_escape(), |
4835 | | Ok(Primitive::Literal(ast::Literal { |
4836 | | span: span(0..pat.len()), |
4837 | | kind: ast::LiteralKind::HexFixed( |
4838 | | ast::HexLiteralKind::UnicodeShort |
4839 | | ), |
4840 | | c, |
4841 | | })) |
4842 | | ); |
4843 | | } |
4844 | | |
4845 | | assert_eq!( |
4846 | | parser(r"\uF").parse_escape().unwrap_err(), |
4847 | | TestError { |
4848 | | span: span(3..3), |
4849 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4850 | | } |
4851 | | ); |
4852 | | assert_eq!( |
4853 | | parser(r"\uG").parse_escape().unwrap_err(), |
4854 | | TestError { |
4855 | | span: span(2..3), |
4856 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4857 | | } |
4858 | | ); |
4859 | | assert_eq!( |
4860 | | parser(r"\uFG").parse_escape().unwrap_err(), |
4861 | | TestError { |
4862 | | span: span(3..4), |
4863 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4864 | | } |
4865 | | ); |
4866 | | assert_eq!( |
4867 | | parser(r"\uFFG").parse_escape().unwrap_err(), |
4868 | | TestError { |
4869 | | span: span(4..5), |
4870 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4871 | | } |
4872 | | ); |
4873 | | assert_eq!( |
4874 | | parser(r"\uFFFG").parse_escape().unwrap_err(), |
4875 | | TestError { |
4876 | | span: span(5..6), |
4877 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4878 | | } |
4879 | | ); |
4880 | | assert_eq!( |
4881 | | parser(r"\uD800").parse_escape().unwrap_err(), |
4882 | | TestError { |
4883 | | span: span(2..6), |
4884 | | kind: ast::ErrorKind::EscapeHexInvalid, |
4885 | | } |
4886 | | ); |
4887 | | } |
4888 | | |
4889 | | #[test] |
4890 | | fn parse_hex_eight() { |
4891 | | for i in 0..65536 { |
4892 | | let c = match char::from_u32(i) { |
4893 | | None => continue, |
4894 | | Some(c) => c, |
4895 | | }; |
4896 | | let pat = format!(r"\U{:08x}", i); |
4897 | | assert_eq!( |
4898 | | parser(&pat).parse_escape(), |
4899 | | Ok(Primitive::Literal(ast::Literal { |
4900 | | span: span(0..pat.len()), |
4901 | | kind: ast::LiteralKind::HexFixed( |
4902 | | ast::HexLiteralKind::UnicodeLong |
4903 | | ), |
4904 | | c, |
4905 | | })) |
4906 | | ); |
4907 | | } |
4908 | | |
4909 | | assert_eq!( |
4910 | | parser(r"\UF").parse_escape().unwrap_err(), |
4911 | | TestError { |
4912 | | span: span(3..3), |
4913 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4914 | | } |
4915 | | ); |
4916 | | assert_eq!( |
4917 | | parser(r"\UG").parse_escape().unwrap_err(), |
4918 | | TestError { |
4919 | | span: span(2..3), |
4920 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4921 | | } |
4922 | | ); |
4923 | | assert_eq!( |
4924 | | parser(r"\UFG").parse_escape().unwrap_err(), |
4925 | | TestError { |
4926 | | span: span(3..4), |
4927 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4928 | | } |
4929 | | ); |
4930 | | assert_eq!( |
4931 | | parser(r"\UFFG").parse_escape().unwrap_err(), |
4932 | | TestError { |
4933 | | span: span(4..5), |
4934 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4935 | | } |
4936 | | ); |
4937 | | assert_eq!( |
4938 | | parser(r"\UFFFG").parse_escape().unwrap_err(), |
4939 | | TestError { |
4940 | | span: span(5..6), |
4941 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4942 | | } |
4943 | | ); |
4944 | | assert_eq!( |
4945 | | parser(r"\UFFFFG").parse_escape().unwrap_err(), |
4946 | | TestError { |
4947 | | span: span(6..7), |
4948 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4949 | | } |
4950 | | ); |
4951 | | assert_eq!( |
4952 | | parser(r"\UFFFFFG").parse_escape().unwrap_err(), |
4953 | | TestError { |
4954 | | span: span(7..8), |
4955 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4956 | | } |
4957 | | ); |
4958 | | assert_eq!( |
4959 | | parser(r"\UFFFFFFG").parse_escape().unwrap_err(), |
4960 | | TestError { |
4961 | | span: span(8..9), |
4962 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4963 | | } |
4964 | | ); |
4965 | | assert_eq!( |
4966 | | parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), |
4967 | | TestError { |
4968 | | span: span(9..10), |
4969 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4970 | | } |
4971 | | ); |
4972 | | } |
4973 | | |
4974 | | #[test] |
4975 | | fn parse_hex_brace() { |
4976 | | assert_eq!( |
4977 | | parser(r"\u{26c4}").parse_escape(), |
4978 | | Ok(Primitive::Literal(ast::Literal { |
4979 | | span: span(0..8), |
4980 | | kind: ast::LiteralKind::HexBrace( |
4981 | | ast::HexLiteralKind::UnicodeShort |
4982 | | ), |
4983 | | c: '⛄', |
4984 | | })) |
4985 | | ); |
4986 | | assert_eq!( |
4987 | | parser(r"\U{26c4}").parse_escape(), |
4988 | | Ok(Primitive::Literal(ast::Literal { |
4989 | | span: span(0..8), |
4990 | | kind: ast::LiteralKind::HexBrace( |
4991 | | ast::HexLiteralKind::UnicodeLong |
4992 | | ), |
4993 | | c: '⛄', |
4994 | | })) |
4995 | | ); |
4996 | | assert_eq!( |
4997 | | parser(r"\x{26c4}").parse_escape(), |
4998 | | Ok(Primitive::Literal(ast::Literal { |
4999 | | span: span(0..8), |
5000 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5001 | | c: '⛄', |
5002 | | })) |
5003 | | ); |
5004 | | assert_eq!( |
5005 | | parser(r"\x{26C4}").parse_escape(), |
5006 | | Ok(Primitive::Literal(ast::Literal { |
5007 | | span: span(0..8), |
5008 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5009 | | c: '⛄', |
5010 | | })) |
5011 | | ); |
5012 | | assert_eq!( |
5013 | | parser(r"\x{10fFfF}").parse_escape(), |
5014 | | Ok(Primitive::Literal(ast::Literal { |
5015 | | span: span(0..10), |
5016 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5017 | | c: '\u{10FFFF}', |
5018 | | })) |
5019 | | ); |
5020 | | |
5021 | | assert_eq!( |
5022 | | parser(r"\x").parse_escape().unwrap_err(), |
5023 | | TestError { |
5024 | | span: span(2..2), |
5025 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5026 | | } |
5027 | | ); |
5028 | | assert_eq!( |
5029 | | parser(r"\x{").parse_escape().unwrap_err(), |
5030 | | TestError { |
5031 | | span: span(2..3), |
5032 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5033 | | } |
5034 | | ); |
5035 | | assert_eq!( |
5036 | | parser(r"\x{FF").parse_escape().unwrap_err(), |
5037 | | TestError { |
5038 | | span: span(2..5), |
5039 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5040 | | } |
5041 | | ); |
5042 | | assert_eq!( |
5043 | | parser(r"\x{}").parse_escape().unwrap_err(), |
5044 | | TestError { |
5045 | | span: span(2..4), |
5046 | | kind: ast::ErrorKind::EscapeHexEmpty, |
5047 | | } |
5048 | | ); |
5049 | | assert_eq!( |
5050 | | parser(r"\x{FGF}").parse_escape().unwrap_err(), |
5051 | | TestError { |
5052 | | span: span(4..5), |
5053 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
5054 | | } |
5055 | | ); |
5056 | | assert_eq!( |
5057 | | parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), |
5058 | | TestError { |
5059 | | span: span(3..9), |
5060 | | kind: ast::ErrorKind::EscapeHexInvalid, |
5061 | | } |
5062 | | ); |
5063 | | assert_eq!( |
5064 | | parser(r"\x{D800}").parse_escape().unwrap_err(), |
5065 | | TestError { |
5066 | | span: span(3..7), |
5067 | | kind: ast::ErrorKind::EscapeHexInvalid, |
5068 | | } |
5069 | | ); |
5070 | | assert_eq!( |
5071 | | parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), |
5072 | | TestError { |
5073 | | span: span(3..12), |
5074 | | kind: ast::ErrorKind::EscapeHexInvalid, |
5075 | | } |
5076 | | ); |
5077 | | } |
5078 | | |
5079 | | #[test] |
5080 | | fn parse_decimal() { |
5081 | | assert_eq!(parser("123").parse_decimal(), Ok(123)); |
5082 | | assert_eq!(parser("0").parse_decimal(), Ok(0)); |
5083 | | assert_eq!(parser("01").parse_decimal(), Ok(1)); |
5084 | | |
5085 | | assert_eq!( |
5086 | | parser("-1").parse_decimal().unwrap_err(), |
5087 | | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
5088 | | ); |
5089 | | assert_eq!( |
5090 | | parser("").parse_decimal().unwrap_err(), |
5091 | | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
5092 | | ); |
5093 | | assert_eq!( |
5094 | | parser("9999999999").parse_decimal().unwrap_err(), |
5095 | | TestError { |
5096 | | span: span(0..10), |
5097 | | kind: ast::ErrorKind::DecimalInvalid, |
5098 | | } |
5099 | | ); |
5100 | | } |
5101 | | |
5102 | | #[test] |
5103 | | fn parse_set_class() { |
5104 | | fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { |
5105 | | ast::ClassSet::union(ast::ClassSetUnion { span, items }) |
5106 | | } |
5107 | | |
5108 | | fn intersection( |
5109 | | span: Span, |
5110 | | lhs: ast::ClassSet, |
5111 | | rhs: ast::ClassSet, |
5112 | | ) -> ast::ClassSet { |
5113 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5114 | | span, |
5115 | | kind: ast::ClassSetBinaryOpKind::Intersection, |
5116 | | lhs: Box::new(lhs), |
5117 | | rhs: Box::new(rhs), |
5118 | | }) |
5119 | | } |
5120 | | |
5121 | | fn difference( |
5122 | | span: Span, |
5123 | | lhs: ast::ClassSet, |
5124 | | rhs: ast::ClassSet, |
5125 | | ) -> ast::ClassSet { |
5126 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5127 | | span, |
5128 | | kind: ast::ClassSetBinaryOpKind::Difference, |
5129 | | lhs: Box::new(lhs), |
5130 | | rhs: Box::new(rhs), |
5131 | | }) |
5132 | | } |
5133 | | |
5134 | | fn symdifference( |
5135 | | span: Span, |
5136 | | lhs: ast::ClassSet, |
5137 | | rhs: ast::ClassSet, |
5138 | | ) -> ast::ClassSet { |
5139 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5140 | | span, |
5141 | | kind: ast::ClassSetBinaryOpKind::SymmetricDifference, |
5142 | | lhs: Box::new(lhs), |
5143 | | rhs: Box::new(rhs), |
5144 | | }) |
5145 | | } |
5146 | | |
5147 | | fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { |
5148 | | ast::ClassSet::Item(item) |
5149 | | } |
5150 | | |
5151 | | fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { |
5152 | | ast::ClassSetItem::Ascii(cls) |
5153 | | } |
5154 | | |
5155 | | fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { |
5156 | | ast::ClassSetItem::Unicode(cls) |
5157 | | } |
5158 | | |
5159 | | fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { |
5160 | | ast::ClassSetItem::Perl(cls) |
5161 | | } |
5162 | | |
5163 | | fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { |
5164 | | ast::ClassSetItem::Bracketed(Box::new(cls)) |
5165 | | } |
5166 | | |
5167 | | fn lit(span: Span, c: char) -> ast::ClassSetItem { |
5168 | | ast::ClassSetItem::Literal(ast::Literal { |
5169 | | span, |
5170 | | kind: ast::LiteralKind::Verbatim, |
5171 | | c, |
5172 | | }) |
5173 | | } |
5174 | | |
5175 | | fn empty(span: Span) -> ast::ClassSetItem { |
5176 | | ast::ClassSetItem::Empty(span) |
5177 | | } |
5178 | | |
5179 | | fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { |
5180 | | let pos1 = Position { |
5181 | | offset: span.start.offset + start.len_utf8(), |
5182 | | column: span.start.column + 1, |
5183 | | ..span.start |
5184 | | }; |
5185 | | let pos2 = Position { |
5186 | | offset: span.end.offset - end.len_utf8(), |
5187 | | column: span.end.column - 1, |
5188 | | ..span.end |
5189 | | }; |
5190 | | ast::ClassSetItem::Range(ast::ClassSetRange { |
5191 | | span, |
5192 | | start: ast::Literal { |
5193 | | span: Span { end: pos1, ..span }, |
5194 | | kind: ast::LiteralKind::Verbatim, |
5195 | | c: start, |
5196 | | }, |
5197 | | end: ast::Literal { |
5198 | | span: Span { start: pos2, ..span }, |
5199 | | kind: ast::LiteralKind::Verbatim, |
5200 | | c: end, |
5201 | | }, |
5202 | | }) |
5203 | | } |
5204 | | |
5205 | | fn alnum(span: Span, negated: bool) -> ast::ClassAscii { |
5206 | | ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } |
5207 | | } |
5208 | | |
5209 | | fn lower(span: Span, negated: bool) -> ast::ClassAscii { |
5210 | | ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } |
5211 | | } |
5212 | | |
5213 | | assert_eq!( |
5214 | | parser("[[:alnum:]]").parse(), |
5215 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5216 | | span: span(0..11), |
5217 | | negated: false, |
5218 | | kind: itemset(item_ascii(alnum(span(1..10), false))), |
5219 | | })) |
5220 | | ); |
5221 | | assert_eq!( |
5222 | | parser("[[[:alnum:]]]").parse(), |
5223 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5224 | | span: span(0..13), |
5225 | | negated: false, |
5226 | | kind: itemset(item_bracket(ast::ClassBracketed { |
5227 | | span: span(1..12), |
5228 | | negated: false, |
5229 | | kind: itemset(item_ascii(alnum(span(2..11), false))), |
5230 | | })), |
5231 | | })) |
5232 | | ); |
5233 | | assert_eq!( |
5234 | | parser("[[:alnum:]&&[:lower:]]").parse(), |
5235 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5236 | | span: span(0..22), |
5237 | | negated: false, |
5238 | | kind: intersection( |
5239 | | span(1..21), |
5240 | | itemset(item_ascii(alnum(span(1..10), false))), |
5241 | | itemset(item_ascii(lower(span(12..21), false))), |
5242 | | ), |
5243 | | })) |
5244 | | ); |
5245 | | assert_eq!( |
5246 | | parser("[[:alnum:]--[:lower:]]").parse(), |
5247 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5248 | | span: span(0..22), |
5249 | | negated: false, |
5250 | | kind: difference( |
5251 | | span(1..21), |
5252 | | itemset(item_ascii(alnum(span(1..10), false))), |
5253 | | itemset(item_ascii(lower(span(12..21), false))), |
5254 | | ), |
5255 | | })) |
5256 | | ); |
5257 | | assert_eq!( |
5258 | | parser("[[:alnum:]~~[:lower:]]").parse(), |
5259 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5260 | | span: span(0..22), |
5261 | | negated: false, |
5262 | | kind: symdifference( |
5263 | | span(1..21), |
5264 | | itemset(item_ascii(alnum(span(1..10), false))), |
5265 | | itemset(item_ascii(lower(span(12..21), false))), |
5266 | | ), |
5267 | | })) |
5268 | | ); |
5269 | | |
5270 | | assert_eq!( |
5271 | | parser("[a]").parse(), |
5272 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5273 | | span: span(0..3), |
5274 | | negated: false, |
5275 | | kind: itemset(lit(span(1..2), 'a')), |
5276 | | })) |
5277 | | ); |
5278 | | assert_eq!( |
5279 | | parser(r"[a\]]").parse(), |
5280 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5281 | | span: span(0..5), |
5282 | | negated: false, |
5283 | | kind: union( |
5284 | | span(1..4), |
5285 | | vec![ |
5286 | | lit(span(1..2), 'a'), |
5287 | | ast::ClassSetItem::Literal(ast::Literal { |
5288 | | span: span(2..4), |
5289 | | kind: ast::LiteralKind::Meta, |
5290 | | c: ']', |
5291 | | }), |
5292 | | ] |
5293 | | ), |
5294 | | })) |
5295 | | ); |
5296 | | assert_eq!( |
5297 | | parser(r"[a\-z]").parse(), |
5298 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5299 | | span: span(0..6), |
5300 | | negated: false, |
5301 | | kind: union( |
5302 | | span(1..5), |
5303 | | vec![ |
5304 | | lit(span(1..2), 'a'), |
5305 | | ast::ClassSetItem::Literal(ast::Literal { |
5306 | | span: span(2..4), |
5307 | | kind: ast::LiteralKind::Meta, |
5308 | | c: '-', |
5309 | | }), |
5310 | | lit(span(4..5), 'z'), |
5311 | | ] |
5312 | | ), |
5313 | | })) |
5314 | | ); |
5315 | | assert_eq!( |
5316 | | parser("[ab]").parse(), |
5317 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5318 | | span: span(0..4), |
5319 | | negated: false, |
5320 | | kind: union( |
5321 | | span(1..3), |
5322 | | vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] |
5323 | | ), |
5324 | | })) |
5325 | | ); |
5326 | | assert_eq!( |
5327 | | parser("[a-]").parse(), |
5328 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5329 | | span: span(0..4), |
5330 | | negated: false, |
5331 | | kind: union( |
5332 | | span(1..3), |
5333 | | vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] |
5334 | | ), |
5335 | | })) |
5336 | | ); |
5337 | | assert_eq!( |
5338 | | parser("[-a]").parse(), |
5339 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5340 | | span: span(0..4), |
5341 | | negated: false, |
5342 | | kind: union( |
5343 | | span(1..3), |
5344 | | vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] |
5345 | | ), |
5346 | | })) |
5347 | | ); |
5348 | | assert_eq!( |
5349 | | parser(r"[\pL]").parse(), |
5350 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5351 | | span: span(0..5), |
5352 | | negated: false, |
5353 | | kind: itemset(item_unicode(ast::ClassUnicode { |
5354 | | span: span(1..4), |
5355 | | negated: false, |
5356 | | kind: ast::ClassUnicodeKind::OneLetter('L'), |
5357 | | })), |
5358 | | })) |
5359 | | ); |
5360 | | assert_eq!( |
5361 | | parser(r"[\w]").parse(), |
5362 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5363 | | span: span(0..4), |
5364 | | negated: false, |
5365 | | kind: itemset(item_perl(ast::ClassPerl { |
5366 | | span: span(1..3), |
5367 | | kind: ast::ClassPerlKind::Word, |
5368 | | negated: false, |
5369 | | })), |
5370 | | })) |
5371 | | ); |
5372 | | assert_eq!( |
5373 | | parser(r"[a\wz]").parse(), |
5374 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5375 | | span: span(0..6), |
5376 | | negated: false, |
5377 | | kind: union( |
5378 | | span(1..5), |
5379 | | vec![ |
5380 | | lit(span(1..2), 'a'), |
5381 | | item_perl(ast::ClassPerl { |
5382 | | span: span(2..4), |
5383 | | kind: ast::ClassPerlKind::Word, |
5384 | | negated: false, |
5385 | | }), |
5386 | | lit(span(4..5), 'z'), |
5387 | | ] |
5388 | | ), |
5389 | | })) |
5390 | | ); |
5391 | | |
5392 | | assert_eq!( |
5393 | | parser("[a-z]").parse(), |
5394 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5395 | | span: span(0..5), |
5396 | | negated: false, |
5397 | | kind: itemset(range(span(1..4), 'a', 'z')), |
5398 | | })) |
5399 | | ); |
5400 | | assert_eq!( |
5401 | | parser("[a-cx-z]").parse(), |
5402 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5403 | | span: span(0..8), |
5404 | | negated: false, |
5405 | | kind: union( |
5406 | | span(1..7), |
5407 | | vec![ |
5408 | | range(span(1..4), 'a', 'c'), |
5409 | | range(span(4..7), 'x', 'z'), |
5410 | | ] |
5411 | | ), |
5412 | | })) |
5413 | | ); |
5414 | | assert_eq!( |
5415 | | parser(r"[\w&&a-cx-z]").parse(), |
5416 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5417 | | span: span(0..12), |
5418 | | negated: false, |
5419 | | kind: intersection( |
5420 | | span(1..11), |
5421 | | itemset(item_perl(ast::ClassPerl { |
5422 | | span: span(1..3), |
5423 | | kind: ast::ClassPerlKind::Word, |
5424 | | negated: false, |
5425 | | })), |
5426 | | union( |
5427 | | span(5..11), |
5428 | | vec![ |
5429 | | range(span(5..8), 'a', 'c'), |
5430 | | range(span(8..11), 'x', 'z'), |
5431 | | ] |
5432 | | ), |
5433 | | ), |
5434 | | })) |
5435 | | ); |
5436 | | assert_eq!( |
5437 | | parser(r"[a-cx-z&&\w]").parse(), |
5438 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5439 | | span: span(0..12), |
5440 | | negated: false, |
5441 | | kind: intersection( |
5442 | | span(1..11), |
5443 | | union( |
5444 | | span(1..7), |
5445 | | vec![ |
5446 | | range(span(1..4), 'a', 'c'), |
5447 | | range(span(4..7), 'x', 'z'), |
5448 | | ] |
5449 | | ), |
5450 | | itemset(item_perl(ast::ClassPerl { |
5451 | | span: span(9..11), |
5452 | | kind: ast::ClassPerlKind::Word, |
5453 | | negated: false, |
5454 | | })), |
5455 | | ), |
5456 | | })) |
5457 | | ); |
5458 | | assert_eq!( |
5459 | | parser(r"[a--b--c]").parse(), |
5460 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5461 | | span: span(0..9), |
5462 | | negated: false, |
5463 | | kind: difference( |
5464 | | span(1..8), |
5465 | | difference( |
5466 | | span(1..5), |
5467 | | itemset(lit(span(1..2), 'a')), |
5468 | | itemset(lit(span(4..5), 'b')), |
5469 | | ), |
5470 | | itemset(lit(span(7..8), 'c')), |
5471 | | ), |
5472 | | })) |
5473 | | ); |
5474 | | assert_eq!( |
5475 | | parser(r"[a~~b~~c]").parse(), |
5476 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5477 | | span: span(0..9), |
5478 | | negated: false, |
5479 | | kind: symdifference( |
5480 | | span(1..8), |
5481 | | symdifference( |
5482 | | span(1..5), |
5483 | | itemset(lit(span(1..2), 'a')), |
5484 | | itemset(lit(span(4..5), 'b')), |
5485 | | ), |
5486 | | itemset(lit(span(7..8), 'c')), |
5487 | | ), |
5488 | | })) |
5489 | | ); |
5490 | | assert_eq!( |
5491 | | parser(r"[\^&&^]").parse(), |
5492 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5493 | | span: span(0..7), |
5494 | | negated: false, |
5495 | | kind: intersection( |
5496 | | span(1..6), |
5497 | | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5498 | | span: span(1..3), |
5499 | | kind: ast::LiteralKind::Meta, |
5500 | | c: '^', |
5501 | | })), |
5502 | | itemset(lit(span(5..6), '^')), |
5503 | | ), |
5504 | | })) |
5505 | | ); |
5506 | | assert_eq!( |
5507 | | parser(r"[\&&&&]").parse(), |
5508 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5509 | | span: span(0..7), |
5510 | | negated: false, |
5511 | | kind: intersection( |
5512 | | span(1..6), |
5513 | | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5514 | | span: span(1..3), |
5515 | | kind: ast::LiteralKind::Meta, |
5516 | | c: '&', |
5517 | | })), |
5518 | | itemset(lit(span(5..6), '&')), |
5519 | | ), |
5520 | | })) |
5521 | | ); |
5522 | | assert_eq!( |
5523 | | parser(r"[&&&&]").parse(), |
5524 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5525 | | span: span(0..6), |
5526 | | negated: false, |
5527 | | kind: intersection( |
5528 | | span(1..5), |
5529 | | intersection( |
5530 | | span(1..3), |
5531 | | itemset(empty(span(1..1))), |
5532 | | itemset(empty(span(3..3))), |
5533 | | ), |
5534 | | itemset(empty(span(5..5))), |
5535 | | ), |
5536 | | })) |
5537 | | ); |
5538 | | |
5539 | | let pat = "[☃-⛄]"; |
5540 | | assert_eq!( |
5541 | | parser(pat).parse(), |
5542 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5543 | | span: span_range(pat, 0..9), |
5544 | | negated: false, |
5545 | | kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { |
5546 | | span: span_range(pat, 1..8), |
5547 | | start: ast::Literal { |
5548 | | span: span_range(pat, 1..4), |
5549 | | kind: ast::LiteralKind::Verbatim, |
5550 | | c: '☃', |
5551 | | }, |
5552 | | end: ast::Literal { |
5553 | | span: span_range(pat, 5..8), |
5554 | | kind: ast::LiteralKind::Verbatim, |
5555 | | c: '⛄', |
5556 | | }, |
5557 | | })), |
5558 | | })) |
5559 | | ); |
5560 | | |
5561 | | assert_eq!( |
5562 | | parser(r"[]]").parse(), |
5563 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5564 | | span: span(0..3), |
5565 | | negated: false, |
5566 | | kind: itemset(lit(span(1..2), ']')), |
5567 | | })) |
5568 | | ); |
5569 | | assert_eq!( |
5570 | | parser(r"[]\[]").parse(), |
5571 | | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5572 | | span: span(0..5), |
5573 | | negated: false, |
5574 | | kind: union( |
5575 | | span(1..4), |
5576 | | vec![ |
5577 | | lit(span(1..2), ']'), |
5578 | | ast::ClassSetItem::Literal(ast::Literal { |
5579 | | span: span(2..4), |
5580 | | kind: ast::LiteralKind::Meta, |
5581 | | c: '[', |
5582 | | }), |
5583 | | ] |
5584 | | ), |
5585 | | })) |
5586 | | ); |
5587 | | assert_eq!( |
5588 | | parser(r"[\[]]").parse(), |
5589 | | Ok(concat( |
5590 | | 0..5, |
5591 | | vec![ |
5592 | | Ast::class_bracketed(ast::ClassBracketed { |
5593 | | span: span(0..4), |
5594 | | negated: false, |
5595 | | kind: itemset(ast::ClassSetItem::Literal( |
5596 | | ast::Literal { |
5597 | | span: span(1..3), |
5598 | | kind: ast::LiteralKind::Meta, |
5599 | | c: '[', |
5600 | | } |
5601 | | )), |
5602 | | }), |
5603 | | Ast::literal(ast::Literal { |
5604 | | span: span(4..5), |
5605 | | kind: ast::LiteralKind::Verbatim, |
5606 | | c: ']', |
5607 | | }), |
5608 | | ] |
5609 | | )) |
5610 | | ); |
5611 | | |
5612 | | assert_eq!( |
5613 | | parser("[").parse().unwrap_err(), |
5614 | | TestError { |
5615 | | span: span(0..1), |
5616 | | kind: ast::ErrorKind::ClassUnclosed, |
5617 | | } |
5618 | | ); |
5619 | | assert_eq!( |
5620 | | parser("[[").parse().unwrap_err(), |
5621 | | TestError { |
5622 | | span: span(1..2), |
5623 | | kind: ast::ErrorKind::ClassUnclosed, |
5624 | | } |
5625 | | ); |
5626 | | assert_eq!( |
5627 | | parser("[[-]").parse().unwrap_err(), |
5628 | | TestError { |
5629 | | span: span(0..1), |
5630 | | kind: ast::ErrorKind::ClassUnclosed, |
5631 | | } |
5632 | | ); |
5633 | | assert_eq!( |
5634 | | parser("[[[:alnum:]").parse().unwrap_err(), |
5635 | | TestError { |
5636 | | span: span(1..2), |
5637 | | kind: ast::ErrorKind::ClassUnclosed, |
5638 | | } |
5639 | | ); |
5640 | | assert_eq!( |
5641 | | parser(r"[\b]").parse().unwrap_err(), |
5642 | | TestError { |
5643 | | span: span(1..3), |
5644 | | kind: ast::ErrorKind::ClassEscapeInvalid, |
5645 | | } |
5646 | | ); |
5647 | | assert_eq!( |
5648 | | parser(r"[\w-a]").parse().unwrap_err(), |
5649 | | TestError { |
5650 | | span: span(1..3), |
5651 | | kind: ast::ErrorKind::ClassRangeLiteral, |
5652 | | } |
5653 | | ); |
5654 | | assert_eq!( |
5655 | | parser(r"[a-\w]").parse().unwrap_err(), |
5656 | | TestError { |
5657 | | span: span(3..5), |
5658 | | kind: ast::ErrorKind::ClassRangeLiteral, |
5659 | | } |
5660 | | ); |
5661 | | assert_eq!( |
5662 | | parser(r"[z-a]").parse().unwrap_err(), |
5663 | | TestError { |
5664 | | span: span(1..4), |
5665 | | kind: ast::ErrorKind::ClassRangeInvalid, |
5666 | | } |
5667 | | ); |
5668 | | |
5669 | | assert_eq!( |
5670 | | parser_ignore_whitespace("[a ").parse().unwrap_err(), |
5671 | | TestError { |
5672 | | span: span(0..1), |
5673 | | kind: ast::ErrorKind::ClassUnclosed, |
5674 | | } |
5675 | | ); |
5676 | | assert_eq!( |
5677 | | parser_ignore_whitespace("[a- ").parse().unwrap_err(), |
5678 | | TestError { |
5679 | | span: span(0..1), |
5680 | | kind: ast::ErrorKind::ClassUnclosed, |
5681 | | } |
5682 | | ); |
5683 | | } |
5684 | | |
5685 | | #[test] |
5686 | | fn parse_set_class_open() { |
5687 | | assert_eq!(parser("[a]").parse_set_class_open(), { |
5688 | | let set = ast::ClassBracketed { |
5689 | | span: span(0..1), |
5690 | | negated: false, |
5691 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5692 | | span: span(1..1), |
5693 | | items: vec![], |
5694 | | }), |
5695 | | }; |
5696 | | let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; |
5697 | | Ok((set, union)) |
5698 | | }); |
5699 | | assert_eq!( |
5700 | | parser_ignore_whitespace("[ a]").parse_set_class_open(), |
5701 | | { |
5702 | | let set = ast::ClassBracketed { |
5703 | | span: span(0..4), |
5704 | | negated: false, |
5705 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5706 | | span: span(4..4), |
5707 | | items: vec![], |
5708 | | }), |
5709 | | }; |
5710 | | let union = |
5711 | | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5712 | | Ok((set, union)) |
5713 | | } |
5714 | | ); |
5715 | | assert_eq!(parser("[^a]").parse_set_class_open(), { |
5716 | | let set = ast::ClassBracketed { |
5717 | | span: span(0..2), |
5718 | | negated: true, |
5719 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5720 | | span: span(2..2), |
5721 | | items: vec![], |
5722 | | }), |
5723 | | }; |
5724 | | let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; |
5725 | | Ok((set, union)) |
5726 | | }); |
5727 | | assert_eq!( |
5728 | | parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), |
5729 | | { |
5730 | | let set = ast::ClassBracketed { |
5731 | | span: span(0..4), |
5732 | | negated: true, |
5733 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5734 | | span: span(4..4), |
5735 | | items: vec![], |
5736 | | }), |
5737 | | }; |
5738 | | let union = |
5739 | | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5740 | | Ok((set, union)) |
5741 | | } |
5742 | | ); |
5743 | | assert_eq!(parser("[-a]").parse_set_class_open(), { |
5744 | | let set = ast::ClassBracketed { |
5745 | | span: span(0..2), |
5746 | | negated: false, |
5747 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5748 | | span: span(1..1), |
5749 | | items: vec![], |
5750 | | }), |
5751 | | }; |
5752 | | let union = ast::ClassSetUnion { |
5753 | | span: span(1..2), |
5754 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5755 | | span: span(1..2), |
5756 | | kind: ast::LiteralKind::Verbatim, |
5757 | | c: '-', |
5758 | | })], |
5759 | | }; |
5760 | | Ok((set, union)) |
5761 | | }); |
5762 | | assert_eq!( |
5763 | | parser_ignore_whitespace("[ - a]").parse_set_class_open(), |
5764 | | { |
5765 | | let set = ast::ClassBracketed { |
5766 | | span: span(0..4), |
5767 | | negated: false, |
5768 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5769 | | span: span(2..2), |
5770 | | items: vec![], |
5771 | | }), |
5772 | | }; |
5773 | | let union = ast::ClassSetUnion { |
5774 | | span: span(2..3), |
5775 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5776 | | span: span(2..3), |
5777 | | kind: ast::LiteralKind::Verbatim, |
5778 | | c: '-', |
5779 | | })], |
5780 | | }; |
5781 | | Ok((set, union)) |
5782 | | } |
5783 | | ); |
5784 | | assert_eq!(parser("[^-a]").parse_set_class_open(), { |
5785 | | let set = ast::ClassBracketed { |
5786 | | span: span(0..3), |
5787 | | negated: true, |
5788 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5789 | | span: span(2..2), |
5790 | | items: vec![], |
5791 | | }), |
5792 | | }; |
5793 | | let union = ast::ClassSetUnion { |
5794 | | span: span(2..3), |
5795 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5796 | | span: span(2..3), |
5797 | | kind: ast::LiteralKind::Verbatim, |
5798 | | c: '-', |
5799 | | })], |
5800 | | }; |
5801 | | Ok((set, union)) |
5802 | | }); |
5803 | | assert_eq!(parser("[--a]").parse_set_class_open(), { |
5804 | | let set = ast::ClassBracketed { |
5805 | | span: span(0..3), |
5806 | | negated: false, |
5807 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5808 | | span: span(1..1), |
5809 | | items: vec![], |
5810 | | }), |
5811 | | }; |
5812 | | let union = ast::ClassSetUnion { |
5813 | | span: span(1..3), |
5814 | | items: vec![ |
5815 | | ast::ClassSetItem::Literal(ast::Literal { |
5816 | | span: span(1..2), |
5817 | | kind: ast::LiteralKind::Verbatim, |
5818 | | c: '-', |
5819 | | }), |
5820 | | ast::ClassSetItem::Literal(ast::Literal { |
5821 | | span: span(2..3), |
5822 | | kind: ast::LiteralKind::Verbatim, |
5823 | | c: '-', |
5824 | | }), |
5825 | | ], |
5826 | | }; |
5827 | | Ok((set, union)) |
5828 | | }); |
5829 | | assert_eq!(parser("[]a]").parse_set_class_open(), { |
5830 | | let set = ast::ClassBracketed { |
5831 | | span: span(0..2), |
5832 | | negated: false, |
5833 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5834 | | span: span(1..1), |
5835 | | items: vec![], |
5836 | | }), |
5837 | | }; |
5838 | | let union = ast::ClassSetUnion { |
5839 | | span: span(1..2), |
5840 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5841 | | span: span(1..2), |
5842 | | kind: ast::LiteralKind::Verbatim, |
5843 | | c: ']', |
5844 | | })], |
5845 | | }; |
5846 | | Ok((set, union)) |
5847 | | }); |
5848 | | assert_eq!( |
5849 | | parser_ignore_whitespace("[ ] a]").parse_set_class_open(), |
5850 | | { |
5851 | | let set = ast::ClassBracketed { |
5852 | | span: span(0..4), |
5853 | | negated: false, |
5854 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5855 | | span: span(2..2), |
5856 | | items: vec![], |
5857 | | }), |
5858 | | }; |
5859 | | let union = ast::ClassSetUnion { |
5860 | | span: span(2..3), |
5861 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5862 | | span: span(2..3), |
5863 | | kind: ast::LiteralKind::Verbatim, |
5864 | | c: ']', |
5865 | | })], |
5866 | | }; |
5867 | | Ok((set, union)) |
5868 | | } |
5869 | | ); |
5870 | | assert_eq!(parser("[^]a]").parse_set_class_open(), { |
5871 | | let set = ast::ClassBracketed { |
5872 | | span: span(0..3), |
5873 | | negated: true, |
5874 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5875 | | span: span(2..2), |
5876 | | items: vec![], |
5877 | | }), |
5878 | | }; |
5879 | | let union = ast::ClassSetUnion { |
5880 | | span: span(2..3), |
5881 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5882 | | span: span(2..3), |
5883 | | kind: ast::LiteralKind::Verbatim, |
5884 | | c: ']', |
5885 | | })], |
5886 | | }; |
5887 | | Ok((set, union)) |
5888 | | }); |
5889 | | assert_eq!(parser("[-]a]").parse_set_class_open(), { |
5890 | | let set = ast::ClassBracketed { |
5891 | | span: span(0..2), |
5892 | | negated: false, |
5893 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5894 | | span: span(1..1), |
5895 | | items: vec![], |
5896 | | }), |
5897 | | }; |
5898 | | let union = ast::ClassSetUnion { |
5899 | | span: span(1..2), |
5900 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5901 | | span: span(1..2), |
5902 | | kind: ast::LiteralKind::Verbatim, |
5903 | | c: '-', |
5904 | | })], |
5905 | | }; |
5906 | | Ok((set, union)) |
5907 | | }); |
5908 | | |
5909 | | assert_eq!( |
5910 | | parser("[").parse_set_class_open().unwrap_err(), |
5911 | | TestError { |
5912 | | span: span(0..1), |
5913 | | kind: ast::ErrorKind::ClassUnclosed, |
5914 | | } |
5915 | | ); |
5916 | | assert_eq!( |
5917 | | parser_ignore_whitespace("[ ") |
5918 | | .parse_set_class_open() |
5919 | | .unwrap_err(), |
5920 | | TestError { |
5921 | | span: span(0..5), |
5922 | | kind: ast::ErrorKind::ClassUnclosed, |
5923 | | } |
5924 | | ); |
5925 | | assert_eq!( |
5926 | | parser("[^").parse_set_class_open().unwrap_err(), |
5927 | | TestError { |
5928 | | span: span(0..2), |
5929 | | kind: ast::ErrorKind::ClassUnclosed, |
5930 | | } |
5931 | | ); |
5932 | | assert_eq!( |
5933 | | parser("[]").parse_set_class_open().unwrap_err(), |
5934 | | TestError { |
5935 | | span: span(0..2), |
5936 | | kind: ast::ErrorKind::ClassUnclosed, |
5937 | | } |
5938 | | ); |
5939 | | assert_eq!( |
5940 | | parser("[-").parse_set_class_open().unwrap_err(), |
5941 | | TestError { |
5942 | | span: span(0..0), |
5943 | | kind: ast::ErrorKind::ClassUnclosed, |
5944 | | } |
5945 | | ); |
5946 | | assert_eq!( |
5947 | | parser("[--").parse_set_class_open().unwrap_err(), |
5948 | | TestError { |
5949 | | span: span(0..0), |
5950 | | kind: ast::ErrorKind::ClassUnclosed, |
5951 | | } |
5952 | | ); |
5953 | | |
5954 | | // See: https://github.com/rust-lang/regex/issues/792 |
5955 | | assert_eq!( |
5956 | | parser("(?x)[-#]").parse_with_comments().unwrap_err(), |
5957 | | TestError { |
5958 | | span: span(4..4), |
5959 | | kind: ast::ErrorKind::ClassUnclosed, |
5960 | | } |
5961 | | ); |
5962 | | } |
5963 | | |
5964 | | #[test] |
5965 | | fn maybe_parse_ascii_class() { |
5966 | | assert_eq!( |
5967 | | parser(r"[:alnum:]").maybe_parse_ascii_class(), |
5968 | | Some(ast::ClassAscii { |
5969 | | span: span(0..9), |
5970 | | kind: ast::ClassAsciiKind::Alnum, |
5971 | | negated: false, |
5972 | | }) |
5973 | | ); |
5974 | | assert_eq!( |
5975 | | parser(r"[:alnum:]A").maybe_parse_ascii_class(), |
5976 | | Some(ast::ClassAscii { |
5977 | | span: span(0..9), |
5978 | | kind: ast::ClassAsciiKind::Alnum, |
5979 | | negated: false, |
5980 | | }) |
5981 | | ); |
5982 | | assert_eq!( |
5983 | | parser(r"[:^alnum:]").maybe_parse_ascii_class(), |
5984 | | Some(ast::ClassAscii { |
5985 | | span: span(0..10), |
5986 | | kind: ast::ClassAsciiKind::Alnum, |
5987 | | negated: true, |
5988 | | }) |
5989 | | ); |
5990 | | |
5991 | | let p = parser(r"[:"); |
5992 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5993 | | assert_eq!(p.offset(), 0); |
5994 | | |
5995 | | let p = parser(r"[:^"); |
5996 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5997 | | assert_eq!(p.offset(), 0); |
5998 | | |
5999 | | let p = parser(r"[^:alnum:]"); |
6000 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
6001 | | assert_eq!(p.offset(), 0); |
6002 | | |
6003 | | let p = parser(r"[:alnnum:]"); |
6004 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
6005 | | assert_eq!(p.offset(), 0); |
6006 | | |
6007 | | let p = parser(r"[:alnum]"); |
6008 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
6009 | | assert_eq!(p.offset(), 0); |
6010 | | |
6011 | | let p = parser(r"[:alnum:"); |
6012 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
6013 | | assert_eq!(p.offset(), 0); |
6014 | | } |
6015 | | |
6016 | | #[test] |
6017 | | fn parse_unicode_class() { |
6018 | | assert_eq!( |
6019 | | parser(r"\pN").parse_escape(), |
6020 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6021 | | span: span(0..3), |
6022 | | negated: false, |
6023 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
6024 | | })) |
6025 | | ); |
6026 | | assert_eq!( |
6027 | | parser(r"\PN").parse_escape(), |
6028 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6029 | | span: span(0..3), |
6030 | | negated: true, |
6031 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
6032 | | })) |
6033 | | ); |
6034 | | assert_eq!( |
6035 | | parser(r"\p{N}").parse_escape(), |
6036 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6037 | | span: span(0..5), |
6038 | | negated: false, |
6039 | | kind: ast::ClassUnicodeKind::Named(s("N")), |
6040 | | })) |
6041 | | ); |
6042 | | assert_eq!( |
6043 | | parser(r"\P{N}").parse_escape(), |
6044 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6045 | | span: span(0..5), |
6046 | | negated: true, |
6047 | | kind: ast::ClassUnicodeKind::Named(s("N")), |
6048 | | })) |
6049 | | ); |
6050 | | assert_eq!( |
6051 | | parser(r"\p{Greek}").parse_escape(), |
6052 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6053 | | span: span(0..9), |
6054 | | negated: false, |
6055 | | kind: ast::ClassUnicodeKind::Named(s("Greek")), |
6056 | | })) |
6057 | | ); |
6058 | | |
6059 | | assert_eq!( |
6060 | | parser(r"\p{scx:Katakana}").parse_escape(), |
6061 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6062 | | span: span(0..16), |
6063 | | negated: false, |
6064 | | kind: ast::ClassUnicodeKind::NamedValue { |
6065 | | op: ast::ClassUnicodeOpKind::Colon, |
6066 | | name: s("scx"), |
6067 | | value: s("Katakana"), |
6068 | | }, |
6069 | | })) |
6070 | | ); |
6071 | | assert_eq!( |
6072 | | parser(r"\p{scx=Katakana}").parse_escape(), |
6073 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6074 | | span: span(0..16), |
6075 | | negated: false, |
6076 | | kind: ast::ClassUnicodeKind::NamedValue { |
6077 | | op: ast::ClassUnicodeOpKind::Equal, |
6078 | | name: s("scx"), |
6079 | | value: s("Katakana"), |
6080 | | }, |
6081 | | })) |
6082 | | ); |
6083 | | assert_eq!( |
6084 | | parser(r"\p{scx!=Katakana}").parse_escape(), |
6085 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6086 | | span: span(0..17), |
6087 | | negated: false, |
6088 | | kind: ast::ClassUnicodeKind::NamedValue { |
6089 | | op: ast::ClassUnicodeOpKind::NotEqual, |
6090 | | name: s("scx"), |
6091 | | value: s("Katakana"), |
6092 | | }, |
6093 | | })) |
6094 | | ); |
6095 | | |
6096 | | assert_eq!( |
6097 | | parser(r"\p{:}").parse_escape(), |
6098 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6099 | | span: span(0..5), |
6100 | | negated: false, |
6101 | | kind: ast::ClassUnicodeKind::NamedValue { |
6102 | | op: ast::ClassUnicodeOpKind::Colon, |
6103 | | name: s(""), |
6104 | | value: s(""), |
6105 | | }, |
6106 | | })) |
6107 | | ); |
6108 | | assert_eq!( |
6109 | | parser(r"\p{=}").parse_escape(), |
6110 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6111 | | span: span(0..5), |
6112 | | negated: false, |
6113 | | kind: ast::ClassUnicodeKind::NamedValue { |
6114 | | op: ast::ClassUnicodeOpKind::Equal, |
6115 | | name: s(""), |
6116 | | value: s(""), |
6117 | | }, |
6118 | | })) |
6119 | | ); |
6120 | | assert_eq!( |
6121 | | parser(r"\p{!=}").parse_escape(), |
6122 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
6123 | | span: span(0..6), |
6124 | | negated: false, |
6125 | | kind: ast::ClassUnicodeKind::NamedValue { |
6126 | | op: ast::ClassUnicodeOpKind::NotEqual, |
6127 | | name: s(""), |
6128 | | value: s(""), |
6129 | | }, |
6130 | | })) |
6131 | | ); |
6132 | | |
6133 | | assert_eq!( |
6134 | | parser(r"\p").parse_escape().unwrap_err(), |
6135 | | TestError { |
6136 | | span: span(2..2), |
6137 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6138 | | } |
6139 | | ); |
6140 | | assert_eq!( |
6141 | | parser(r"\p{").parse_escape().unwrap_err(), |
6142 | | TestError { |
6143 | | span: span(3..3), |
6144 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6145 | | } |
6146 | | ); |
6147 | | assert_eq!( |
6148 | | parser(r"\p{N").parse_escape().unwrap_err(), |
6149 | | TestError { |
6150 | | span: span(4..4), |
6151 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6152 | | } |
6153 | | ); |
6154 | | assert_eq!( |
6155 | | parser(r"\p{Greek").parse_escape().unwrap_err(), |
6156 | | TestError { |
6157 | | span: span(8..8), |
6158 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6159 | | } |
6160 | | ); |
6161 | | |
6162 | | assert_eq!( |
6163 | | parser(r"\pNz").parse(), |
6164 | | Ok(Ast::concat(ast::Concat { |
6165 | | span: span(0..4), |
6166 | | asts: vec![ |
6167 | | Ast::class_unicode(ast::ClassUnicode { |
6168 | | span: span(0..3), |
6169 | | negated: false, |
6170 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
6171 | | }), |
6172 | | Ast::literal(ast::Literal { |
6173 | | span: span(3..4), |
6174 | | kind: ast::LiteralKind::Verbatim, |
6175 | | c: 'z', |
6176 | | }), |
6177 | | ], |
6178 | | })) |
6179 | | ); |
6180 | | assert_eq!( |
6181 | | parser(r"\p{Greek}z").parse(), |
6182 | | Ok(Ast::concat(ast::Concat { |
6183 | | span: span(0..10), |
6184 | | asts: vec![ |
6185 | | Ast::class_unicode(ast::ClassUnicode { |
6186 | | span: span(0..9), |
6187 | | negated: false, |
6188 | | kind: ast::ClassUnicodeKind::Named(s("Greek")), |
6189 | | }), |
6190 | | Ast::literal(ast::Literal { |
6191 | | span: span(9..10), |
6192 | | kind: ast::LiteralKind::Verbatim, |
6193 | | c: 'z', |
6194 | | }), |
6195 | | ], |
6196 | | })) |
6197 | | ); |
6198 | | assert_eq!( |
6199 | | parser(r"\p\{").parse().unwrap_err(), |
6200 | | TestError { |
6201 | | span: span(2..3), |
6202 | | kind: ast::ErrorKind::UnicodeClassInvalid, |
6203 | | } |
6204 | | ); |
6205 | | assert_eq!( |
6206 | | parser(r"\P\{").parse().unwrap_err(), |
6207 | | TestError { |
6208 | | span: span(2..3), |
6209 | | kind: ast::ErrorKind::UnicodeClassInvalid, |
6210 | | } |
6211 | | ); |
6212 | | } |
6213 | | |
6214 | | #[test] |
6215 | | fn parse_perl_class() { |
6216 | | assert_eq!( |
6217 | | parser(r"\d").parse_escape(), |
6218 | | Ok(Primitive::Perl(ast::ClassPerl { |
6219 | | span: span(0..2), |
6220 | | kind: ast::ClassPerlKind::Digit, |
6221 | | negated: false, |
6222 | | })) |
6223 | | ); |
6224 | | assert_eq!( |
6225 | | parser(r"\D").parse_escape(), |
6226 | | Ok(Primitive::Perl(ast::ClassPerl { |
6227 | | span: span(0..2), |
6228 | | kind: ast::ClassPerlKind::Digit, |
6229 | | negated: true, |
6230 | | })) |
6231 | | ); |
6232 | | assert_eq!( |
6233 | | parser(r"\s").parse_escape(), |
6234 | | Ok(Primitive::Perl(ast::ClassPerl { |
6235 | | span: span(0..2), |
6236 | | kind: ast::ClassPerlKind::Space, |
6237 | | negated: false, |
6238 | | })) |
6239 | | ); |
6240 | | assert_eq!( |
6241 | | parser(r"\S").parse_escape(), |
6242 | | Ok(Primitive::Perl(ast::ClassPerl { |
6243 | | span: span(0..2), |
6244 | | kind: ast::ClassPerlKind::Space, |
6245 | | negated: true, |
6246 | | })) |
6247 | | ); |
6248 | | assert_eq!( |
6249 | | parser(r"\w").parse_escape(), |
6250 | | Ok(Primitive::Perl(ast::ClassPerl { |
6251 | | span: span(0..2), |
6252 | | kind: ast::ClassPerlKind::Word, |
6253 | | negated: false, |
6254 | | })) |
6255 | | ); |
6256 | | assert_eq!( |
6257 | | parser(r"\W").parse_escape(), |
6258 | | Ok(Primitive::Perl(ast::ClassPerl { |
6259 | | span: span(0..2), |
6260 | | kind: ast::ClassPerlKind::Word, |
6261 | | negated: true, |
6262 | | })) |
6263 | | ); |
6264 | | |
6265 | | assert_eq!( |
6266 | | parser(r"\d").parse(), |
6267 | | Ok(Ast::class_perl(ast::ClassPerl { |
6268 | | span: span(0..2), |
6269 | | kind: ast::ClassPerlKind::Digit, |
6270 | | negated: false, |
6271 | | })) |
6272 | | ); |
6273 | | assert_eq!( |
6274 | | parser(r"\dz").parse(), |
6275 | | Ok(Ast::concat(ast::Concat { |
6276 | | span: span(0..3), |
6277 | | asts: vec![ |
6278 | | Ast::class_perl(ast::ClassPerl { |
6279 | | span: span(0..2), |
6280 | | kind: ast::ClassPerlKind::Digit, |
6281 | | negated: false, |
6282 | | }), |
6283 | | Ast::literal(ast::Literal { |
6284 | | span: span(2..3), |
6285 | | kind: ast::LiteralKind::Verbatim, |
6286 | | c: 'z', |
6287 | | }), |
6288 | | ], |
6289 | | })) |
6290 | | ); |
6291 | | } |
6292 | | |
6293 | | // This tests a bug fix where the nest limit checker wasn't decrementing |
6294 | | // its depth during post-traversal, which causes long regexes to trip |
6295 | | // the default limit too aggressively. |
6296 | | #[test] |
6297 | | fn regression_454_nest_too_big() { |
6298 | | let pattern = r#" |
6299 | | 2(?: |
6300 | | [45]\d{3}| |
6301 | | 7(?: |
6302 | | 1[0-267]| |
6303 | | 2[0-289]| |
6304 | | 3[0-29]| |
6305 | | 4[01]| |
6306 | | 5[1-3]| |
6307 | | 6[013]| |
6308 | | 7[0178]| |
6309 | | 91 |
6310 | | )| |
6311 | | 8(?: |
6312 | | 0[125]| |
6313 | | [139][1-6]| |
6314 | | 2[0157-9]| |
6315 | | 41| |
6316 | | 6[1-35]| |
6317 | | 7[1-5]| |
6318 | | 8[1-8]| |
6319 | | 90 |
6320 | | )| |
6321 | | 9(?: |
6322 | | 0[0-2]| |
6323 | | 1[0-4]| |
6324 | | 2[568]| |
6325 | | 3[3-6]| |
6326 | | 5[5-7]| |
6327 | | 6[0167]| |
6328 | | 7[15]| |
6329 | | 8[0146-9] |
6330 | | ) |
6331 | | )\d{4} |
6332 | | "#; |
6333 | | assert!(parser_nest_limit(pattern, 50).parse().is_ok()); |
6334 | | } |
6335 | | |
6336 | | // This tests that we treat a trailing `-` in a character class as a |
6337 | | // literal `-` even when whitespace mode is enabled and there is whitespace |
6338 | | // after the trailing `-`. |
6339 | | #[test] |
6340 | | fn regression_455_trailing_dash_ignore_whitespace() { |
6341 | | assert!(parser("(?x)[ / - ]").parse().is_ok()); |
6342 | | assert!(parser("(?x)[ a - ]").parse().is_ok()); |
6343 | | assert!(parser( |
6344 | | "(?x)[ |
6345 | | a |
6346 | | - ] |
6347 | | " |
6348 | | ) |
6349 | | .parse() |
6350 | | .is_ok()); |
6351 | | assert!(parser( |
6352 | | "(?x)[ |
6353 | | a # wat |
6354 | | - ] |
6355 | | " |
6356 | | ) |
6357 | | .parse() |
6358 | | .is_ok()); |
6359 | | |
6360 | | assert!(parser("(?x)[ / -").parse().is_err()); |
6361 | | assert!(parser("(?x)[ / - ").parse().is_err()); |
6362 | | assert!(parser( |
6363 | | "(?x)[ |
6364 | | / - |
6365 | | " |
6366 | | ) |
6367 | | .parse() |
6368 | | .is_err()); |
6369 | | assert!(parser( |
6370 | | "(?x)[ |
6371 | | / - # wat |
6372 | | " |
6373 | | ) |
6374 | | .parse() |
6375 | | .is_err()); |
6376 | | } |
6377 | | } |