/rust/registry/src/github.com-1ecc6299db9ec823/regex-syntax-0.6.25/src/ast/parse.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | This module provides a regular expression parser. |
3 | | */ |
4 | | |
5 | | use std::borrow::Borrow; |
6 | | use std::cell::{Cell, RefCell}; |
7 | | use std::mem; |
8 | | use std::result; |
9 | | |
10 | | use crate::ast::{self, Ast, Position, Span}; |
11 | | use crate::either::Either; |
12 | | |
13 | | use crate::is_meta_character; |
14 | | |
15 | | type Result<T> = result::Result<T, ast::Error>; |
16 | | |
17 | | /// A primitive is an expression with no sub-expressions. This includes |
18 | | /// literals, assertions and non-set character classes. This representation |
19 | | /// is used as intermediate state in the parser. |
20 | | /// |
21 | | /// This does not include ASCII character classes, since they can only appear |
22 | | /// within a set character class. |
23 | 0 | #[derive(Clone, Debug, Eq, PartialEq)] Unexecuted instantiation: <regex_syntax::ast::parse::Primitive as core::cmp::PartialEq>::eq Unexecuted instantiation: <regex_syntax::ast::parse::Primitive as core::cmp::PartialEq>::ne |
24 | | enum Primitive { |
25 | | Literal(ast::Literal), |
26 | | Assertion(ast::Assertion), |
27 | | Dot(Span), |
28 | | Perl(ast::ClassPerl), |
29 | | Unicode(ast::ClassUnicode), |
30 | | } |
31 | | |
32 | | impl Primitive { |
33 | | /// Return the span of this primitive. |
34 | 8 | fn span(&self) -> &Span { |
35 | 8 | match *self { |
36 | 8 | Primitive::Literal(ref x) => &x.span, |
37 | 0 | Primitive::Assertion(ref x) => &x.span, |
38 | 0 | Primitive::Dot(ref span) => span, |
39 | 0 | Primitive::Perl(ref x) => &x.span, |
40 | 0 | Primitive::Unicode(ref x) => &x.span, |
41 | | } |
42 | 8 | } |
43 | | |
44 | | /// Convert this primitive into a proper AST. |
45 | 200 | fn into_ast(self) -> Ast { |
46 | 200 | match self { |
47 | 186 | Primitive::Literal(lit) => Ast::Literal(lit), |
48 | 12 | Primitive::Assertion(assert) => Ast::Assertion(assert), |
49 | 2 | Primitive::Dot(span) => Ast::Dot(span), |
50 | 0 | Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), |
51 | 0 | Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), |
52 | | } |
53 | 200 | } |
54 | | |
55 | | /// Convert this primitive into an item in a character class. |
56 | | /// |
57 | | /// If this primitive is not a legal item (i.e., an assertion or a dot), |
58 | | /// then return an error. |
59 | 12 | fn into_class_set_item<P: Borrow<Parser>>( |
60 | 12 | self, |
61 | 12 | p: &ParserI<'_, P>, |
62 | 12 | ) -> Result<ast::ClassSetItem> { |
63 | 12 | use self::Primitive::*; |
64 | 12 | use crate::ast::ClassSetItem; |
65 | 12 | |
66 | 12 | match self { |
67 | 12 | Literal(lit) => Ok(ClassSetItem::Literal(lit)), |
68 | 0 | Perl(cls) => Ok(ClassSetItem::Perl(cls)), |
69 | 0 | Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), |
70 | 0 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), |
71 | | } |
72 | 12 | } |
73 | | |
74 | | /// Convert this primitive into a literal in a character class. In |
75 | | /// particular, literals are the only valid items that can appear in |
76 | | /// ranges. |
77 | | /// |
78 | | /// If this primitive is not a legal item (i.e., a class, assertion or a |
79 | | /// dot), then return an error. |
80 | 8 | fn into_class_literal<P: Borrow<Parser>>( |
81 | 8 | self, |
82 | 8 | p: &ParserI<'_, P>, |
83 | 8 | ) -> Result<ast::Literal> { |
84 | 8 | use self::Primitive::*; |
85 | 8 | |
86 | 8 | match self { |
87 | 8 | Literal(lit) => Ok(lit), |
88 | 0 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), |
89 | | } |
90 | 8 | } |
91 | | } |
92 | | |
93 | | /// Returns true if the given character is a hexadecimal digit. |
94 | 0 | fn is_hex(c: char) -> bool { |
95 | 0 | ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
96 | 0 | } |
97 | | |
98 | | /// Returns true if the given character is a valid in a capture group name. |
99 | | /// |
100 | | /// If `first` is true, then `c` is treated as the first character in the |
101 | | /// group name (which must be alphabetic or underscore). |
102 | 0 | fn is_capture_char(c: char, first: bool) -> bool { |
103 | 0 | c == '_' |
104 | 0 | || (!first |
105 | 0 | && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) |
106 | 0 | || ('A' <= c && c <= 'Z') |
107 | 0 | || ('a' <= c && c <= 'z') |
108 | 0 | } |
109 | | |
110 | | /// A builder for a regular expression parser. |
111 | | /// |
112 | | /// This builder permits modifying configuration options for the parser. |
113 | 0 | #[derive(Clone, Debug)] |
114 | | pub struct ParserBuilder { |
115 | | ignore_whitespace: bool, |
116 | | nest_limit: u32, |
117 | | octal: bool, |
118 | | } |
119 | | |
120 | | impl Default for ParserBuilder { |
121 | 6 | fn default() -> ParserBuilder { |
122 | 6 | ParserBuilder::new() |
123 | 6 | } |
124 | | } |
125 | | |
126 | | impl ParserBuilder { |
127 | | /// Create a new parser builder with a default configuration. |
128 | 6 | pub fn new() -> ParserBuilder { |
129 | 6 | ParserBuilder { |
130 | 6 | ignore_whitespace: false, |
131 | 6 | nest_limit: 250, |
132 | 6 | octal: false, |
133 | 6 | } |
134 | 6 | } |
135 | | |
136 | | /// Build a parser from this configuration with the given pattern. |
137 | 6 | pub fn build(&self) -> Parser { |
138 | 6 | Parser { |
139 | 6 | pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), |
140 | 6 | capture_index: Cell::new(0), |
141 | 6 | nest_limit: self.nest_limit, |
142 | 6 | octal: self.octal, |
143 | 6 | initial_ignore_whitespace: self.ignore_whitespace, |
144 | 6 | ignore_whitespace: Cell::new(self.ignore_whitespace), |
145 | 6 | comments: RefCell::new(vec![]), |
146 | 6 | stack_group: RefCell::new(vec![]), |
147 | 6 | stack_class: RefCell::new(vec![]), |
148 | 6 | capture_names: RefCell::new(vec![]), |
149 | 6 | scratch: RefCell::new(String::new()), |
150 | 6 | } |
151 | 6 | } |
152 | | |
153 | | /// Set the nesting limit for this parser. |
154 | | /// |
155 | | /// The nesting limit controls how deep the abstract syntax tree is allowed |
156 | | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
157 | | /// groups), then an error is returned by the parser. |
158 | | /// |
159 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
160 | | /// overflow for consumers that do structural induction on an `Ast` using |
161 | | /// explicit recursion. While this crate never does this (instead using |
162 | | /// constant stack space and moving the call stack to the heap), other |
163 | | /// crates may. |
164 | | /// |
165 | | /// This limit is not checked until the entire Ast is parsed. Therefore, |
166 | | /// if callers want to put a limit on the amount of heap space used, then |
167 | | /// they should impose a limit on the length, in bytes, of the concrete |
168 | | /// pattern string. In particular, this is viable since this parser |
169 | | /// implementation will limit itself to heap space proportional to the |
170 | | /// lenth of the pattern string. |
171 | | /// |
172 | | /// Note that a nest limit of `0` will return a nest limit error for most |
173 | | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
174 | | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
175 | | /// depth of `1`. In general, a nest limit is not something that manifests |
176 | | /// in an obvious way in the concrete syntax, therefore, it should not be |
177 | | /// used in a granular way. |
178 | 6 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { |
179 | 6 | self.nest_limit = limit; |
180 | 6 | self |
181 | 6 | } |
182 | | |
183 | | /// Whether to support octal syntax or not. |
184 | | /// |
185 | | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
186 | | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
187 | | /// `\141` are all equivalent regular expressions, where the last example |
188 | | /// shows octal syntax. |
189 | | /// |
190 | | /// While supporting octal syntax isn't in and of itself a problem, it does |
191 | | /// make good error messages harder. That is, in PCRE based regex engines, |
192 | | /// syntax like `\0` invokes a backreference, which is explicitly |
193 | | /// unsupported in Rust's regex engine. However, many users expect it to |
194 | | /// be supported. Therefore, when octal support is disabled, the error |
195 | | /// message will explicitly mention that backreferences aren't supported. |
196 | | /// |
197 | | /// Octal syntax is disabled by default. |
198 | 6 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { |
199 | 6 | self.octal = yes; |
200 | 6 | self |
201 | 6 | } |
202 | | |
203 | | /// Enable verbose mode in the regular expression. |
204 | | /// |
205 | | /// When enabled, verbose mode permits insigificant whitespace in many |
206 | | /// places in the regular expression, as well as comments. Comments are |
207 | | /// started using `#` and continue until the end of the line. |
208 | | /// |
209 | | /// By default, this is disabled. It may be selectively enabled in the |
210 | | /// regular expression by using the `x` flag regardless of this setting. |
211 | 6 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { |
212 | 6 | self.ignore_whitespace = yes; |
213 | 6 | self |
214 | 6 | } |
215 | | } |
216 | | |
217 | | /// A regular expression parser. |
218 | | /// |
219 | | /// This parses a string representation of a regular expression into an |
220 | | /// abstract syntax tree. The size of the tree is proportional to the length |
221 | | /// of the regular expression pattern. |
222 | | /// |
223 | | /// A `Parser` can be configured in more detail via a |
224 | | /// [`ParserBuilder`](struct.ParserBuilder.html). |
225 | 0 | #[derive(Clone, Debug)] |
226 | | pub struct Parser { |
227 | | /// The current position of the parser. |
228 | | pos: Cell<Position>, |
229 | | /// The current capture index. |
230 | | capture_index: Cell<u32>, |
231 | | /// The maximum number of open parens/brackets allowed. If the parser |
232 | | /// exceeds this number, then an error is returned. |
233 | | nest_limit: u32, |
234 | | /// Whether to support octal syntax or not. When `false`, the parser will |
235 | | /// return an error helpfully pointing out that backreferences are not |
236 | | /// supported. |
237 | | octal: bool, |
238 | | /// The initial setting for `ignore_whitespace` as provided by |
239 | | /// Th`ParserBuilder`. is is used when reseting the parser's state. |
240 | | initial_ignore_whitespace: bool, |
241 | | /// Whether whitespace should be ignored. When enabled, comments are |
242 | | /// also permitted. |
243 | | ignore_whitespace: Cell<bool>, |
244 | | /// A list of comments, in order of appearance. |
245 | | comments: RefCell<Vec<ast::Comment>>, |
246 | | /// A stack of grouped sub-expressions, including alternations. |
247 | | stack_group: RefCell<Vec<GroupState>>, |
248 | | /// A stack of nested character classes. This is only non-empty when |
249 | | /// parsing a class. |
250 | | stack_class: RefCell<Vec<ClassState>>, |
251 | | /// A sorted sequence of capture names. This is used to detect duplicate |
252 | | /// capture names and report an error if one is detected. |
253 | | capture_names: RefCell<Vec<ast::CaptureName>>, |
254 | | /// A scratch buffer used in various places. Mostly this is used to |
255 | | /// accumulate relevant characters from parts of a pattern. |
256 | | scratch: RefCell<String>, |
257 | | } |
258 | | |
259 | | /// ParserI is the internal parser implementation. |
260 | | /// |
261 | | /// We use this separate type so that we can carry the provided pattern string |
262 | | /// along with us. In particular, a `Parser` internal state is not tied to any |
263 | | /// one pattern, but `ParserI` is. |
264 | | /// |
265 | | /// This type also lets us use `ParserI<&Parser>` in production code while |
266 | | /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes |
267 | | /// work against the internal interface of the parser. |
268 | 0 | #[derive(Clone, Debug)] |
269 | | struct ParserI<'s, P> { |
270 | | /// The parser state/configuration. |
271 | | parser: P, |
272 | | /// The full regular expression provided by the user. |
273 | | pattern: &'s str, |
274 | | } |
275 | | |
276 | | /// GroupState represents a single stack frame while parsing nested groups |
277 | | /// and alternations. Each frame records the state up to an opening parenthesis |
278 | | /// or a alternating bracket `|`. |
279 | 0 | #[derive(Clone, Debug)] |
280 | | enum GroupState { |
281 | | /// This state is pushed whenever an opening group is found. |
282 | | Group { |
283 | | /// The concatenation immediately preceding the opening group. |
284 | | concat: ast::Concat, |
285 | | /// The group that has been opened. Its sub-AST is always empty. |
286 | | group: ast::Group, |
287 | | /// Whether this group has the `x` flag enabled or not. |
288 | | ignore_whitespace: bool, |
289 | | }, |
290 | | /// This state is pushed whenever a new alternation branch is found. If |
291 | | /// an alternation branch is found and this state is at the top of the |
292 | | /// stack, then this state should be modified to include the new |
293 | | /// alternation. |
294 | | Alternation(ast::Alternation), |
295 | | } |
296 | | |
297 | | /// ClassState represents a single stack frame while parsing character classes. |
298 | | /// Each frame records the state up to an intersection, difference, symmetric |
299 | | /// difference or nested class. |
300 | | /// |
301 | | /// Note that a parser's character class stack is only non-empty when parsing |
302 | | /// a character class. In all other cases, it is empty. |
303 | 0 | #[derive(Clone, Debug)] |
304 | | enum ClassState { |
305 | | /// This state is pushed whenever an opening bracket is found. |
306 | | Open { |
307 | | /// The union of class items immediately preceding this class. |
308 | | union: ast::ClassSetUnion, |
309 | | /// The class that has been opened. Typically this just corresponds |
310 | | /// to the `[`, but it can also include `[^` since `^` indicates |
311 | | /// negation of the class. |
312 | | set: ast::ClassBracketed, |
313 | | }, |
314 | | /// This state is pushed when a operator is seen. When popped, the stored |
315 | | /// set becomes the left hand side of the operator. |
316 | | Op { |
317 | | /// The type of the operation, i.e., &&, -- or ~~. |
318 | | kind: ast::ClassSetBinaryOpKind, |
319 | | /// The left-hand side of the operator. |
320 | | lhs: ast::ClassSet, |
321 | | }, |
322 | | } |
323 | | |
324 | | impl Parser { |
325 | | /// Create a new parser with a default configuration. |
326 | | /// |
327 | | /// The parser can be run with either the `parse` or `parse_with_comments` |
328 | | /// methods. The parse methods return an abstract syntax tree. |
329 | | /// |
330 | | /// To set configuration options on the parser, use |
331 | | /// [`ParserBuilder`](struct.ParserBuilder.html). |
332 | 0 | pub fn new() -> Parser { |
333 | 0 | ParserBuilder::new().build() |
334 | 0 | } |
335 | | |
336 | | /// Parse the regular expression into an abstract syntax tree. |
337 | 6 | pub fn parse(&mut self, pattern: &str) -> Result<Ast> { |
338 | 6 | ParserI::new(self, pattern).parse() |
339 | 6 | } |
340 | | |
341 | | /// Parse the regular expression and return an abstract syntax tree with |
342 | | /// all of the comments found in the pattern. |
343 | 0 | pub fn parse_with_comments( |
344 | 0 | &mut self, |
345 | 0 | pattern: &str, |
346 | 0 | ) -> Result<ast::WithComments> { |
347 | 0 | ParserI::new(self, pattern).parse_with_comments() |
348 | 0 | } |
349 | | |
350 | | /// Reset the internal state of a parser. |
351 | | /// |
352 | | /// This is called at the beginning of every parse. This prevents the |
353 | | /// parser from running with inconsistent state (say, if a previous |
354 | | /// invocation returned an error and the parser is reused). |
355 | 6 | fn reset(&self) { |
356 | 6 | // These settings should be in line with the construction |
357 | 6 | // in `ParserBuilder::build`. |
358 | 6 | self.pos.set(Position { offset: 0, line: 1, column: 1 }); |
359 | 6 | self.ignore_whitespace.set(self.initial_ignore_whitespace); |
360 | 6 | self.comments.borrow_mut().clear(); |
361 | 6 | self.stack_group.borrow_mut().clear(); |
362 | 6 | self.stack_class.borrow_mut().clear(); |
363 | 6 | } |
364 | | } |
365 | | |
366 | | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
367 | | /// Build an internal parser from a parser configuration and a pattern. |
368 | 6 | fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { |
369 | 6 | ParserI { parser: parser, pattern: pattern } |
370 | 6 | } |
371 | | |
372 | | /// Return a reference to the parser state. |
373 | 14.4k | fn parser(&self) -> &Parser { |
374 | 14.4k | self.parser.borrow() |
375 | 14.4k | } |
376 | | |
377 | | /// Return a reference to the pattern being parsed. |
378 | 8.49k | fn pattern(&self) -> &str { |
379 | 8.49k | self.pattern.borrow() |
380 | 8.49k | } |
381 | | |
382 | | /// Create a new error with the given span and error type. |
383 | 0 | fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { |
384 | 0 | ast::Error { |
385 | 0 | kind: kind, |
386 | 0 | pattern: self.pattern().to_string(), |
387 | 0 | span: span, |
388 | 0 | } |
389 | 0 | } |
390 | | |
391 | | /// Return the current offset of the parser. |
392 | | /// |
393 | | /// The offset starts at `0` from the beginning of the regular expression |
394 | | /// pattern string. |
395 | 8.81k | fn offset(&self) -> usize { |
396 | 8.81k | self.parser().pos.get().offset |
397 | 8.81k | } |
398 | | |
399 | | /// Return the current line number of the parser. |
400 | | /// |
401 | | /// The line number starts at `1`. |
402 | 256 | fn line(&self) -> usize { |
403 | 256 | self.parser().pos.get().line |
404 | 256 | } |
405 | | |
406 | | /// Return the current column of the parser. |
407 | | /// |
408 | | /// The column number starts at `1` and is reset whenever a `\n` is seen. |
409 | 256 | fn column(&self) -> usize { |
410 | 256 | self.parser().pos.get().column |
411 | 256 | } |
412 | | |
413 | | /// Return the next capturing index. Each subsequent call increments the |
414 | | /// internal index. |
415 | | /// |
416 | | /// The span given should correspond to the location of the opening |
417 | | /// parenthesis. |
418 | | /// |
419 | | /// If the capture limit is exceeded, then an error is returned. |
420 | 22 | fn next_capture_index(&self, span: Span) -> Result<u32> { |
421 | 22 | let current = self.parser().capture_index.get(); |
422 | 22 | let i = current.checked_add(1).ok_or_else(|| { |
423 | 0 | self.error(span, ast::ErrorKind::CaptureLimitExceeded) |
424 | 22 | })?; |
425 | 22 | self.parser().capture_index.set(i); |
426 | 22 | Ok(i) |
427 | 22 | } |
428 | | |
429 | | /// Adds the given capture name to this parser. If this capture name has |
430 | | /// already been used, then an error is returned. |
431 | 0 | fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { |
432 | 0 | let mut names = self.parser().capture_names.borrow_mut(); |
433 | 0 | match names |
434 | 0 | .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) |
435 | | { |
436 | 0 | Err(i) => { |
437 | 0 | names.insert(i, cap.clone()); |
438 | 0 | Ok(()) |
439 | | } |
440 | 0 | Ok(i) => Err(self.error( |
441 | 0 | cap.span, |
442 | 0 | ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, |
443 | 0 | )), |
444 | | } |
445 | 0 | } |
446 | | |
447 | | /// Return whether the parser should ignore whitespace or not. |
448 | 834 | fn ignore_whitespace(&self) -> bool { |
449 | 834 | self.parser().ignore_whitespace.get() |
450 | 834 | } |
451 | | |
452 | | /// Return the character at the current position of the parser. |
453 | | /// |
454 | | /// This panics if the current position does not point to a valid char. |
455 | 5.10k | fn char(&self) -> char { |
456 | 5.10k | self.char_at(self.offset()) |
457 | 5.10k | } |
458 | | |
459 | | /// Return the character at the given position. |
460 | | /// |
461 | | /// This panics if the given position does not point to a valid char. |
462 | 5.10k | fn char_at(&self, i: usize) -> char { |
463 | 5.10k | self.pattern()[i..] |
464 | 5.10k | .chars() |
465 | 5.10k | .next() |
466 | 5.10k | .unwrap_or_else(|| panic!("expected char at offset {}", i)) |
467 | 5.10k | } |
468 | | |
469 | | /// Bump the parser to the next Unicode scalar value. |
470 | | /// |
471 | | /// If the end of the input has been reached, then `false` is returned. |
472 | 1.10k | fn bump(&self) -> bool { |
473 | 1.10k | if self.is_eof() { |
474 | 0 | return false; |
475 | 1.10k | } |
476 | 1.10k | let Position { mut offset, mut line, mut column } = self.pos(); |
477 | 1.10k | if self.char() == '\n' { |
478 | 0 | line = line.checked_add(1).unwrap(); |
479 | 0 | column = 1; |
480 | 1.10k | } else { |
481 | 1.10k | column = column.checked_add(1).unwrap(); |
482 | 1.10k | } |
483 | 1.10k | offset += self.char().len_utf8(); |
484 | 1.10k | self.parser().pos.set(Position { |
485 | 1.10k | offset: offset, |
486 | 1.10k | line: line, |
487 | 1.10k | column: column, |
488 | 1.10k | }); |
489 | 1.10k | self.pattern()[self.offset()..].chars().next().is_some() |
490 | 1.10k | } |
491 | | |
492 | | /// If the substring starting at the current position of the parser has |
493 | | /// the given prefix, then bump the parser to the character immediately |
494 | | /// following the prefix and return true. Otherwise, don't bump the parser |
495 | | /// and return false. |
496 | 276 | fn bump_if(&self, prefix: &str) -> bool { |
497 | 276 | if self.pattern()[self.offset()..].starts_with(prefix) { |
498 | 134 | for _ in 0..prefix.chars().count() { |
499 | 134 | self.bump(); |
500 | 134 | } |
501 | 74 | true |
502 | | } else { |
503 | 202 | false |
504 | | } |
505 | 276 | } |
506 | | |
507 | | /// Returns true if and only if the parser is positioned at a look-around |
508 | | /// prefix. The conditions under which this returns true must always |
509 | | /// correspond to a regular expression that would otherwise be consider |
510 | | /// invalid. |
511 | | /// |
512 | | /// This should only be called immediately after parsing the opening of |
513 | | /// a group or a set of flags. |
514 | 36 | fn is_lookaround_prefix(&self) -> bool { |
515 | 36 | self.bump_if("?=") |
516 | 36 | || self.bump_if("?!") |
517 | 36 | || self.bump_if("?<=") |
518 | 36 | || self.bump_if("?<!") |
519 | 36 | } |
520 | | |
521 | | /// Bump the parser, and if the `x` flag is enabled, bump through any |
522 | | /// subsequent spaces. Return true if and only if the parser is not at |
523 | | /// EOF. |
524 | 72 | fn bump_and_bump_space(&self) -> bool { |
525 | 72 | if !self.bump() { |
526 | 0 | return false; |
527 | 72 | } |
528 | 72 | self.bump_space(); |
529 | 72 | !self.is_eof() |
530 | 72 | } |
531 | | |
532 | | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
533 | | /// comments), then this will advance the parser through all whitespace |
534 | | /// and comments to the next non-whitespace non-comment byte. |
535 | | /// |
536 | | /// If the `x` flag is disabled, then this is a no-op. |
537 | | /// |
538 | | /// This should be used selectively throughout the parser where |
539 | | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
540 | | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
541 | 790 | fn bump_space(&self) { |
542 | 790 | if !self.ignore_whitespace() { |
543 | 790 | return; |
544 | 0 | } |
545 | 0 | while !self.is_eof() { |
546 | 0 | if self.char().is_whitespace() { |
547 | 0 | self.bump(); |
548 | 0 | } else if self.char() == '#' { |
549 | 0 | let start = self.pos(); |
550 | 0 | let mut comment_text = String::new(); |
551 | 0 | self.bump(); |
552 | 0 | while !self.is_eof() { |
553 | 0 | let c = self.char(); |
554 | 0 | self.bump(); |
555 | 0 | if c == '\n' { |
556 | 0 | break; |
557 | 0 | } |
558 | 0 | comment_text.push(c); |
559 | | } |
560 | 0 | let comment = ast::Comment { |
561 | 0 | span: Span::new(start, self.pos()), |
562 | 0 | comment: comment_text, |
563 | 0 | }; |
564 | 0 | self.parser().comments.borrow_mut().push(comment); |
565 | | } else { |
566 | 0 | break; |
567 | | } |
568 | | } |
569 | 790 | } |
570 | | |
571 | | /// Peek at the next character in the input without advancing the parser. |
572 | | /// |
573 | | /// If the input has been exhausted, then this returns `None`. |
574 | 8 | fn peek(&self) -> Option<char> { |
575 | 8 | if self.is_eof() { |
576 | 0 | return None; |
577 | 8 | } |
578 | 8 | self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() |
579 | 8 | } |
580 | | |
581 | | /// Like peek, but will ignore spaces when the parser is in whitespace |
582 | | /// insensitive mode. |
583 | 8 | fn peek_space(&self) -> Option<char> { |
584 | 8 | if !self.ignore_whitespace() { |
585 | 8 | return self.peek(); |
586 | 0 | } |
587 | 0 | if self.is_eof() { |
588 | 0 | return None; |
589 | 0 | } |
590 | 0 | let mut start = self.offset() + self.char().len_utf8(); |
591 | 0 | let mut in_comment = false; |
592 | 0 | for (i, c) in self.pattern()[start..].char_indices() { |
593 | 0 | if c.is_whitespace() { |
594 | 0 | continue; |
595 | 0 | } else if !in_comment && c == '#' { |
596 | 0 | in_comment = true; |
597 | 0 | } else if in_comment && c == '\n' { |
598 | 0 | in_comment = false; |
599 | 0 | } else { |
600 | 0 | start += i; |
601 | 0 | break; |
602 | | } |
603 | | } |
604 | 0 | self.pattern()[start..].chars().next() |
605 | 8 | } |
606 | | |
607 | | /// Returns true if the next call to `bump` would return false. |
608 | 1.94k | fn is_eof(&self) -> bool { |
609 | 1.94k | self.offset() == self.pattern().len() |
610 | 1.94k | } |
611 | | |
612 | | /// Return the current position of the parser, which includes the offset, |
613 | | /// line and column. |
614 | 2.36k | fn pos(&self) -> Position { |
615 | 2.36k | self.parser().pos.get() |
616 | 2.36k | } |
617 | | |
618 | | /// Create a span at the current position of the parser. Both the start |
619 | | /// and end of the span are set. |
620 | 272 | fn span(&self) -> Span { |
621 | 272 | Span::splat(self.pos()) |
622 | 272 | } |
623 | | |
624 | | /// Create a span that covers the current character. |
625 | 256 | fn span_char(&self) -> Span { |
626 | 256 | let mut next = Position { |
627 | 256 | offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), |
628 | 256 | line: self.line(), |
629 | 256 | column: self.column().checked_add(1).unwrap(), |
630 | 256 | }; |
631 | 256 | if self.char() == '\n' { |
632 | 0 | next.line += 1; |
633 | 0 | next.column = 1; |
634 | 256 | } |
635 | 256 | Span::new(self.pos(), next) |
636 | 256 | } |
637 | | |
638 | | /// Parse and push a single alternation on to the parser's internal stack. |
639 | | /// If the top of the stack already has an alternation, then add to that |
640 | | /// instead of pushing a new one. |
641 | | /// |
642 | | /// The concatenation given corresponds to a single alternation branch. |
643 | | /// The concatenation returned starts the next branch and is empty. |
644 | | /// |
645 | | /// This assumes the parser is currently positioned at `|` and will advance |
646 | | /// the parser to the character following `|`. |
647 | | #[inline(never)] |
648 | 8 | fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
649 | 8 | assert_eq!(self.char(), '|'); |
650 | 8 | concat.span.end = self.pos(); |
651 | 8 | self.push_or_add_alternation(concat); |
652 | 8 | self.bump(); |
653 | 8 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
654 | 8 | } |
655 | | |
656 | | /// Pushes or adds the given branch of an alternation to the parser's |
657 | | /// internal stack of state. |
658 | 8 | fn push_or_add_alternation(&self, concat: ast::Concat) { |
659 | 8 | use self::GroupState::*; |
660 | 8 | |
661 | 8 | let mut stack = self.parser().stack_group.borrow_mut(); |
662 | 8 | if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { |
663 | 4 | alts.asts.push(concat.into_ast()); |
664 | 4 | return; |
665 | 4 | } |
666 | 4 | stack.push(Alternation(ast::Alternation { |
667 | 4 | span: Span::new(concat.span.start, self.pos()), |
668 | 4 | asts: vec![concat.into_ast()], |
669 | 4 | })); |
670 | 8 | } |
671 | | |
672 | | /// Parse and push a group AST (and its parent concatenation) on to the |
673 | | /// parser's internal stack. Return a fresh concatenation corresponding |
674 | | /// to the group's sub-AST. |
675 | | /// |
676 | | /// If a set of flags was found (with no group), then the concatenation |
677 | | /// is returned with that set of flags added. |
678 | | /// |
679 | | /// This assumes that the parser is currently positioned on the opening |
680 | | /// parenthesis. It advances the parser to the character at the start |
681 | | /// of the sub-expression (or adjoining expression). |
682 | | /// |
683 | | /// If there was a problem parsing the start of the group, then an error |
684 | | /// is returned. |
685 | | #[inline(never)] |
686 | 36 | fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
687 | 36 | assert_eq!(self.char(), '('); |
688 | 36 | match self.parse_group()? { |
689 | 0 | Either::Left(set) => { |
690 | 0 | let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); |
691 | 0 | if let Some(v) = ignore { |
692 | 0 | self.parser().ignore_whitespace.set(v); |
693 | 0 | } |
694 | | |
695 | 0 | concat.asts.push(Ast::Flags(set)); |
696 | 0 | Ok(concat) |
697 | | } |
698 | 36 | Either::Right(group) => { |
699 | 36 | let old_ignore_whitespace = self.ignore_whitespace(); |
700 | 36 | let new_ignore_whitespace = group |
701 | 36 | .flags() |
702 | 36 | .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) |
703 | 36 | .unwrap_or(old_ignore_whitespace); |
704 | 36 | self.parser().stack_group.borrow_mut().push( |
705 | 36 | GroupState::Group { |
706 | 36 | concat: concat, |
707 | 36 | group: group, |
708 | 36 | ignore_whitespace: old_ignore_whitespace, |
709 | 36 | }, |
710 | 36 | ); |
711 | 36 | self.parser().ignore_whitespace.set(new_ignore_whitespace); |
712 | 36 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
713 | | } |
714 | | } |
715 | 36 | } |
716 | | |
717 | | /// Pop a group AST from the parser's internal stack and set the group's |
718 | | /// AST to the given concatenation. Return the concatenation containing |
719 | | /// the group. |
720 | | /// |
721 | | /// This assumes that the parser is currently positioned on the closing |
722 | | /// parenthesis and advances the parser to the character following the `)`. |
723 | | /// |
724 | | /// If no such group could be popped, then an unopened group error is |
725 | | /// returned. |
726 | | #[inline(never)] |
727 | 36 | fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { |
728 | 36 | use self::GroupState::*; |
729 | 36 | |
730 | 36 | assert_eq!(self.char(), ')'); |
731 | 36 | let mut stack = self.parser().stack_group.borrow_mut(); |
732 | 36 | let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack |
733 | 36 | .pop() |
734 | | { |
735 | 32 | Some(Group { concat, group, ignore_whitespace }) => { |
736 | 32 | (concat, group, ignore_whitespace, None) |
737 | | } |
738 | 4 | Some(Alternation(alt)) => match stack.pop() { |
739 | 4 | Some(Group { concat, group, ignore_whitespace }) => { |
740 | 4 | (concat, group, ignore_whitespace, Some(alt)) |
741 | | } |
742 | | None | Some(Alternation(_)) => { |
743 | 0 | return Err(self.error( |
744 | 0 | self.span_char(), |
745 | 0 | ast::ErrorKind::GroupUnopened, |
746 | 0 | )); |
747 | | } |
748 | | }, |
749 | | None => { |
750 | 0 | return Err(self |
751 | 0 | .error(self.span_char(), ast::ErrorKind::GroupUnopened)); |
752 | | } |
753 | | }; |
754 | 36 | self.parser().ignore_whitespace.set(ignore_whitespace); |
755 | 36 | group_concat.span.end = self.pos(); |
756 | 36 | self.bump(); |
757 | 36 | group.span.end = self.pos(); |
758 | 36 | match alt { |
759 | 4 | Some(mut alt) => { |
760 | 4 | alt.span.end = group_concat.span.end; |
761 | 4 | alt.asts.push(group_concat.into_ast()); |
762 | 4 | group.ast = Box::new(alt.into_ast()); |
763 | 4 | } |
764 | 32 | None => { |
765 | 32 | group.ast = Box::new(group_concat.into_ast()); |
766 | 32 | } |
767 | | } |
768 | 36 | prior_concat.asts.push(Ast::Group(group)); |
769 | 36 | Ok(prior_concat) |
770 | 36 | } |
771 | | |
772 | | /// Pop the last state from the parser's internal stack, if it exists, and |
773 | | /// add the given concatenation to it. There either must be no state or a |
774 | | /// single alternation item on the stack. Any other scenario produces an |
775 | | /// error. |
776 | | /// |
777 | | /// This assumes that the parser has advanced to the end. |
778 | | #[inline(never)] |
779 | 6 | fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { |
780 | 6 | concat.span.end = self.pos(); |
781 | 6 | let mut stack = self.parser().stack_group.borrow_mut(); |
782 | 6 | let ast = match stack.pop() { |
783 | 6 | None => Ok(concat.into_ast()), |
784 | 0 | Some(GroupState::Alternation(mut alt)) => { |
785 | 0 | alt.span.end = self.pos(); |
786 | 0 | alt.asts.push(concat.into_ast()); |
787 | 0 | Ok(Ast::Alternation(alt)) |
788 | | } |
789 | 0 | Some(GroupState::Group { group, .. }) => { |
790 | 0 | return Err( |
791 | 0 | self.error(group.span, ast::ErrorKind::GroupUnclosed) |
792 | 0 | ); |
793 | | } |
794 | | }; |
795 | | // If we try to pop again, there should be nothing. |
796 | 6 | match stack.pop() { |
797 | 6 | None => ast, |
798 | | Some(GroupState::Alternation(_)) => { |
799 | | // This unreachable is unfortunate. This case can't happen |
800 | | // because the only way we can be here is if there were two |
801 | | // `GroupState::Alternation`s adjacent in the parser's stack, |
802 | | // which we guarantee to never happen because we never push a |
803 | | // `GroupState::Alternation` if one is already at the top of |
804 | | // the stack. |
805 | 0 | unreachable!() |
806 | | } |
807 | 0 | Some(GroupState::Group { group, .. }) => { |
808 | 0 | Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) |
809 | | } |
810 | | } |
811 | 6 | } |
812 | | |
813 | | /// Parse the opening of a character class and push the current class |
814 | | /// parsing context onto the parser's stack. This assumes that the parser |
815 | | /// is positioned at an opening `[`. The given union should correspond to |
816 | | /// the union of set items built up before seeing the `[`. |
817 | | /// |
818 | | /// If there was a problem parsing the opening of the class, then an error |
819 | | /// is returned. Otherwise, a new union of set items for the class is |
820 | | /// returned (which may be populated with either a `]` or a `-`). |
821 | | #[inline(never)] |
822 | 68 | fn push_class_open( |
823 | 68 | &self, |
824 | 68 | parent_union: ast::ClassSetUnion, |
825 | 68 | ) -> Result<ast::ClassSetUnion> { |
826 | 68 | assert_eq!(self.char(), '['); |
827 | | |
828 | 68 | let (nested_set, nested_union) = self.parse_set_class_open()?; |
829 | 68 | self.parser() |
830 | 68 | .stack_class |
831 | 68 | .borrow_mut() |
832 | 68 | .push(ClassState::Open { union: parent_union, set: nested_set }); |
833 | 68 | Ok(nested_union) |
834 | 68 | } |
835 | | |
836 | | /// Parse the end of a character class set and pop the character class |
837 | | /// parser stack. The union given corresponds to the last union built |
838 | | /// before seeing the closing `]`. The union returned corresponds to the |
839 | | /// parent character class set with the nested class added to it. |
840 | | /// |
841 | | /// This assumes that the parser is positioned at a `]` and will advance |
842 | | /// the parser to the byte immediately following the `]`. |
843 | | /// |
844 | | /// If the stack is empty after popping, then this returns the final |
845 | | /// "top-level" character class AST (where a "top-level" character class |
846 | | /// is one that is not nested inside any other character class). |
847 | | /// |
848 | | /// If there is no corresponding opening bracket on the parser's stack, |
849 | | /// then an error is returned. |
850 | | #[inline(never)] |
851 | 68 | fn pop_class( |
852 | 68 | &self, |
853 | 68 | nested_union: ast::ClassSetUnion, |
854 | 68 | ) -> Result<Either<ast::ClassSetUnion, ast::Class>> { |
855 | 68 | assert_eq!(self.char(), ']'); |
856 | | |
857 | 68 | let item = ast::ClassSet::Item(nested_union.into_item()); |
858 | 68 | let prevset = self.pop_class_op(item); |
859 | 68 | let mut stack = self.parser().stack_class.borrow_mut(); |
860 | 68 | match stack.pop() { |
861 | | None => { |
862 | | // We can never observe an empty stack: |
863 | | // |
864 | | // 1) We are guaranteed to start with a non-empty stack since |
865 | | // the character class parser is only initiated when it sees |
866 | | // a `[`. |
867 | | // 2) If we ever observe an empty stack while popping after |
868 | | // seeing a `]`, then we signal the character class parser |
869 | | // to terminate. |
870 | 0 | panic!("unexpected empty character class stack") |
871 | | } |
872 | | Some(ClassState::Op { .. }) => { |
873 | | // This panic is unfortunate, but this case is impossible |
874 | | // since we already popped the Op state if one exists above. |
875 | | // Namely, every push to the class parser stack is guarded by |
876 | | // whether an existing Op is already on the top of the stack. |
877 | | // If it is, the existing Op is modified. That is, the stack |
878 | | // can never have consecutive Op states. |
879 | 0 | panic!("unexpected ClassState::Op") |
880 | | } |
881 | 68 | Some(ClassState::Open { mut union, mut set }) => { |
882 | 68 | self.bump(); |
883 | 68 | set.span.end = self.pos(); |
884 | 68 | set.kind = prevset; |
885 | 68 | if stack.is_empty() { |
886 | 68 | Ok(Either::Right(ast::Class::Bracketed(set))) |
887 | | } else { |
888 | 0 | union.push(ast::ClassSetItem::Bracketed(Box::new(set))); |
889 | 0 | Ok(Either::Left(union)) |
890 | | } |
891 | | } |
892 | | } |
893 | 68 | } |
894 | | |
895 | | /// Return an "unclosed class" error whose span points to the most |
896 | | /// recently opened class. |
897 | | /// |
898 | | /// This should only be called while parsing a character class. |
899 | | #[inline(never)] |
900 | 0 | fn unclosed_class_error(&self) -> ast::Error { |
901 | 0 | for state in self.parser().stack_class.borrow().iter().rev() { |
902 | 0 | match *state { |
903 | 0 | ClassState::Open { ref set, .. } => { |
904 | 0 | return self |
905 | 0 | .error(set.span, ast::ErrorKind::ClassUnclosed); |
906 | | } |
907 | 0 | _ => {} |
908 | | } |
909 | | } |
910 | | // We are guaranteed to have a non-empty stack with at least |
911 | | // one open bracket, so we should never get here. |
912 | 0 | panic!("no open character class found") |
913 | 0 | } |
914 | | |
915 | | /// Push the current set of class items on to the class parser's stack as |
916 | | /// the left hand side of the given operator. |
917 | | /// |
918 | | /// A fresh set union is returned, which should be used to build the right |
919 | | /// hand side of this operator. |
920 | | #[inline(never)] |
921 | 0 | fn push_class_op( |
922 | 0 | &self, |
923 | 0 | next_kind: ast::ClassSetBinaryOpKind, |
924 | 0 | next_union: ast::ClassSetUnion, |
925 | 0 | ) -> ast::ClassSetUnion { |
926 | 0 | let item = ast::ClassSet::Item(next_union.into_item()); |
927 | 0 | let new_lhs = self.pop_class_op(item); |
928 | 0 | self.parser() |
929 | 0 | .stack_class |
930 | 0 | .borrow_mut() |
931 | 0 | .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); |
932 | 0 | ast::ClassSetUnion { span: self.span(), items: vec![] } |
933 | 0 | } |
934 | | |
935 | | /// Pop a character class set from the character class parser stack. If the |
936 | | /// top of the stack is just an item (not an operation), then return the |
937 | | /// given set unchanged. If the top of the stack is an operation, then the |
938 | | /// given set will be used as the rhs of the operation on the top of the |
939 | | /// stack. In that case, the binary operation is returned as a set. |
940 | | #[inline(never)] |
941 | 68 | fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { |
942 | 68 | let mut stack = self.parser().stack_class.borrow_mut(); |
943 | 68 | let (kind, lhs) = match stack.pop() { |
944 | 0 | Some(ClassState::Op { kind, lhs }) => (kind, lhs), |
945 | 68 | Some(state @ ClassState::Open { .. }) => { |
946 | 68 | stack.push(state); |
947 | 68 | return rhs; |
948 | | } |
949 | 0 | None => unreachable!(), |
950 | | }; |
951 | 0 | let span = Span::new(lhs.span().start, rhs.span().end); |
952 | 0 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
953 | 0 | span: span, |
954 | 0 | kind: kind, |
955 | 0 | lhs: Box::new(lhs), |
956 | 0 | rhs: Box::new(rhs), |
957 | 0 | }) |
958 | 68 | } |
959 | | } |
960 | | |
961 | | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
962 | | /// Parse the regular expression into an abstract syntax tree. |
963 | 6 | fn parse(&self) -> Result<Ast> { |
964 | 6 | self.parse_with_comments().map(|astc| astc.ast) |
965 | 6 | } |
966 | | |
967 | | /// Parse the regular expression and return an abstract syntax tree with |
968 | | /// all of the comments found in the pattern. |
969 | 6 | fn parse_with_comments(&self) -> Result<ast::WithComments> { |
970 | 6 | assert_eq!(self.offset(), 0, "parser can only be used once"); |
971 | 6 | self.parser().reset(); |
972 | 6 | let mut concat = ast::Concat { span: self.span(), asts: vec![] }; |
973 | 454 | loop { |
974 | 454 | self.bump_space(); |
975 | 454 | if self.is_eof() { |
976 | 6 | break; |
977 | 448 | } |
978 | 448 | match self.char() { |
979 | 36 | '(' => concat = self.push_group(concat)?, |
980 | 36 | ')' => concat = self.pop_group(concat)?, |
981 | 8 | '|' => concat = self.push_alternate(concat)?, |
982 | 68 | '[' => { |
983 | 68 | let class = self.parse_set_class()?; |
984 | 68 | concat.asts.push(Ast::Class(class)); |
985 | | } |
986 | | '?' => { |
987 | 36 | concat = self.parse_uncounted_repetition( |
988 | 36 | concat, |
989 | 36 | ast::RepetitionKind::ZeroOrOne, |
990 | 36 | )?; |
991 | | } |
992 | | '*' => { |
993 | 40 | concat = self.parse_uncounted_repetition( |
994 | 40 | concat, |
995 | 40 | ast::RepetitionKind::ZeroOrMore, |
996 | 40 | )?; |
997 | | } |
998 | | '+' => { |
999 | 24 | concat = self.parse_uncounted_repetition( |
1000 | 24 | concat, |
1001 | 24 | ast::RepetitionKind::OneOrMore, |
1002 | 24 | )?; |
1003 | | } |
1004 | | '{' => { |
1005 | 0 | concat = self.parse_counted_repetition(concat)?; |
1006 | | } |
1007 | 200 | _ => concat.asts.push(self.parse_primitive()?.into_ast()), |
1008 | | } |
1009 | | } |
1010 | 6 | let ast = self.pop_group_end(concat)?; |
1011 | 6 | NestLimiter::new(self).check(&ast)?; |
1012 | 6 | Ok(ast::WithComments { |
1013 | 6 | ast: ast, |
1014 | 6 | comments: mem::replace( |
1015 | 6 | &mut *self.parser().comments.borrow_mut(), |
1016 | 6 | vec![], |
1017 | 6 | ), |
1018 | 6 | }) |
1019 | 6 | } |
1020 | | |
1021 | | /// Parses an uncounted repetition operation. An uncounted repetition |
1022 | | /// operator includes ?, * and +, but does not include the {m,n} syntax. |
1023 | | /// The given `kind` should correspond to the operator observed by the |
1024 | | /// caller. |
1025 | | /// |
1026 | | /// This assumes that the paser is currently positioned at the repetition |
1027 | | /// operator and advances the parser to the first character after the |
1028 | | /// operator. (Note that the operator may include a single additional `?`, |
1029 | | /// which makes the operator ungreedy.) |
1030 | | /// |
1031 | | /// The caller should include the concatenation that is being built. The |
1032 | | /// concatenation returned includes the repetition operator applied to the |
1033 | | /// last expression in the given concatenation. |
1034 | | #[inline(never)] |
1035 | 100 | fn parse_uncounted_repetition( |
1036 | 100 | &self, |
1037 | 100 | mut concat: ast::Concat, |
1038 | 100 | kind: ast::RepetitionKind, |
1039 | 100 | ) -> Result<ast::Concat> { |
1040 | 100 | assert!( |
1041 | 100 | self.char() == '?' || self.char() == '*' || self.char() == '+' |
1042 | | ); |
1043 | 100 | let op_start = self.pos(); |
1044 | 100 | let ast = match concat.asts.pop() { |
1045 | 100 | Some(ast) => ast, |
1046 | | None => { |
1047 | 0 | return Err( |
1048 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1049 | 0 | ) |
1050 | | } |
1051 | | }; |
1052 | 100 | match ast { |
1053 | | Ast::Empty(_) | Ast::Flags(_) => { |
1054 | 0 | return Err( |
1055 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1056 | 0 | ) |
1057 | | } |
1058 | 100 | _ => {} |
1059 | 100 | } |
1060 | 100 | let mut greedy = true; |
1061 | 100 | if self.bump() && self.char() == '?' { |
1062 | 0 | greedy = false; |
1063 | 0 | self.bump(); |
1064 | 100 | } |
1065 | 100 | concat.asts.push(Ast::Repetition(ast::Repetition { |
1066 | 100 | span: ast.span().with_end(self.pos()), |
1067 | 100 | op: ast::RepetitionOp { |
1068 | 100 | span: Span::new(op_start, self.pos()), |
1069 | 100 | kind: kind, |
1070 | 100 | }, |
1071 | 100 | greedy: greedy, |
1072 | 100 | ast: Box::new(ast), |
1073 | 100 | })); |
1074 | 100 | Ok(concat) |
1075 | 100 | } |
1076 | | |
1077 | | /// Parses a counted repetition operation. A counted repetition operator |
1078 | | /// corresponds to the {m,n} syntax, and does not include the ?, * or + |
1079 | | /// operators. |
1080 | | /// |
1081 | | /// This assumes that the paser is currently positioned at the opening `{` |
1082 | | /// and advances the parser to the first character after the operator. |
1083 | | /// (Note that the operator may include a single additional `?`, which |
1084 | | /// makes the operator ungreedy.) |
1085 | | /// |
1086 | | /// The caller should include the concatenation that is being built. The |
1087 | | /// concatenation returned includes the repetition operator applied to the |
1088 | | /// last expression in the given concatenation. |
1089 | | #[inline(never)] |
1090 | 0 | fn parse_counted_repetition( |
1091 | 0 | &self, |
1092 | 0 | mut concat: ast::Concat, |
1093 | 0 | ) -> Result<ast::Concat> { |
1094 | 0 | assert!(self.char() == '{'); |
1095 | 0 | let start = self.pos(); |
1096 | 0 | let ast = match concat.asts.pop() { |
1097 | 0 | Some(ast) => ast, |
1098 | | None => { |
1099 | 0 | return Err( |
1100 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1101 | 0 | ) |
1102 | | } |
1103 | | }; |
1104 | 0 | match ast { |
1105 | | Ast::Empty(_) | Ast::Flags(_) => { |
1106 | 0 | return Err( |
1107 | 0 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1108 | 0 | ) |
1109 | | } |
1110 | 0 | _ => {} |
1111 | 0 | } |
1112 | 0 | if !self.bump_and_bump_space() { |
1113 | 0 | return Err(self.error( |
1114 | 0 | Span::new(start, self.pos()), |
1115 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1116 | 0 | )); |
1117 | 0 | } |
1118 | 0 | let count_start = specialize_err( |
1119 | 0 | self.parse_decimal(), |
1120 | 0 | ast::ErrorKind::DecimalEmpty, |
1121 | 0 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1122 | 0 | )?; |
1123 | 0 | let mut range = ast::RepetitionRange::Exactly(count_start); |
1124 | 0 | if self.is_eof() { |
1125 | 0 | return Err(self.error( |
1126 | 0 | Span::new(start, self.pos()), |
1127 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1128 | 0 | )); |
1129 | 0 | } |
1130 | 0 | if self.char() == ',' { |
1131 | 0 | if !self.bump_and_bump_space() { |
1132 | 0 | return Err(self.error( |
1133 | 0 | Span::new(start, self.pos()), |
1134 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1135 | 0 | )); |
1136 | 0 | } |
1137 | 0 | if self.char() != '}' { |
1138 | 0 | let count_end = specialize_err( |
1139 | 0 | self.parse_decimal(), |
1140 | 0 | ast::ErrorKind::DecimalEmpty, |
1141 | 0 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1142 | 0 | )?; |
1143 | 0 | range = ast::RepetitionRange::Bounded(count_start, count_end); |
1144 | 0 | } else { |
1145 | 0 | range = ast::RepetitionRange::AtLeast(count_start); |
1146 | 0 | } |
1147 | 0 | } |
1148 | 0 | if self.is_eof() || self.char() != '}' { |
1149 | 0 | return Err(self.error( |
1150 | 0 | Span::new(start, self.pos()), |
1151 | 0 | ast::ErrorKind::RepetitionCountUnclosed, |
1152 | 0 | )); |
1153 | 0 | } |
1154 | 0 |
|
1155 | 0 | let mut greedy = true; |
1156 | 0 | if self.bump_and_bump_space() && self.char() == '?' { |
1157 | 0 | greedy = false; |
1158 | 0 | self.bump(); |
1159 | 0 | } |
1160 | | |
1161 | 0 | let op_span = Span::new(start, self.pos()); |
1162 | 0 | if !range.is_valid() { |
1163 | 0 | return Err( |
1164 | 0 | self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) |
1165 | 0 | ); |
1166 | 0 | } |
1167 | 0 | concat.asts.push(Ast::Repetition(ast::Repetition { |
1168 | 0 | span: ast.span().with_end(self.pos()), |
1169 | 0 | op: ast::RepetitionOp { |
1170 | 0 | span: op_span, |
1171 | 0 | kind: ast::RepetitionKind::Range(range), |
1172 | 0 | }, |
1173 | 0 | greedy: greedy, |
1174 | 0 | ast: Box::new(ast), |
1175 | 0 | })); |
1176 | 0 | Ok(concat) |
1177 | 0 | } |
1178 | | |
1179 | | /// Parse a group (which contains a sub-expression) or a set of flags. |
1180 | | /// |
1181 | | /// If a group was found, then it is returned with an empty AST. If a set |
1182 | | /// of flags is found, then that set is returned. |
1183 | | /// |
1184 | | /// The parser should be positioned at the opening parenthesis. |
1185 | | /// |
1186 | | /// This advances the parser to the character before the start of the |
1187 | | /// sub-expression (in the case of a group) or to the closing parenthesis |
1188 | | /// immediately following the set of flags. |
1189 | | /// |
1190 | | /// # Errors |
1191 | | /// |
1192 | | /// If flags are given and incorrectly specified, then a corresponding |
1193 | | /// error is returned. |
1194 | | /// |
1195 | | /// If a capture name is given and it is incorrectly specified, then a |
1196 | | /// corresponding error is returned. |
1197 | | #[inline(never)] |
1198 | 36 | fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { |
1199 | 36 | assert_eq!(self.char(), '('); |
1200 | 36 | let open_span = self.span_char(); |
1201 | 36 | self.bump(); |
1202 | 36 | self.bump_space(); |
1203 | 36 | if self.is_lookaround_prefix() { |
1204 | 0 | return Err(self.error( |
1205 | 0 | Span::new(open_span.start, self.span().end), |
1206 | 0 | ast::ErrorKind::UnsupportedLookAround, |
1207 | 0 | )); |
1208 | 36 | } |
1209 | 36 | let inner_span = self.span(); |
1210 | 36 | if self.bump_if("?P<") { |
1211 | 0 | let capture_index = self.next_capture_index(open_span)?; |
1212 | 0 | let cap = self.parse_capture_name(capture_index)?; |
1213 | 0 | Ok(Either::Right(ast::Group { |
1214 | 0 | span: open_span, |
1215 | 0 | kind: ast::GroupKind::CaptureName(cap), |
1216 | 0 | ast: Box::new(Ast::Empty(self.span())), |
1217 | 0 | })) |
1218 | 36 | } else if self.bump_if("?") { |
1219 | 14 | if self.is_eof() { |
1220 | 0 | return Err( |
1221 | 0 | self.error(open_span, ast::ErrorKind::GroupUnclosed) |
1222 | 0 | ); |
1223 | 14 | } |
1224 | 14 | let flags = self.parse_flags()?; |
1225 | 14 | let char_end = self.char(); |
1226 | 14 | self.bump(); |
1227 | 14 | if char_end == ')' { |
1228 | | // We don't allow empty flags, e.g., `(?)`. We instead |
1229 | | // interpret it as a repetition operator missing its argument. |
1230 | 0 | if flags.items.is_empty() { |
1231 | 0 | return Err(self.error( |
1232 | 0 | inner_span, |
1233 | 0 | ast::ErrorKind::RepetitionMissing, |
1234 | 0 | )); |
1235 | 0 | } |
1236 | 0 | Ok(Either::Left(ast::SetFlags { |
1237 | 0 | span: Span { end: self.pos(), ..open_span }, |
1238 | 0 | flags: flags, |
1239 | 0 | })) |
1240 | | } else { |
1241 | 14 | assert_eq!(char_end, ':'); |
1242 | 14 | Ok(Either::Right(ast::Group { |
1243 | 14 | span: open_span, |
1244 | 14 | kind: ast::GroupKind::NonCapturing(flags), |
1245 | 14 | ast: Box::new(Ast::Empty(self.span())), |
1246 | 14 | })) |
1247 | | } |
1248 | | } else { |
1249 | 22 | let capture_index = self.next_capture_index(open_span)?; |
1250 | 22 | Ok(Either::Right(ast::Group { |
1251 | 22 | span: open_span, |
1252 | 22 | kind: ast::GroupKind::CaptureIndex(capture_index), |
1253 | 22 | ast: Box::new(Ast::Empty(self.span())), |
1254 | 22 | })) |
1255 | | } |
1256 | 36 | } |
1257 | | |
1258 | | /// Parses a capture group name. Assumes that the parser is positioned at |
1259 | | /// the first character in the name following the opening `<` (and may |
1260 | | /// possibly be EOF). This advances the parser to the first character |
1261 | | /// following the closing `>`. |
1262 | | /// |
1263 | | /// The caller must provide the capture index of the group for this name. |
1264 | | #[inline(never)] |
1265 | 0 | fn parse_capture_name( |
1266 | 0 | &self, |
1267 | 0 | capture_index: u32, |
1268 | 0 | ) -> Result<ast::CaptureName> { |
1269 | 0 | if self.is_eof() { |
1270 | 0 | return Err(self |
1271 | 0 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1272 | 0 | } |
1273 | 0 | let start = self.pos(); |
1274 | 0 | loop { |
1275 | 0 | if self.char() == '>' { |
1276 | 0 | break; |
1277 | 0 | } |
1278 | 0 | if !is_capture_char(self.char(), self.pos() == start) { |
1279 | 0 | return Err(self.error( |
1280 | 0 | self.span_char(), |
1281 | 0 | ast::ErrorKind::GroupNameInvalid, |
1282 | 0 | )); |
1283 | 0 | } |
1284 | 0 | if !self.bump() { |
1285 | 0 | break; |
1286 | 0 | } |
1287 | | } |
1288 | 0 | let end = self.pos(); |
1289 | 0 | if self.is_eof() { |
1290 | 0 | return Err(self |
1291 | 0 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1292 | 0 | } |
1293 | 0 | assert_eq!(self.char(), '>'); |
1294 | 0 | self.bump(); |
1295 | 0 | let name = &self.pattern()[start.offset..end.offset]; |
1296 | 0 | if name.is_empty() { |
1297 | 0 | return Err(self.error( |
1298 | 0 | Span::new(start, start), |
1299 | 0 | ast::ErrorKind::GroupNameEmpty, |
1300 | 0 | )); |
1301 | 0 | } |
1302 | 0 | let capname = ast::CaptureName { |
1303 | 0 | span: Span::new(start, end), |
1304 | 0 | name: name.to_string(), |
1305 | 0 | index: capture_index, |
1306 | 0 | }; |
1307 | 0 | self.add_capture_name(&capname)?; |
1308 | 0 | Ok(capname) |
1309 | 0 | } |
1310 | | |
1311 | | /// Parse a sequence of flags starting at the current character. |
1312 | | /// |
1313 | | /// This advances the parser to the character immediately following the |
1314 | | /// flags, which is guaranteed to be either `:` or `)`. |
1315 | | /// |
1316 | | /// # Errors |
1317 | | /// |
1318 | | /// If any flags are duplicated, then an error is returned. |
1319 | | /// |
1320 | | /// If the negation operator is used more than once, then an error is |
1321 | | /// returned. |
1322 | | /// |
1323 | | /// If no flags could be found or if the negation operation is not followed |
1324 | | /// by any flags, then an error is returned. |
1325 | | #[inline(never)] |
1326 | 14 | fn parse_flags(&self) -> Result<ast::Flags> { |
1327 | 14 | let mut flags = ast::Flags { span: self.span(), items: vec![] }; |
1328 | 14 | let mut last_was_negation = None; |
1329 | 14 | while self.char() != ':' && self.char() != ')' { |
1330 | 0 | if self.char() == '-' { |
1331 | 0 | last_was_negation = Some(self.span_char()); |
1332 | 0 | let item = ast::FlagsItem { |
1333 | 0 | span: self.span_char(), |
1334 | 0 | kind: ast::FlagsItemKind::Negation, |
1335 | 0 | }; |
1336 | 0 | if let Some(i) = flags.add_item(item) { |
1337 | 0 | return Err(self.error( |
1338 | 0 | self.span_char(), |
1339 | 0 | ast::ErrorKind::FlagRepeatedNegation { |
1340 | 0 | original: flags.items[i].span, |
1341 | 0 | }, |
1342 | 0 | )); |
1343 | 0 | } |
1344 | | } else { |
1345 | 0 | last_was_negation = None; |
1346 | 0 | let item = ast::FlagsItem { |
1347 | 0 | span: self.span_char(), |
1348 | 0 | kind: ast::FlagsItemKind::Flag(self.parse_flag()?), |
1349 | | }; |
1350 | 0 | if let Some(i) = flags.add_item(item) { |
1351 | 0 | return Err(self.error( |
1352 | 0 | self.span_char(), |
1353 | 0 | ast::ErrorKind::FlagDuplicate { |
1354 | 0 | original: flags.items[i].span, |
1355 | 0 | }, |
1356 | 0 | )); |
1357 | 0 | } |
1358 | | } |
1359 | 0 | if !self.bump() { |
1360 | 0 | return Err( |
1361 | 0 | self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) |
1362 | 0 | ); |
1363 | 0 | } |
1364 | | } |
1365 | 14 | if let Some(span) = last_was_negation { |
1366 | 0 | return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); |
1367 | 14 | } |
1368 | 14 | flags.span.end = self.pos(); |
1369 | 14 | Ok(flags) |
1370 | 14 | } |
1371 | | |
1372 | | /// Parse the current character as a flag. Do not advance the parser. |
1373 | | /// |
1374 | | /// # Errors |
1375 | | /// |
1376 | | /// If the flag is not recognized, then an error is returned. |
1377 | | #[inline(never)] |
1378 | 0 | fn parse_flag(&self) -> Result<ast::Flag> { |
1379 | 0 | match self.char() { |
1380 | 0 | 'i' => Ok(ast::Flag::CaseInsensitive), |
1381 | 0 | 'm' => Ok(ast::Flag::MultiLine), |
1382 | 0 | 's' => Ok(ast::Flag::DotMatchesNewLine), |
1383 | 0 | 'U' => Ok(ast::Flag::SwapGreed), |
1384 | 0 | 'u' => Ok(ast::Flag::Unicode), |
1385 | 0 | 'x' => Ok(ast::Flag::IgnoreWhitespace), |
1386 | | _ => { |
1387 | 0 | Err(self |
1388 | 0 | .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) |
1389 | | } |
1390 | | } |
1391 | 0 | } |
1392 | | |
1393 | | /// Parse a primitive AST. e.g., A literal, non-set character class or |
1394 | | /// assertion. |
1395 | | /// |
1396 | | /// This assumes that the parser expects a primitive at the current |
1397 | | /// location. i.e., All other non-primitive cases have been handled. |
1398 | | /// For example, if the parser's position is at `|`, then `|` will be |
1399 | | /// treated as a literal (e.g., inside a character class). |
1400 | | /// |
1401 | | /// This advances the parser to the first character immediately following |
1402 | | /// the primitive. |
1403 | 200 | fn parse_primitive(&self) -> Result<Primitive> { |
1404 | 200 | match self.char() { |
1405 | 0 | '\\' => self.parse_escape(), |
1406 | | '.' => { |
1407 | 2 | let ast = Primitive::Dot(self.span_char()); |
1408 | 2 | self.bump(); |
1409 | 2 | Ok(ast) |
1410 | | } |
1411 | | '^' => { |
1412 | 6 | let ast = Primitive::Assertion(ast::Assertion { |
1413 | 6 | span: self.span_char(), |
1414 | 6 | kind: ast::AssertionKind::StartLine, |
1415 | 6 | }); |
1416 | 6 | self.bump(); |
1417 | 6 | Ok(ast) |
1418 | | } |
1419 | | '$' => { |
1420 | 6 | let ast = Primitive::Assertion(ast::Assertion { |
1421 | 6 | span: self.span_char(), |
1422 | 6 | kind: ast::AssertionKind::EndLine, |
1423 | 6 | }); |
1424 | 6 | self.bump(); |
1425 | 6 | Ok(ast) |
1426 | | } |
1427 | 186 | c => { |
1428 | 186 | let ast = Primitive::Literal(ast::Literal { |
1429 | 186 | span: self.span_char(), |
1430 | 186 | kind: ast::LiteralKind::Verbatim, |
1431 | 186 | c: c, |
1432 | 186 | }); |
1433 | 186 | self.bump(); |
1434 | 186 | Ok(ast) |
1435 | | } |
1436 | | } |
1437 | 200 | } |
1438 | | |
1439 | | /// Parse an escape sequence as a primitive AST. |
1440 | | /// |
1441 | | /// This assumes the parser is positioned at the start of the escape |
1442 | | /// sequence, i.e., `\`. It advances the parser to the first position |
1443 | | /// immediately following the escape sequence. |
1444 | | #[inline(never)] |
1445 | 0 | fn parse_escape(&self) -> Result<Primitive> { |
1446 | 0 | assert_eq!(self.char(), '\\'); |
1447 | 0 | let start = self.pos(); |
1448 | 0 | if !self.bump() { |
1449 | 0 | return Err(self.error( |
1450 | 0 | Span::new(start, self.pos()), |
1451 | 0 | ast::ErrorKind::EscapeUnexpectedEof, |
1452 | 0 | )); |
1453 | 0 | } |
1454 | 0 | let c = self.char(); |
1455 | | // Put some of the more complicated routines into helpers. |
1456 | 0 | match c { |
1457 | 0 | '0'..='7' => { |
1458 | 0 | if !self.parser().octal { |
1459 | 0 | return Err(self.error( |
1460 | 0 | Span::new(start, self.span_char().end), |
1461 | 0 | ast::ErrorKind::UnsupportedBackreference, |
1462 | 0 | )); |
1463 | 0 | } |
1464 | 0 | let mut lit = self.parse_octal(); |
1465 | 0 | lit.span.start = start; |
1466 | 0 | return Ok(Primitive::Literal(lit)); |
1467 | | } |
1468 | 0 | '8'..='9' if !self.parser().octal => { |
1469 | 0 | return Err(self.error( |
1470 | 0 | Span::new(start, self.span_char().end), |
1471 | 0 | ast::ErrorKind::UnsupportedBackreference, |
1472 | 0 | )); |
1473 | | } |
1474 | | 'x' | 'u' | 'U' => { |
1475 | 0 | let mut lit = self.parse_hex()?; |
1476 | 0 | lit.span.start = start; |
1477 | 0 | return Ok(Primitive::Literal(lit)); |
1478 | | } |
1479 | | 'p' | 'P' => { |
1480 | 0 | let mut cls = self.parse_unicode_class()?; |
1481 | 0 | cls.span.start = start; |
1482 | 0 | return Ok(Primitive::Unicode(cls)); |
1483 | | } |
1484 | | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
1485 | 0 | let mut cls = self.parse_perl_class(); |
1486 | 0 | cls.span.start = start; |
1487 | 0 | return Ok(Primitive::Perl(cls)); |
1488 | | } |
1489 | 0 | _ => {} |
1490 | 0 | } |
1491 | 0 |
|
1492 | 0 | // Handle all of the one letter sequences inline. |
1493 | 0 | self.bump(); |
1494 | 0 | let span = Span::new(start, self.pos()); |
1495 | 0 | if is_meta_character(c) { |
1496 | 0 | return Ok(Primitive::Literal(ast::Literal { |
1497 | 0 | span: span, |
1498 | 0 | kind: ast::LiteralKind::Punctuation, |
1499 | 0 | c: c, |
1500 | 0 | })); |
1501 | 0 | } |
1502 | 0 | let special = |kind, c| { |
1503 | 0 | Ok(Primitive::Literal(ast::Literal { |
1504 | 0 | span: span, |
1505 | 0 | kind: ast::LiteralKind::Special(kind), |
1506 | 0 | c: c, |
1507 | 0 | })) |
1508 | 0 | }; |
1509 | 0 | match c { |
1510 | 0 | 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), |
1511 | 0 | 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), |
1512 | 0 | 't' => special(ast::SpecialLiteralKind::Tab, '\t'), |
1513 | 0 | 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), |
1514 | 0 | 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), |
1515 | 0 | 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), |
1516 | 0 | ' ' if self.ignore_whitespace() => { |
1517 | 0 | special(ast::SpecialLiteralKind::Space, ' ') |
1518 | | } |
1519 | 0 | 'A' => Ok(Primitive::Assertion(ast::Assertion { |
1520 | 0 | span: span, |
1521 | 0 | kind: ast::AssertionKind::StartText, |
1522 | 0 | })), |
1523 | 0 | 'z' => Ok(Primitive::Assertion(ast::Assertion { |
1524 | 0 | span: span, |
1525 | 0 | kind: ast::AssertionKind::EndText, |
1526 | 0 | })), |
1527 | 0 | 'b' => Ok(Primitive::Assertion(ast::Assertion { |
1528 | 0 | span: span, |
1529 | 0 | kind: ast::AssertionKind::WordBoundary, |
1530 | 0 | })), |
1531 | 0 | 'B' => Ok(Primitive::Assertion(ast::Assertion { |
1532 | 0 | span: span, |
1533 | 0 | kind: ast::AssertionKind::NotWordBoundary, |
1534 | 0 | })), |
1535 | 0 | _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), |
1536 | | } |
1537 | 0 | } |
1538 | | |
1539 | | /// Parse an octal representation of a Unicode codepoint up to 3 digits |
1540 | | /// long. This expects the parser to be positioned at the first octal |
1541 | | /// digit and advances the parser to the first character immediately |
1542 | | /// following the octal number. This also assumes that parsing octal |
1543 | | /// escapes is enabled. |
1544 | | /// |
1545 | | /// Assuming the preconditions are met, this routine can never fail. |
1546 | | #[inline(never)] |
1547 | 0 | fn parse_octal(&self) -> ast::Literal { |
1548 | 0 | use std::char; |
1549 | 0 | use std::u32; |
1550 | 0 |
|
1551 | 0 | assert!(self.parser().octal); |
1552 | 0 | assert!('0' <= self.char() && self.char() <= '7'); |
1553 | 0 | let start = self.pos(); |
1554 | | // Parse up to two more digits. |
1555 | 0 | while self.bump() |
1556 | 0 | && '0' <= self.char() |
1557 | 0 | && self.char() <= '7' |
1558 | 0 | && self.pos().offset - start.offset <= 2 |
1559 | 0 | {} |
1560 | 0 | let end = self.pos(); |
1561 | 0 | let octal = &self.pattern()[start.offset..end.offset]; |
1562 | 0 | // Parsing the octal should never fail since the above guarantees a |
1563 | 0 | // valid number. |
1564 | 0 | let codepoint = |
1565 | 0 | u32::from_str_radix(octal, 8).expect("valid octal number"); |
1566 | 0 | // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no |
1567 | 0 | // invalid Unicode scalar values. |
1568 | 0 | let c = char::from_u32(codepoint).expect("Unicode scalar value"); |
1569 | 0 | ast::Literal { |
1570 | 0 | span: Span::new(start, end), |
1571 | 0 | kind: ast::LiteralKind::Octal, |
1572 | 0 | c: c, |
1573 | 0 | } |
1574 | 0 | } |
1575 | | |
1576 | | /// Parse a hex representation of a Unicode codepoint. This handles both |
1577 | | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
1578 | | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
1579 | | /// the first character immediately following the hexadecimal literal. |
1580 | | #[inline(never)] |
1581 | 0 | fn parse_hex(&self) -> Result<ast::Literal> { |
1582 | 0 | assert!( |
1583 | 0 | self.char() == 'x' || self.char() == 'u' || self.char() == 'U' |
1584 | | ); |
1585 | | |
1586 | 0 | let hex_kind = match self.char() { |
1587 | 0 | 'x' => ast::HexLiteralKind::X, |
1588 | 0 | 'u' => ast::HexLiteralKind::UnicodeShort, |
1589 | 0 | _ => ast::HexLiteralKind::UnicodeLong, |
1590 | | }; |
1591 | 0 | if !self.bump_and_bump_space() { |
1592 | 0 | return Err( |
1593 | 0 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
1594 | 0 | ); |
1595 | 0 | } |
1596 | 0 | if self.char() == '{' { |
1597 | 0 | self.parse_hex_brace(hex_kind) |
1598 | | } else { |
1599 | 0 | self.parse_hex_digits(hex_kind) |
1600 | | } |
1601 | 0 | } |
1602 | | |
1603 | | /// Parse an N-digit hex representation of a Unicode codepoint. This |
1604 | | /// expects the parser to be positioned at the first digit and will advance |
1605 | | /// the parser to the first character immediately following the escape |
1606 | | /// sequence. |
1607 | | /// |
1608 | | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
1609 | | /// or 8 (for `\UNNNNNNNN`). |
1610 | | #[inline(never)] |
1611 | 0 | fn parse_hex_digits( |
1612 | 0 | &self, |
1613 | 0 | kind: ast::HexLiteralKind, |
1614 | 0 | ) -> Result<ast::Literal> { |
1615 | 0 | use std::char; |
1616 | 0 | use std::u32; |
1617 | 0 |
|
1618 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1619 | 0 | scratch.clear(); |
1620 | 0 |
|
1621 | 0 | let start = self.pos(); |
1622 | 0 | for i in 0..kind.digits() { |
1623 | 0 | if i > 0 && !self.bump_and_bump_space() { |
1624 | 0 | return Err(self |
1625 | 0 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
1626 | 0 | } |
1627 | 0 | if !is_hex(self.char()) { |
1628 | 0 | return Err(self.error( |
1629 | 0 | self.span_char(), |
1630 | 0 | ast::ErrorKind::EscapeHexInvalidDigit, |
1631 | 0 | )); |
1632 | 0 | } |
1633 | 0 | scratch.push(self.char()); |
1634 | | } |
1635 | | // The final bump just moves the parser past the literal, which may |
1636 | | // be EOF. |
1637 | 0 | self.bump_and_bump_space(); |
1638 | 0 | let end = self.pos(); |
1639 | 0 | let hex = scratch.as_str(); |
1640 | 0 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1641 | 0 | None => Err(self.error( |
1642 | 0 | Span::new(start, end), |
1643 | 0 | ast::ErrorKind::EscapeHexInvalid, |
1644 | 0 | )), |
1645 | 0 | Some(c) => Ok(ast::Literal { |
1646 | 0 | span: Span::new(start, end), |
1647 | 0 | kind: ast::LiteralKind::HexFixed(kind), |
1648 | 0 | c: c, |
1649 | 0 | }), |
1650 | | } |
1651 | 0 | } |
1652 | | |
1653 | | /// Parse a hex representation of any Unicode scalar value. This expects |
1654 | | /// the parser to be positioned at the opening brace `{` and will advance |
1655 | | /// the parser to the first character following the closing brace `}`. |
1656 | | #[inline(never)] |
1657 | 0 | fn parse_hex_brace( |
1658 | 0 | &self, |
1659 | 0 | kind: ast::HexLiteralKind, |
1660 | 0 | ) -> Result<ast::Literal> { |
1661 | 0 | use std::char; |
1662 | 0 | use std::u32; |
1663 | 0 |
|
1664 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1665 | 0 | scratch.clear(); |
1666 | 0 |
|
1667 | 0 | let brace_pos = self.pos(); |
1668 | 0 | let start = self.span_char().end; |
1669 | 0 | while self.bump_and_bump_space() && self.char() != '}' { |
1670 | 0 | if !is_hex(self.char()) { |
1671 | 0 | return Err(self.error( |
1672 | 0 | self.span_char(), |
1673 | 0 | ast::ErrorKind::EscapeHexInvalidDigit, |
1674 | 0 | )); |
1675 | 0 | } |
1676 | 0 | scratch.push(self.char()); |
1677 | | } |
1678 | 0 | if self.is_eof() { |
1679 | 0 | return Err(self.error( |
1680 | 0 | Span::new(brace_pos, self.pos()), |
1681 | 0 | ast::ErrorKind::EscapeUnexpectedEof, |
1682 | 0 | )); |
1683 | 0 | } |
1684 | 0 | let end = self.pos(); |
1685 | 0 | let hex = scratch.as_str(); |
1686 | 0 | assert_eq!(self.char(), '}'); |
1687 | 0 | self.bump_and_bump_space(); |
1688 | 0 |
|
1689 | 0 | if hex.is_empty() { |
1690 | 0 | return Err(self.error( |
1691 | 0 | Span::new(brace_pos, self.pos()), |
1692 | 0 | ast::ErrorKind::EscapeHexEmpty, |
1693 | 0 | )); |
1694 | 0 | } |
1695 | 0 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1696 | 0 | None => Err(self.error( |
1697 | 0 | Span::new(start, end), |
1698 | 0 | ast::ErrorKind::EscapeHexInvalid, |
1699 | 0 | )), |
1700 | 0 | Some(c) => Ok(ast::Literal { |
1701 | 0 | span: Span::new(start, self.pos()), |
1702 | 0 | kind: ast::LiteralKind::HexBrace(kind), |
1703 | 0 | c: c, |
1704 | 0 | }), |
1705 | | } |
1706 | 0 | } |
1707 | | |
1708 | | /// Parse a decimal number into a u32 while trimming leading and trailing |
1709 | | /// whitespace. |
1710 | | /// |
1711 | | /// This expects the parser to be positioned at the first position where |
1712 | | /// a decimal digit could occur. This will advance the parser to the byte |
1713 | | /// immediately following the last contiguous decimal digit. |
1714 | | /// |
1715 | | /// If no decimal digit could be found or if there was a problem parsing |
1716 | | /// the complete set of digits into a u32, then an error is returned. |
1717 | 0 | fn parse_decimal(&self) -> Result<u32> { |
1718 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
1719 | 0 | scratch.clear(); |
1720 | | |
1721 | 0 | while !self.is_eof() && self.char().is_whitespace() { |
1722 | 0 | self.bump(); |
1723 | 0 | } |
1724 | 0 | let start = self.pos(); |
1725 | 0 | while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { |
1726 | 0 | scratch.push(self.char()); |
1727 | 0 | self.bump_and_bump_space(); |
1728 | 0 | } |
1729 | 0 | let span = Span::new(start, self.pos()); |
1730 | 0 | while !self.is_eof() && self.char().is_whitespace() { |
1731 | 0 | self.bump_and_bump_space(); |
1732 | 0 | } |
1733 | 0 | let digits = scratch.as_str(); |
1734 | 0 | if digits.is_empty() { |
1735 | 0 | return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); |
1736 | 0 | } |
1737 | 0 | match u32::from_str_radix(digits, 10).ok() { |
1738 | 0 | Some(n) => Ok(n), |
1739 | 0 | None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), |
1740 | | } |
1741 | 0 | } |
1742 | | |
1743 | | /// Parse a standard character class consisting primarily of characters or |
1744 | | /// character ranges, but can also contain nested character classes of |
1745 | | /// any type (sans `.`). |
1746 | | /// |
1747 | | /// This assumes the parser is positioned at the opening `[`. If parsing |
1748 | | /// is successful, then the parser is advanced to the position immediately |
1749 | | /// following the closing `]`. |
1750 | | #[inline(never)] |
1751 | 68 | fn parse_set_class(&self) -> Result<ast::Class> { |
1752 | 68 | assert_eq!(self.char(), '['); |
1753 | | |
1754 | 68 | let mut union = |
1755 | 68 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
1756 | 212 | loop { |
1757 | 212 | self.bump_space(); |
1758 | 212 | if self.is_eof() { |
1759 | 0 | return Err(self.unclosed_class_error()); |
1760 | 212 | } |
1761 | 212 | match self.char() { |
1762 | | '[' => { |
1763 | | // If we've already parsed the opening bracket, then |
1764 | | // attempt to treat this as the beginning of an ASCII |
1765 | | // class. If ASCII class parsing fails, then the parser |
1766 | | // backs up to `[`. |
1767 | 128 | if !self.parser().stack_class.borrow().is_empty() { |
1768 | 60 | if let Some(cls) = self.maybe_parse_ascii_class() { |
1769 | 60 | union.push(ast::ClassSetItem::Ascii(cls)); |
1770 | 60 | continue; |
1771 | 0 | } |
1772 | 68 | } |
1773 | 68 | union = self.push_class_open(union)?; |
1774 | | } |
1775 | 68 | ']' => match self.pop_class(union)? { |
1776 | 0 | Either::Left(nested_union) => { |
1777 | 0 | union = nested_union; |
1778 | 0 | } |
1779 | 68 | Either::Right(class) => return Ok(class), |
1780 | | }, |
1781 | 0 | '&' if self.peek() == Some('&') => { |
1782 | 0 | assert!(self.bump_if("&&")); |
1783 | 0 | union = self.push_class_op( |
1784 | 0 | ast::ClassSetBinaryOpKind::Intersection, |
1785 | 0 | union, |
1786 | 0 | ); |
1787 | | } |
1788 | 0 | '-' if self.peek() == Some('-') => { |
1789 | 0 | assert!(self.bump_if("--")); |
1790 | 0 | union = self.push_class_op( |
1791 | 0 | ast::ClassSetBinaryOpKind::Difference, |
1792 | 0 | union, |
1793 | 0 | ); |
1794 | | } |
1795 | 0 | '~' if self.peek() == Some('~') => { |
1796 | 0 | assert!(self.bump_if("~~")); |
1797 | 0 | union = self.push_class_op( |
1798 | 0 | ast::ClassSetBinaryOpKind::SymmetricDifference, |
1799 | 0 | union, |
1800 | 0 | ); |
1801 | | } |
1802 | | _ => { |
1803 | 16 | union.push(self.parse_set_class_range()?); |
1804 | | } |
1805 | | } |
1806 | | } |
1807 | 68 | } |
1808 | | |
1809 | | /// Parse a single primitive item in a character class set. The item to |
1810 | | /// be parsed can either be one of a simple literal character, a range |
1811 | | /// between two simple literal characters or a "primitive" character |
1812 | | /// class like \w or \p{Greek}. |
1813 | | /// |
1814 | | /// If an invalid escape is found, or if a character class is found where |
1815 | | /// a simple literal is expected (e.g., in a range), then an error is |
1816 | | /// returned. |
1817 | | #[inline(never)] |
1818 | 16 | fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { |
1819 | 16 | let prim1 = self.parse_set_class_item()?; |
1820 | 16 | self.bump_space(); |
1821 | 16 | if self.is_eof() { |
1822 | 0 | return Err(self.unclosed_class_error()); |
1823 | 16 | } |
1824 | 16 | // If the next char isn't a `-`, then we don't have a range. |
1825 | 16 | // There are two exceptions. If the char after a `-` is a `]`, then |
1826 | 16 | // `-` is interpreted as a literal `-`. Alternatively, if the char |
1827 | 16 | // after a `-` is a `-`, then `--` corresponds to a "difference" |
1828 | 16 | // operation. |
1829 | 16 | if self.char() != '-' |
1830 | 4 | || self.peek_space() == Some(']') |
1831 | 4 | || self.peek_space() == Some('-') |
1832 | | { |
1833 | 12 | return prim1.into_class_set_item(self); |
1834 | 4 | } |
1835 | 4 | // OK, now we're parsing a range, so bump past the `-` and parse the |
1836 | 4 | // second half of the range. |
1837 | 4 | if !self.bump_and_bump_space() { |
1838 | 0 | return Err(self.unclosed_class_error()); |
1839 | 4 | } |
1840 | 4 | let prim2 = self.parse_set_class_item()?; |
1841 | 4 | let range = ast::ClassSetRange { |
1842 | 4 | span: Span::new(prim1.span().start, prim2.span().end), |
1843 | 4 | start: prim1.into_class_literal(self)?, |
1844 | 4 | end: prim2.into_class_literal(self)?, |
1845 | | }; |
1846 | 4 | if !range.is_valid() { |
1847 | 0 | return Err( |
1848 | 0 | self.error(range.span, ast::ErrorKind::ClassRangeInvalid) |
1849 | 0 | ); |
1850 | 4 | } |
1851 | 4 | Ok(ast::ClassSetItem::Range(range)) |
1852 | 16 | } |
1853 | | |
1854 | | /// Parse a single item in a character class as a primitive, where the |
1855 | | /// primitive either consists of a verbatim literal or a single escape |
1856 | | /// sequence. |
1857 | | /// |
1858 | | /// This assumes the parser is positioned at the beginning of a primitive, |
1859 | | /// and advances the parser to the first position after the primitive if |
1860 | | /// successful. |
1861 | | /// |
1862 | | /// Note that it is the caller's responsibility to report an error if an |
1863 | | /// illegal primitive was parsed. |
1864 | | #[inline(never)] |
1865 | 20 | fn parse_set_class_item(&self) -> Result<Primitive> { |
1866 | 20 | if self.char() == '\\' { |
1867 | 0 | self.parse_escape() |
1868 | | } else { |
1869 | 20 | let x = Primitive::Literal(ast::Literal { |
1870 | 20 | span: self.span_char(), |
1871 | 20 | kind: ast::LiteralKind::Verbatim, |
1872 | 20 | c: self.char(), |
1873 | 20 | }); |
1874 | 20 | self.bump(); |
1875 | 20 | Ok(x) |
1876 | | } |
1877 | 20 | } |
1878 | | |
1879 | | /// Parses the opening of a character class set. This includes the opening |
1880 | | /// bracket along with `^` if present to indicate negation. This also |
1881 | | /// starts parsing the opening set of unioned items if applicable, since |
1882 | | /// there are special rules applied to certain characters in the opening |
1883 | | /// of a character class. For example, `[^]]` is the class of all |
1884 | | /// characters not equal to `]`. (`]` would need to be escaped in any other |
1885 | | /// position.) Similarly for `-`. |
1886 | | /// |
1887 | | /// In all cases, the op inside the returned `ast::ClassBracketed` is an |
1888 | | /// empty union. This empty union should be replaced with the actual item |
1889 | | /// when it is popped from the parser's stack. |
1890 | | /// |
1891 | | /// This assumes the parser is positioned at the opening `[` and advances |
1892 | | /// the parser to the first non-special byte of the character class. |
1893 | | /// |
1894 | | /// An error is returned if EOF is found. |
1895 | | #[inline(never)] |
1896 | 68 | fn parse_set_class_open( |
1897 | 68 | &self, |
1898 | 68 | ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { |
1899 | 68 | assert_eq!(self.char(), '['); |
1900 | 68 | let start = self.pos(); |
1901 | 68 | if !self.bump_and_bump_space() { |
1902 | 0 | return Err(self.error( |
1903 | 0 | Span::new(start, self.pos()), |
1904 | 0 | ast::ErrorKind::ClassUnclosed, |
1905 | 0 | )); |
1906 | 68 | } |
1907 | | |
1908 | 68 | let negated = if self.char() != '^' { |
1909 | 68 | false |
1910 | | } else { |
1911 | 0 | if !self.bump_and_bump_space() { |
1912 | 0 | return Err(self.error( |
1913 | 0 | Span::new(start, self.pos()), |
1914 | 0 | ast::ErrorKind::ClassUnclosed, |
1915 | 0 | )); |
1916 | 0 | } |
1917 | 0 | true |
1918 | | }; |
1919 | | // Accept any number of `-` as literal `-`. |
1920 | 68 | let mut union = |
1921 | 68 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
1922 | 68 | while self.char() == '-' { |
1923 | 0 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
1924 | 0 | span: self.span_char(), |
1925 | 0 | kind: ast::LiteralKind::Verbatim, |
1926 | 0 | c: '-', |
1927 | 0 | })); |
1928 | 0 | if !self.bump_and_bump_space() { |
1929 | 0 | return Err(self.error( |
1930 | 0 | Span::new(start, self.pos()), |
1931 | 0 | ast::ErrorKind::ClassUnclosed, |
1932 | 0 | )); |
1933 | 0 | } |
1934 | | } |
1935 | | // If `]` is the *first* char in a set, then interpret it as a literal |
1936 | | // `]`. That is, an empty class is impossible to write. |
1937 | 68 | if union.items.is_empty() && self.char() == ']' { |
1938 | 0 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
1939 | 0 | span: self.span_char(), |
1940 | 0 | kind: ast::LiteralKind::Verbatim, |
1941 | 0 | c: ']', |
1942 | 0 | })); |
1943 | 0 | if !self.bump_and_bump_space() { |
1944 | 0 | return Err(self.error( |
1945 | 0 | Span::new(start, self.pos()), |
1946 | 0 | ast::ErrorKind::ClassUnclosed, |
1947 | 0 | )); |
1948 | 0 | } |
1949 | 68 | } |
1950 | 68 | let set = ast::ClassBracketed { |
1951 | 68 | span: Span::new(start, self.pos()), |
1952 | 68 | negated: negated, |
1953 | 68 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
1954 | 68 | span: Span::new(union.span.start, union.span.start), |
1955 | 68 | items: vec![], |
1956 | 68 | }), |
1957 | 68 | }; |
1958 | 68 | Ok((set, union)) |
1959 | 68 | } |
1960 | | |
1961 | | /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. |
1962 | | /// |
1963 | | /// This assumes the parser is positioned at the opening `[`. |
1964 | | /// |
1965 | | /// If no valid ASCII character class could be found, then this does not |
1966 | | /// advance the parser and `None` is returned. Otherwise, the parser is |
1967 | | /// advanced to the first byte following the closing `]` and the |
1968 | | /// corresponding ASCII class is returned. |
1969 | | #[inline(never)] |
1970 | 60 | fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { |
1971 | 60 | // ASCII character classes are interesting from a parsing perspective |
1972 | 60 | // because parsing cannot fail with any interesting error. For example, |
1973 | 60 | // in order to use an ASCII character class, it must be enclosed in |
1974 | 60 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
1975 | 60 | // of it as "ASCII character characters have the syntax `[:NAME:]` |
1976 | 60 | // which can only appear within character brackets." This means that |
1977 | 60 | // things like `[[:lower:]A]` are legal constructs. |
1978 | 60 | // |
1979 | 60 | // However, if one types an incorrect ASCII character class, e.g., |
1980 | 60 | // `[[:loower:]]`, then we treat that as a normal nested character |
1981 | 60 | // class containing the characters `:elorw`. One might argue that we |
1982 | 60 | // should return an error instead since the repeated colons give away |
1983 | 60 | // the intent to write an ASCII class. But what if the user typed |
1984 | 60 | // `[[:lower]]` instead? How can we tell that was intended to be an |
1985 | 60 | // ASCII class and not just a normal nested class? |
1986 | 60 | // |
1987 | 60 | // Reasonable people can probably disagree over this, but for better |
1988 | 60 | // or worse, we implement semantics that never fails at the expense |
1989 | 60 | // of better failure modes. |
1990 | 60 | assert_eq!(self.char(), '['); |
1991 | | // If parsing fails, then we back up the parser to this starting point. |
1992 | 60 | let start = self.pos(); |
1993 | 60 | let mut negated = false; |
1994 | 60 | if !self.bump() || self.char() != ':' { |
1995 | 0 | self.parser().pos.set(start); |
1996 | 0 | return None; |
1997 | 60 | } |
1998 | 60 | if !self.bump() { |
1999 | 0 | self.parser().pos.set(start); |
2000 | 0 | return None; |
2001 | 60 | } |
2002 | 60 | if self.char() == '^' { |
2003 | 0 | negated = true; |
2004 | 0 | if !self.bump() { |
2005 | 0 | self.parser().pos.set(start); |
2006 | 0 | return None; |
2007 | 0 | } |
2008 | 60 | } |
2009 | 60 | let name_start = self.offset(); |
2010 | 360 | while self.char() != ':' && self.bump() {} |
2011 | 60 | if self.is_eof() { |
2012 | 0 | self.parser().pos.set(start); |
2013 | 0 | return None; |
2014 | 60 | } |
2015 | 60 | let name = &self.pattern()[name_start..self.offset()]; |
2016 | 60 | if !self.bump_if(":]") { |
2017 | 0 | self.parser().pos.set(start); |
2018 | 0 | return None; |
2019 | 60 | } |
2020 | 60 | let kind = match ast::ClassAsciiKind::from_name(name) { |
2021 | 60 | Some(kind) => kind, |
2022 | | None => { |
2023 | 0 | self.parser().pos.set(start); |
2024 | 0 | return None; |
2025 | | } |
2026 | | }; |
2027 | 60 | Some(ast::ClassAscii { |
2028 | 60 | span: Span::new(start, self.pos()), |
2029 | 60 | kind: kind, |
2030 | 60 | negated: negated, |
2031 | 60 | }) |
2032 | 60 | } |
2033 | | |
2034 | | /// Parse a Unicode class in either the single character notation, `\pN` |
2035 | | /// or the multi-character bracketed notation, `\p{Greek}`. This assumes |
2036 | | /// the parser is positioned at the `p` (or `P` for negation) and will |
2037 | | /// advance the parser to the character immediately following the class. |
2038 | | /// |
2039 | | /// Note that this does not check whether the class name is valid or not. |
2040 | | #[inline(never)] |
2041 | 0 | fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { |
2042 | 0 | assert!(self.char() == 'p' || self.char() == 'P'); |
2043 | | |
2044 | 0 | let mut scratch = self.parser().scratch.borrow_mut(); |
2045 | 0 | scratch.clear(); |
2046 | 0 |
|
2047 | 0 | let negated = self.char() == 'P'; |
2048 | 0 | if !self.bump_and_bump_space() { |
2049 | 0 | return Err( |
2050 | 0 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
2051 | 0 | ); |
2052 | 0 | } |
2053 | 0 | let (start, kind) = if self.char() == '{' { |
2054 | 0 | let start = self.span_char().end; |
2055 | 0 | while self.bump_and_bump_space() && self.char() != '}' { |
2056 | 0 | scratch.push(self.char()); |
2057 | 0 | } |
2058 | 0 | if self.is_eof() { |
2059 | 0 | return Err(self |
2060 | 0 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
2061 | 0 | } |
2062 | 0 | assert_eq!(self.char(), '}'); |
2063 | 0 | self.bump(); |
2064 | 0 |
|
2065 | 0 | let name = scratch.as_str(); |
2066 | 0 | if let Some(i) = name.find("!=") { |
2067 | 0 | ( |
2068 | 0 | start, |
2069 | 0 | ast::ClassUnicodeKind::NamedValue { |
2070 | 0 | op: ast::ClassUnicodeOpKind::NotEqual, |
2071 | 0 | name: name[..i].to_string(), |
2072 | 0 | value: name[i + 2..].to_string(), |
2073 | 0 | }, |
2074 | 0 | ) |
2075 | 0 | } else if let Some(i) = name.find(':') { |
2076 | 0 | ( |
2077 | 0 | start, |
2078 | 0 | ast::ClassUnicodeKind::NamedValue { |
2079 | 0 | op: ast::ClassUnicodeOpKind::Colon, |
2080 | 0 | name: name[..i].to_string(), |
2081 | 0 | value: name[i + 1..].to_string(), |
2082 | 0 | }, |
2083 | 0 | ) |
2084 | 0 | } else if let Some(i) = name.find('=') { |
2085 | 0 | ( |
2086 | 0 | start, |
2087 | 0 | ast::ClassUnicodeKind::NamedValue { |
2088 | 0 | op: ast::ClassUnicodeOpKind::Equal, |
2089 | 0 | name: name[..i].to_string(), |
2090 | 0 | value: name[i + 1..].to_string(), |
2091 | 0 | }, |
2092 | 0 | ) |
2093 | | } else { |
2094 | 0 | (start, ast::ClassUnicodeKind::Named(name.to_string())) |
2095 | | } |
2096 | | } else { |
2097 | 0 | let start = self.pos(); |
2098 | 0 | let c = self.char(); |
2099 | 0 | if c == '\\' { |
2100 | 0 | return Err(self.error( |
2101 | 0 | self.span_char(), |
2102 | 0 | ast::ErrorKind::UnicodeClassInvalid, |
2103 | 0 | )); |
2104 | 0 | } |
2105 | 0 | self.bump_and_bump_space(); |
2106 | 0 | let kind = ast::ClassUnicodeKind::OneLetter(c); |
2107 | 0 | (start, kind) |
2108 | | }; |
2109 | 0 | Ok(ast::ClassUnicode { |
2110 | 0 | span: Span::new(start, self.pos()), |
2111 | 0 | negated: negated, |
2112 | 0 | kind: kind, |
2113 | 0 | }) |
2114 | 0 | } |
2115 | | |
2116 | | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
2117 | | /// parser is currently at a valid character class name and will be |
2118 | | /// advanced to the character immediately following the class. |
2119 | | #[inline(never)] |
2120 | 0 | fn parse_perl_class(&self) -> ast::ClassPerl { |
2121 | 0 | let c = self.char(); |
2122 | 0 | let span = self.span_char(); |
2123 | 0 | self.bump(); |
2124 | 0 | let (negated, kind) = match c { |
2125 | 0 | 'd' => (false, ast::ClassPerlKind::Digit), |
2126 | 0 | 'D' => (true, ast::ClassPerlKind::Digit), |
2127 | 0 | 's' => (false, ast::ClassPerlKind::Space), |
2128 | 0 | 'S' => (true, ast::ClassPerlKind::Space), |
2129 | 0 | 'w' => (false, ast::ClassPerlKind::Word), |
2130 | 0 | 'W' => (true, ast::ClassPerlKind::Word), |
2131 | 0 | c => panic!("expected valid Perl class but got '{}'", c), |
2132 | | }; |
2133 | 0 | ast::ClassPerl { span: span, kind: kind, negated: negated } |
2134 | 0 | } |
2135 | | } |
2136 | | |
2137 | | /// A type that traverses a fully parsed Ast and checks whether its depth |
2138 | | /// exceeds the specified nesting limit. If it does, then an error is returned. |
2139 | 0 | #[derive(Debug)] |
2140 | | struct NestLimiter<'p, 's, P> { |
2141 | | /// The parser that is checking the nest limit. |
2142 | | p: &'p ParserI<'s, P>, |
2143 | | /// The current depth while walking an Ast. |
2144 | | depth: u32, |
2145 | | } |
2146 | | |
2147 | | impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { |
2148 | 6 | fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { |
2149 | 6 | NestLimiter { p: p, depth: 0 } |
2150 | 6 | } |
2151 | | |
2152 | | #[inline(never)] |
2153 | 6 | fn check(self, ast: &Ast) -> Result<()> { |
2154 | 6 | ast::visit(ast, self) |
2155 | 6 | } |
2156 | | |
2157 | 256 | fn increment_depth(&mut self, span: &Span) -> Result<()> { |
2158 | 256 | let new = self.depth.checked_add(1).ok_or_else(|| { |
2159 | 0 | self.p.error( |
2160 | 0 | span.clone(), |
2161 | 0 | ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), |
2162 | 0 | ) |
2163 | 256 | })?; |
2164 | 256 | let limit = self.p.parser().nest_limit; |
2165 | 256 | if new > limit { |
2166 | 0 | return Err(self.p.error( |
2167 | 0 | span.clone(), |
2168 | 0 | ast::ErrorKind::NestLimitExceeded(limit), |
2169 | 0 | )); |
2170 | 256 | } |
2171 | 256 | self.depth = new; |
2172 | 256 | Ok(()) |
2173 | 256 | } |
2174 | | |
2175 | 256 | fn decrement_depth(&mut self) { |
2176 | 256 | // Assuming the correctness of the visitor, this should never drop |
2177 | 256 | // below 0. |
2178 | 256 | self.depth = self.depth.checked_sub(1).unwrap(); |
2179 | 256 | } |
2180 | | } |
2181 | | |
2182 | | impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { |
2183 | | type Output = (); |
2184 | | type Err = ast::Error; |
2185 | | |
2186 | 6 | fn finish(self) -> Result<()> { |
2187 | 6 | Ok(()) |
2188 | 6 | } |
2189 | | |
2190 | 448 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
2191 | 248 | let span = match *ast { |
2192 | | Ast::Empty(_) |
2193 | | | Ast::Flags(_) |
2194 | | | Ast::Literal(_) |
2195 | | | Ast::Dot(_) |
2196 | | | Ast::Assertion(_) |
2197 | | | Ast::Class(ast::Class::Unicode(_)) |
2198 | | | Ast::Class(ast::Class::Perl(_)) => { |
2199 | | // These are all base cases, so we don't increment depth. |
2200 | 200 | return Ok(()); |
2201 | | } |
2202 | 68 | Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, |
2203 | 100 | Ast::Repetition(ref x) => &x.span, |
2204 | 36 | Ast::Group(ref x) => &x.span, |
2205 | 4 | Ast::Alternation(ref x) => &x.span, |
2206 | 40 | Ast::Concat(ref x) => &x.span, |
2207 | | }; |
2208 | 248 | self.increment_depth(span) |
2209 | 448 | } |
2210 | | |
2211 | | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
2212 | 68 | match *ast { |
2213 | | Ast::Empty(_) |
2214 | | | Ast::Flags(_) |
2215 | | | Ast::Literal(_) |
2216 | | | Ast::Dot(_) |
2217 | | | Ast::Assertion(_) |
2218 | | | Ast::Class(ast::Class::Unicode(_)) |
2219 | | | Ast::Class(ast::Class::Perl(_)) => { |
2220 | | // These are all base cases, so we don't decrement depth. |
2221 | 200 | Ok(()) |
2222 | | } |
2223 | | Ast::Class(ast::Class::Bracketed(_)) |
2224 | | | Ast::Repetition(_) |
2225 | | | Ast::Group(_) |
2226 | | | Ast::Alternation(_) |
2227 | | | Ast::Concat(_) => { |
2228 | 248 | self.decrement_depth(); |
2229 | 248 | Ok(()) |
2230 | | } |
2231 | | } |
2232 | 448 | } |
2233 | | |
2234 | 84 | fn visit_class_set_item_pre( |
2235 | 84 | &mut self, |
2236 | 84 | ast: &ast::ClassSetItem, |
2237 | 84 | ) -> Result<()> { |
2238 | 84 | let span = match *ast { |
2239 | | ast::ClassSetItem::Empty(_) |
2240 | | | ast::ClassSetItem::Literal(_) |
2241 | | | ast::ClassSetItem::Range(_) |
2242 | | | ast::ClassSetItem::Ascii(_) |
2243 | | | ast::ClassSetItem::Unicode(_) |
2244 | | | ast::ClassSetItem::Perl(_) => { |
2245 | | // These are all base cases, so we don't increment depth. |
2246 | 76 | return Ok(()); |
2247 | | } |
2248 | 0 | ast::ClassSetItem::Bracketed(ref x) => &x.span, |
2249 | 8 | ast::ClassSetItem::Union(ref x) => &x.span, |
2250 | | }; |
2251 | 8 | self.increment_depth(span) |
2252 | 84 | } |
2253 | | |
2254 | 84 | fn visit_class_set_item_post( |
2255 | 84 | &mut self, |
2256 | 84 | ast: &ast::ClassSetItem, |
2257 | 84 | ) -> Result<()> { |
2258 | 84 | match *ast { |
2259 | | ast::ClassSetItem::Empty(_) |
2260 | | | ast::ClassSetItem::Literal(_) |
2261 | | | ast::ClassSetItem::Range(_) |
2262 | | | ast::ClassSetItem::Ascii(_) |
2263 | | | ast::ClassSetItem::Unicode(_) |
2264 | | | ast::ClassSetItem::Perl(_) => { |
2265 | | // These are all base cases, so we don't decrement depth. |
2266 | 76 | Ok(()) |
2267 | | } |
2268 | | ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { |
2269 | 8 | self.decrement_depth(); |
2270 | 8 | Ok(()) |
2271 | | } |
2272 | | } |
2273 | 84 | } |
2274 | | |
2275 | 0 | fn visit_class_set_binary_op_pre( |
2276 | 0 | &mut self, |
2277 | 0 | ast: &ast::ClassSetBinaryOp, |
2278 | 0 | ) -> Result<()> { |
2279 | 0 | self.increment_depth(&ast.span) |
2280 | 0 | } |
2281 | | |
2282 | 0 | fn visit_class_set_binary_op_post( |
2283 | 0 | &mut self, |
2284 | 0 | _ast: &ast::ClassSetBinaryOp, |
2285 | 0 | ) -> Result<()> { |
2286 | 0 | self.decrement_depth(); |
2287 | 0 | Ok(()) |
2288 | 0 | } |
2289 | | } |
2290 | | |
2291 | | /// When the result is an error, transforms the ast::ErrorKind from the source |
2292 | | /// Result into another one. This function is used to return clearer error |
2293 | | /// messages when possible. |
2294 | | fn specialize_err<T>( |
2295 | | result: Result<T>, |
2296 | | from: ast::ErrorKind, |
2297 | | to: ast::ErrorKind, |
2298 | | ) -> Result<T> { |
2299 | 0 | if let Err(e) = result { |
2300 | 0 | if e.kind == from { |
2301 | 0 | Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) |
2302 | | } else { |
2303 | 0 | Err(e) |
2304 | | } |
2305 | | } else { |
2306 | 0 | result |
2307 | | } |
2308 | 0 | } |
2309 | | |
2310 | | #[cfg(test)] |
2311 | | mod tests { |
2312 | | use std::ops::Range; |
2313 | | |
2314 | | use super::{Parser, ParserBuilder, ParserI, Primitive}; |
2315 | | use crate::ast::{self, Ast, Position, Span}; |
2316 | | |
2317 | | // Our own assert_eq, which has slightly better formatting (but honestly |
2318 | | // still kind of crappy). |
2319 | | macro_rules! assert_eq { |
2320 | | ($left:expr, $right:expr) => {{ |
2321 | | match (&$left, &$right) { |
2322 | | (left_val, right_val) => { |
2323 | | if !(*left_val == *right_val) { |
2324 | | panic!( |
2325 | | "assertion failed: `(left == right)`\n\n\ |
2326 | | left: `{:?}`\nright: `{:?}`\n\n", |
2327 | | left_val, right_val |
2328 | | ) |
2329 | | } |
2330 | | } |
2331 | | } |
2332 | | }}; |
2333 | | } |
2334 | | |
2335 | | // We create these errors to compare with real ast::Errors in the tests. |
2336 | | // We define equality between TestError and ast::Error to disregard the |
2337 | | // pattern string in ast::Error, which is annoying to provide in tests. |
2338 | | #[derive(Clone, Debug)] |
2339 | | struct TestError { |
2340 | | span: Span, |
2341 | | kind: ast::ErrorKind, |
2342 | | } |
2343 | | |
2344 | | impl PartialEq<ast::Error> for TestError { |
2345 | | fn eq(&self, other: &ast::Error) -> bool { |
2346 | | self.span == other.span && self.kind == other.kind |
2347 | | } |
2348 | | } |
2349 | | |
2350 | | impl PartialEq<TestError> for ast::Error { |
2351 | | fn eq(&self, other: &TestError) -> bool { |
2352 | | self.span == other.span && self.kind == other.kind |
2353 | | } |
2354 | | } |
2355 | | |
2356 | | fn s(str: &str) -> String { |
2357 | | str.to_string() |
2358 | | } |
2359 | | |
2360 | | fn parser(pattern: &str) -> ParserI<'_, Parser> { |
2361 | | ParserI::new(Parser::new(), pattern) |
2362 | | } |
2363 | | |
2364 | | fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { |
2365 | | let parser = ParserBuilder::new().octal(true).build(); |
2366 | | ParserI::new(parser, pattern) |
2367 | | } |
2368 | | |
2369 | | fn parser_nest_limit( |
2370 | | pattern: &str, |
2371 | | nest_limit: u32, |
2372 | | ) -> ParserI<'_, Parser> { |
2373 | | let p = ParserBuilder::new().nest_limit(nest_limit).build(); |
2374 | | ParserI::new(p, pattern) |
2375 | | } |
2376 | | |
2377 | | fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { |
2378 | | let p = ParserBuilder::new().ignore_whitespace(true).build(); |
2379 | | ParserI::new(p, pattern) |
2380 | | } |
2381 | | |
2382 | | /// Short alias for creating a new span. |
2383 | | fn nspan(start: Position, end: Position) -> Span { |
2384 | | Span::new(start, end) |
2385 | | } |
2386 | | |
2387 | | /// Short alias for creating a new position. |
2388 | | fn npos(offset: usize, line: usize, column: usize) -> Position { |
2389 | | Position::new(offset, line, column) |
2390 | | } |
2391 | | |
2392 | | /// Create a new span from the given offset range. This assumes a single |
2393 | | /// line and sets the columns based on the offsets. i.e., This only works |
2394 | | /// out of the box for ASCII, which is fine for most tests. |
2395 | | fn span(range: Range<usize>) -> Span { |
2396 | | let start = Position::new(range.start, 1, range.start + 1); |
2397 | | let end = Position::new(range.end, 1, range.end + 1); |
2398 | | Span::new(start, end) |
2399 | | } |
2400 | | |
2401 | | /// Create a new span for the corresponding byte range in the given string. |
2402 | | fn span_range(subject: &str, range: Range<usize>) -> Span { |
2403 | | let start = Position { |
2404 | | offset: range.start, |
2405 | | line: 1 + subject[..range.start].matches('\n').count(), |
2406 | | column: 1 + subject[..range.start] |
2407 | | .chars() |
2408 | | .rev() |
2409 | | .position(|c| c == '\n') |
2410 | | .unwrap_or(subject[..range.start].chars().count()), |
2411 | | }; |
2412 | | let end = Position { |
2413 | | offset: range.end, |
2414 | | line: 1 + subject[..range.end].matches('\n').count(), |
2415 | | column: 1 + subject[..range.end] |
2416 | | .chars() |
2417 | | .rev() |
2418 | | .position(|c| c == '\n') |
2419 | | .unwrap_or(subject[..range.end].chars().count()), |
2420 | | }; |
2421 | | Span::new(start, end) |
2422 | | } |
2423 | | |
2424 | | /// Create a verbatim literal starting at the given position. |
2425 | | fn lit(c: char, start: usize) -> Ast { |
2426 | | lit_with(c, span(start..start + c.len_utf8())) |
2427 | | } |
2428 | | |
2429 | | /// Create a punctuation literal starting at the given position. |
2430 | | fn punct_lit(c: char, span: Span) -> Ast { |
2431 | | Ast::Literal(ast::Literal { |
2432 | | span: span, |
2433 | | kind: ast::LiteralKind::Punctuation, |
2434 | | c: c, |
2435 | | }) |
2436 | | } |
2437 | | |
2438 | | /// Create a verbatim literal with the given span. |
2439 | | fn lit_with(c: char, span: Span) -> Ast { |
2440 | | Ast::Literal(ast::Literal { |
2441 | | span: span, |
2442 | | kind: ast::LiteralKind::Verbatim, |
2443 | | c: c, |
2444 | | }) |
2445 | | } |
2446 | | |
2447 | | /// Create a concatenation with the given range. |
2448 | | fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2449 | | concat_with(span(range), asts) |
2450 | | } |
2451 | | |
2452 | | /// Create a concatenation with the given span. |
2453 | | fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { |
2454 | | Ast::Concat(ast::Concat { span: span, asts: asts }) |
2455 | | } |
2456 | | |
2457 | | /// Create an alternation with the given span. |
2458 | | fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2459 | | Ast::Alternation(ast::Alternation { span: span(range), asts: asts }) |
2460 | | } |
2461 | | |
2462 | | /// Create a capturing group with the given span. |
2463 | | fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { |
2464 | | Ast::Group(ast::Group { |
2465 | | span: span(range), |
2466 | | kind: ast::GroupKind::CaptureIndex(index), |
2467 | | ast: Box::new(ast), |
2468 | | }) |
2469 | | } |
2470 | | |
2471 | | /// Create an ast::SetFlags. |
2472 | | /// |
2473 | | /// The given pattern should be the full pattern string. The range given |
2474 | | /// should correspond to the byte offsets where the flag set occurs. |
2475 | | /// |
2476 | | /// If negated is true, then the set is interpreted as beginning with a |
2477 | | /// negation. |
2478 | | fn flag_set( |
2479 | | pat: &str, |
2480 | | range: Range<usize>, |
2481 | | flag: ast::Flag, |
2482 | | negated: bool, |
2483 | | ) -> Ast { |
2484 | | let mut items = vec![ast::FlagsItem { |
2485 | | span: span_range(pat, (range.end - 2)..(range.end - 1)), |
2486 | | kind: ast::FlagsItemKind::Flag(flag), |
2487 | | }]; |
2488 | | if negated { |
2489 | | items.insert( |
2490 | | 0, |
2491 | | ast::FlagsItem { |
2492 | | span: span_range(pat, (range.start + 2)..(range.end - 2)), |
2493 | | kind: ast::FlagsItemKind::Negation, |
2494 | | }, |
2495 | | ); |
2496 | | } |
2497 | | Ast::Flags(ast::SetFlags { |
2498 | | span: span_range(pat, range.clone()), |
2499 | | flags: ast::Flags { |
2500 | | span: span_range(pat, (range.start + 2)..(range.end - 1)), |
2501 | | items: items, |
2502 | | }, |
2503 | | }) |
2504 | | } |
2505 | | |
2506 | | #[test] |
2507 | | fn parse_nest_limit() { |
2508 | | // A nest limit of 0 still allows some types of regexes. |
2509 | | assert_eq!( |
2510 | | parser_nest_limit("", 0).parse(), |
2511 | | Ok(Ast::Empty(span(0..0))) |
2512 | | ); |
2513 | | assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); |
2514 | | |
2515 | | // Test repetition operations, which require one level of nesting. |
2516 | | assert_eq!( |
2517 | | parser_nest_limit("a+", 0).parse().unwrap_err(), |
2518 | | TestError { |
2519 | | span: span(0..2), |
2520 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2521 | | } |
2522 | | ); |
2523 | | assert_eq!( |
2524 | | parser_nest_limit("a+", 1).parse(), |
2525 | | Ok(Ast::Repetition(ast::Repetition { |
2526 | | span: span(0..2), |
2527 | | op: ast::RepetitionOp { |
2528 | | span: span(1..2), |
2529 | | kind: ast::RepetitionKind::OneOrMore, |
2530 | | }, |
2531 | | greedy: true, |
2532 | | ast: Box::new(lit('a', 0)), |
2533 | | })) |
2534 | | ); |
2535 | | assert_eq!( |
2536 | | parser_nest_limit("(a)+", 1).parse().unwrap_err(), |
2537 | | TestError { |
2538 | | span: span(0..3), |
2539 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2540 | | } |
2541 | | ); |
2542 | | assert_eq!( |
2543 | | parser_nest_limit("a+*", 1).parse().unwrap_err(), |
2544 | | TestError { |
2545 | | span: span(0..2), |
2546 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2547 | | } |
2548 | | ); |
2549 | | assert_eq!( |
2550 | | parser_nest_limit("a+*", 2).parse(), |
2551 | | Ok(Ast::Repetition(ast::Repetition { |
2552 | | span: span(0..3), |
2553 | | op: ast::RepetitionOp { |
2554 | | span: span(2..3), |
2555 | | kind: ast::RepetitionKind::ZeroOrMore, |
2556 | | }, |
2557 | | greedy: true, |
2558 | | ast: Box::new(Ast::Repetition(ast::Repetition { |
2559 | | span: span(0..2), |
2560 | | op: ast::RepetitionOp { |
2561 | | span: span(1..2), |
2562 | | kind: ast::RepetitionKind::OneOrMore, |
2563 | | }, |
2564 | | greedy: true, |
2565 | | ast: Box::new(lit('a', 0)), |
2566 | | })), |
2567 | | })) |
2568 | | ); |
2569 | | |
2570 | | // Test concatenations. A concatenation requires one level of nesting. |
2571 | | assert_eq!( |
2572 | | parser_nest_limit("ab", 0).parse().unwrap_err(), |
2573 | | TestError { |
2574 | | span: span(0..2), |
2575 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2576 | | } |
2577 | | ); |
2578 | | assert_eq!( |
2579 | | parser_nest_limit("ab", 1).parse(), |
2580 | | Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) |
2581 | | ); |
2582 | | assert_eq!( |
2583 | | parser_nest_limit("abc", 1).parse(), |
2584 | | Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) |
2585 | | ); |
2586 | | |
2587 | | // Test alternations. An alternation requires one level of nesting. |
2588 | | assert_eq!( |
2589 | | parser_nest_limit("a|b", 0).parse().unwrap_err(), |
2590 | | TestError { |
2591 | | span: span(0..3), |
2592 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2593 | | } |
2594 | | ); |
2595 | | assert_eq!( |
2596 | | parser_nest_limit("a|b", 1).parse(), |
2597 | | Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) |
2598 | | ); |
2599 | | assert_eq!( |
2600 | | parser_nest_limit("a|b|c", 1).parse(), |
2601 | | Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) |
2602 | | ); |
2603 | | |
2604 | | // Test character classes. Classes form their own mini-recursive |
2605 | | // syntax! |
2606 | | assert_eq!( |
2607 | | parser_nest_limit("[a]", 0).parse().unwrap_err(), |
2608 | | TestError { |
2609 | | span: span(0..3), |
2610 | | kind: ast::ErrorKind::NestLimitExceeded(0), |
2611 | | } |
2612 | | ); |
2613 | | assert_eq!( |
2614 | | parser_nest_limit("[a]", 1).parse(), |
2615 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
2616 | | span: span(0..3), |
2617 | | negated: false, |
2618 | | kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( |
2619 | | ast::Literal { |
2620 | | span: span(1..2), |
2621 | | kind: ast::LiteralKind::Verbatim, |
2622 | | c: 'a', |
2623 | | } |
2624 | | )), |
2625 | | }))) |
2626 | | ); |
2627 | | assert_eq!( |
2628 | | parser_nest_limit("[ab]", 1).parse().unwrap_err(), |
2629 | | TestError { |
2630 | | span: span(1..3), |
2631 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2632 | | } |
2633 | | ); |
2634 | | assert_eq!( |
2635 | | parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), |
2636 | | TestError { |
2637 | | span: span(3..7), |
2638 | | kind: ast::ErrorKind::NestLimitExceeded(2), |
2639 | | } |
2640 | | ); |
2641 | | assert_eq!( |
2642 | | parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), |
2643 | | TestError { |
2644 | | span: span(4..6), |
2645 | | kind: ast::ErrorKind::NestLimitExceeded(3), |
2646 | | } |
2647 | | ); |
2648 | | assert_eq!( |
2649 | | parser_nest_limit("[a--b]", 1).parse().unwrap_err(), |
2650 | | TestError { |
2651 | | span: span(1..5), |
2652 | | kind: ast::ErrorKind::NestLimitExceeded(1), |
2653 | | } |
2654 | | ); |
2655 | | assert_eq!( |
2656 | | parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), |
2657 | | TestError { |
2658 | | span: span(4..6), |
2659 | | kind: ast::ErrorKind::NestLimitExceeded(2), |
2660 | | } |
2661 | | ); |
2662 | | } |
2663 | | |
2664 | | #[test] |
2665 | | fn parse_comments() { |
2666 | | let pat = "(?x) |
2667 | | # This is comment 1. |
2668 | | foo # This is comment 2. |
2669 | | # This is comment 3. |
2670 | | bar |
2671 | | # This is comment 4."; |
2672 | | let astc = parser(pat).parse_with_comments().unwrap(); |
2673 | | assert_eq!( |
2674 | | astc.ast, |
2675 | | concat_with( |
2676 | | span_range(pat, 0..pat.len()), |
2677 | | vec![ |
2678 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2679 | | lit_with('f', span_range(pat, 26..27)), |
2680 | | lit_with('o', span_range(pat, 27..28)), |
2681 | | lit_with('o', span_range(pat, 28..29)), |
2682 | | lit_with('b', span_range(pat, 74..75)), |
2683 | | lit_with('a', span_range(pat, 75..76)), |
2684 | | lit_with('r', span_range(pat, 76..77)), |
2685 | | ] |
2686 | | ) |
2687 | | ); |
2688 | | assert_eq!( |
2689 | | astc.comments, |
2690 | | vec![ |
2691 | | ast::Comment { |
2692 | | span: span_range(pat, 5..26), |
2693 | | comment: s(" This is comment 1."), |
2694 | | }, |
2695 | | ast::Comment { |
2696 | | span: span_range(pat, 30..51), |
2697 | | comment: s(" This is comment 2."), |
2698 | | }, |
2699 | | ast::Comment { |
2700 | | span: span_range(pat, 53..74), |
2701 | | comment: s(" This is comment 3."), |
2702 | | }, |
2703 | | ast::Comment { |
2704 | | span: span_range(pat, 78..98), |
2705 | | comment: s(" This is comment 4."), |
2706 | | }, |
2707 | | ] |
2708 | | ); |
2709 | | } |
2710 | | |
2711 | | #[test] |
2712 | | fn parse_holistic() { |
2713 | | assert_eq!(parser("]").parse(), Ok(lit(']', 0))); |
2714 | | assert_eq!( |
2715 | | parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), |
2716 | | Ok(concat( |
2717 | | 0..36, |
2718 | | vec![ |
2719 | | punct_lit('\\', span(0..2)), |
2720 | | punct_lit('.', span(2..4)), |
2721 | | punct_lit('+', span(4..6)), |
2722 | | punct_lit('*', span(6..8)), |
2723 | | punct_lit('?', span(8..10)), |
2724 | | punct_lit('(', span(10..12)), |
2725 | | punct_lit(')', span(12..14)), |
2726 | | punct_lit('|', span(14..16)), |
2727 | | punct_lit('[', span(16..18)), |
2728 | | punct_lit(']', span(18..20)), |
2729 | | punct_lit('{', span(20..22)), |
2730 | | punct_lit('}', span(22..24)), |
2731 | | punct_lit('^', span(24..26)), |
2732 | | punct_lit('$', span(26..28)), |
2733 | | punct_lit('#', span(28..30)), |
2734 | | punct_lit('&', span(30..32)), |
2735 | | punct_lit('-', span(32..34)), |
2736 | | punct_lit('~', span(34..36)), |
2737 | | ] |
2738 | | )) |
2739 | | ); |
2740 | | } |
2741 | | |
2742 | | #[test] |
2743 | | fn parse_ignore_whitespace() { |
2744 | | // Test that basic whitespace insensitivity works. |
2745 | | let pat = "(?x)a b"; |
2746 | | assert_eq!( |
2747 | | parser(pat).parse(), |
2748 | | Ok(concat_with( |
2749 | | nspan(npos(0, 1, 1), npos(7, 1, 8)), |
2750 | | vec![ |
2751 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2752 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2753 | | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2754 | | ] |
2755 | | )) |
2756 | | ); |
2757 | | |
2758 | | // Test that we can toggle whitespace insensitivity. |
2759 | | let pat = "(?x)a b(?-x)a b"; |
2760 | | assert_eq!( |
2761 | | parser(pat).parse(), |
2762 | | Ok(concat_with( |
2763 | | nspan(npos(0, 1, 1), npos(15, 1, 16)), |
2764 | | vec![ |
2765 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2766 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2767 | | lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2768 | | flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), |
2769 | | lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), |
2770 | | lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), |
2771 | | lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), |
2772 | | ] |
2773 | | )) |
2774 | | ); |
2775 | | |
2776 | | // Test that nesting whitespace insensitive flags works. |
2777 | | let pat = "a (?x:a )a "; |
2778 | | assert_eq!( |
2779 | | parser(pat).parse(), |
2780 | | Ok(concat_with( |
2781 | | span_range(pat, 0..11), |
2782 | | vec![ |
2783 | | lit_with('a', span_range(pat, 0..1)), |
2784 | | lit_with(' ', span_range(pat, 1..2)), |
2785 | | Ast::Group(ast::Group { |
2786 | | span: span_range(pat, 2..9), |
2787 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2788 | | span: span_range(pat, 4..5), |
2789 | | items: vec![ast::FlagsItem { |
2790 | | span: span_range(pat, 4..5), |
2791 | | kind: ast::FlagsItemKind::Flag( |
2792 | | ast::Flag::IgnoreWhitespace |
2793 | | ), |
2794 | | },], |
2795 | | }), |
2796 | | ast: Box::new(lit_with('a', span_range(pat, 6..7))), |
2797 | | }), |
2798 | | lit_with('a', span_range(pat, 9..10)), |
2799 | | lit_with(' ', span_range(pat, 10..11)), |
2800 | | ] |
2801 | | )) |
2802 | | ); |
2803 | | |
2804 | | // Test that whitespace after an opening paren is insignificant. |
2805 | | let pat = "(?x)( ?P<foo> a )"; |
2806 | | assert_eq!( |
2807 | | parser(pat).parse(), |
2808 | | Ok(concat_with( |
2809 | | span_range(pat, 0..pat.len()), |
2810 | | vec![ |
2811 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2812 | | Ast::Group(ast::Group { |
2813 | | span: span_range(pat, 4..pat.len()), |
2814 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
2815 | | span: span_range(pat, 9..12), |
2816 | | name: s("foo"), |
2817 | | index: 1, |
2818 | | }), |
2819 | | ast: Box::new(lit_with('a', span_range(pat, 14..15))), |
2820 | | }), |
2821 | | ] |
2822 | | )) |
2823 | | ); |
2824 | | let pat = "(?x)( a )"; |
2825 | | assert_eq!( |
2826 | | parser(pat).parse(), |
2827 | | Ok(concat_with( |
2828 | | span_range(pat, 0..pat.len()), |
2829 | | vec![ |
2830 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2831 | | Ast::Group(ast::Group { |
2832 | | span: span_range(pat, 4..pat.len()), |
2833 | | kind: ast::GroupKind::CaptureIndex(1), |
2834 | | ast: Box::new(lit_with('a', span_range(pat, 7..8))), |
2835 | | }), |
2836 | | ] |
2837 | | )) |
2838 | | ); |
2839 | | let pat = "(?x)( ?: a )"; |
2840 | | assert_eq!( |
2841 | | parser(pat).parse(), |
2842 | | Ok(concat_with( |
2843 | | span_range(pat, 0..pat.len()), |
2844 | | vec![ |
2845 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2846 | | Ast::Group(ast::Group { |
2847 | | span: span_range(pat, 4..pat.len()), |
2848 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2849 | | span: span_range(pat, 8..8), |
2850 | | items: vec![], |
2851 | | }), |
2852 | | ast: Box::new(lit_with('a', span_range(pat, 11..12))), |
2853 | | }), |
2854 | | ] |
2855 | | )) |
2856 | | ); |
2857 | | let pat = r"(?x)\x { 53 }"; |
2858 | | assert_eq!( |
2859 | | parser(pat).parse(), |
2860 | | Ok(concat_with( |
2861 | | span_range(pat, 0..pat.len()), |
2862 | | vec![ |
2863 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2864 | | Ast::Literal(ast::Literal { |
2865 | | span: span(4..13), |
2866 | | kind: ast::LiteralKind::HexBrace( |
2867 | | ast::HexLiteralKind::X |
2868 | | ), |
2869 | | c: 'S', |
2870 | | }), |
2871 | | ] |
2872 | | )) |
2873 | | ); |
2874 | | |
2875 | | // Test that whitespace after an escape is OK. |
2876 | | let pat = r"(?x)\ "; |
2877 | | assert_eq!( |
2878 | | parser(pat).parse(), |
2879 | | Ok(concat_with( |
2880 | | span_range(pat, 0..pat.len()), |
2881 | | vec![ |
2882 | | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2883 | | Ast::Literal(ast::Literal { |
2884 | | span: span_range(pat, 4..6), |
2885 | | kind: ast::LiteralKind::Special( |
2886 | | ast::SpecialLiteralKind::Space |
2887 | | ), |
2888 | | c: ' ', |
2889 | | }), |
2890 | | ] |
2891 | | )) |
2892 | | ); |
2893 | | // ... but only when `x` mode is enabled. |
2894 | | let pat = r"\ "; |
2895 | | assert_eq!( |
2896 | | parser(pat).parse().unwrap_err(), |
2897 | | TestError { |
2898 | | span: span_range(pat, 0..2), |
2899 | | kind: ast::ErrorKind::EscapeUnrecognized, |
2900 | | } |
2901 | | ); |
2902 | | } |
2903 | | |
2904 | | #[test] |
2905 | | fn parse_newlines() { |
2906 | | let pat = ".\n."; |
2907 | | assert_eq!( |
2908 | | parser(pat).parse(), |
2909 | | Ok(concat_with( |
2910 | | span_range(pat, 0..3), |
2911 | | vec![ |
2912 | | Ast::Dot(span_range(pat, 0..1)), |
2913 | | lit_with('\n', span_range(pat, 1..2)), |
2914 | | Ast::Dot(span_range(pat, 2..3)), |
2915 | | ] |
2916 | | )) |
2917 | | ); |
2918 | | |
2919 | | let pat = "foobar\nbaz\nquux\n"; |
2920 | | assert_eq!( |
2921 | | parser(pat).parse(), |
2922 | | Ok(concat_with( |
2923 | | span_range(pat, 0..pat.len()), |
2924 | | vec![ |
2925 | | lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), |
2926 | | lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), |
2927 | | lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), |
2928 | | lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), |
2929 | | lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2930 | | lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), |
2931 | | lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), |
2932 | | lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), |
2933 | | lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), |
2934 | | lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), |
2935 | | lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), |
2936 | | lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), |
2937 | | lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), |
2938 | | lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), |
2939 | | lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), |
2940 | | lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), |
2941 | | ] |
2942 | | )) |
2943 | | ); |
2944 | | } |
2945 | | |
2946 | | #[test] |
2947 | | fn parse_uncounted_repetition() { |
2948 | | assert_eq!( |
2949 | | parser(r"a*").parse(), |
2950 | | Ok(Ast::Repetition(ast::Repetition { |
2951 | | span: span(0..2), |
2952 | | op: ast::RepetitionOp { |
2953 | | span: span(1..2), |
2954 | | kind: ast::RepetitionKind::ZeroOrMore, |
2955 | | }, |
2956 | | greedy: true, |
2957 | | ast: Box::new(lit('a', 0)), |
2958 | | })) |
2959 | | ); |
2960 | | assert_eq!( |
2961 | | parser(r"a+").parse(), |
2962 | | Ok(Ast::Repetition(ast::Repetition { |
2963 | | span: span(0..2), |
2964 | | op: ast::RepetitionOp { |
2965 | | span: span(1..2), |
2966 | | kind: ast::RepetitionKind::OneOrMore, |
2967 | | }, |
2968 | | greedy: true, |
2969 | | ast: Box::new(lit('a', 0)), |
2970 | | })) |
2971 | | ); |
2972 | | |
2973 | | assert_eq!( |
2974 | | parser(r"a?").parse(), |
2975 | | Ok(Ast::Repetition(ast::Repetition { |
2976 | | span: span(0..2), |
2977 | | op: ast::RepetitionOp { |
2978 | | span: span(1..2), |
2979 | | kind: ast::RepetitionKind::ZeroOrOne, |
2980 | | }, |
2981 | | greedy: true, |
2982 | | ast: Box::new(lit('a', 0)), |
2983 | | })) |
2984 | | ); |
2985 | | assert_eq!( |
2986 | | parser(r"a??").parse(), |
2987 | | Ok(Ast::Repetition(ast::Repetition { |
2988 | | span: span(0..3), |
2989 | | op: ast::RepetitionOp { |
2990 | | span: span(1..3), |
2991 | | kind: ast::RepetitionKind::ZeroOrOne, |
2992 | | }, |
2993 | | greedy: false, |
2994 | | ast: Box::new(lit('a', 0)), |
2995 | | })) |
2996 | | ); |
2997 | | assert_eq!( |
2998 | | parser(r"a?").parse(), |
2999 | | Ok(Ast::Repetition(ast::Repetition { |
3000 | | span: span(0..2), |
3001 | | op: ast::RepetitionOp { |
3002 | | span: span(1..2), |
3003 | | kind: ast::RepetitionKind::ZeroOrOne, |
3004 | | }, |
3005 | | greedy: true, |
3006 | | ast: Box::new(lit('a', 0)), |
3007 | | })) |
3008 | | ); |
3009 | | assert_eq!( |
3010 | | parser(r"a?b").parse(), |
3011 | | Ok(concat( |
3012 | | 0..3, |
3013 | | vec![ |
3014 | | Ast::Repetition(ast::Repetition { |
3015 | | span: span(0..2), |
3016 | | op: ast::RepetitionOp { |
3017 | | span: span(1..2), |
3018 | | kind: ast::RepetitionKind::ZeroOrOne, |
3019 | | }, |
3020 | | greedy: true, |
3021 | | ast: Box::new(lit('a', 0)), |
3022 | | }), |
3023 | | lit('b', 2), |
3024 | | ] |
3025 | | )) |
3026 | | ); |
3027 | | assert_eq!( |
3028 | | parser(r"a??b").parse(), |
3029 | | Ok(concat( |
3030 | | 0..4, |
3031 | | vec![ |
3032 | | Ast::Repetition(ast::Repetition { |
3033 | | span: span(0..3), |
3034 | | op: ast::RepetitionOp { |
3035 | | span: span(1..3), |
3036 | | kind: ast::RepetitionKind::ZeroOrOne, |
3037 | | }, |
3038 | | greedy: false, |
3039 | | ast: Box::new(lit('a', 0)), |
3040 | | }), |
3041 | | lit('b', 3), |
3042 | | ] |
3043 | | )) |
3044 | | ); |
3045 | | assert_eq!( |
3046 | | parser(r"ab?").parse(), |
3047 | | Ok(concat( |
3048 | | 0..3, |
3049 | | vec![ |
3050 | | lit('a', 0), |
3051 | | Ast::Repetition(ast::Repetition { |
3052 | | span: span(1..3), |
3053 | | op: ast::RepetitionOp { |
3054 | | span: span(2..3), |
3055 | | kind: ast::RepetitionKind::ZeroOrOne, |
3056 | | }, |
3057 | | greedy: true, |
3058 | | ast: Box::new(lit('b', 1)), |
3059 | | }), |
3060 | | ] |
3061 | | )) |
3062 | | ); |
3063 | | assert_eq!( |
3064 | | parser(r"(ab)?").parse(), |
3065 | | Ok(Ast::Repetition(ast::Repetition { |
3066 | | span: span(0..5), |
3067 | | op: ast::RepetitionOp { |
3068 | | span: span(4..5), |
3069 | | kind: ast::RepetitionKind::ZeroOrOne, |
3070 | | }, |
3071 | | greedy: true, |
3072 | | ast: Box::new(group( |
3073 | | 0..4, |
3074 | | 1, |
3075 | | concat(1..3, vec![lit('a', 1), lit('b', 2),]) |
3076 | | )), |
3077 | | })) |
3078 | | ); |
3079 | | assert_eq!( |
3080 | | parser(r"|a?").parse(), |
3081 | | Ok(alt( |
3082 | | 0..3, |
3083 | | vec![ |
3084 | | Ast::Empty(span(0..0)), |
3085 | | Ast::Repetition(ast::Repetition { |
3086 | | span: span(1..3), |
3087 | | op: ast::RepetitionOp { |
3088 | | span: span(2..3), |
3089 | | kind: ast::RepetitionKind::ZeroOrOne, |
3090 | | }, |
3091 | | greedy: true, |
3092 | | ast: Box::new(lit('a', 1)), |
3093 | | }), |
3094 | | ] |
3095 | | )) |
3096 | | ); |
3097 | | |
3098 | | assert_eq!( |
3099 | | parser(r"*").parse().unwrap_err(), |
3100 | | TestError { |
3101 | | span: span(0..0), |
3102 | | kind: ast::ErrorKind::RepetitionMissing, |
3103 | | } |
3104 | | ); |
3105 | | assert_eq!( |
3106 | | parser(r"(?i)*").parse().unwrap_err(), |
3107 | | TestError { |
3108 | | span: span(4..4), |
3109 | | kind: ast::ErrorKind::RepetitionMissing, |
3110 | | } |
3111 | | ); |
3112 | | assert_eq!( |
3113 | | parser(r"(*)").parse().unwrap_err(), |
3114 | | TestError { |
3115 | | span: span(1..1), |
3116 | | kind: ast::ErrorKind::RepetitionMissing, |
3117 | | } |
3118 | | ); |
3119 | | assert_eq!( |
3120 | | parser(r"(?:?)").parse().unwrap_err(), |
3121 | | TestError { |
3122 | | span: span(3..3), |
3123 | | kind: ast::ErrorKind::RepetitionMissing, |
3124 | | } |
3125 | | ); |
3126 | | assert_eq!( |
3127 | | parser(r"+").parse().unwrap_err(), |
3128 | | TestError { |
3129 | | span: span(0..0), |
3130 | | kind: ast::ErrorKind::RepetitionMissing, |
3131 | | } |
3132 | | ); |
3133 | | assert_eq!( |
3134 | | parser(r"?").parse().unwrap_err(), |
3135 | | TestError { |
3136 | | span: span(0..0), |
3137 | | kind: ast::ErrorKind::RepetitionMissing, |
3138 | | } |
3139 | | ); |
3140 | | assert_eq!( |
3141 | | parser(r"(?)").parse().unwrap_err(), |
3142 | | TestError { |
3143 | | span: span(1..1), |
3144 | | kind: ast::ErrorKind::RepetitionMissing, |
3145 | | } |
3146 | | ); |
3147 | | assert_eq!( |
3148 | | parser(r"|*").parse().unwrap_err(), |
3149 | | TestError { |
3150 | | span: span(1..1), |
3151 | | kind: ast::ErrorKind::RepetitionMissing, |
3152 | | } |
3153 | | ); |
3154 | | assert_eq!( |
3155 | | parser(r"|+").parse().unwrap_err(), |
3156 | | TestError { |
3157 | | span: span(1..1), |
3158 | | kind: ast::ErrorKind::RepetitionMissing, |
3159 | | } |
3160 | | ); |
3161 | | assert_eq!( |
3162 | | parser(r"|?").parse().unwrap_err(), |
3163 | | TestError { |
3164 | | span: span(1..1), |
3165 | | kind: ast::ErrorKind::RepetitionMissing, |
3166 | | } |
3167 | | ); |
3168 | | } |
3169 | | |
3170 | | #[test] |
3171 | | fn parse_counted_repetition() { |
3172 | | assert_eq!( |
3173 | | parser(r"a{5}").parse(), |
3174 | | Ok(Ast::Repetition(ast::Repetition { |
3175 | | span: span(0..4), |
3176 | | op: ast::RepetitionOp { |
3177 | | span: span(1..4), |
3178 | | kind: ast::RepetitionKind::Range( |
3179 | | ast::RepetitionRange::Exactly(5) |
3180 | | ), |
3181 | | }, |
3182 | | greedy: true, |
3183 | | ast: Box::new(lit('a', 0)), |
3184 | | })) |
3185 | | ); |
3186 | | assert_eq!( |
3187 | | parser(r"a{5,}").parse(), |
3188 | | Ok(Ast::Repetition(ast::Repetition { |
3189 | | span: span(0..5), |
3190 | | op: ast::RepetitionOp { |
3191 | | span: span(1..5), |
3192 | | kind: ast::RepetitionKind::Range( |
3193 | | ast::RepetitionRange::AtLeast(5) |
3194 | | ), |
3195 | | }, |
3196 | | greedy: true, |
3197 | | ast: Box::new(lit('a', 0)), |
3198 | | })) |
3199 | | ); |
3200 | | assert_eq!( |
3201 | | parser(r"a{5,9}").parse(), |
3202 | | Ok(Ast::Repetition(ast::Repetition { |
3203 | | span: span(0..6), |
3204 | | op: ast::RepetitionOp { |
3205 | | span: span(1..6), |
3206 | | kind: ast::RepetitionKind::Range( |
3207 | | ast::RepetitionRange::Bounded(5, 9) |
3208 | | ), |
3209 | | }, |
3210 | | greedy: true, |
3211 | | ast: Box::new(lit('a', 0)), |
3212 | | })) |
3213 | | ); |
3214 | | assert_eq!( |
3215 | | parser(r"a{5}?").parse(), |
3216 | | Ok(Ast::Repetition(ast::Repetition { |
3217 | | span: span(0..5), |
3218 | | op: ast::RepetitionOp { |
3219 | | span: span(1..5), |
3220 | | kind: ast::RepetitionKind::Range( |
3221 | | ast::RepetitionRange::Exactly(5) |
3222 | | ), |
3223 | | }, |
3224 | | greedy: false, |
3225 | | ast: Box::new(lit('a', 0)), |
3226 | | })) |
3227 | | ); |
3228 | | assert_eq!( |
3229 | | parser(r"ab{5}").parse(), |
3230 | | Ok(concat( |
3231 | | 0..5, |
3232 | | vec![ |
3233 | | lit('a', 0), |
3234 | | Ast::Repetition(ast::Repetition { |
3235 | | span: span(1..5), |
3236 | | op: ast::RepetitionOp { |
3237 | | span: span(2..5), |
3238 | | kind: ast::RepetitionKind::Range( |
3239 | | ast::RepetitionRange::Exactly(5) |
3240 | | ), |
3241 | | }, |
3242 | | greedy: true, |
3243 | | ast: Box::new(lit('b', 1)), |
3244 | | }), |
3245 | | ] |
3246 | | )) |
3247 | | ); |
3248 | | assert_eq!( |
3249 | | parser(r"ab{5}c").parse(), |
3250 | | Ok(concat( |
3251 | | 0..6, |
3252 | | vec![ |
3253 | | lit('a', 0), |
3254 | | Ast::Repetition(ast::Repetition { |
3255 | | span: span(1..5), |
3256 | | op: ast::RepetitionOp { |
3257 | | span: span(2..5), |
3258 | | kind: ast::RepetitionKind::Range( |
3259 | | ast::RepetitionRange::Exactly(5) |
3260 | | ), |
3261 | | }, |
3262 | | greedy: true, |
3263 | | ast: Box::new(lit('b', 1)), |
3264 | | }), |
3265 | | lit('c', 5), |
3266 | | ] |
3267 | | )) |
3268 | | ); |
3269 | | |
3270 | | assert_eq!( |
3271 | | parser(r"a{ 5 }").parse(), |
3272 | | Ok(Ast::Repetition(ast::Repetition { |
3273 | | span: span(0..6), |
3274 | | op: ast::RepetitionOp { |
3275 | | span: span(1..6), |
3276 | | kind: ast::RepetitionKind::Range( |
3277 | | ast::RepetitionRange::Exactly(5) |
3278 | | ), |
3279 | | }, |
3280 | | greedy: true, |
3281 | | ast: Box::new(lit('a', 0)), |
3282 | | })) |
3283 | | ); |
3284 | | assert_eq!( |
3285 | | parser(r"a{ 5 , 9 }").parse(), |
3286 | | Ok(Ast::Repetition(ast::Repetition { |
3287 | | span: span(0..10), |
3288 | | op: ast::RepetitionOp { |
3289 | | span: span(1..10), |
3290 | | kind: ast::RepetitionKind::Range( |
3291 | | ast::RepetitionRange::Bounded(5, 9) |
3292 | | ), |
3293 | | }, |
3294 | | greedy: true, |
3295 | | ast: Box::new(lit('a', 0)), |
3296 | | })) |
3297 | | ); |
3298 | | assert_eq!( |
3299 | | parser_ignore_whitespace(r"a{5,9} ?").parse(), |
3300 | | Ok(Ast::Repetition(ast::Repetition { |
3301 | | span: span(0..8), |
3302 | | op: ast::RepetitionOp { |
3303 | | span: span(1..8), |
3304 | | kind: ast::RepetitionKind::Range( |
3305 | | ast::RepetitionRange::Bounded(5, 9) |
3306 | | ), |
3307 | | }, |
3308 | | greedy: false, |
3309 | | ast: Box::new(lit('a', 0)), |
3310 | | })) |
3311 | | ); |
3312 | | |
3313 | | assert_eq!( |
3314 | | parser(r"(?i){0}").parse().unwrap_err(), |
3315 | | TestError { |
3316 | | span: span(4..4), |
3317 | | kind: ast::ErrorKind::RepetitionMissing, |
3318 | | } |
3319 | | ); |
3320 | | assert_eq!( |
3321 | | parser(r"(?m){1,1}").parse().unwrap_err(), |
3322 | | TestError { |
3323 | | span: span(4..4), |
3324 | | kind: ast::ErrorKind::RepetitionMissing, |
3325 | | } |
3326 | | ); |
3327 | | assert_eq!( |
3328 | | parser(r"a{]}").parse().unwrap_err(), |
3329 | | TestError { |
3330 | | span: span(2..2), |
3331 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3332 | | } |
3333 | | ); |
3334 | | assert_eq!( |
3335 | | parser(r"a{1,]}").parse().unwrap_err(), |
3336 | | TestError { |
3337 | | span: span(4..4), |
3338 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3339 | | } |
3340 | | ); |
3341 | | assert_eq!( |
3342 | | parser(r"a{").parse().unwrap_err(), |
3343 | | TestError { |
3344 | | span: span(1..2), |
3345 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3346 | | } |
3347 | | ); |
3348 | | assert_eq!( |
3349 | | parser(r"a{}").parse().unwrap_err(), |
3350 | | TestError { |
3351 | | span: span(2..2), |
3352 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3353 | | } |
3354 | | ); |
3355 | | assert_eq!( |
3356 | | parser(r"a{a").parse().unwrap_err(), |
3357 | | TestError { |
3358 | | span: span(2..2), |
3359 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3360 | | } |
3361 | | ); |
3362 | | assert_eq!( |
3363 | | parser(r"a{9999999999}").parse().unwrap_err(), |
3364 | | TestError { |
3365 | | span: span(2..12), |
3366 | | kind: ast::ErrorKind::DecimalInvalid, |
3367 | | } |
3368 | | ); |
3369 | | assert_eq!( |
3370 | | parser(r"a{9").parse().unwrap_err(), |
3371 | | TestError { |
3372 | | span: span(1..3), |
3373 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3374 | | } |
3375 | | ); |
3376 | | assert_eq!( |
3377 | | parser(r"a{9,a").parse().unwrap_err(), |
3378 | | TestError { |
3379 | | span: span(4..4), |
3380 | | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3381 | | } |
3382 | | ); |
3383 | | assert_eq!( |
3384 | | parser(r"a{9,9999999999}").parse().unwrap_err(), |
3385 | | TestError { |
3386 | | span: span(4..14), |
3387 | | kind: ast::ErrorKind::DecimalInvalid, |
3388 | | } |
3389 | | ); |
3390 | | assert_eq!( |
3391 | | parser(r"a{9,").parse().unwrap_err(), |
3392 | | TestError { |
3393 | | span: span(1..4), |
3394 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3395 | | } |
3396 | | ); |
3397 | | assert_eq!( |
3398 | | parser(r"a{9,11").parse().unwrap_err(), |
3399 | | TestError { |
3400 | | span: span(1..6), |
3401 | | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3402 | | } |
3403 | | ); |
3404 | | assert_eq!( |
3405 | | parser(r"a{2,1}").parse().unwrap_err(), |
3406 | | TestError { |
3407 | | span: span(1..6), |
3408 | | kind: ast::ErrorKind::RepetitionCountInvalid, |
3409 | | } |
3410 | | ); |
3411 | | assert_eq!( |
3412 | | parser(r"{5}").parse().unwrap_err(), |
3413 | | TestError { |
3414 | | span: span(0..0), |
3415 | | kind: ast::ErrorKind::RepetitionMissing, |
3416 | | } |
3417 | | ); |
3418 | | assert_eq!( |
3419 | | parser(r"|{5}").parse().unwrap_err(), |
3420 | | TestError { |
3421 | | span: span(1..1), |
3422 | | kind: ast::ErrorKind::RepetitionMissing, |
3423 | | } |
3424 | | ); |
3425 | | } |
3426 | | |
3427 | | #[test] |
3428 | | fn parse_alternate() { |
3429 | | assert_eq!( |
3430 | | parser(r"a|b").parse(), |
3431 | | Ok(Ast::Alternation(ast::Alternation { |
3432 | | span: span(0..3), |
3433 | | asts: vec![lit('a', 0), lit('b', 2)], |
3434 | | })) |
3435 | | ); |
3436 | | assert_eq!( |
3437 | | parser(r"(a|b)").parse(), |
3438 | | Ok(group( |
3439 | | 0..5, |
3440 | | 1, |
3441 | | Ast::Alternation(ast::Alternation { |
3442 | | span: span(1..4), |
3443 | | asts: vec![lit('a', 1), lit('b', 3)], |
3444 | | }) |
3445 | | )) |
3446 | | ); |
3447 | | |
3448 | | assert_eq!( |
3449 | | parser(r"a|b|c").parse(), |
3450 | | Ok(Ast::Alternation(ast::Alternation { |
3451 | | span: span(0..5), |
3452 | | asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], |
3453 | | })) |
3454 | | ); |
3455 | | assert_eq!( |
3456 | | parser(r"ax|by|cz").parse(), |
3457 | | Ok(Ast::Alternation(ast::Alternation { |
3458 | | span: span(0..8), |
3459 | | asts: vec![ |
3460 | | concat(0..2, vec![lit('a', 0), lit('x', 1)]), |
3461 | | concat(3..5, vec![lit('b', 3), lit('y', 4)]), |
3462 | | concat(6..8, vec![lit('c', 6), lit('z', 7)]), |
3463 | | ], |
3464 | | })) |
3465 | | ); |
3466 | | assert_eq!( |
3467 | | parser(r"(ax|by|cz)").parse(), |
3468 | | Ok(group( |
3469 | | 0..10, |
3470 | | 1, |
3471 | | Ast::Alternation(ast::Alternation { |
3472 | | span: span(1..9), |
3473 | | asts: vec![ |
3474 | | concat(1..3, vec![lit('a', 1), lit('x', 2)]), |
3475 | | concat(4..6, vec![lit('b', 4), lit('y', 5)]), |
3476 | | concat(7..9, vec![lit('c', 7), lit('z', 8)]), |
3477 | | ], |
3478 | | }) |
3479 | | )) |
3480 | | ); |
3481 | | assert_eq!( |
3482 | | parser(r"(ax|(by|(cz)))").parse(), |
3483 | | Ok(group( |
3484 | | 0..14, |
3485 | | 1, |
3486 | | alt( |
3487 | | 1..13, |
3488 | | vec![ |
3489 | | concat(1..3, vec![lit('a', 1), lit('x', 2)]), |
3490 | | group( |
3491 | | 4..13, |
3492 | | 2, |
3493 | | alt( |
3494 | | 5..12, |
3495 | | vec![ |
3496 | | concat( |
3497 | | 5..7, |
3498 | | vec![lit('b', 5), lit('y', 6)] |
3499 | | ), |
3500 | | group( |
3501 | | 8..12, |
3502 | | 3, |
3503 | | concat( |
3504 | | 9..11, |
3505 | | vec![lit('c', 9), lit('z', 10),] |
3506 | | ) |
3507 | | ), |
3508 | | ] |
3509 | | ) |
3510 | | ), |
3511 | | ] |
3512 | | ) |
3513 | | )) |
3514 | | ); |
3515 | | |
3516 | | assert_eq!( |
3517 | | parser(r"|").parse(), |
3518 | | Ok(alt( |
3519 | | 0..1, |
3520 | | vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] |
3521 | | )) |
3522 | | ); |
3523 | | assert_eq!( |
3524 | | parser(r"||").parse(), |
3525 | | Ok(alt( |
3526 | | 0..2, |
3527 | | vec![ |
3528 | | Ast::Empty(span(0..0)), |
3529 | | Ast::Empty(span(1..1)), |
3530 | | Ast::Empty(span(2..2)), |
3531 | | ] |
3532 | | )) |
3533 | | ); |
3534 | | assert_eq!( |
3535 | | parser(r"a|").parse(), |
3536 | | Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) |
3537 | | ); |
3538 | | assert_eq!( |
3539 | | parser(r"|a").parse(), |
3540 | | Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) |
3541 | | ); |
3542 | | |
3543 | | assert_eq!( |
3544 | | parser(r"(|)").parse(), |
3545 | | Ok(group( |
3546 | | 0..3, |
3547 | | 1, |
3548 | | alt( |
3549 | | 1..2, |
3550 | | vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] |
3551 | | ) |
3552 | | )) |
3553 | | ); |
3554 | | assert_eq!( |
3555 | | parser(r"(a|)").parse(), |
3556 | | Ok(group( |
3557 | | 0..4, |
3558 | | 1, |
3559 | | alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) |
3560 | | )) |
3561 | | ); |
3562 | | assert_eq!( |
3563 | | parser(r"(|a)").parse(), |
3564 | | Ok(group( |
3565 | | 0..4, |
3566 | | 1, |
3567 | | alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) |
3568 | | )) |
3569 | | ); |
3570 | | |
3571 | | assert_eq!( |
3572 | | parser(r"a|b)").parse().unwrap_err(), |
3573 | | TestError { |
3574 | | span: span(3..4), |
3575 | | kind: ast::ErrorKind::GroupUnopened, |
3576 | | } |
3577 | | ); |
3578 | | assert_eq!( |
3579 | | parser(r"(a|b").parse().unwrap_err(), |
3580 | | TestError { |
3581 | | span: span(0..1), |
3582 | | kind: ast::ErrorKind::GroupUnclosed, |
3583 | | } |
3584 | | ); |
3585 | | } |
3586 | | |
3587 | | #[test] |
3588 | | fn parse_unsupported_lookaround() { |
3589 | | assert_eq!( |
3590 | | parser(r"(?=a)").parse().unwrap_err(), |
3591 | | TestError { |
3592 | | span: span(0..3), |
3593 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3594 | | } |
3595 | | ); |
3596 | | assert_eq!( |
3597 | | parser(r"(?!a)").parse().unwrap_err(), |
3598 | | TestError { |
3599 | | span: span(0..3), |
3600 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3601 | | } |
3602 | | ); |
3603 | | assert_eq!( |
3604 | | parser(r"(?<=a)").parse().unwrap_err(), |
3605 | | TestError { |
3606 | | span: span(0..4), |
3607 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3608 | | } |
3609 | | ); |
3610 | | assert_eq!( |
3611 | | parser(r"(?<!a)").parse().unwrap_err(), |
3612 | | TestError { |
3613 | | span: span(0..4), |
3614 | | kind: ast::ErrorKind::UnsupportedLookAround, |
3615 | | } |
3616 | | ); |
3617 | | } |
3618 | | |
3619 | | #[test] |
3620 | | fn parse_group() { |
3621 | | assert_eq!( |
3622 | | parser("(?i)").parse(), |
3623 | | Ok(Ast::Flags(ast::SetFlags { |
3624 | | span: span(0..4), |
3625 | | flags: ast::Flags { |
3626 | | span: span(2..3), |
3627 | | items: vec![ast::FlagsItem { |
3628 | | span: span(2..3), |
3629 | | kind: ast::FlagsItemKind::Flag( |
3630 | | ast::Flag::CaseInsensitive |
3631 | | ), |
3632 | | }], |
3633 | | }, |
3634 | | })) |
3635 | | ); |
3636 | | assert_eq!( |
3637 | | parser("(?iU)").parse(), |
3638 | | Ok(Ast::Flags(ast::SetFlags { |
3639 | | span: span(0..5), |
3640 | | flags: ast::Flags { |
3641 | | span: span(2..4), |
3642 | | items: vec![ |
3643 | | ast::FlagsItem { |
3644 | | span: span(2..3), |
3645 | | kind: ast::FlagsItemKind::Flag( |
3646 | | ast::Flag::CaseInsensitive |
3647 | | ), |
3648 | | }, |
3649 | | ast::FlagsItem { |
3650 | | span: span(3..4), |
3651 | | kind: ast::FlagsItemKind::Flag( |
3652 | | ast::Flag::SwapGreed |
3653 | | ), |
3654 | | }, |
3655 | | ], |
3656 | | }, |
3657 | | })) |
3658 | | ); |
3659 | | assert_eq!( |
3660 | | parser("(?i-U)").parse(), |
3661 | | Ok(Ast::Flags(ast::SetFlags { |
3662 | | span: span(0..6), |
3663 | | flags: ast::Flags { |
3664 | | span: span(2..5), |
3665 | | items: vec![ |
3666 | | ast::FlagsItem { |
3667 | | span: span(2..3), |
3668 | | kind: ast::FlagsItemKind::Flag( |
3669 | | ast::Flag::CaseInsensitive |
3670 | | ), |
3671 | | }, |
3672 | | ast::FlagsItem { |
3673 | | span: span(3..4), |
3674 | | kind: ast::FlagsItemKind::Negation, |
3675 | | }, |
3676 | | ast::FlagsItem { |
3677 | | span: span(4..5), |
3678 | | kind: ast::FlagsItemKind::Flag( |
3679 | | ast::Flag::SwapGreed |
3680 | | ), |
3681 | | }, |
3682 | | ], |
3683 | | }, |
3684 | | })) |
3685 | | ); |
3686 | | |
3687 | | assert_eq!( |
3688 | | parser("()").parse(), |
3689 | | Ok(Ast::Group(ast::Group { |
3690 | | span: span(0..2), |
3691 | | kind: ast::GroupKind::CaptureIndex(1), |
3692 | | ast: Box::new(Ast::Empty(span(1..1))), |
3693 | | })) |
3694 | | ); |
3695 | | assert_eq!( |
3696 | | parser("(a)").parse(), |
3697 | | Ok(Ast::Group(ast::Group { |
3698 | | span: span(0..3), |
3699 | | kind: ast::GroupKind::CaptureIndex(1), |
3700 | | ast: Box::new(lit('a', 1)), |
3701 | | })) |
3702 | | ); |
3703 | | assert_eq!( |
3704 | | parser("(())").parse(), |
3705 | | Ok(Ast::Group(ast::Group { |
3706 | | span: span(0..4), |
3707 | | kind: ast::GroupKind::CaptureIndex(1), |
3708 | | ast: Box::new(Ast::Group(ast::Group { |
3709 | | span: span(1..3), |
3710 | | kind: ast::GroupKind::CaptureIndex(2), |
3711 | | ast: Box::new(Ast::Empty(span(2..2))), |
3712 | | })), |
3713 | | })) |
3714 | | ); |
3715 | | |
3716 | | assert_eq!( |
3717 | | parser("(?:a)").parse(), |
3718 | | Ok(Ast::Group(ast::Group { |
3719 | | span: span(0..5), |
3720 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3721 | | span: span(2..2), |
3722 | | items: vec![], |
3723 | | }), |
3724 | | ast: Box::new(lit('a', 3)), |
3725 | | })) |
3726 | | ); |
3727 | | |
3728 | | assert_eq!( |
3729 | | parser("(?i:a)").parse(), |
3730 | | Ok(Ast::Group(ast::Group { |
3731 | | span: span(0..6), |
3732 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3733 | | span: span(2..3), |
3734 | | items: vec![ast::FlagsItem { |
3735 | | span: span(2..3), |
3736 | | kind: ast::FlagsItemKind::Flag( |
3737 | | ast::Flag::CaseInsensitive |
3738 | | ), |
3739 | | },], |
3740 | | }), |
3741 | | ast: Box::new(lit('a', 4)), |
3742 | | })) |
3743 | | ); |
3744 | | assert_eq!( |
3745 | | parser("(?i-U:a)").parse(), |
3746 | | Ok(Ast::Group(ast::Group { |
3747 | | span: span(0..8), |
3748 | | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3749 | | span: span(2..5), |
3750 | | items: vec![ |
3751 | | ast::FlagsItem { |
3752 | | span: span(2..3), |
3753 | | kind: ast::FlagsItemKind::Flag( |
3754 | | ast::Flag::CaseInsensitive |
3755 | | ), |
3756 | | }, |
3757 | | ast::FlagsItem { |
3758 | | span: span(3..4), |
3759 | | kind: ast::FlagsItemKind::Negation, |
3760 | | }, |
3761 | | ast::FlagsItem { |
3762 | | span: span(4..5), |
3763 | | kind: ast::FlagsItemKind::Flag( |
3764 | | ast::Flag::SwapGreed |
3765 | | ), |
3766 | | }, |
3767 | | ], |
3768 | | }), |
3769 | | ast: Box::new(lit('a', 6)), |
3770 | | })) |
3771 | | ); |
3772 | | |
3773 | | assert_eq!( |
3774 | | parser("(").parse().unwrap_err(), |
3775 | | TestError { |
3776 | | span: span(0..1), |
3777 | | kind: ast::ErrorKind::GroupUnclosed, |
3778 | | } |
3779 | | ); |
3780 | | assert_eq!( |
3781 | | parser("(?").parse().unwrap_err(), |
3782 | | TestError { |
3783 | | span: span(0..1), |
3784 | | kind: ast::ErrorKind::GroupUnclosed, |
3785 | | } |
3786 | | ); |
3787 | | assert_eq!( |
3788 | | parser("(?P").parse().unwrap_err(), |
3789 | | TestError { |
3790 | | span: span(2..3), |
3791 | | kind: ast::ErrorKind::FlagUnrecognized, |
3792 | | } |
3793 | | ); |
3794 | | assert_eq!( |
3795 | | parser("(?P<").parse().unwrap_err(), |
3796 | | TestError { |
3797 | | span: span(4..4), |
3798 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3799 | | } |
3800 | | ); |
3801 | | assert_eq!( |
3802 | | parser("(a").parse().unwrap_err(), |
3803 | | TestError { |
3804 | | span: span(0..1), |
3805 | | kind: ast::ErrorKind::GroupUnclosed, |
3806 | | } |
3807 | | ); |
3808 | | assert_eq!( |
3809 | | parser("(()").parse().unwrap_err(), |
3810 | | TestError { |
3811 | | span: span(0..1), |
3812 | | kind: ast::ErrorKind::GroupUnclosed, |
3813 | | } |
3814 | | ); |
3815 | | assert_eq!( |
3816 | | parser(")").parse().unwrap_err(), |
3817 | | TestError { |
3818 | | span: span(0..1), |
3819 | | kind: ast::ErrorKind::GroupUnopened, |
3820 | | } |
3821 | | ); |
3822 | | assert_eq!( |
3823 | | parser("a)").parse().unwrap_err(), |
3824 | | TestError { |
3825 | | span: span(1..2), |
3826 | | kind: ast::ErrorKind::GroupUnopened, |
3827 | | } |
3828 | | ); |
3829 | | } |
3830 | | |
3831 | | #[test] |
3832 | | fn parse_capture_name() { |
3833 | | assert_eq!( |
3834 | | parser("(?P<a>z)").parse(), |
3835 | | Ok(Ast::Group(ast::Group { |
3836 | | span: span(0..8), |
3837 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
3838 | | span: span(4..5), |
3839 | | name: s("a"), |
3840 | | index: 1, |
3841 | | }), |
3842 | | ast: Box::new(lit('z', 6)), |
3843 | | })) |
3844 | | ); |
3845 | | assert_eq!( |
3846 | | parser("(?P<abc>z)").parse(), |
3847 | | Ok(Ast::Group(ast::Group { |
3848 | | span: span(0..10), |
3849 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
3850 | | span: span(4..7), |
3851 | | name: s("abc"), |
3852 | | index: 1, |
3853 | | }), |
3854 | | ast: Box::new(lit('z', 8)), |
3855 | | })) |
3856 | | ); |
3857 | | |
3858 | | assert_eq!( |
3859 | | parser("(?P<a_1>z)").parse(), |
3860 | | Ok(Ast::Group(ast::Group { |
3861 | | span: span(0..10), |
3862 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
3863 | | span: span(4..7), |
3864 | | name: s("a_1"), |
3865 | | index: 1, |
3866 | | }), |
3867 | | ast: Box::new(lit('z', 8)), |
3868 | | })) |
3869 | | ); |
3870 | | |
3871 | | assert_eq!( |
3872 | | parser("(?P<a.1>z)").parse(), |
3873 | | Ok(Ast::Group(ast::Group { |
3874 | | span: span(0..10), |
3875 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
3876 | | span: span(4..7), |
3877 | | name: s("a.1"), |
3878 | | index: 1, |
3879 | | }), |
3880 | | ast: Box::new(lit('z', 8)), |
3881 | | })) |
3882 | | ); |
3883 | | |
3884 | | assert_eq!( |
3885 | | parser("(?P<a[1]>z)").parse(), |
3886 | | Ok(Ast::Group(ast::Group { |
3887 | | span: span(0..11), |
3888 | | kind: ast::GroupKind::CaptureName(ast::CaptureName { |
3889 | | span: span(4..8), |
3890 | | name: s("a[1]"), |
3891 | | index: 1, |
3892 | | }), |
3893 | | ast: Box::new(lit('z', 9)), |
3894 | | })) |
3895 | | ); |
3896 | | |
3897 | | assert_eq!( |
3898 | | parser("(?P<").parse().unwrap_err(), |
3899 | | TestError { |
3900 | | span: span(4..4), |
3901 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3902 | | } |
3903 | | ); |
3904 | | assert_eq!( |
3905 | | parser("(?P<>z)").parse().unwrap_err(), |
3906 | | TestError { |
3907 | | span: span(4..4), |
3908 | | kind: ast::ErrorKind::GroupNameEmpty, |
3909 | | } |
3910 | | ); |
3911 | | assert_eq!( |
3912 | | parser("(?P<a").parse().unwrap_err(), |
3913 | | TestError { |
3914 | | span: span(5..5), |
3915 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3916 | | } |
3917 | | ); |
3918 | | assert_eq!( |
3919 | | parser("(?P<ab").parse().unwrap_err(), |
3920 | | TestError { |
3921 | | span: span(6..6), |
3922 | | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3923 | | } |
3924 | | ); |
3925 | | assert_eq!( |
3926 | | parser("(?P<0a").parse().unwrap_err(), |
3927 | | TestError { |
3928 | | span: span(4..5), |
3929 | | kind: ast::ErrorKind::GroupNameInvalid, |
3930 | | } |
3931 | | ); |
3932 | | assert_eq!( |
3933 | | parser("(?P<~").parse().unwrap_err(), |
3934 | | TestError { |
3935 | | span: span(4..5), |
3936 | | kind: ast::ErrorKind::GroupNameInvalid, |
3937 | | } |
3938 | | ); |
3939 | | assert_eq!( |
3940 | | parser("(?P<abc~").parse().unwrap_err(), |
3941 | | TestError { |
3942 | | span: span(7..8), |
3943 | | kind: ast::ErrorKind::GroupNameInvalid, |
3944 | | } |
3945 | | ); |
3946 | | assert_eq!( |
3947 | | parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(), |
3948 | | TestError { |
3949 | | span: span(12..13), |
3950 | | kind: ast::ErrorKind::GroupNameDuplicate { |
3951 | | original: span(4..5), |
3952 | | }, |
3953 | | } |
3954 | | ); |
3955 | | } |
3956 | | |
3957 | | #[test] |
3958 | | fn parse_flags() { |
3959 | | assert_eq!( |
3960 | | parser("i:").parse_flags(), |
3961 | | Ok(ast::Flags { |
3962 | | span: span(0..1), |
3963 | | items: vec![ast::FlagsItem { |
3964 | | span: span(0..1), |
3965 | | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
3966 | | }], |
3967 | | }) |
3968 | | ); |
3969 | | assert_eq!( |
3970 | | parser("i)").parse_flags(), |
3971 | | Ok(ast::Flags { |
3972 | | span: span(0..1), |
3973 | | items: vec![ast::FlagsItem { |
3974 | | span: span(0..1), |
3975 | | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
3976 | | }], |
3977 | | }) |
3978 | | ); |
3979 | | |
3980 | | assert_eq!( |
3981 | | parser("isU:").parse_flags(), |
3982 | | Ok(ast::Flags { |
3983 | | span: span(0..3), |
3984 | | items: vec![ |
3985 | | ast::FlagsItem { |
3986 | | span: span(0..1), |
3987 | | kind: ast::FlagsItemKind::Flag( |
3988 | | ast::Flag::CaseInsensitive |
3989 | | ), |
3990 | | }, |
3991 | | ast::FlagsItem { |
3992 | | span: span(1..2), |
3993 | | kind: ast::FlagsItemKind::Flag( |
3994 | | ast::Flag::DotMatchesNewLine |
3995 | | ), |
3996 | | }, |
3997 | | ast::FlagsItem { |
3998 | | span: span(2..3), |
3999 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4000 | | }, |
4001 | | ], |
4002 | | }) |
4003 | | ); |
4004 | | |
4005 | | assert_eq!( |
4006 | | parser("-isU:").parse_flags(), |
4007 | | Ok(ast::Flags { |
4008 | | span: span(0..4), |
4009 | | items: vec![ |
4010 | | ast::FlagsItem { |
4011 | | span: span(0..1), |
4012 | | kind: ast::FlagsItemKind::Negation, |
4013 | | }, |
4014 | | ast::FlagsItem { |
4015 | | span: span(1..2), |
4016 | | kind: ast::FlagsItemKind::Flag( |
4017 | | ast::Flag::CaseInsensitive |
4018 | | ), |
4019 | | }, |
4020 | | ast::FlagsItem { |
4021 | | span: span(2..3), |
4022 | | kind: ast::FlagsItemKind::Flag( |
4023 | | ast::Flag::DotMatchesNewLine |
4024 | | ), |
4025 | | }, |
4026 | | ast::FlagsItem { |
4027 | | span: span(3..4), |
4028 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4029 | | }, |
4030 | | ], |
4031 | | }) |
4032 | | ); |
4033 | | assert_eq!( |
4034 | | parser("i-sU:").parse_flags(), |
4035 | | Ok(ast::Flags { |
4036 | | span: span(0..4), |
4037 | | items: vec![ |
4038 | | ast::FlagsItem { |
4039 | | span: span(0..1), |
4040 | | kind: ast::FlagsItemKind::Flag( |
4041 | | ast::Flag::CaseInsensitive |
4042 | | ), |
4043 | | }, |
4044 | | ast::FlagsItem { |
4045 | | span: span(1..2), |
4046 | | kind: ast::FlagsItemKind::Negation, |
4047 | | }, |
4048 | | ast::FlagsItem { |
4049 | | span: span(2..3), |
4050 | | kind: ast::FlagsItemKind::Flag( |
4051 | | ast::Flag::DotMatchesNewLine |
4052 | | ), |
4053 | | }, |
4054 | | ast::FlagsItem { |
4055 | | span: span(3..4), |
4056 | | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4057 | | }, |
4058 | | ], |
4059 | | }) |
4060 | | ); |
4061 | | |
4062 | | assert_eq!( |
4063 | | parser("isU").parse_flags().unwrap_err(), |
4064 | | TestError { |
4065 | | span: span(3..3), |
4066 | | kind: ast::ErrorKind::FlagUnexpectedEof, |
4067 | | } |
4068 | | ); |
4069 | | assert_eq!( |
4070 | | parser("isUa:").parse_flags().unwrap_err(), |
4071 | | TestError { |
4072 | | span: span(3..4), |
4073 | | kind: ast::ErrorKind::FlagUnrecognized, |
4074 | | } |
4075 | | ); |
4076 | | assert_eq!( |
4077 | | parser("isUi:").parse_flags().unwrap_err(), |
4078 | | TestError { |
4079 | | span: span(3..4), |
4080 | | kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, |
4081 | | } |
4082 | | ); |
4083 | | assert_eq!( |
4084 | | parser("i-sU-i:").parse_flags().unwrap_err(), |
4085 | | TestError { |
4086 | | span: span(4..5), |
4087 | | kind: ast::ErrorKind::FlagRepeatedNegation { |
4088 | | original: span(1..2), |
4089 | | }, |
4090 | | } |
4091 | | ); |
4092 | | assert_eq!( |
4093 | | parser("-)").parse_flags().unwrap_err(), |
4094 | | TestError { |
4095 | | span: span(0..1), |
4096 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4097 | | } |
4098 | | ); |
4099 | | assert_eq!( |
4100 | | parser("i-)").parse_flags().unwrap_err(), |
4101 | | TestError { |
4102 | | span: span(1..2), |
4103 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4104 | | } |
4105 | | ); |
4106 | | assert_eq!( |
4107 | | parser("iU-)").parse_flags().unwrap_err(), |
4108 | | TestError { |
4109 | | span: span(2..3), |
4110 | | kind: ast::ErrorKind::FlagDanglingNegation, |
4111 | | } |
4112 | | ); |
4113 | | } |
4114 | | |
4115 | | #[test] |
4116 | | fn parse_flag() { |
4117 | | assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); |
4118 | | assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); |
4119 | | assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); |
4120 | | assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); |
4121 | | assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); |
4122 | | assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); |
4123 | | |
4124 | | assert_eq!( |
4125 | | parser("a").parse_flag().unwrap_err(), |
4126 | | TestError { |
4127 | | span: span(0..1), |
4128 | | kind: ast::ErrorKind::FlagUnrecognized, |
4129 | | } |
4130 | | ); |
4131 | | assert_eq!( |
4132 | | parser("☃").parse_flag().unwrap_err(), |
4133 | | TestError { |
4134 | | span: span_range("☃", 0..3), |
4135 | | kind: ast::ErrorKind::FlagUnrecognized, |
4136 | | } |
4137 | | ); |
4138 | | } |
4139 | | |
4140 | | #[test] |
4141 | | fn parse_primitive_non_escape() { |
4142 | | assert_eq!( |
4143 | | parser(r".").parse_primitive(), |
4144 | | Ok(Primitive::Dot(span(0..1))) |
4145 | | ); |
4146 | | assert_eq!( |
4147 | | parser(r"^").parse_primitive(), |
4148 | | Ok(Primitive::Assertion(ast::Assertion { |
4149 | | span: span(0..1), |
4150 | | kind: ast::AssertionKind::StartLine, |
4151 | | })) |
4152 | | ); |
4153 | | assert_eq!( |
4154 | | parser(r"$").parse_primitive(), |
4155 | | Ok(Primitive::Assertion(ast::Assertion { |
4156 | | span: span(0..1), |
4157 | | kind: ast::AssertionKind::EndLine, |
4158 | | })) |
4159 | | ); |
4160 | | |
4161 | | assert_eq!( |
4162 | | parser(r"a").parse_primitive(), |
4163 | | Ok(Primitive::Literal(ast::Literal { |
4164 | | span: span(0..1), |
4165 | | kind: ast::LiteralKind::Verbatim, |
4166 | | c: 'a', |
4167 | | })) |
4168 | | ); |
4169 | | assert_eq!( |
4170 | | parser(r"|").parse_primitive(), |
4171 | | Ok(Primitive::Literal(ast::Literal { |
4172 | | span: span(0..1), |
4173 | | kind: ast::LiteralKind::Verbatim, |
4174 | | c: '|', |
4175 | | })) |
4176 | | ); |
4177 | | assert_eq!( |
4178 | | parser(r"☃").parse_primitive(), |
4179 | | Ok(Primitive::Literal(ast::Literal { |
4180 | | span: span_range("☃", 0..3), |
4181 | | kind: ast::LiteralKind::Verbatim, |
4182 | | c: '☃', |
4183 | | })) |
4184 | | ); |
4185 | | } |
4186 | | |
4187 | | #[test] |
4188 | | fn parse_escape() { |
4189 | | assert_eq!( |
4190 | | parser(r"\|").parse_primitive(), |
4191 | | Ok(Primitive::Literal(ast::Literal { |
4192 | | span: span(0..2), |
4193 | | kind: ast::LiteralKind::Punctuation, |
4194 | | c: '|', |
4195 | | })) |
4196 | | ); |
4197 | | let specials = &[ |
4198 | | (r"\a", '\x07', ast::SpecialLiteralKind::Bell), |
4199 | | (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), |
4200 | | (r"\t", '\t', ast::SpecialLiteralKind::Tab), |
4201 | | (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), |
4202 | | (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), |
4203 | | (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), |
4204 | | ]; |
4205 | | for &(pat, c, ref kind) in specials { |
4206 | | assert_eq!( |
4207 | | parser(pat).parse_primitive(), |
4208 | | Ok(Primitive::Literal(ast::Literal { |
4209 | | span: span(0..2), |
4210 | | kind: ast::LiteralKind::Special(kind.clone()), |
4211 | | c: c, |
4212 | | })) |
4213 | | ); |
4214 | | } |
4215 | | assert_eq!( |
4216 | | parser(r"\A").parse_primitive(), |
4217 | | Ok(Primitive::Assertion(ast::Assertion { |
4218 | | span: span(0..2), |
4219 | | kind: ast::AssertionKind::StartText, |
4220 | | })) |
4221 | | ); |
4222 | | assert_eq!( |
4223 | | parser(r"\z").parse_primitive(), |
4224 | | Ok(Primitive::Assertion(ast::Assertion { |
4225 | | span: span(0..2), |
4226 | | kind: ast::AssertionKind::EndText, |
4227 | | })) |
4228 | | ); |
4229 | | assert_eq!( |
4230 | | parser(r"\b").parse_primitive(), |
4231 | | Ok(Primitive::Assertion(ast::Assertion { |
4232 | | span: span(0..2), |
4233 | | kind: ast::AssertionKind::WordBoundary, |
4234 | | })) |
4235 | | ); |
4236 | | assert_eq!( |
4237 | | parser(r"\B").parse_primitive(), |
4238 | | Ok(Primitive::Assertion(ast::Assertion { |
4239 | | span: span(0..2), |
4240 | | kind: ast::AssertionKind::NotWordBoundary, |
4241 | | })) |
4242 | | ); |
4243 | | |
4244 | | assert_eq!( |
4245 | | parser(r"\").parse_escape().unwrap_err(), |
4246 | | TestError { |
4247 | | span: span(0..1), |
4248 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4249 | | } |
4250 | | ); |
4251 | | assert_eq!( |
4252 | | parser(r"\y").parse_escape().unwrap_err(), |
4253 | | TestError { |
4254 | | span: span(0..2), |
4255 | | kind: ast::ErrorKind::EscapeUnrecognized, |
4256 | | } |
4257 | | ); |
4258 | | } |
4259 | | |
4260 | | #[test] |
4261 | | fn parse_unsupported_backreference() { |
4262 | | assert_eq!( |
4263 | | parser(r"\0").parse_escape().unwrap_err(), |
4264 | | TestError { |
4265 | | span: span(0..2), |
4266 | | kind: ast::ErrorKind::UnsupportedBackreference, |
4267 | | } |
4268 | | ); |
4269 | | assert_eq!( |
4270 | | parser(r"\9").parse_escape().unwrap_err(), |
4271 | | TestError { |
4272 | | span: span(0..2), |
4273 | | kind: ast::ErrorKind::UnsupportedBackreference, |
4274 | | } |
4275 | | ); |
4276 | | } |
4277 | | |
4278 | | #[test] |
4279 | | fn parse_octal() { |
4280 | | for i in 0..511 { |
4281 | | let pat = format!(r"\{:o}", i); |
4282 | | assert_eq!( |
4283 | | parser_octal(&pat).parse_escape(), |
4284 | | Ok(Primitive::Literal(ast::Literal { |
4285 | | span: span(0..pat.len()), |
4286 | | kind: ast::LiteralKind::Octal, |
4287 | | c: ::std::char::from_u32(i).unwrap(), |
4288 | | })) |
4289 | | ); |
4290 | | } |
4291 | | assert_eq!( |
4292 | | parser_octal(r"\778").parse_escape(), |
4293 | | Ok(Primitive::Literal(ast::Literal { |
4294 | | span: span(0..3), |
4295 | | kind: ast::LiteralKind::Octal, |
4296 | | c: '?', |
4297 | | })) |
4298 | | ); |
4299 | | assert_eq!( |
4300 | | parser_octal(r"\7777").parse_escape(), |
4301 | | Ok(Primitive::Literal(ast::Literal { |
4302 | | span: span(0..4), |
4303 | | kind: ast::LiteralKind::Octal, |
4304 | | c: '\u{01FF}', |
4305 | | })) |
4306 | | ); |
4307 | | assert_eq!( |
4308 | | parser_octal(r"\778").parse(), |
4309 | | Ok(Ast::Concat(ast::Concat { |
4310 | | span: span(0..4), |
4311 | | asts: vec![ |
4312 | | Ast::Literal(ast::Literal { |
4313 | | span: span(0..3), |
4314 | | kind: ast::LiteralKind::Octal, |
4315 | | c: '?', |
4316 | | }), |
4317 | | Ast::Literal(ast::Literal { |
4318 | | span: span(3..4), |
4319 | | kind: ast::LiteralKind::Verbatim, |
4320 | | c: '8', |
4321 | | }), |
4322 | | ], |
4323 | | })) |
4324 | | ); |
4325 | | assert_eq!( |
4326 | | parser_octal(r"\7777").parse(), |
4327 | | Ok(Ast::Concat(ast::Concat { |
4328 | | span: span(0..5), |
4329 | | asts: vec![ |
4330 | | Ast::Literal(ast::Literal { |
4331 | | span: span(0..4), |
4332 | | kind: ast::LiteralKind::Octal, |
4333 | | c: '\u{01FF}', |
4334 | | }), |
4335 | | Ast::Literal(ast::Literal { |
4336 | | span: span(4..5), |
4337 | | kind: ast::LiteralKind::Verbatim, |
4338 | | c: '7', |
4339 | | }), |
4340 | | ], |
4341 | | })) |
4342 | | ); |
4343 | | |
4344 | | assert_eq!( |
4345 | | parser_octal(r"\8").parse_escape().unwrap_err(), |
4346 | | TestError { |
4347 | | span: span(0..2), |
4348 | | kind: ast::ErrorKind::EscapeUnrecognized, |
4349 | | } |
4350 | | ); |
4351 | | } |
4352 | | |
4353 | | #[test] |
4354 | | fn parse_hex_two() { |
4355 | | for i in 0..256 { |
4356 | | let pat = format!(r"\x{:02x}", i); |
4357 | | assert_eq!( |
4358 | | parser(&pat).parse_escape(), |
4359 | | Ok(Primitive::Literal(ast::Literal { |
4360 | | span: span(0..pat.len()), |
4361 | | kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), |
4362 | | c: ::std::char::from_u32(i).unwrap(), |
4363 | | })) |
4364 | | ); |
4365 | | } |
4366 | | |
4367 | | assert_eq!( |
4368 | | parser(r"\xF").parse_escape().unwrap_err(), |
4369 | | TestError { |
4370 | | span: span(3..3), |
4371 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4372 | | } |
4373 | | ); |
4374 | | assert_eq!( |
4375 | | parser(r"\xG").parse_escape().unwrap_err(), |
4376 | | TestError { |
4377 | | span: span(2..3), |
4378 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4379 | | } |
4380 | | ); |
4381 | | assert_eq!( |
4382 | | parser(r"\xFG").parse_escape().unwrap_err(), |
4383 | | TestError { |
4384 | | span: span(3..4), |
4385 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4386 | | } |
4387 | | ); |
4388 | | } |
4389 | | |
4390 | | #[test] |
4391 | | fn parse_hex_four() { |
4392 | | for i in 0..65536 { |
4393 | | let c = match ::std::char::from_u32(i) { |
4394 | | None => continue, |
4395 | | Some(c) => c, |
4396 | | }; |
4397 | | let pat = format!(r"\u{:04x}", i); |
4398 | | assert_eq!( |
4399 | | parser(&pat).parse_escape(), |
4400 | | Ok(Primitive::Literal(ast::Literal { |
4401 | | span: span(0..pat.len()), |
4402 | | kind: ast::LiteralKind::HexFixed( |
4403 | | ast::HexLiteralKind::UnicodeShort |
4404 | | ), |
4405 | | c: c, |
4406 | | })) |
4407 | | ); |
4408 | | } |
4409 | | |
4410 | | assert_eq!( |
4411 | | parser(r"\uF").parse_escape().unwrap_err(), |
4412 | | TestError { |
4413 | | span: span(3..3), |
4414 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4415 | | } |
4416 | | ); |
4417 | | assert_eq!( |
4418 | | parser(r"\uG").parse_escape().unwrap_err(), |
4419 | | TestError { |
4420 | | span: span(2..3), |
4421 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4422 | | } |
4423 | | ); |
4424 | | assert_eq!( |
4425 | | parser(r"\uFG").parse_escape().unwrap_err(), |
4426 | | TestError { |
4427 | | span: span(3..4), |
4428 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4429 | | } |
4430 | | ); |
4431 | | assert_eq!( |
4432 | | parser(r"\uFFG").parse_escape().unwrap_err(), |
4433 | | TestError { |
4434 | | span: span(4..5), |
4435 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4436 | | } |
4437 | | ); |
4438 | | assert_eq!( |
4439 | | parser(r"\uFFFG").parse_escape().unwrap_err(), |
4440 | | TestError { |
4441 | | span: span(5..6), |
4442 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4443 | | } |
4444 | | ); |
4445 | | assert_eq!( |
4446 | | parser(r"\uD800").parse_escape().unwrap_err(), |
4447 | | TestError { |
4448 | | span: span(2..6), |
4449 | | kind: ast::ErrorKind::EscapeHexInvalid, |
4450 | | } |
4451 | | ); |
4452 | | } |
4453 | | |
4454 | | #[test] |
4455 | | fn parse_hex_eight() { |
4456 | | for i in 0..65536 { |
4457 | | let c = match ::std::char::from_u32(i) { |
4458 | | None => continue, |
4459 | | Some(c) => c, |
4460 | | }; |
4461 | | let pat = format!(r"\U{:08x}", i); |
4462 | | assert_eq!( |
4463 | | parser(&pat).parse_escape(), |
4464 | | Ok(Primitive::Literal(ast::Literal { |
4465 | | span: span(0..pat.len()), |
4466 | | kind: ast::LiteralKind::HexFixed( |
4467 | | ast::HexLiteralKind::UnicodeLong |
4468 | | ), |
4469 | | c: c, |
4470 | | })) |
4471 | | ); |
4472 | | } |
4473 | | |
4474 | | assert_eq!( |
4475 | | parser(r"\UF").parse_escape().unwrap_err(), |
4476 | | TestError { |
4477 | | span: span(3..3), |
4478 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4479 | | } |
4480 | | ); |
4481 | | assert_eq!( |
4482 | | parser(r"\UG").parse_escape().unwrap_err(), |
4483 | | TestError { |
4484 | | span: span(2..3), |
4485 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4486 | | } |
4487 | | ); |
4488 | | assert_eq!( |
4489 | | parser(r"\UFG").parse_escape().unwrap_err(), |
4490 | | TestError { |
4491 | | span: span(3..4), |
4492 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4493 | | } |
4494 | | ); |
4495 | | assert_eq!( |
4496 | | parser(r"\UFFG").parse_escape().unwrap_err(), |
4497 | | TestError { |
4498 | | span: span(4..5), |
4499 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4500 | | } |
4501 | | ); |
4502 | | assert_eq!( |
4503 | | parser(r"\UFFFG").parse_escape().unwrap_err(), |
4504 | | TestError { |
4505 | | span: span(5..6), |
4506 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4507 | | } |
4508 | | ); |
4509 | | assert_eq!( |
4510 | | parser(r"\UFFFFG").parse_escape().unwrap_err(), |
4511 | | TestError { |
4512 | | span: span(6..7), |
4513 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4514 | | } |
4515 | | ); |
4516 | | assert_eq!( |
4517 | | parser(r"\UFFFFFG").parse_escape().unwrap_err(), |
4518 | | TestError { |
4519 | | span: span(7..8), |
4520 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4521 | | } |
4522 | | ); |
4523 | | assert_eq!( |
4524 | | parser(r"\UFFFFFFG").parse_escape().unwrap_err(), |
4525 | | TestError { |
4526 | | span: span(8..9), |
4527 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4528 | | } |
4529 | | ); |
4530 | | assert_eq!( |
4531 | | parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), |
4532 | | TestError { |
4533 | | span: span(9..10), |
4534 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4535 | | } |
4536 | | ); |
4537 | | } |
4538 | | |
4539 | | #[test] |
4540 | | fn parse_hex_brace() { |
4541 | | assert_eq!( |
4542 | | parser(r"\u{26c4}").parse_escape(), |
4543 | | Ok(Primitive::Literal(ast::Literal { |
4544 | | span: span(0..8), |
4545 | | kind: ast::LiteralKind::HexBrace( |
4546 | | ast::HexLiteralKind::UnicodeShort |
4547 | | ), |
4548 | | c: '⛄', |
4549 | | })) |
4550 | | ); |
4551 | | assert_eq!( |
4552 | | parser(r"\U{26c4}").parse_escape(), |
4553 | | Ok(Primitive::Literal(ast::Literal { |
4554 | | span: span(0..8), |
4555 | | kind: ast::LiteralKind::HexBrace( |
4556 | | ast::HexLiteralKind::UnicodeLong |
4557 | | ), |
4558 | | c: '⛄', |
4559 | | })) |
4560 | | ); |
4561 | | assert_eq!( |
4562 | | parser(r"\x{26c4}").parse_escape(), |
4563 | | Ok(Primitive::Literal(ast::Literal { |
4564 | | span: span(0..8), |
4565 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
4566 | | c: '⛄', |
4567 | | })) |
4568 | | ); |
4569 | | assert_eq!( |
4570 | | parser(r"\x{26C4}").parse_escape(), |
4571 | | Ok(Primitive::Literal(ast::Literal { |
4572 | | span: span(0..8), |
4573 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
4574 | | c: '⛄', |
4575 | | })) |
4576 | | ); |
4577 | | assert_eq!( |
4578 | | parser(r"\x{10fFfF}").parse_escape(), |
4579 | | Ok(Primitive::Literal(ast::Literal { |
4580 | | span: span(0..10), |
4581 | | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
4582 | | c: '\u{10FFFF}', |
4583 | | })) |
4584 | | ); |
4585 | | |
4586 | | assert_eq!( |
4587 | | parser(r"\x").parse_escape().unwrap_err(), |
4588 | | TestError { |
4589 | | span: span(2..2), |
4590 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4591 | | } |
4592 | | ); |
4593 | | assert_eq!( |
4594 | | parser(r"\x{").parse_escape().unwrap_err(), |
4595 | | TestError { |
4596 | | span: span(2..3), |
4597 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4598 | | } |
4599 | | ); |
4600 | | assert_eq!( |
4601 | | parser(r"\x{FF").parse_escape().unwrap_err(), |
4602 | | TestError { |
4603 | | span: span(2..5), |
4604 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4605 | | } |
4606 | | ); |
4607 | | assert_eq!( |
4608 | | parser(r"\x{}").parse_escape().unwrap_err(), |
4609 | | TestError { |
4610 | | span: span(2..4), |
4611 | | kind: ast::ErrorKind::EscapeHexEmpty, |
4612 | | } |
4613 | | ); |
4614 | | assert_eq!( |
4615 | | parser(r"\x{FGF}").parse_escape().unwrap_err(), |
4616 | | TestError { |
4617 | | span: span(4..5), |
4618 | | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4619 | | } |
4620 | | ); |
4621 | | assert_eq!( |
4622 | | parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), |
4623 | | TestError { |
4624 | | span: span(3..9), |
4625 | | kind: ast::ErrorKind::EscapeHexInvalid, |
4626 | | } |
4627 | | ); |
4628 | | assert_eq!( |
4629 | | parser(r"\x{D800}").parse_escape().unwrap_err(), |
4630 | | TestError { |
4631 | | span: span(3..7), |
4632 | | kind: ast::ErrorKind::EscapeHexInvalid, |
4633 | | } |
4634 | | ); |
4635 | | assert_eq!( |
4636 | | parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), |
4637 | | TestError { |
4638 | | span: span(3..12), |
4639 | | kind: ast::ErrorKind::EscapeHexInvalid, |
4640 | | } |
4641 | | ); |
4642 | | } |
4643 | | |
4644 | | #[test] |
4645 | | fn parse_decimal() { |
4646 | | assert_eq!(parser("123").parse_decimal(), Ok(123)); |
4647 | | assert_eq!(parser("0").parse_decimal(), Ok(0)); |
4648 | | assert_eq!(parser("01").parse_decimal(), Ok(1)); |
4649 | | |
4650 | | assert_eq!( |
4651 | | parser("-1").parse_decimal().unwrap_err(), |
4652 | | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
4653 | | ); |
4654 | | assert_eq!( |
4655 | | parser("").parse_decimal().unwrap_err(), |
4656 | | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
4657 | | ); |
4658 | | assert_eq!( |
4659 | | parser("9999999999").parse_decimal().unwrap_err(), |
4660 | | TestError { |
4661 | | span: span(0..10), |
4662 | | kind: ast::ErrorKind::DecimalInvalid, |
4663 | | } |
4664 | | ); |
4665 | | } |
4666 | | |
4667 | | #[test] |
4668 | | fn parse_set_class() { |
4669 | | fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { |
4670 | | ast::ClassSet::union(ast::ClassSetUnion { |
4671 | | span: span, |
4672 | | items: items, |
4673 | | }) |
4674 | | } |
4675 | | |
4676 | | fn intersection( |
4677 | | span: Span, |
4678 | | lhs: ast::ClassSet, |
4679 | | rhs: ast::ClassSet, |
4680 | | ) -> ast::ClassSet { |
4681 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
4682 | | span: span, |
4683 | | kind: ast::ClassSetBinaryOpKind::Intersection, |
4684 | | lhs: Box::new(lhs), |
4685 | | rhs: Box::new(rhs), |
4686 | | }) |
4687 | | } |
4688 | | |
4689 | | fn difference( |
4690 | | span: Span, |
4691 | | lhs: ast::ClassSet, |
4692 | | rhs: ast::ClassSet, |
4693 | | ) -> ast::ClassSet { |
4694 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
4695 | | span: span, |
4696 | | kind: ast::ClassSetBinaryOpKind::Difference, |
4697 | | lhs: Box::new(lhs), |
4698 | | rhs: Box::new(rhs), |
4699 | | }) |
4700 | | } |
4701 | | |
4702 | | fn symdifference( |
4703 | | span: Span, |
4704 | | lhs: ast::ClassSet, |
4705 | | rhs: ast::ClassSet, |
4706 | | ) -> ast::ClassSet { |
4707 | | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
4708 | | span: span, |
4709 | | kind: ast::ClassSetBinaryOpKind::SymmetricDifference, |
4710 | | lhs: Box::new(lhs), |
4711 | | rhs: Box::new(rhs), |
4712 | | }) |
4713 | | } |
4714 | | |
4715 | | fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { |
4716 | | ast::ClassSet::Item(item) |
4717 | | } |
4718 | | |
4719 | | fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { |
4720 | | ast::ClassSetItem::Ascii(cls) |
4721 | | } |
4722 | | |
4723 | | fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { |
4724 | | ast::ClassSetItem::Unicode(cls) |
4725 | | } |
4726 | | |
4727 | | fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { |
4728 | | ast::ClassSetItem::Perl(cls) |
4729 | | } |
4730 | | |
4731 | | fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { |
4732 | | ast::ClassSetItem::Bracketed(Box::new(cls)) |
4733 | | } |
4734 | | |
4735 | | fn lit(span: Span, c: char) -> ast::ClassSetItem { |
4736 | | ast::ClassSetItem::Literal(ast::Literal { |
4737 | | span: span, |
4738 | | kind: ast::LiteralKind::Verbatim, |
4739 | | c: c, |
4740 | | }) |
4741 | | } |
4742 | | |
4743 | | fn empty(span: Span) -> ast::ClassSetItem { |
4744 | | ast::ClassSetItem::Empty(span) |
4745 | | } |
4746 | | |
4747 | | fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { |
4748 | | let pos1 = Position { |
4749 | | offset: span.start.offset + start.len_utf8(), |
4750 | | column: span.start.column + 1, |
4751 | | ..span.start |
4752 | | }; |
4753 | | let pos2 = Position { |
4754 | | offset: span.end.offset - end.len_utf8(), |
4755 | | column: span.end.column - 1, |
4756 | | ..span.end |
4757 | | }; |
4758 | | ast::ClassSetItem::Range(ast::ClassSetRange { |
4759 | | span: span, |
4760 | | start: ast::Literal { |
4761 | | span: Span { end: pos1, ..span }, |
4762 | | kind: ast::LiteralKind::Verbatim, |
4763 | | c: start, |
4764 | | }, |
4765 | | end: ast::Literal { |
4766 | | span: Span { start: pos2, ..span }, |
4767 | | kind: ast::LiteralKind::Verbatim, |
4768 | | c: end, |
4769 | | }, |
4770 | | }) |
4771 | | } |
4772 | | |
4773 | | fn alnum(span: Span, negated: bool) -> ast::ClassAscii { |
4774 | | ast::ClassAscii { |
4775 | | span: span, |
4776 | | kind: ast::ClassAsciiKind::Alnum, |
4777 | | negated: negated, |
4778 | | } |
4779 | | } |
4780 | | |
4781 | | fn lower(span: Span, negated: bool) -> ast::ClassAscii { |
4782 | | ast::ClassAscii { |
4783 | | span: span, |
4784 | | kind: ast::ClassAsciiKind::Lower, |
4785 | | negated: negated, |
4786 | | } |
4787 | | } |
4788 | | |
4789 | | assert_eq!( |
4790 | | parser("[[:alnum:]]").parse(), |
4791 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4792 | | span: span(0..11), |
4793 | | negated: false, |
4794 | | kind: itemset(item_ascii(alnum(span(1..10), false))), |
4795 | | }))) |
4796 | | ); |
4797 | | assert_eq!( |
4798 | | parser("[[[:alnum:]]]").parse(), |
4799 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4800 | | span: span(0..13), |
4801 | | negated: false, |
4802 | | kind: itemset(item_bracket(ast::ClassBracketed { |
4803 | | span: span(1..12), |
4804 | | negated: false, |
4805 | | kind: itemset(item_ascii(alnum(span(2..11), false))), |
4806 | | })), |
4807 | | }))) |
4808 | | ); |
4809 | | assert_eq!( |
4810 | | parser("[[:alnum:]&&[:lower:]]").parse(), |
4811 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4812 | | span: span(0..22), |
4813 | | negated: false, |
4814 | | kind: intersection( |
4815 | | span(1..21), |
4816 | | itemset(item_ascii(alnum(span(1..10), false))), |
4817 | | itemset(item_ascii(lower(span(12..21), false))), |
4818 | | ), |
4819 | | }))) |
4820 | | ); |
4821 | | assert_eq!( |
4822 | | parser("[[:alnum:]--[:lower:]]").parse(), |
4823 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4824 | | span: span(0..22), |
4825 | | negated: false, |
4826 | | kind: difference( |
4827 | | span(1..21), |
4828 | | itemset(item_ascii(alnum(span(1..10), false))), |
4829 | | itemset(item_ascii(lower(span(12..21), false))), |
4830 | | ), |
4831 | | }))) |
4832 | | ); |
4833 | | assert_eq!( |
4834 | | parser("[[:alnum:]~~[:lower:]]").parse(), |
4835 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4836 | | span: span(0..22), |
4837 | | negated: false, |
4838 | | kind: symdifference( |
4839 | | span(1..21), |
4840 | | itemset(item_ascii(alnum(span(1..10), false))), |
4841 | | itemset(item_ascii(lower(span(12..21), false))), |
4842 | | ), |
4843 | | }))) |
4844 | | ); |
4845 | | |
4846 | | assert_eq!( |
4847 | | parser("[a]").parse(), |
4848 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4849 | | span: span(0..3), |
4850 | | negated: false, |
4851 | | kind: itemset(lit(span(1..2), 'a')), |
4852 | | }))) |
4853 | | ); |
4854 | | assert_eq!( |
4855 | | parser(r"[a\]]").parse(), |
4856 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4857 | | span: span(0..5), |
4858 | | negated: false, |
4859 | | kind: union( |
4860 | | span(1..4), |
4861 | | vec![ |
4862 | | lit(span(1..2), 'a'), |
4863 | | ast::ClassSetItem::Literal(ast::Literal { |
4864 | | span: span(2..4), |
4865 | | kind: ast::LiteralKind::Punctuation, |
4866 | | c: ']', |
4867 | | }), |
4868 | | ] |
4869 | | ), |
4870 | | }))) |
4871 | | ); |
4872 | | assert_eq!( |
4873 | | parser(r"[a\-z]").parse(), |
4874 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4875 | | span: span(0..6), |
4876 | | negated: false, |
4877 | | kind: union( |
4878 | | span(1..5), |
4879 | | vec![ |
4880 | | lit(span(1..2), 'a'), |
4881 | | ast::ClassSetItem::Literal(ast::Literal { |
4882 | | span: span(2..4), |
4883 | | kind: ast::LiteralKind::Punctuation, |
4884 | | c: '-', |
4885 | | }), |
4886 | | lit(span(4..5), 'z'), |
4887 | | ] |
4888 | | ), |
4889 | | }))) |
4890 | | ); |
4891 | | assert_eq!( |
4892 | | parser("[ab]").parse(), |
4893 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4894 | | span: span(0..4), |
4895 | | negated: false, |
4896 | | kind: union( |
4897 | | span(1..3), |
4898 | | vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] |
4899 | | ), |
4900 | | }))) |
4901 | | ); |
4902 | | assert_eq!( |
4903 | | parser("[a-]").parse(), |
4904 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4905 | | span: span(0..4), |
4906 | | negated: false, |
4907 | | kind: union( |
4908 | | span(1..3), |
4909 | | vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] |
4910 | | ), |
4911 | | }))) |
4912 | | ); |
4913 | | assert_eq!( |
4914 | | parser("[-a]").parse(), |
4915 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4916 | | span: span(0..4), |
4917 | | negated: false, |
4918 | | kind: union( |
4919 | | span(1..3), |
4920 | | vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] |
4921 | | ), |
4922 | | }))) |
4923 | | ); |
4924 | | assert_eq!( |
4925 | | parser(r"[\pL]").parse(), |
4926 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4927 | | span: span(0..5), |
4928 | | negated: false, |
4929 | | kind: itemset(item_unicode(ast::ClassUnicode { |
4930 | | span: span(1..4), |
4931 | | negated: false, |
4932 | | kind: ast::ClassUnicodeKind::OneLetter('L'), |
4933 | | })), |
4934 | | }))) |
4935 | | ); |
4936 | | assert_eq!( |
4937 | | parser(r"[\w]").parse(), |
4938 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4939 | | span: span(0..4), |
4940 | | negated: false, |
4941 | | kind: itemset(item_perl(ast::ClassPerl { |
4942 | | span: span(1..3), |
4943 | | kind: ast::ClassPerlKind::Word, |
4944 | | negated: false, |
4945 | | })), |
4946 | | }))) |
4947 | | ); |
4948 | | assert_eq!( |
4949 | | parser(r"[a\wz]").parse(), |
4950 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4951 | | span: span(0..6), |
4952 | | negated: false, |
4953 | | kind: union( |
4954 | | span(1..5), |
4955 | | vec![ |
4956 | | lit(span(1..2), 'a'), |
4957 | | item_perl(ast::ClassPerl { |
4958 | | span: span(2..4), |
4959 | | kind: ast::ClassPerlKind::Word, |
4960 | | negated: false, |
4961 | | }), |
4962 | | lit(span(4..5), 'z'), |
4963 | | ] |
4964 | | ), |
4965 | | }))) |
4966 | | ); |
4967 | | |
4968 | | assert_eq!( |
4969 | | parser("[a-z]").parse(), |
4970 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4971 | | span: span(0..5), |
4972 | | negated: false, |
4973 | | kind: itemset(range(span(1..4), 'a', 'z')), |
4974 | | }))) |
4975 | | ); |
4976 | | assert_eq!( |
4977 | | parser("[a-cx-z]").parse(), |
4978 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4979 | | span: span(0..8), |
4980 | | negated: false, |
4981 | | kind: union( |
4982 | | span(1..7), |
4983 | | vec![ |
4984 | | range(span(1..4), 'a', 'c'), |
4985 | | range(span(4..7), 'x', 'z'), |
4986 | | ] |
4987 | | ), |
4988 | | }))) |
4989 | | ); |
4990 | | assert_eq!( |
4991 | | parser(r"[\w&&a-cx-z]").parse(), |
4992 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
4993 | | span: span(0..12), |
4994 | | negated: false, |
4995 | | kind: intersection( |
4996 | | span(1..11), |
4997 | | itemset(item_perl(ast::ClassPerl { |
4998 | | span: span(1..3), |
4999 | | kind: ast::ClassPerlKind::Word, |
5000 | | negated: false, |
5001 | | })), |
5002 | | union( |
5003 | | span(5..11), |
5004 | | vec![ |
5005 | | range(span(5..8), 'a', 'c'), |
5006 | | range(span(8..11), 'x', 'z'), |
5007 | | ] |
5008 | | ), |
5009 | | ), |
5010 | | }))) |
5011 | | ); |
5012 | | assert_eq!( |
5013 | | parser(r"[a-cx-z&&\w]").parse(), |
5014 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5015 | | span: span(0..12), |
5016 | | negated: false, |
5017 | | kind: intersection( |
5018 | | span(1..11), |
5019 | | union( |
5020 | | span(1..7), |
5021 | | vec![ |
5022 | | range(span(1..4), 'a', 'c'), |
5023 | | range(span(4..7), 'x', 'z'), |
5024 | | ] |
5025 | | ), |
5026 | | itemset(item_perl(ast::ClassPerl { |
5027 | | span: span(9..11), |
5028 | | kind: ast::ClassPerlKind::Word, |
5029 | | negated: false, |
5030 | | })), |
5031 | | ), |
5032 | | }))) |
5033 | | ); |
5034 | | assert_eq!( |
5035 | | parser(r"[a--b--c]").parse(), |
5036 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5037 | | span: span(0..9), |
5038 | | negated: false, |
5039 | | kind: difference( |
5040 | | span(1..8), |
5041 | | difference( |
5042 | | span(1..5), |
5043 | | itemset(lit(span(1..2), 'a')), |
5044 | | itemset(lit(span(4..5), 'b')), |
5045 | | ), |
5046 | | itemset(lit(span(7..8), 'c')), |
5047 | | ), |
5048 | | }))) |
5049 | | ); |
5050 | | assert_eq!( |
5051 | | parser(r"[a~~b~~c]").parse(), |
5052 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5053 | | span: span(0..9), |
5054 | | negated: false, |
5055 | | kind: symdifference( |
5056 | | span(1..8), |
5057 | | symdifference( |
5058 | | span(1..5), |
5059 | | itemset(lit(span(1..2), 'a')), |
5060 | | itemset(lit(span(4..5), 'b')), |
5061 | | ), |
5062 | | itemset(lit(span(7..8), 'c')), |
5063 | | ), |
5064 | | }))) |
5065 | | ); |
5066 | | assert_eq!( |
5067 | | parser(r"[\^&&^]").parse(), |
5068 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5069 | | span: span(0..7), |
5070 | | negated: false, |
5071 | | kind: intersection( |
5072 | | span(1..6), |
5073 | | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5074 | | span: span(1..3), |
5075 | | kind: ast::LiteralKind::Punctuation, |
5076 | | c: '^', |
5077 | | })), |
5078 | | itemset(lit(span(5..6), '^')), |
5079 | | ), |
5080 | | }))) |
5081 | | ); |
5082 | | assert_eq!( |
5083 | | parser(r"[\&&&&]").parse(), |
5084 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5085 | | span: span(0..7), |
5086 | | negated: false, |
5087 | | kind: intersection( |
5088 | | span(1..6), |
5089 | | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5090 | | span: span(1..3), |
5091 | | kind: ast::LiteralKind::Punctuation, |
5092 | | c: '&', |
5093 | | })), |
5094 | | itemset(lit(span(5..6), '&')), |
5095 | | ), |
5096 | | }))) |
5097 | | ); |
5098 | | assert_eq!( |
5099 | | parser(r"[&&&&]").parse(), |
5100 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5101 | | span: span(0..6), |
5102 | | negated: false, |
5103 | | kind: intersection( |
5104 | | span(1..5), |
5105 | | intersection( |
5106 | | span(1..3), |
5107 | | itemset(empty(span(1..1))), |
5108 | | itemset(empty(span(3..3))), |
5109 | | ), |
5110 | | itemset(empty(span(5..5))), |
5111 | | ), |
5112 | | }))) |
5113 | | ); |
5114 | | |
5115 | | let pat = "[☃-⛄]"; |
5116 | | assert_eq!( |
5117 | | parser(pat).parse(), |
5118 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5119 | | span: span_range(pat, 0..9), |
5120 | | negated: false, |
5121 | | kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { |
5122 | | span: span_range(pat, 1..8), |
5123 | | start: ast::Literal { |
5124 | | span: span_range(pat, 1..4), |
5125 | | kind: ast::LiteralKind::Verbatim, |
5126 | | c: '☃', |
5127 | | }, |
5128 | | end: ast::Literal { |
5129 | | span: span_range(pat, 5..8), |
5130 | | kind: ast::LiteralKind::Verbatim, |
5131 | | c: '⛄', |
5132 | | }, |
5133 | | })), |
5134 | | }))) |
5135 | | ); |
5136 | | |
5137 | | assert_eq!( |
5138 | | parser(r"[]]").parse(), |
5139 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5140 | | span: span(0..3), |
5141 | | negated: false, |
5142 | | kind: itemset(lit(span(1..2), ']')), |
5143 | | }))) |
5144 | | ); |
5145 | | assert_eq!( |
5146 | | parser(r"[]\[]").parse(), |
5147 | | Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5148 | | span: span(0..5), |
5149 | | negated: false, |
5150 | | kind: union( |
5151 | | span(1..4), |
5152 | | vec![ |
5153 | | lit(span(1..2), ']'), |
5154 | | ast::ClassSetItem::Literal(ast::Literal { |
5155 | | span: span(2..4), |
5156 | | kind: ast::LiteralKind::Punctuation, |
5157 | | c: '[', |
5158 | | }), |
5159 | | ] |
5160 | | ), |
5161 | | }))) |
5162 | | ); |
5163 | | assert_eq!( |
5164 | | parser(r"[\[]]").parse(), |
5165 | | Ok(concat( |
5166 | | 0..5, |
5167 | | vec![ |
5168 | | Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { |
5169 | | span: span(0..4), |
5170 | | negated: false, |
5171 | | kind: itemset(ast::ClassSetItem::Literal( |
5172 | | ast::Literal { |
5173 | | span: span(1..3), |
5174 | | kind: ast::LiteralKind::Punctuation, |
5175 | | c: '[', |
5176 | | } |
5177 | | )), |
5178 | | })), |
5179 | | Ast::Literal(ast::Literal { |
5180 | | span: span(4..5), |
5181 | | kind: ast::LiteralKind::Verbatim, |
5182 | | c: ']', |
5183 | | }), |
5184 | | ] |
5185 | | )) |
5186 | | ); |
5187 | | |
5188 | | assert_eq!( |
5189 | | parser("[").parse().unwrap_err(), |
5190 | | TestError { |
5191 | | span: span(0..1), |
5192 | | kind: ast::ErrorKind::ClassUnclosed, |
5193 | | } |
5194 | | ); |
5195 | | assert_eq!( |
5196 | | parser("[[").parse().unwrap_err(), |
5197 | | TestError { |
5198 | | span: span(1..2), |
5199 | | kind: ast::ErrorKind::ClassUnclosed, |
5200 | | } |
5201 | | ); |
5202 | | assert_eq!( |
5203 | | parser("[[-]").parse().unwrap_err(), |
5204 | | TestError { |
5205 | | span: span(0..1), |
5206 | | kind: ast::ErrorKind::ClassUnclosed, |
5207 | | } |
5208 | | ); |
5209 | | assert_eq!( |
5210 | | parser("[[[:alnum:]").parse().unwrap_err(), |
5211 | | TestError { |
5212 | | span: span(1..2), |
5213 | | kind: ast::ErrorKind::ClassUnclosed, |
5214 | | } |
5215 | | ); |
5216 | | assert_eq!( |
5217 | | parser(r"[\b]").parse().unwrap_err(), |
5218 | | TestError { |
5219 | | span: span(1..3), |
5220 | | kind: ast::ErrorKind::ClassEscapeInvalid, |
5221 | | } |
5222 | | ); |
5223 | | assert_eq!( |
5224 | | parser(r"[\w-a]").parse().unwrap_err(), |
5225 | | TestError { |
5226 | | span: span(1..3), |
5227 | | kind: ast::ErrorKind::ClassRangeLiteral, |
5228 | | } |
5229 | | ); |
5230 | | assert_eq!( |
5231 | | parser(r"[a-\w]").parse().unwrap_err(), |
5232 | | TestError { |
5233 | | span: span(3..5), |
5234 | | kind: ast::ErrorKind::ClassRangeLiteral, |
5235 | | } |
5236 | | ); |
5237 | | assert_eq!( |
5238 | | parser(r"[z-a]").parse().unwrap_err(), |
5239 | | TestError { |
5240 | | span: span(1..4), |
5241 | | kind: ast::ErrorKind::ClassRangeInvalid, |
5242 | | } |
5243 | | ); |
5244 | | |
5245 | | assert_eq!( |
5246 | | parser_ignore_whitespace("[a ").parse().unwrap_err(), |
5247 | | TestError { |
5248 | | span: span(0..1), |
5249 | | kind: ast::ErrorKind::ClassUnclosed, |
5250 | | } |
5251 | | ); |
5252 | | assert_eq!( |
5253 | | parser_ignore_whitespace("[a- ").parse().unwrap_err(), |
5254 | | TestError { |
5255 | | span: span(0..1), |
5256 | | kind: ast::ErrorKind::ClassUnclosed, |
5257 | | } |
5258 | | ); |
5259 | | } |
5260 | | |
5261 | | #[test] |
5262 | | fn parse_set_class_open() { |
5263 | | assert_eq!(parser("[a]").parse_set_class_open(), { |
5264 | | let set = ast::ClassBracketed { |
5265 | | span: span(0..1), |
5266 | | negated: false, |
5267 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5268 | | span: span(1..1), |
5269 | | items: vec![], |
5270 | | }), |
5271 | | }; |
5272 | | let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; |
5273 | | Ok((set, union)) |
5274 | | }); |
5275 | | assert_eq!( |
5276 | | parser_ignore_whitespace("[ a]").parse_set_class_open(), |
5277 | | { |
5278 | | let set = ast::ClassBracketed { |
5279 | | span: span(0..4), |
5280 | | negated: false, |
5281 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5282 | | span: span(4..4), |
5283 | | items: vec![], |
5284 | | }), |
5285 | | }; |
5286 | | let union = |
5287 | | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5288 | | Ok((set, union)) |
5289 | | } |
5290 | | ); |
5291 | | assert_eq!(parser("[^a]").parse_set_class_open(), { |
5292 | | let set = ast::ClassBracketed { |
5293 | | span: span(0..2), |
5294 | | negated: true, |
5295 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5296 | | span: span(2..2), |
5297 | | items: vec![], |
5298 | | }), |
5299 | | }; |
5300 | | let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; |
5301 | | Ok((set, union)) |
5302 | | }); |
5303 | | assert_eq!( |
5304 | | parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), |
5305 | | { |
5306 | | let set = ast::ClassBracketed { |
5307 | | span: span(0..4), |
5308 | | negated: true, |
5309 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5310 | | span: span(4..4), |
5311 | | items: vec![], |
5312 | | }), |
5313 | | }; |
5314 | | let union = |
5315 | | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5316 | | Ok((set, union)) |
5317 | | } |
5318 | | ); |
5319 | | assert_eq!(parser("[-a]").parse_set_class_open(), { |
5320 | | let set = ast::ClassBracketed { |
5321 | | span: span(0..2), |
5322 | | negated: false, |
5323 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5324 | | span: span(1..1), |
5325 | | items: vec![], |
5326 | | }), |
5327 | | }; |
5328 | | let union = ast::ClassSetUnion { |
5329 | | span: span(1..2), |
5330 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5331 | | span: span(1..2), |
5332 | | kind: ast::LiteralKind::Verbatim, |
5333 | | c: '-', |
5334 | | })], |
5335 | | }; |
5336 | | Ok((set, union)) |
5337 | | }); |
5338 | | assert_eq!( |
5339 | | parser_ignore_whitespace("[ - a]").parse_set_class_open(), |
5340 | | { |
5341 | | let set = ast::ClassBracketed { |
5342 | | span: span(0..4), |
5343 | | negated: false, |
5344 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5345 | | span: span(2..2), |
5346 | | items: vec![], |
5347 | | }), |
5348 | | }; |
5349 | | let union = ast::ClassSetUnion { |
5350 | | span: span(2..3), |
5351 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5352 | | span: span(2..3), |
5353 | | kind: ast::LiteralKind::Verbatim, |
5354 | | c: '-', |
5355 | | })], |
5356 | | }; |
5357 | | Ok((set, union)) |
5358 | | } |
5359 | | ); |
5360 | | assert_eq!(parser("[^-a]").parse_set_class_open(), { |
5361 | | let set = ast::ClassBracketed { |
5362 | | span: span(0..3), |
5363 | | negated: true, |
5364 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5365 | | span: span(2..2), |
5366 | | items: vec![], |
5367 | | }), |
5368 | | }; |
5369 | | let union = ast::ClassSetUnion { |
5370 | | span: span(2..3), |
5371 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5372 | | span: span(2..3), |
5373 | | kind: ast::LiteralKind::Verbatim, |
5374 | | c: '-', |
5375 | | })], |
5376 | | }; |
5377 | | Ok((set, union)) |
5378 | | }); |
5379 | | assert_eq!(parser("[--a]").parse_set_class_open(), { |
5380 | | let set = ast::ClassBracketed { |
5381 | | span: span(0..3), |
5382 | | negated: false, |
5383 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5384 | | span: span(1..1), |
5385 | | items: vec![], |
5386 | | }), |
5387 | | }; |
5388 | | let union = ast::ClassSetUnion { |
5389 | | span: span(1..3), |
5390 | | items: vec![ |
5391 | | ast::ClassSetItem::Literal(ast::Literal { |
5392 | | span: span(1..2), |
5393 | | kind: ast::LiteralKind::Verbatim, |
5394 | | c: '-', |
5395 | | }), |
5396 | | ast::ClassSetItem::Literal(ast::Literal { |
5397 | | span: span(2..3), |
5398 | | kind: ast::LiteralKind::Verbatim, |
5399 | | c: '-', |
5400 | | }), |
5401 | | ], |
5402 | | }; |
5403 | | Ok((set, union)) |
5404 | | }); |
5405 | | assert_eq!(parser("[]a]").parse_set_class_open(), { |
5406 | | let set = ast::ClassBracketed { |
5407 | | span: span(0..2), |
5408 | | negated: false, |
5409 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5410 | | span: span(1..1), |
5411 | | items: vec![], |
5412 | | }), |
5413 | | }; |
5414 | | let union = ast::ClassSetUnion { |
5415 | | span: span(1..2), |
5416 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5417 | | span: span(1..2), |
5418 | | kind: ast::LiteralKind::Verbatim, |
5419 | | c: ']', |
5420 | | })], |
5421 | | }; |
5422 | | Ok((set, union)) |
5423 | | }); |
5424 | | assert_eq!( |
5425 | | parser_ignore_whitespace("[ ] a]").parse_set_class_open(), |
5426 | | { |
5427 | | let set = ast::ClassBracketed { |
5428 | | span: span(0..4), |
5429 | | negated: false, |
5430 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5431 | | span: span(2..2), |
5432 | | items: vec![], |
5433 | | }), |
5434 | | }; |
5435 | | let union = ast::ClassSetUnion { |
5436 | | span: span(2..3), |
5437 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5438 | | span: span(2..3), |
5439 | | kind: ast::LiteralKind::Verbatim, |
5440 | | c: ']', |
5441 | | })], |
5442 | | }; |
5443 | | Ok((set, union)) |
5444 | | } |
5445 | | ); |
5446 | | assert_eq!(parser("[^]a]").parse_set_class_open(), { |
5447 | | let set = ast::ClassBracketed { |
5448 | | span: span(0..3), |
5449 | | negated: true, |
5450 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5451 | | span: span(2..2), |
5452 | | items: vec![], |
5453 | | }), |
5454 | | }; |
5455 | | let union = ast::ClassSetUnion { |
5456 | | span: span(2..3), |
5457 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5458 | | span: span(2..3), |
5459 | | kind: ast::LiteralKind::Verbatim, |
5460 | | c: ']', |
5461 | | })], |
5462 | | }; |
5463 | | Ok((set, union)) |
5464 | | }); |
5465 | | assert_eq!(parser("[-]a]").parse_set_class_open(), { |
5466 | | let set = ast::ClassBracketed { |
5467 | | span: span(0..2), |
5468 | | negated: false, |
5469 | | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5470 | | span: span(1..1), |
5471 | | items: vec![], |
5472 | | }), |
5473 | | }; |
5474 | | let union = ast::ClassSetUnion { |
5475 | | span: span(1..2), |
5476 | | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5477 | | span: span(1..2), |
5478 | | kind: ast::LiteralKind::Verbatim, |
5479 | | c: '-', |
5480 | | })], |
5481 | | }; |
5482 | | Ok((set, union)) |
5483 | | }); |
5484 | | |
5485 | | assert_eq!( |
5486 | | parser("[").parse_set_class_open().unwrap_err(), |
5487 | | TestError { |
5488 | | span: span(0..1), |
5489 | | kind: ast::ErrorKind::ClassUnclosed, |
5490 | | } |
5491 | | ); |
5492 | | assert_eq!( |
5493 | | parser_ignore_whitespace("[ ") |
5494 | | .parse_set_class_open() |
5495 | | .unwrap_err(), |
5496 | | TestError { |
5497 | | span: span(0..5), |
5498 | | kind: ast::ErrorKind::ClassUnclosed, |
5499 | | } |
5500 | | ); |
5501 | | assert_eq!( |
5502 | | parser("[^").parse_set_class_open().unwrap_err(), |
5503 | | TestError { |
5504 | | span: span(0..2), |
5505 | | kind: ast::ErrorKind::ClassUnclosed, |
5506 | | } |
5507 | | ); |
5508 | | assert_eq!( |
5509 | | parser("[]").parse_set_class_open().unwrap_err(), |
5510 | | TestError { |
5511 | | span: span(0..2), |
5512 | | kind: ast::ErrorKind::ClassUnclosed, |
5513 | | } |
5514 | | ); |
5515 | | assert_eq!( |
5516 | | parser("[-").parse_set_class_open().unwrap_err(), |
5517 | | TestError { |
5518 | | span: span(0..2), |
5519 | | kind: ast::ErrorKind::ClassUnclosed, |
5520 | | } |
5521 | | ); |
5522 | | assert_eq!( |
5523 | | parser("[--").parse_set_class_open().unwrap_err(), |
5524 | | TestError { |
5525 | | span: span(0..3), |
5526 | | kind: ast::ErrorKind::ClassUnclosed, |
5527 | | } |
5528 | | ); |
5529 | | } |
5530 | | |
5531 | | #[test] |
5532 | | fn maybe_parse_ascii_class() { |
5533 | | assert_eq!( |
5534 | | parser(r"[:alnum:]").maybe_parse_ascii_class(), |
5535 | | Some(ast::ClassAscii { |
5536 | | span: span(0..9), |
5537 | | kind: ast::ClassAsciiKind::Alnum, |
5538 | | negated: false, |
5539 | | }) |
5540 | | ); |
5541 | | assert_eq!( |
5542 | | parser(r"[:alnum:]A").maybe_parse_ascii_class(), |
5543 | | Some(ast::ClassAscii { |
5544 | | span: span(0..9), |
5545 | | kind: ast::ClassAsciiKind::Alnum, |
5546 | | negated: false, |
5547 | | }) |
5548 | | ); |
5549 | | assert_eq!( |
5550 | | parser(r"[:^alnum:]").maybe_parse_ascii_class(), |
5551 | | Some(ast::ClassAscii { |
5552 | | span: span(0..10), |
5553 | | kind: ast::ClassAsciiKind::Alnum, |
5554 | | negated: true, |
5555 | | }) |
5556 | | ); |
5557 | | |
5558 | | let p = parser(r"[:"); |
5559 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5560 | | assert_eq!(p.offset(), 0); |
5561 | | |
5562 | | let p = parser(r"[:^"); |
5563 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5564 | | assert_eq!(p.offset(), 0); |
5565 | | |
5566 | | let p = parser(r"[^:alnum:]"); |
5567 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5568 | | assert_eq!(p.offset(), 0); |
5569 | | |
5570 | | let p = parser(r"[:alnnum:]"); |
5571 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5572 | | assert_eq!(p.offset(), 0); |
5573 | | |
5574 | | let p = parser(r"[:alnum]"); |
5575 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5576 | | assert_eq!(p.offset(), 0); |
5577 | | |
5578 | | let p = parser(r"[:alnum:"); |
5579 | | assert_eq!(p.maybe_parse_ascii_class(), None); |
5580 | | assert_eq!(p.offset(), 0); |
5581 | | } |
5582 | | |
5583 | | #[test] |
5584 | | fn parse_unicode_class() { |
5585 | | assert_eq!( |
5586 | | parser(r"\pN").parse_escape(), |
5587 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5588 | | span: span(0..3), |
5589 | | negated: false, |
5590 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
5591 | | })) |
5592 | | ); |
5593 | | assert_eq!( |
5594 | | parser(r"\PN").parse_escape(), |
5595 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5596 | | span: span(0..3), |
5597 | | negated: true, |
5598 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
5599 | | })) |
5600 | | ); |
5601 | | assert_eq!( |
5602 | | parser(r"\p{N}").parse_escape(), |
5603 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5604 | | span: span(0..5), |
5605 | | negated: false, |
5606 | | kind: ast::ClassUnicodeKind::Named(s("N")), |
5607 | | })) |
5608 | | ); |
5609 | | assert_eq!( |
5610 | | parser(r"\P{N}").parse_escape(), |
5611 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5612 | | span: span(0..5), |
5613 | | negated: true, |
5614 | | kind: ast::ClassUnicodeKind::Named(s("N")), |
5615 | | })) |
5616 | | ); |
5617 | | assert_eq!( |
5618 | | parser(r"\p{Greek}").parse_escape(), |
5619 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5620 | | span: span(0..9), |
5621 | | negated: false, |
5622 | | kind: ast::ClassUnicodeKind::Named(s("Greek")), |
5623 | | })) |
5624 | | ); |
5625 | | |
5626 | | assert_eq!( |
5627 | | parser(r"\p{scx:Katakana}").parse_escape(), |
5628 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5629 | | span: span(0..16), |
5630 | | negated: false, |
5631 | | kind: ast::ClassUnicodeKind::NamedValue { |
5632 | | op: ast::ClassUnicodeOpKind::Colon, |
5633 | | name: s("scx"), |
5634 | | value: s("Katakana"), |
5635 | | }, |
5636 | | })) |
5637 | | ); |
5638 | | assert_eq!( |
5639 | | parser(r"\p{scx=Katakana}").parse_escape(), |
5640 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5641 | | span: span(0..16), |
5642 | | negated: false, |
5643 | | kind: ast::ClassUnicodeKind::NamedValue { |
5644 | | op: ast::ClassUnicodeOpKind::Equal, |
5645 | | name: s("scx"), |
5646 | | value: s("Katakana"), |
5647 | | }, |
5648 | | })) |
5649 | | ); |
5650 | | assert_eq!( |
5651 | | parser(r"\p{scx!=Katakana}").parse_escape(), |
5652 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5653 | | span: span(0..17), |
5654 | | negated: false, |
5655 | | kind: ast::ClassUnicodeKind::NamedValue { |
5656 | | op: ast::ClassUnicodeOpKind::NotEqual, |
5657 | | name: s("scx"), |
5658 | | value: s("Katakana"), |
5659 | | }, |
5660 | | })) |
5661 | | ); |
5662 | | |
5663 | | assert_eq!( |
5664 | | parser(r"\p{:}").parse_escape(), |
5665 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5666 | | span: span(0..5), |
5667 | | negated: false, |
5668 | | kind: ast::ClassUnicodeKind::NamedValue { |
5669 | | op: ast::ClassUnicodeOpKind::Colon, |
5670 | | name: s(""), |
5671 | | value: s(""), |
5672 | | }, |
5673 | | })) |
5674 | | ); |
5675 | | assert_eq!( |
5676 | | parser(r"\p{=}").parse_escape(), |
5677 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5678 | | span: span(0..5), |
5679 | | negated: false, |
5680 | | kind: ast::ClassUnicodeKind::NamedValue { |
5681 | | op: ast::ClassUnicodeOpKind::Equal, |
5682 | | name: s(""), |
5683 | | value: s(""), |
5684 | | }, |
5685 | | })) |
5686 | | ); |
5687 | | assert_eq!( |
5688 | | parser(r"\p{!=}").parse_escape(), |
5689 | | Ok(Primitive::Unicode(ast::ClassUnicode { |
5690 | | span: span(0..6), |
5691 | | negated: false, |
5692 | | kind: ast::ClassUnicodeKind::NamedValue { |
5693 | | op: ast::ClassUnicodeOpKind::NotEqual, |
5694 | | name: s(""), |
5695 | | value: s(""), |
5696 | | }, |
5697 | | })) |
5698 | | ); |
5699 | | |
5700 | | assert_eq!( |
5701 | | parser(r"\p").parse_escape().unwrap_err(), |
5702 | | TestError { |
5703 | | span: span(2..2), |
5704 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5705 | | } |
5706 | | ); |
5707 | | assert_eq!( |
5708 | | parser(r"\p{").parse_escape().unwrap_err(), |
5709 | | TestError { |
5710 | | span: span(3..3), |
5711 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5712 | | } |
5713 | | ); |
5714 | | assert_eq!( |
5715 | | parser(r"\p{N").parse_escape().unwrap_err(), |
5716 | | TestError { |
5717 | | span: span(4..4), |
5718 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5719 | | } |
5720 | | ); |
5721 | | assert_eq!( |
5722 | | parser(r"\p{Greek").parse_escape().unwrap_err(), |
5723 | | TestError { |
5724 | | span: span(8..8), |
5725 | | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5726 | | } |
5727 | | ); |
5728 | | |
5729 | | assert_eq!( |
5730 | | parser(r"\pNz").parse(), |
5731 | | Ok(Ast::Concat(ast::Concat { |
5732 | | span: span(0..4), |
5733 | | asts: vec![ |
5734 | | Ast::Class(ast::Class::Unicode(ast::ClassUnicode { |
5735 | | span: span(0..3), |
5736 | | negated: false, |
5737 | | kind: ast::ClassUnicodeKind::OneLetter('N'), |
5738 | | })), |
5739 | | Ast::Literal(ast::Literal { |
5740 | | span: span(3..4), |
5741 | | kind: ast::LiteralKind::Verbatim, |
5742 | | c: 'z', |
5743 | | }), |
5744 | | ], |
5745 | | })) |
5746 | | ); |
5747 | | assert_eq!( |
5748 | | parser(r"\p{Greek}z").parse(), |
5749 | | Ok(Ast::Concat(ast::Concat { |
5750 | | span: span(0..10), |
5751 | | asts: vec![ |
5752 | | Ast::Class(ast::Class::Unicode(ast::ClassUnicode { |
5753 | | span: span(0..9), |
5754 | | negated: false, |
5755 | | kind: ast::ClassUnicodeKind::Named(s("Greek")), |
5756 | | })), |
5757 | | Ast::Literal(ast::Literal { |
5758 | | span: span(9..10), |
5759 | | kind: ast::LiteralKind::Verbatim, |
5760 | | c: 'z', |
5761 | | }), |
5762 | | ], |
5763 | | })) |
5764 | | ); |
5765 | | assert_eq!( |
5766 | | parser(r"\p\{").parse().unwrap_err(), |
5767 | | TestError { |
5768 | | span: span(2..3), |
5769 | | kind: ast::ErrorKind::UnicodeClassInvalid, |
5770 | | } |
5771 | | ); |
5772 | | assert_eq!( |
5773 | | parser(r"\P\{").parse().unwrap_err(), |
5774 | | TestError { |
5775 | | span: span(2..3), |
5776 | | kind: ast::ErrorKind::UnicodeClassInvalid, |
5777 | | } |
5778 | | ); |
5779 | | } |
5780 | | |
5781 | | #[test] |
5782 | | fn parse_perl_class() { |
5783 | | assert_eq!( |
5784 | | parser(r"\d").parse_escape(), |
5785 | | Ok(Primitive::Perl(ast::ClassPerl { |
5786 | | span: span(0..2), |
5787 | | kind: ast::ClassPerlKind::Digit, |
5788 | | negated: false, |
5789 | | })) |
5790 | | ); |
5791 | | assert_eq!( |
5792 | | parser(r"\D").parse_escape(), |
5793 | | Ok(Primitive::Perl(ast::ClassPerl { |
5794 | | span: span(0..2), |
5795 | | kind: ast::ClassPerlKind::Digit, |
5796 | | negated: true, |
5797 | | })) |
5798 | | ); |
5799 | | assert_eq!( |
5800 | | parser(r"\s").parse_escape(), |
5801 | | Ok(Primitive::Perl(ast::ClassPerl { |
5802 | | span: span(0..2), |
5803 | | kind: ast::ClassPerlKind::Space, |
5804 | | negated: false, |
5805 | | })) |
5806 | | ); |
5807 | | assert_eq!( |
5808 | | parser(r"\S").parse_escape(), |
5809 | | Ok(Primitive::Perl(ast::ClassPerl { |
5810 | | span: span(0..2), |
5811 | | kind: ast::ClassPerlKind::Space, |
5812 | | negated: true, |
5813 | | })) |
5814 | | ); |
5815 | | assert_eq!( |
5816 | | parser(r"\w").parse_escape(), |
5817 | | Ok(Primitive::Perl(ast::ClassPerl { |
5818 | | span: span(0..2), |
5819 | | kind: ast::ClassPerlKind::Word, |
5820 | | negated: false, |
5821 | | })) |
5822 | | ); |
5823 | | assert_eq!( |
5824 | | parser(r"\W").parse_escape(), |
5825 | | Ok(Primitive::Perl(ast::ClassPerl { |
5826 | | span: span(0..2), |
5827 | | kind: ast::ClassPerlKind::Word, |
5828 | | negated: true, |
5829 | | })) |
5830 | | ); |
5831 | | |
5832 | | assert_eq!( |
5833 | | parser(r"\d").parse(), |
5834 | | Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { |
5835 | | span: span(0..2), |
5836 | | kind: ast::ClassPerlKind::Digit, |
5837 | | negated: false, |
5838 | | }))) |
5839 | | ); |
5840 | | assert_eq!( |
5841 | | parser(r"\dz").parse(), |
5842 | | Ok(Ast::Concat(ast::Concat { |
5843 | | span: span(0..3), |
5844 | | asts: vec![ |
5845 | | Ast::Class(ast::Class::Perl(ast::ClassPerl { |
5846 | | span: span(0..2), |
5847 | | kind: ast::ClassPerlKind::Digit, |
5848 | | negated: false, |
5849 | | })), |
5850 | | Ast::Literal(ast::Literal { |
5851 | | span: span(2..3), |
5852 | | kind: ast::LiteralKind::Verbatim, |
5853 | | c: 'z', |
5854 | | }), |
5855 | | ], |
5856 | | })) |
5857 | | ); |
5858 | | } |
5859 | | |
5860 | | // This tests a bug fix where the nest limit checker wasn't decrementing |
5861 | | // its depth during post-traversal, which causes long regexes to trip |
5862 | | // the default limit too aggressively. |
5863 | | #[test] |
5864 | | fn regression_454_nest_too_big() { |
5865 | | let pattern = r#" |
5866 | | 2(?: |
5867 | | [45]\d{3}| |
5868 | | 7(?: |
5869 | | 1[0-267]| |
5870 | | 2[0-289]| |
5871 | | 3[0-29]| |
5872 | | 4[01]| |
5873 | | 5[1-3]| |
5874 | | 6[013]| |
5875 | | 7[0178]| |
5876 | | 91 |
5877 | | )| |
5878 | | 8(?: |
5879 | | 0[125]| |
5880 | | [139][1-6]| |
5881 | | 2[0157-9]| |
5882 | | 41| |
5883 | | 6[1-35]| |
5884 | | 7[1-5]| |
5885 | | 8[1-8]| |
5886 | | 90 |
5887 | | )| |
5888 | | 9(?: |
5889 | | 0[0-2]| |
5890 | | 1[0-4]| |
5891 | | 2[568]| |
5892 | | 3[3-6]| |
5893 | | 5[5-7]| |
5894 | | 6[0167]| |
5895 | | 7[15]| |
5896 | | 8[0146-9] |
5897 | | ) |
5898 | | )\d{4} |
5899 | | "#; |
5900 | | assert!(parser_nest_limit(pattern, 50).parse().is_ok()); |
5901 | | } |
5902 | | |
5903 | | // This tests that we treat a trailing `-` in a character class as a |
5904 | | // literal `-` even when whitespace mode is enabled and there is whitespace |
5905 | | // after the trailing `-`. |
5906 | | #[test] |
5907 | | fn regression_455_trailing_dash_ignore_whitespace() { |
5908 | | assert!(parser("(?x)[ / - ]").parse().is_ok()); |
5909 | | assert!(parser("(?x)[ a - ]").parse().is_ok()); |
5910 | | assert!(parser( |
5911 | | "(?x)[ |
5912 | | a |
5913 | | - ] |
5914 | | " |
5915 | | ) |
5916 | | .parse() |
5917 | | .is_ok()); |
5918 | | assert!(parser( |
5919 | | "(?x)[ |
5920 | | a # wat |
5921 | | - ] |
5922 | | " |
5923 | | ) |
5924 | | .parse() |
5925 | | .is_ok()); |
5926 | | |
5927 | | assert!(parser("(?x)[ / -").parse().is_err()); |
5928 | | assert!(parser("(?x)[ / - ").parse().is_err()); |
5929 | | assert!(parser( |
5930 | | "(?x)[ |
5931 | | / - |
5932 | | " |
5933 | | ) |
5934 | | .parse() |
5935 | | .is_err()); |
5936 | | assert!(parser( |
5937 | | "(?x)[ |
5938 | | / - # wat |
5939 | | " |
5940 | | ) |
5941 | | .parse() |
5942 | | .is_err()); |
5943 | | } |
5944 | | } |