Coverage Report

Created: 2026-02-14 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-lite/src/hir/parse.rs
Line
Count
Source
1
use core::cell::{Cell, RefCell};
2
3
use alloc::{
4
    boxed::Box,
5
    string::{String, ToString},
6
    vec,
7
    vec::Vec,
8
};
9
10
use crate::{
11
    error::Error,
12
    hir::{self, Config, Flags, Hir, HirKind},
13
};
14
15
// These are all of the errors that can occur while parsing a regex. Unlike
16
// regex-syntax, our errors are not particularly great. They are just enough
17
// to get a general sense of what went wrong. But in exchange, the error
18
// reporting mechanism is *much* simpler than what's in regex-syntax.
19
//
20
// By convention, we use each of these messages in exactly one place. That
21
// way, every branch that leads to an error has a unique message. This in turn
22
// means that given a message, one can precisely identify which part of the
23
// parser reported it.
24
//
25
// Finally, we give names to each message so that we can reference them in
26
// tests.
27
const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
28
const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
29
const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
30
const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
31
const ERR_UNCLOSED_GROUP_QUESTION: &str =
32
    "expected closing ')', but got end of pattern";
33
const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
34
const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
35
const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
36
const ERR_MISSING_GROUP_NAME: &str =
37
    "expected capture group name, but got end of pattern";
38
const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
39
const ERR_UNCLOSED_GROUP_NAME: &str =
40
    "expected end of capture group name, but got end of pattern";
41
const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
42
const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
43
const ERR_FLAG_REPEATED_NEGATION: &str =
44
    "inline flag negation cannot be repeated";
45
const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
46
const ERR_FLAG_UNEXPECTED_EOF: &str =
47
    "expected ':' or ')' to end inline flags, but got end of pattern";
48
const ERR_FLAG_DANGLING_NEGATION: &str =
49
    "inline flags cannot end with negation directive";
50
const ERR_DECIMAL_NO_DIGITS: &str =
51
    "expected decimal number, but found no digits";
52
const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
53
const ERR_HEX_BRACE_INVALID_DIGIT: &str =
54
    "expected hexadecimal number in braces, but got non-hex digit";
55
const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
56
    "expected hexadecimal number, but saw end of pattern before closing brace";
57
const ERR_HEX_BRACE_EMPTY: &str =
58
    "expected hexadecimal number in braces, but got no digits";
59
const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
60
const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
61
    "expected fixed length hexadecimal number, but saw end of pattern first";
62
const ERR_HEX_FIXED_INVALID_DIGIT: &str =
63
    "expected fixed length hexadecimal number, but got non-hex digit";
64
const ERR_HEX_FIXED_INVALID: &str =
65
    "got invalid fixed length hexadecimal number";
66
const ERR_HEX_UNEXPECTED_EOF: &str =
67
    "expected hexadecimal number, but saw end of pattern first";
68
const ERR_ESCAPE_UNEXPECTED_EOF: &str =
69
    "saw start of escape sequence, but saw end of pattern before it finished";
70
const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
71
const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
72
    "Unicode character classes are not supported";
73
const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
74
const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
75
    "unrecognized POSIX character class";
76
const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
77
    "uncounted repetition operator must be applied to a sub-expression";
78
const ERR_COUNTED_REP_SUB_MISSING: &str =
79
    "counted repetition operator must be applied to a sub-expression";
80
const ERR_COUNTED_REP_UNCLOSED: &str =
81
    "found unclosed counted repetition operator";
82
const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
83
    "found incomplete and unclosed counted repetition operator";
84
const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
85
    "found counted repetition operator with a comma that is unclosed";
86
const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
87
    "found counted repetition with min and max that is unclosed";
88
const ERR_COUNTED_REP_INVALID: &str =
89
    "expected closing brace for counted repetition, but got something else";
90
const ERR_COUNTED_REP_INVALID_RANGE: &str =
91
    "found counted repetition with a min bigger than its max";
92
const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
93
    "non-empty character class has no closing bracket";
94
const ERR_CLASS_INVALID_RANGE_ITEM: &str =
95
    "character class ranges must start and end with a single character";
96
const ERR_CLASS_INVALID_ITEM: &str =
97
    "invalid escape sequence in character class";
98
const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
99
    "non-empty character class has no closing bracket after dash";
100
const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
101
    "negated character class has no closing bracket";
102
const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
103
    "character class begins with literal ']' but has no closing bracket";
104
const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
105
const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
106
const ERR_CLASS_NEST_UNSUPPORTED: &str =
107
    "nested character classes are not supported";
108
const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
109
    "character class intersection is not supported";
110
const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111
    "character class difference is not supported";
112
const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113
    "character class symmetric difference is not supported";
114
const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115
    "special word boundary assertion is unclosed or has an invalid character";
116
const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117
    "special word boundary assertion is unrecognized";
118
const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119
    "found start of special word boundary or repetition without an end";
120
121
/// A regular expression parser.
122
///
123
/// This parses a string representation of a regular expression into an
124
/// abstract syntax tree. The size of the tree is proportional to the length
125
/// of the regular expression pattern.
126
///
127
/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
128
#[derive(Clone, Debug)]
129
pub(super) struct Parser<'a> {
130
    /// The configuration of the parser as given by the caller.
131
    config: Config,
132
    /// The pattern we're parsing as given by the caller.
133
    pattern: &'a str,
134
    /// The call depth of the parser. This is incremented for each
135
    /// sub-expression parsed. Its peak value is the maximum nesting of the
136
    /// pattern.
137
    depth: Cell<u32>,
138
    /// The current position of the parser.
139
    pos: Cell<usize>,
140
    /// The current codepoint of the parser. The codepoint corresponds to the
141
    /// codepoint encoded in `pattern` beginning at `pos`.
142
    ///
143
    /// This is `None` if and only if `pos == pattern.len()`.
144
    char: Cell<Option<char>>,
145
    /// The current capture index.
146
    capture_index: Cell<u32>,
147
    /// The flags that are currently set.
148
    flags: RefCell<Flags>,
149
    /// A sorted sequence of capture names. This is used to detect duplicate
150
    /// capture names and report an error if one is detected.
151
    capture_names: RefCell<Vec<String>>,
152
}
153
154
/// The constructor and a variety of helper routines.
155
impl<'a> Parser<'a> {
156
    /// Build a parser from this configuration with the given pattern.
157
7.98k
    pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
158
7.98k
        Parser {
159
7.98k
            config,
160
7.98k
            pattern,
161
7.98k
            depth: Cell::new(0),
162
7.98k
            pos: Cell::new(0),
163
7.98k
            char: Cell::new(pattern.chars().next()),
164
7.98k
            capture_index: Cell::new(0),
165
7.98k
            flags: RefCell::new(config.flags),
166
7.98k
            capture_names: RefCell::new(vec![]),
167
7.98k
        }
168
7.98k
    }
169
170
    /// Returns the full pattern string that we're parsing.
171
87.7M
    fn pattern(&self) -> &str {
172
87.7M
        self.pattern
173
87.7M
    }
174
175
    /// Return the current byte offset of the parser.
176
    ///
177
    /// The offset starts at `0` from the beginning of the regular expression
178
    /// pattern string.
179
398M
    fn pos(&self) -> usize {
180
398M
        self.pos.get()
181
398M
    }
182
183
    /// Increments the call depth of the parser.
184
    ///
185
    /// If the call depth would exceed the configured nest limit, then this
186
    /// returns an error.
187
    ///
188
    /// This returns the old depth.
189
444k
    fn increment_depth(&self) -> Result<u32, Error> {
190
444k
        let old = self.depth.get();
191
444k
        if old > self.config.nest_limit {
192
2
            return Err(Error::new(ERR_TOO_MUCH_NESTING));
193
444k
        }
194
        // OK because our depth starts at 0, and we return an error if it
195
        // ever reaches the limit. So the call depth can never exceed u32::MAX.
196
444k
        let new = old.checked_add(1).unwrap();
197
444k
        self.depth.set(new);
198
444k
        Ok(old)
199
444k
    }
200
201
    /// Decrements the call depth of the parser.
202
    ///
203
    /// This panics if the current depth is 0.
204
440k
    fn decrement_depth(&self) {
205
440k
        let old = self.depth.get();
206
        // If this fails then the caller has a bug in how they're incrementing
207
        // and decrementing the depth of the parser's call stack.
208
440k
        let new = old.checked_sub(1).unwrap();
209
440k
        self.depth.set(new);
210
440k
    }
211
212
    /// Return the codepoint at the current position of the parser.
213
    ///
214
    /// This panics if the parser is positioned at the end of the pattern.
215
319M
    fn char(&self) -> char {
216
319M
        self.char.get().expect("codepoint, but parser is done")
217
319M
    }
218
219
    /// Returns true if the next call to `bump` would return false.
220
223M
    fn is_done(&self) -> bool {
221
223M
        self.pos() == self.pattern.len()
222
223M
    }
223
224
    /// Returns the flags that are current set for this regex.
225
114M
    fn flags(&self) -> Flags {
226
114M
        *self.flags.borrow()
227
114M
    }
228
229
    /// Bump the parser to the next Unicode scalar value.
230
    ///
231
    /// If the end of the input has been reached, then `false` is returned.
232
84.0M
    fn bump(&self) -> bool {
233
84.0M
        if self.is_done() {
234
0
            return false;
235
84.0M
        }
236
84.0M
        self.pos.set(self.pos() + self.char().len_utf8());
237
84.0M
        self.char.set(self.pattern()[self.pos()..].chars().next());
238
84.0M
        self.char.get().is_some()
239
84.0M
    }
240
241
    /// If the substring starting at the current position of the parser has
242
    /// the given prefix, then bump the parser to the character immediately
243
    /// following the prefix and return true. Otherwise, don't bump the parser
244
    /// and return false.
245
3.07M
    fn bump_if(&self, prefix: &str) -> bool {
246
3.07M
        if self.pattern()[self.pos()..].starts_with(prefix) {
247
41.5k
            for _ in 0..prefix.chars().count() {
248
41.5k
                self.bump();
249
41.5k
            }
250
22.4k
            true
251
        } else {
252
3.04M
            false
253
        }
254
3.07M
    }
255
256
    /// Bump the parser, and if the `x` flag is enabled, bump through any
257
    /// subsequent spaces. Return true if and only if the parser is not done.
258
3.05M
    fn bump_and_bump_space(&self) -> bool {
259
3.05M
        if !self.bump() {
260
1.31k
            return false;
261
3.05M
        }
262
3.05M
        self.bump_space();
263
3.05M
        !self.is_done()
264
3.05M
    }
265
266
    /// If the `x` flag is enabled (i.e., whitespace insensitivity with
267
    /// comments), then this will advance the parser through all whitespace
268
    /// and comments to the next non-whitespace non-comment byte.
269
    ///
270
    /// If the `x` flag is disabled, then this is a no-op.
271
    ///
272
    /// This should be used selectively throughout the parser where
273
    /// arbitrary whitespace is permitted when the `x` flag is enabled. For
274
    /// example, `{   5  , 6}` is equivalent to `{5,6}`.
275
99.6M
    fn bump_space(&self) {
276
99.6M
        if !self.flags().ignore_whitespace {
277
83.0M
            return;
278
16.5M
        }
279
17.2M
        while !self.is_done() {
280
17.2M
            if self.char().is_whitespace() {
281
643k
                self.bump();
282
16.5M
            } else if self.char() == '#' {
283
739
                self.bump();
284
1.99M
                while !self.is_done() {
285
1.99M
                    let c = self.char();
286
1.99M
                    self.bump();
287
1.99M
                    if c == '\n' {
288
686
                        break;
289
1.99M
                    }
290
                }
291
            } else {
292
16.5M
                break;
293
            }
294
        }
295
99.6M
    }
296
297
    /// Peek at the next character in the input without advancing the parser.
298
    ///
299
    /// If the input has been exhausted, then this returns `None`.
300
203k
    fn peek(&self) -> Option<char> {
301
203k
        if self.is_done() {
302
0
            return None;
303
203k
        }
304
203k
        self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
305
203k
    }
306
307
    /// Peeks at the next character in the pattern from the current offset, and
308
    /// will ignore spaces when the parser is in whitespace insensitive mode.
309
337k
    fn peek_space(&self) -> Option<char> {
310
337k
        if !self.flags().ignore_whitespace {
311
127k
            return self.peek();
312
210k
        }
313
210k
        if self.is_done() {
314
0
            return None;
315
210k
        }
316
210k
        let mut start = self.pos() + self.char().len_utf8();
317
210k
        let mut in_comment = false;
318
309k
        for (i, ch) in self.pattern()[start..].char_indices() {
319
309k
            if ch.is_whitespace() {
320
98.9k
                continue;
321
210k
            } else if !in_comment && ch == '#' {
322
676
                in_comment = true;
323
210k
            } else if in_comment && ch == '\n' {
324
0
                in_comment = false;
325
0
            } else {
326
210k
                start += i;
327
210k
                break;
328
            }
329
        }
330
210k
        self.pattern()[start..].chars().next()
331
337k
    }
332
333
    /// Return the next capturing index. Each subsequent call increments the
334
    /// internal index. Since the way capture indices are computed is a public
335
    /// API guarantee, use of this routine depends on the parser being depth
336
    /// first and left-to-right.
337
    ///
338
    /// If the capture limit is exceeded, then an error is returned.
339
435k
    fn next_capture_index(&self) -> Result<u32, Error> {
340
435k
        let current = self.capture_index.get();
341
435k
        let next = current
342
435k
            .checked_add(1)
343
435k
            .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?;
344
435k
        self.capture_index.set(next);
345
435k
        Ok(next)
346
435k
    }
347
348
    /// Adds the given capture name to this parser. If this capture name has
349
    /// already been used, then an error is returned.
350
8.96k
    fn add_capture_name(&self, name: &str) -> Result<(), Error> {
351
8.96k
        let mut names = self.capture_names.borrow_mut();
352
42.9k
        match names.binary_search_by(|n| name.cmp(n)) {
353
7
            Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
354
8.96k
            Err(i) => {
355
8.96k
                names.insert(i, name.to_string());
356
8.96k
                Ok(())
357
            }
358
        }
359
8.96k
    }
360
361
    /// Returns true if and only if the parser is positioned at a look-around
362
    /// prefix. The conditions under which this returns true must always
363
    /// correspond to a regular expression that would otherwise be consider
364
    /// invalid.
365
    ///
366
    /// This should only be called immediately after parsing the opening of
367
    /// a group or a set of flags.
368
438k
    fn is_lookaround_prefix(&self) -> bool {
369
438k
        self.bump_if("?=")
370
438k
            || self.bump_if("?!")
371
438k
            || self.bump_if("?<=")
372
438k
            || self.bump_if("?<!")
373
438k
    }
374
}
375
376
/// The actual parser. We try to break out each kind of regex syntax into its
377
/// own routine.
378
impl<'a> Parser<'a> {
379
7.98k
    pub(super) fn parse(&self) -> Result<Hir, Error> {
380
7.98k
        let hir = self.parse_inner()?;
381
        // While we also check nesting during parsing, that only checks the
382
        // number of recursive parse calls. It does not necessarily cover
383
        // all possible recursive nesting of the Hir itself. For example,
384
        // repetition operators don't require recursive parse calls. So one
385
        // can stack them arbitrarily without overflowing the stack in the
386
        // *parser*. But then if one recurses over the resulting Hir, a stack
387
        // overflow is possible. So here we check the Hir nesting level
388
        // thoroughly to ensure it isn't nested too deeply.
389
        //
390
        // Note that we do still need the nesting limit check in the parser as
391
        // well, since that will avoid overflowing the stack during parse time
392
        // before the complete Hir value is constructed.
393
5.85k
        check_hir_nesting(&hir, self.config.nest_limit)?;
394
5.84k
        Ok(hir)
395
7.98k
    }
396
397
444k
    fn parse_inner(&self) -> Result<Hir, Error> {
398
444k
        let depth = self.increment_depth()?;
399
444k
        let mut alternates = vec![];
400
444k
        let mut concat = vec![];
401
        loop {
402
18.6M
            self.bump_space();
403
18.6M
            if self.is_done() {
404
6.10k
                break;
405
18.6M
            }
406
18.6M
            match self.char() {
407
                '(' => {
408
                    // Save the old flags and reset them only when we close
409
                    // the group.
410
438k
                    let oldflags = *self.flags.borrow();
411
438k
                    if let Some(sub) = self.parse_group()? {
412
434k
                        concat.push(sub);
413
434k
                        // We only reset them here because if 'parse_group'
414
434k
                        // returns None, then that means it handled a flag
415
434k
                        // directive, e.g., '(?ism)'. And the whole point is
416
434k
                        // that those flags remain active until either disabled
417
434k
                        // or the end of the pattern or current group.
418
434k
                        *self.flags.borrow_mut() = oldflags;
419
434k
                    }
420
436k
                    if self.char.get() != Some(')') {
421
249
                        return Err(Error::new(ERR_UNCLOSED_GROUP));
422
436k
                    }
423
436k
                    self.bump();
424
                }
425
                ')' => {
426
434k
                    if depth == 0 {
427
26
                        return Err(Error::new(ERR_UNOPENED_GROUP));
428
434k
                    }
429
434k
                    break;
430
                }
431
4.24M
                '|' => {
432
4.24M
                    alternates.push(Hir::concat(core::mem::take(&mut concat)));
433
4.24M
                    self.bump();
434
4.24M
                }
435
151k
                '[' => concat.push(self.parse_class()?),
436
                '?' | '*' | '+' => {
437
176k
                    concat = self.parse_uncounted_repetition(concat)?;
438
                }
439
                '{' => {
440
199k
                    concat = self.parse_counted_repetition(concat)?;
441
                }
442
13.0M
                _ => concat.push(self.parse_primitive()?),
443
            }
444
        }
445
440k
        self.decrement_depth();
446
440k
        alternates.push(Hir::concat(concat));
447
        // N.B. This strips off the "alternation" if there's only one branch.
448
440k
        Ok(Hir::alternation(alternates))
449
444k
    }
450
451
    /// Parses a "primitive" pattern. A primitive is any expression that does
452
    /// not contain any sub-expressions.
453
    ///
454
    /// This assumes the parser is pointing at the beginning of the primitive.
455
13.0M
    fn parse_primitive(&self) -> Result<Hir, Error> {
456
13.0M
        let ch = self.char();
457
13.0M
        self.bump();
458
13.0M
        match ch {
459
82.6k
            '\\' => self.parse_escape(),
460
27.0k
            '.' => Ok(self.hir_dot()),
461
12.2k
            '^' => Ok(self.hir_anchor_start()),
462
27.8k
            '$' => Ok(self.hir_anchor_end()),
463
12.8M
            ch => Ok(self.hir_char(ch)),
464
        }
465
13.0M
    }
466
467
    /// Parse an escape sequence. This always results in a "primitive" HIR,
468
    /// that is, an HIR with no sub-expressions.
469
    ///
470
    /// This assumes the parser is positioned at the start of the sequence,
471
    /// immediately *after* the `\`. It advances the parser to the first
472
    /// position immediately following the escape sequence.
473
17.6M
    fn parse_escape(&self) -> Result<Hir, Error> {
474
17.6M
        if self.is_done() {
475
61
            return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
476
17.6M
        }
477
17.6M
        let ch = self.char();
478
        // Put some of the more complicated routines into helpers.
479
17.6M
        match ch {
480
17.5M
            '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
481
            'p' | 'P' => {
482
2
                return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
483
            }
484
8.74k
            'x' | 'u' | 'U' => return self.parse_hex(),
485
            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
486
16.8M
                return Ok(self.parse_perl_class());
487
            }
488
785k
            _ => {}
489
        }
490
491
        // Handle all of the one letter sequences inline.
492
785k
        self.bump();
493
785k
        if hir::is_meta_character(ch) || hir::is_escapable_character(ch) {
494
741k
            return Ok(self.hir_char(ch));
495
43.8k
        }
496
43.8k
        let special = |ch| Ok(self.hir_char(ch));
497
43.8k
        match ch {
498
8.07k
            'a' => special('\x07'),
499
23.5k
            'f' => special('\x0C'),
500
457
            't' => special('\t'),
501
1.22k
            'n' => special('\n'),
502
491
            'r' => special('\r'),
503
3.80k
            'v' => special('\x0B'),
504
294
            'A' => Ok(Hir::look(hir::Look::Start)),
505
1.91k
            'z' => Ok(Hir::look(hir::Look::End)),
506
            'b' => {
507
2.86k
                let mut hir = Hir::look(hir::Look::Word);
508
2.86k
                if !self.is_done() && self.char() == '{' {
509
1.38k
                    if let Some(special) =
510
2.19k
                        self.maybe_parse_special_word_boundary()?
511
1.38k
                    {
512
1.38k
                        hir = special;
513
1.38k
                    }
514
665
                }
515
2.71k
                Ok(hir)
516
            }
517
448
            'B' => Ok(Hir::look(hir::Look::WordNegate)),
518
436
            '<' => Ok(Hir::look(hir::Look::WordStart)),
519
278
            '>' => Ok(Hir::look(hir::Look::WordEnd)),
520
40
            _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
521
        }
522
17.6M
    }
523
524
    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
525
    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
526
    ///
527
    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
528
    /// if it fails it will just return `None` with no error. This is done
529
    /// because `\b{5}` is a valid expression and we want to let that be parsed
530
    /// by the existing counted repetition parsing code. (I thought about just
531
    /// invoking the counted repetition code from here, but it seemed a little
532
    /// ham-fisted.)
533
    ///
534
    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
535
    /// Namely, if we definitely know it isn't a counted repetition, then we
536
    /// return an error specific to the specialty word boundaries.
537
    ///
538
    /// This assumes the parser is positioned at a `{` immediately following
539
    /// a `\b`. When `None` is returned, the parser is returned to the position
540
    /// at which it started: pointing at a `{`.
541
    ///
542
    /// The position given should correspond to the start of the `\b`.
543
2.19k
    fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
544
2.19k
        assert_eq!(self.char(), '{');
545
546
14.7k
        let is_valid_char = |c| match c {
547
13.0k
            'A'..='Z' | 'a'..='z' | '-' => true,
548
2.16k
            _ => false,
549
14.7k
        };
550
2.19k
        let start = self.pos();
551
2.19k
        if !self.bump_and_bump_space() {
552
3
            return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
553
2.19k
        }
554
        // This is one of the critical bits: if the first non-whitespace
555
        // character isn't in [-A-Za-z] (i.e., this can't be a special word
556
        // boundary), then we bail and let the counted repetition parser deal
557
        // with this.
558
2.19k
        if !is_valid_char(self.char()) {
559
663
            self.pos.set(start);
560
663
            self.char.set(Some('{'));
561
663
            return Ok(None);
562
1.52k
        }
563
564
        // Now collect up our chars until we see a '}'.
565
1.52k
        let mut scratch = String::new();
566
12.5k
        while !self.is_done() && is_valid_char(self.char()) {
567
11.0k
            scratch.push(self.char());
568
11.0k
            self.bump_and_bump_space();
569
11.0k
        }
570
1.52k
        if self.is_done() || self.char() != '}' {
571
89
            return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
572
1.44k
        }
573
1.44k
        self.bump();
574
1.44k
        let kind = match scratch.as_str() {
575
1.44k
            "start" => hir::Look::WordStart,
576
1.23k
            "end" => hir::Look::WordEnd,
577
1.03k
            "start-half" => hir::Look::WordStartHalf,
578
811
            "end-half" => hir::Look::WordEndHalf,
579
            _ => {
580
55
                return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
581
            }
582
        };
583
1.38k
        Ok(Some(Hir::look(kind)))
584
2.19k
    }
585
586
    /// Parse a hex representation of a Unicode codepoint. This handles both
587
    /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
588
    /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
589
    /// the first character immediately following the hexadecimal literal.
590
8.74k
    fn parse_hex(&self) -> Result<Hir, Error> {
591
8.74k
        let digit_len = match self.char() {
592
3.33k
            'x' => 2,
593
3.55k
            'u' => 4,
594
1.85k
            'U' => 8,
595
0
            unk => unreachable!(
596
                "invalid start of fixed length hexadecimal number {unk}"
597
            ),
598
        };
599
8.74k
        if !self.bump_and_bump_space() {
600
11
            return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
601
8.73k
        }
602
8.73k
        if self.char() == '{' {
603
3.10k
            self.parse_hex_brace()
604
        } else {
605
5.62k
            self.parse_hex_digits(digit_len)
606
        }
607
8.74k
    }
608
609
    /// Parse an N-digit hex representation of a Unicode codepoint. This
610
    /// expects the parser to be positioned at the first digit and will advance
611
    /// the parser to the first character immediately following the escape
612
    /// sequence.
613
    ///
614
    /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
615
    /// or 8 (for `\UNNNNNNNN`).
616
5.62k
    fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
617
5.62k
        let mut scratch = String::new();
618
16.6k
        for i in 0..digit_len {
619
16.6k
            if i > 0 && !self.bump_and_bump_space() {
620
32
                return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
621
16.6k
            }
622
16.6k
            if !is_hex(self.char()) {
623
77
                return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
624
16.5k
            }
625
16.5k
            scratch.push(self.char());
626
        }
627
        // The final bump just moves the parser past the literal, which may
628
        // be EOF.
629
5.51k
        self.bump_and_bump_space();
630
5.51k
        match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
631
32
            None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
632
5.48k
            Some(ch) => Ok(self.hir_char(ch)),
633
        }
634
5.62k
    }
635
636
    /// Parse a hex representation of any Unicode scalar value. This expects
637
    /// the parser to be positioned at the opening brace `{` and will advance
638
    /// the parser to the first character following the closing brace `}`.
639
3.10k
    fn parse_hex_brace(&self) -> Result<Hir, Error> {
640
3.10k
        let mut scratch = String::new();
641
341k
        while self.bump_and_bump_space() && self.char() != '}' {
642
338k
            if !is_hex(self.char()) {
643
47
                return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
644
338k
            }
645
338k
            scratch.push(self.char());
646
        }
647
3.05k
        if self.is_done() {
648
30
            return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
649
3.02k
        }
650
3.02k
        assert_eq!(self.char(), '}');
651
3.02k
        self.bump_and_bump_space();
652
653
3.02k
        if scratch.is_empty() {
654
4
            return Err(Error::new(ERR_HEX_BRACE_EMPTY));
655
3.02k
        }
656
3.02k
        match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
657
52
            None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
658
2.96k
            Some(ch) => Ok(self.hir_char(ch)),
659
        }
660
3.10k
    }
661
662
    /// Parse a decimal number into a u32 while trimming leading and trailing
663
    /// whitespace.
664
    ///
665
    /// This expects the parser to be positioned at the first position where
666
    /// a decimal digit could occur. This will advance the parser to the byte
667
    /// immediately following the last contiguous decimal digit.
668
    ///
669
    /// If no decimal digit could be found or if there was a problem parsing
670
    /// the complete set of digits into a u32, then an error is returned.
671
200k
    fn parse_decimal(&self) -> Result<u32, Error> {
672
200k
        let mut scratch = String::new();
673
201k
        while !self.is_done() && self.char().is_whitespace() {
674
1.55k
            self.bump();
675
1.55k
        }
676
1.72M
        while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
677
1.52M
            scratch.push(self.char());
678
1.52M
            self.bump_and_bump_space();
679
1.52M
        }
680
201k
        while !self.is_done() && self.char().is_whitespace() {
681
1.57k
            self.bump_and_bump_space();
682
1.57k
        }
683
200k
        let digits = scratch.as_str();
684
200k
        if digits.is_empty() {
685
212
            return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
686
200k
        }
687
200k
        match u32::from_str_radix(digits, 10).ok() {
688
200k
            Some(n) => Ok(n),
689
14
            None => Err(Error::new(ERR_DECIMAL_INVALID)),
690
        }
691
200k
    }
692
693
    /// Parses an uncounted repetition operator. An uncounted repetition
694
    /// operator includes `?`, `*` and `+`, but does not include the `{m,n}`
695
    /// syntax. The current character should be one of `?`, `*` or `+`. Any
696
    /// other character will result in a panic.
697
    ///
698
    /// This assumes that the parser is currently positioned at the repetition
699
    /// operator and advances the parser to the first character after the
700
    /// operator. (Note that the operator may include a single additional `?`,
701
    /// which makes the operator ungreedy.)
702
    ///
703
    /// The caller should include the concatenation that is being built. The
704
    /// concatenation returned includes the repetition operator applied to the
705
    /// last expression in the given concatenation.
706
    ///
707
    /// If the concatenation is empty, then this returns an error.
708
176k
    fn parse_uncounted_repetition(
709
176k
        &self,
710
176k
        mut concat: Vec<Hir>,
711
176k
    ) -> Result<Vec<Hir>, Error> {
712
176k
        let sub = match concat.pop() {
713
176k
            Some(hir) => Box::new(hir),
714
            None => {
715
7
                return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
716
            }
717
        };
718
176k
        let (min, max) = match self.char() {
719
76.9k
            '?' => (0, Some(1)),
720
46.0k
            '*' => (0, None),
721
53.8k
            '+' => (1, None),
722
0
            unk => unreachable!("unrecognized repetition operator '{unk}'"),
723
        };
724
176k
        let mut greedy = true;
725
176k
        if self.bump() && self.char() == '?' {
726
33.7k
            greedy = false;
727
33.7k
            self.bump();
728
143k
        }
729
176k
        if self.flags().swap_greed {
730
61.0k
            greedy = !greedy;
731
115k
        }
732
176k
        concat.push(Hir::repetition(hir::Repetition {
733
176k
            min,
734
176k
            max,
735
176k
            greedy,
736
176k
            sub,
737
176k
        }));
738
176k
        Ok(concat)
739
176k
    }
740
741
    /// Parses a counted repetition operation. A counted repetition operator
742
    /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or
743
    /// `+` operators.
744
    ///
745
    /// This assumes that the parser is currently at the opening `{` and
746
    /// advances the parser to the first character after the operator. (Note
747
    /// that the operator may include a single additional `?`, which makes the
748
    /// operator ungreedy.)
749
    ///
750
    /// The caller should include the concatenation that is being built. The
751
    /// concatenation returned includes the repetition operator applied to the
752
    /// last expression in the given concatenation.
753
    ///
754
    /// If the concatenation is empty, then this returns an error.
755
199k
    fn parse_counted_repetition(
756
199k
        &self,
757
199k
        mut concat: Vec<Hir>,
758
199k
    ) -> Result<Vec<Hir>, Error> {
759
199k
        assert_eq!(self.char(), '{', "expected opening brace");
760
199k
        let sub = match concat.pop() {
761
199k
            Some(hir) => Box::new(hir),
762
            None => {
763
5
                return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
764
            }
765
        };
766
199k
        if !self.bump_and_bump_space() {
767
22
            return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
768
199k
        }
769
199k
        let min = self.parse_decimal()?;
770
198k
        let mut max = Some(min);
771
198k
        if self.is_done() {
772
43
            return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
773
198k
        }
774
198k
        if self.char() == ',' {
775
1.84k
            if !self.bump_and_bump_space() {
776
4
                return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
777
1.83k
            }
778
1.83k
            if self.char() != '}' {
779
1.17k
                max = Some(self.parse_decimal()?);
780
664
            } else {
781
664
                max = None;
782
664
            }
783
1.80k
            if self.is_done() {
784
3
                return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
785
1.80k
            }
786
197k
        }
787
198k
        if self.char() != '}' {
788
69
            return Err(Error::new(ERR_COUNTED_REP_INVALID));
789
198k
        }
790
791
198k
        let mut greedy = true;
792
198k
        if self.bump_and_bump_space() && self.char() == '?' {
793
421
            greedy = false;
794
421
            self.bump();
795
198k
        }
796
198k
        if self.flags().swap_greed {
797
29.5k
            greedy = !greedy;
798
169k
        }
799
800
198k
        if max.map_or(false, |max| min > max) {
801
17
            return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
802
198k
        }
803
198k
        concat.push(Hir::repetition(hir::Repetition {
804
198k
            min,
805
198k
            max,
806
198k
            greedy,
807
198k
            sub,
808
198k
        }));
809
198k
        Ok(concat)
810
199k
    }
811
812
    /// Parses the part of a pattern that starts with a `(`. This is usually
813
    /// a group sub-expression, but might just be a directive that enables
814
    /// (or disables) certain flags.
815
    ///
816
    /// This assumes the parser is pointing at the opening `(`.
817
438k
    fn parse_group(&self) -> Result<Option<Hir>, Error> {
818
438k
        assert_eq!(self.char(), '(');
819
438k
        self.bump_and_bump_space();
820
438k
        if self.is_lookaround_prefix() {
821
6
            return Err(Error::new(ERR_LOOK_UNSUPPORTED));
822
438k
        }
823
438k
        if self.bump_if("?P<") || self.bump_if("?<") {
824
9.12k
            let index = self.next_capture_index()?;
825
9.12k
            let name = Some(Box::from(self.parse_capture_name()?));
826
8.96k
            let sub = Box::new(self.parse_inner()?);
827
8.77k
            let cap = hir::Capture { index, name, sub };
828
8.77k
            Ok(Some(Hir::capture(cap)))
829
429k
        } else if self.bump_if("?") {
830
3.46k
            if self.is_done() {
831
5
                return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
832
3.46k
            }
833
3.46k
            let start = self.pos();
834
            // The flags get reset in the top-level 'parse' routine.
835
3.46k
            *self.flags.borrow_mut() = self.parse_flags()?;
836
3.38k
            let consumed = self.pos() - start;
837
3.38k
            if self.char() == ')' {
838
                // We don't allow empty flags, e.g., `(?)`.
839
2.04k
                if consumed == 0 {
840
2
                    return Err(Error::new(ERR_EMPTY_FLAGS));
841
2.04k
                }
842
2.04k
                Ok(None)
843
            } else {
844
1.33k
                assert_eq!(':', self.char());
845
1.33k
                self.bump();
846
1.33k
                self.parse_inner().map(Some)
847
            }
848
        } else {
849
426k
            let index = self.next_capture_index()?;
850
426k
            let sub = Box::new(self.parse_inner()?);
851
424k
            let cap = hir::Capture { index, name: None, sub };
852
424k
            Ok(Some(Hir::capture(cap)))
853
        }
854
438k
    }
855
856
    /// Parses a capture group name. Assumes that the parser is positioned at
857
    /// the first character in the name following the opening `<` (and may
858
    /// possibly be EOF). This advances the parser to the first character
859
    /// following the closing `>`.
860
9.12k
    fn parse_capture_name(&self) -> Result<&str, Error> {
861
9.12k
        if self.is_done() {
862
8
            return Err(Error::new(ERR_MISSING_GROUP_NAME));
863
9.11k
        }
864
9.11k
        let start = self.pos();
865
        loop {
866
3.56M
            if self.char() == '>' {
867
8.97k
                break;
868
3.55M
            }
869
3.55M
            if !is_capture_char(self.char(), self.pos() == start) {
870
54
                return Err(Error::new(ERR_INVALID_GROUP_NAME));
871
3.55M
            }
872
3.55M
            if !self.bump() {
873
89
                break;
874
3.55M
            }
875
        }
876
9.06k
        let end = self.pos();
877
9.06k
        if self.is_done() {
878
89
            return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
879
8.97k
        }
880
8.97k
        assert_eq!(self.char(), '>');
881
8.97k
        self.bump();
882
8.97k
        let name = &self.pattern()[start..end];
883
8.97k
        if name.is_empty() {
884
6
            return Err(Error::new(ERR_EMPTY_GROUP_NAME));
885
8.96k
        }
886
8.96k
        self.add_capture_name(name)?;
887
8.96k
        Ok(name)
888
9.12k
    }
889
890
    /// Parse a sequence of flags starting at the current character.
891
    ///
892
    /// This advances the parser to the character immediately following the
893
    /// flags, which is guaranteed to be either `:` or `)`.
894
    ///
895
    /// # Errors
896
    ///
897
    /// If any flags are duplicated, then an error is returned.
898
    ///
899
    /// If the negation operator is used more than once, then an error is
900
    /// returned.
901
    ///
902
    /// If no flags could be found or if the negation operation is not followed
903
    /// by any flags, then an error is returned.
904
3.46k
    fn parse_flags(&self) -> Result<Flags, Error> {
905
3.46k
        let mut flags = *self.flags.borrow();
906
3.46k
        let mut negate = false;
907
        // Keeps track of whether the previous flag item was a '-'. We use this
908
        // to detect whether there is a dangling '-', which is invalid.
909
3.46k
        let mut last_was_negation = false;
910
        // A set to keep track of the flags we've seen. Since all flags are
911
        // ASCII, we only need 128 bytes.
912
3.46k
        let mut seen = [false; 128];
913
6.84k
        while self.char() != ':' && self.char() != ')' {
914
3.45k
            if self.char() == '-' {
915
290
                last_was_negation = true;
916
290
                if negate {
917
1
                    return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
918
289
                }
919
289
                negate = true;
920
            } else {
921
3.16k
                last_was_negation = false;
922
3.16k
                self.parse_flag(&mut flags, negate)?;
923
                // OK because every valid flag is ASCII, and we're only here if
924
                // the flag is valid.
925
3.13k
                let flag_byte = u8::try_from(self.char()).unwrap();
926
3.13k
                if seen[usize::from(flag_byte)] {
927
28
                    return Err(Error::new(ERR_FLAG_DUPLICATE));
928
3.10k
                }
929
3.10k
                seen[usize::from(flag_byte)] = true;
930
            }
931
3.39k
            if !self.bump() {
932
16
                return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
933
3.38k
            }
934
        }
935
3.38k
        if last_was_negation {
936
1
            return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
937
3.38k
        }
938
3.38k
        Ok(flags)
939
3.46k
    }
940
941
    /// Parse the current character as a flag. Do not advance the parser.
942
    ///
943
    /// This sets the appropriate boolean value in place on the set of flags
944
    /// given. The boolean is inverted when `negate` is true.
945
    ///
946
    /// # Errors
947
    ///
948
    /// If the flag is not recognized, then an error is returned.
949
3.16k
    fn parse_flag(
950
3.16k
        &self,
951
3.16k
        flags: &mut Flags,
952
3.16k
        negate: bool,
953
3.16k
    ) -> Result<(), Error> {
954
3.16k
        let enabled = !negate;
955
3.16k
        match self.char() {
956
317
            'i' => flags.case_insensitive = enabled,
957
523
            'm' => flags.multi_line = enabled,
958
293
            's' => flags.dot_matches_new_line = enabled,
959
428
            'U' => flags.swap_greed = enabled,
960
297
            'R' => flags.crlf = enabled,
961
975
            'x' => flags.ignore_whitespace = enabled,
962
            // We make a special exception for this flag where we let it
963
            // through as a recognized flag, but treat it as a no-op. This in
964
            // practice retains some compatibility with the regex crate. It is
965
            // a little suspect to do this, but for example, '(?-u:\b).+' in
966
            // the regex crate is equivalent to '\b.+' in regex-lite.
967
303
            'u' => {}
968
33
            _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
969
        }
970
3.13k
        Ok(())
971
3.16k
    }
972
973
    /// Parse a standard character class consisting primarily of characters or
974
    /// character ranges.
975
    ///
976
    /// This assumes the parser is positioned at the opening `[`. If parsing
977
    /// is successful, then the parser is advanced to the position immediately
978
    /// following the closing `]`.
979
151k
    fn parse_class(&self) -> Result<Hir, Error> {
980
151k
        assert_eq!(self.char(), '[');
981
982
151k
        let mut union = vec![];
983
151k
        if !self.bump_and_bump_space() {
984
8
            return Err(Error::new(ERR_CLASS_UNCLOSED));
985
151k
        }
986
        // Determine whether the class is negated or not.
987
151k
        let negate = if self.char() != '^' {
988
140k
            false
989
        } else {
990
10.8k
            if !self.bump_and_bump_space() {
991
4
                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
992
10.8k
            }
993
10.8k
            true
994
        };
995
        // Accept any number of `-` as literal `-`.
996
160k
        while self.char() == '-' {
997
9.22k
            union.push(hir::ClassRange { start: '-', end: '-' });
998
9.22k
            if !self.bump_and_bump_space() {
999
13
                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1000
9.21k
            }
1001
        }
1002
        // If `]` is the *first* char in a set, then interpret it as a literal
1003
        // `]`. That is, an empty class is impossible to write.
1004
151k
        if union.is_empty() && self.char() == ']' {
1005
2.37k
            union.push(hir::ClassRange { start: ']', end: ']' });
1006
2.37k
            if !self.bump_and_bump_space() {
1007
5
                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
1008
2.37k
            }
1009
149k
        }
1010
        loop {
1011
39.0M
            self.bump_space();
1012
39.0M
            if self.is_done() {
1013
132
                return Err(Error::new(ERR_CLASS_UNCLOSED));
1014
39.0M
            }
1015
39.0M
            match self.char() {
1016
                '[' => {
1017
                    // Attempt to treat this as the beginning of a POSIX class.
1018
                    // If POSIX class parsing fails, then the parser backs up
1019
                    // to `[`.
1020
9.94k
                    if let Some(class) = self.maybe_parse_posix_class() {
1021
9.78k
                        union.extend_from_slice(&class.ranges);
1022
9.78k
                        continue;
1023
163
                    }
1024
                    // ... otherwise we don't support nested classes.
1025
163
                    return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
1026
                }
1027
                ']' => {
1028
151k
                    self.bump();
1029
151k
                    let mut class = hir::Class::new(union);
1030
                    // Note that we must apply case folding before negation!
1031
                    // Consider `(?i)[^x]`. If we applied negation first, then
1032
                    // the result would be the character class that matched any
1033
                    // Unicode scalar value.
1034
151k
                    if self.flags().case_insensitive {
1035
48.5k
                        class.ascii_case_fold();
1036
102k
                    }
1037
151k
                    if negate {
1038
10.8k
                        class.negate();
1039
140k
                    }
1040
151k
                    return Ok(Hir::class(class));
1041
                }
1042
2.63k
                '&' if self.peek() == Some('&') => {
1043
1
                    return Err(Error::new(
1044
1
                        ERR_CLASS_INTERSECTION_UNSUPPORTED,
1045
1
                    ));
1046
                }
1047
64.5k
                '-' if self.peek() == Some('-') => {
1048
10
                    return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
1049
                }
1050
8.57k
                '~' if self.peek() == Some('~') => {
1051
1
                    return Err(Error::new(
1052
1
                        ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
1053
1
                    ));
1054
                }
1055
38.8M
                _ => self.parse_class_range(&mut union)?,
1056
            }
1057
        }
1058
151k
    }
1059
1060
    /// Parse a single primitive item in a character class set. The item to
1061
    /// be parsed can either be one of a simple literal character, a range
1062
    /// between two simple literal characters or a "primitive" character
1063
    /// class like `\w`.
1064
    ///
1065
    /// If an invalid escape is found, or if a character class is found where
1066
    /// a simple literal is expected (e.g., in a range), then an error is
1067
    /// returned.
1068
    ///
1069
    /// Otherwise, the range (or ranges) are appended to the given union of
1070
    /// ranges.
1071
38.8M
    fn parse_class_range(
1072
38.8M
        &self,
1073
38.8M
        union: &mut Vec<hir::ClassRange>,
1074
38.8M
    ) -> Result<(), Error> {
1075
38.8M
        let prim1 = self.parse_class_item()?;
1076
38.8M
        self.bump_space();
1077
38.8M
        if self.is_done() {
1078
207
            return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
1079
38.8M
        }
1080
        // If the next char isn't a `-`, then we don't have a range.
1081
        // There are two exceptions. If the char after a `-` is a `]`, then
1082
        // `-` is interpreted as a literal `-`. Alternatively, if the char
1083
        // after a `-` is a `-`, then `--` corresponds to a "difference"
1084
        // operation. (Which we don't support in regex-lite, but error about
1085
        // specifically in an effort to be loud about differences between the
1086
        // main regex crate where possible.)
1087
38.8M
        if self.char() != '-'
1088
200k
            || self.peek_space() == Some(']')
1089
137k
            || self.peek_space() == Some('-')
1090
        {
1091
38.7M
            union.extend_from_slice(&into_class_item_ranges(prim1)?);
1092
38.7M
            return Ok(());
1093
136k
        }
1094
        // OK, now we're parsing a range, so bump past the `-` and parse the
1095
        // second half of the range.
1096
136k
        if !self.bump_and_bump_space() {
1097
63
            return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1098
136k
        }
1099
136k
        let prim2 = self.parse_class_item()?;
1100
136k
        let range = hir::ClassRange {
1101
136k
            start: into_class_item_range(prim1)?,
1102
136k
            end: into_class_item_range(prim2)?,
1103
        };
1104
136k
        if range.start > range.end {
1105
39
            return Err(Error::new(ERR_CLASS_INVALID_RANGE));
1106
136k
        }
1107
136k
        union.push(range);
1108
136k
        Ok(())
1109
38.8M
    }
1110
1111
    /// Parse a single item in a character class as a primitive, where the
1112
    /// primitive either consists of a verbatim literal or a single escape
1113
    /// sequence.
1114
    ///
1115
    /// This assumes the parser is positioned at the beginning of a primitive,
1116
    /// and advances the parser to the first position after the primitive if
1117
    /// successful.
1118
    ///
1119
    /// Note that it is the caller's responsibility to report an error if an
1120
    /// illegal primitive was parsed.
1121
39.0M
    fn parse_class_item(&self) -> Result<Hir, Error> {
1122
39.0M
        let ch = self.char();
1123
39.0M
        self.bump();
1124
39.0M
        if ch == '\\' {
1125
17.5M
            self.parse_escape()
1126
        } else {
1127
21.4M
            Ok(Hir::char(ch))
1128
        }
1129
39.0M
    }
1130
1131
    /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`.
1132
    ///
1133
    /// This assumes the parser is positioned at the opening `[`.
1134
    ///
1135
    /// If no valid POSIX character class could be found, then this does not
1136
    /// advance the parser and `None` is returned. Otherwise, the parser is
1137
    /// advanced to the first byte following the closing `]` and the
1138
    /// corresponding POSIX class is returned.
1139
9.94k
    fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
1140
        // POSIX character classes are interesting from a parsing perspective
1141
        // because parsing cannot fail with any interesting error. For example,
1142
        // in order to use an POSIX character class, it must be enclosed in
1143
        // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1144
        // of it as "POSIX character classes have the syntax `[:NAME:]` which
1145
        // can only appear within character brackets." This means that things
1146
        // like `[[:lower:]A]` are legal constructs.
1147
        //
1148
        // However, if one types an incorrect POSIX character class, e.g.,
1149
        // `[[:loower:]]`, then we treat that as if it were normal nested
1150
        // character class containing the characters `:elorw`. (Which isn't
1151
        // supported and results in an error in regex-lite.) One might argue
1152
        // that we should return an error instead since the repeated colons
1153
        // give away the intent to write an POSIX class. But what if the user
1154
        // typed `[[:lower]]` instead? How can we tell that was intended to be
1155
        // a POSIX class and not just a normal nested class?
1156
        //
1157
        // Reasonable people can probably disagree over this, but for better
1158
        // or worse, we implement semantics that never fails at the expense of
1159
        // better failure modes.
1160
9.94k
        assert_eq!(self.char(), '[');
1161
1162
        // If parsing fails, then we back up the parser to this starting point.
1163
9.94k
        let start_pos = self.pos();
1164
9.94k
        let start_char = self.char.get();
1165
9.94k
        let reset = || {
1166
163
            self.pos.set(start_pos);
1167
163
            self.char.set(start_char);
1168
163
        };
1169
1170
9.94k
        let mut negated = false;
1171
9.94k
        if !self.bump() || self.char() != ':' {
1172
51
            reset();
1173
51
            return None;
1174
9.89k
        }
1175
9.89k
        if !self.bump() {
1176
4
            reset();
1177
4
            return None;
1178
9.89k
        }
1179
9.89k
        if self.char() == '^' {
1180
2.54k
            negated = true;
1181
2.54k
            if !self.bump() {
1182
1
                reset();
1183
1
                return None;
1184
2.54k
            }
1185
7.34k
        }
1186
9.89k
        let name_start = self.pos();
1187
64.1k
        while self.char() != ':' && self.bump() {}
1188
9.89k
        if self.is_done() {
1189
57
            reset();
1190
57
            return None;
1191
9.83k
        }
1192
9.83k
        let name = &self.pattern()[name_start..self.pos()];
1193
9.83k
        if !self.bump_if(":]") {
1194
25
            reset();
1195
25
            return None;
1196
9.80k
        }
1197
9.80k
        if let Ok(ranges) = posix_class(name) {
1198
9.78k
            let mut class = hir::Class::new(ranges);
1199
9.78k
            if negated {
1200
2.53k
                class.negate();
1201
7.24k
            }
1202
9.78k
            return Some(class);
1203
25
        }
1204
25
        reset();
1205
25
        None
1206
9.94k
    }
1207
1208
    /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
1209
    /// parser is currently at a valid character class name and will be
1210
    /// advanced to the character immediately following the class.
1211
16.8M
    fn parse_perl_class(&self) -> Hir {
1212
16.8M
        let ch = self.char();
1213
16.8M
        self.bump();
1214
16.8M
        let mut class = hir::Class::new(match ch {
1215
2.44k
            'd' | 'D' => posix_class("digit").unwrap(),
1216
10.9k
            's' | 'S' => posix_class("space").unwrap(),
1217
16.8M
            'w' | 'W' => posix_class("word").unwrap(),
1218
0
            unk => unreachable!("invalid Perl class \\{unk}"),
1219
        });
1220
16.8M
        if ch.is_ascii_uppercase() {
1221
16.7M
            class.negate();
1222
16.7M
        }
1223
16.8M
        Hir::class(class)
1224
16.8M
    }
1225
1226
27.0k
    fn hir_dot(&self) -> Hir {
1227
27.0k
        if self.flags().dot_matches_new_line {
1228
3.11k
            Hir::class(hir::Class::new([hir::ClassRange {
1229
3.11k
                start: '\x00',
1230
3.11k
                end: '\u{10FFFF}',
1231
3.11k
            }]))
1232
23.9k
        } else if self.flags().crlf {
1233
1.93k
            Hir::class(hir::Class::new([
1234
1.93k
                hir::ClassRange { start: '\x00', end: '\x09' },
1235
1.93k
                hir::ClassRange { start: '\x0B', end: '\x0C' },
1236
1.93k
                hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' },
1237
1.93k
            ]))
1238
        } else {
1239
21.9k
            Hir::class(hir::Class::new([
1240
21.9k
                hir::ClassRange { start: '\x00', end: '\x09' },
1241
21.9k
                hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' },
1242
21.9k
            ]))
1243
        }
1244
27.0k
    }
1245
1246
12.2k
    fn hir_anchor_start(&self) -> Hir {
1247
12.2k
        let look = if self.flags().multi_line {
1248
2.70k
            if self.flags().crlf {
1249
1.60k
                hir::Look::StartCRLF
1250
            } else {
1251
1.10k
                hir::Look::StartLF
1252
            }
1253
        } else {
1254
9.55k
            hir::Look::Start
1255
        };
1256
12.2k
        Hir::look(look)
1257
12.2k
    }
1258
1259
27.8k
    fn hir_anchor_end(&self) -> Hir {
1260
27.8k
        let look = if self.flags().multi_line {
1261
10.5k
            if self.flags().crlf {
1262
4.21k
                hir::Look::EndCRLF
1263
            } else {
1264
6.32k
                hir::Look::EndLF
1265
            }
1266
        } else {
1267
17.3k
            hir::Look::End
1268
        };
1269
27.8k
        Hir::look(look)
1270
27.8k
    }
1271
1272
13.6M
    fn hir_char(&self, ch: char) -> Hir {
1273
13.6M
        if self.flags().case_insensitive {
1274
5.95M
            let this = hir::ClassRange { start: ch, end: ch };
1275
5.95M
            if let Some(folded) = this.ascii_case_fold() {
1276
4.06M
                return Hir::class(hir::Class::new([this, folded]));
1277
1.89M
            }
1278
7.68M
        }
1279
9.57M
        Hir::char(ch)
1280
13.6M
    }
1281
}
1282
1283
/// This checks the depth of the given `Hir` value, and if it exceeds the given
1284
/// limit, then an error is returned.
1285
5.85k
fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
1286
6.60M
    fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
1287
6.60M
        if depth > limit {
1288
11
            return Err(Error::new(ERR_TOO_MUCH_NESTING));
1289
6.60M
        }
1290
6.60M
        let Some(next_depth) = depth.checked_add(1) else {
1291
0
            return Err(Error::new(ERR_TOO_MUCH_NESTING));
1292
        };
1293
6.60M
        match *hir.kind() {
1294
            HirKind::Empty
1295
            | HirKind::Char(_)
1296
            | HirKind::Class(_)
1297
6.51M
            | HirKind::Look(_) => Ok(()),
1298
33.4k
            HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
1299
33.4k
                recurse(sub, limit, next_depth)
1300
            }
1301
26.7k
            HirKind::Capture(hir::Capture { ref sub, .. }) => {
1302
26.7k
                recurse(sub, limit, next_depth)
1303
            }
1304
26.6k
            HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
1305
6.53M
                for sub in subs.iter() {
1306
6.53M
                    recurse(sub, limit, next_depth)?;
1307
                }
1308
28.2k
                Ok(())
1309
            }
1310
        }
1311
6.60M
    }
1312
5.85k
    recurse(hir, limit, 0)
1313
5.85k
}
1314
1315
/// Converts the given Hir to a literal char if the Hir is just a single
1316
/// character. Otherwise this returns an error.
1317
///
1318
/// This is useful in contexts where you can only accept a single character,
1319
/// but where it is convenient to parse something more general. For example,
1320
/// parsing a single part of a character class range. It's useful to reuse
1321
/// the literal parsing code, but that code can itself return entire classes
1322
/// which can't be used as the start/end of a class range.
1323
272k
fn into_class_item_range(hir: Hir) -> Result<char, Error> {
1324
272k
    match hir.kind {
1325
272k
        HirKind::Char(ch) => Ok(ch),
1326
3
        _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
1327
    }
1328
272k
}
1329
1330
38.7M
fn into_class_item_ranges(
1331
38.7M
    mut hir: Hir,
1332
38.7M
) -> Result<Vec<hir::ClassRange>, Error> {
1333
38.7M
    match core::mem::replace(&mut hir.kind, HirKind::Empty) {
1334
21.9M
        HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
1335
16.8M
        HirKind::Class(hir::Class { ranges }) => Ok(ranges),
1336
2
        _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
1337
    }
1338
38.7M
}
1339
1340
/// Returns an iterator of character class ranges for the given named POSIX
1341
/// character class. If no such character class exists for the name given, then
1342
/// an error is returned.
1343
16.8M
fn posix_class(
1344
16.8M
    kind: &str,
1345
16.8M
) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
1346
16.8M
    let slice: &'static [(u8, u8)] = match kind {
1347
16.8M
        "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1348
16.8M
        "alpha" => &[(b'A', b'Z'), (b'a', b'z')],
1349
16.8M
        "ascii" => &[(b'\x00', b'\x7F')],
1350
16.8M
        "blank" => &[(b'\t', b'\t'), (b' ', b' ')],
1351
16.8M
        "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1352
16.8M
        "digit" => &[(b'0', b'9')],
1353
16.8M
        "graph" => &[(b'!', b'~')],
1354
16.8M
        "lower" => &[(b'a', b'z')],
1355
16.8M
        "print" => &[(b' ', b'~')],
1356
16.8M
        "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1357
16.8M
        "space" => &[
1358
10.9k
            (b'\t', b'\t'),
1359
10.9k
            (b'\n', b'\n'),
1360
10.9k
            (b'\x0B', b'\x0B'),
1361
10.9k
            (b'\x0C', b'\x0C'),
1362
10.9k
            (b'\r', b'\r'),
1363
10.9k
            (b' ', b' '),
1364
10.9k
        ],
1365
16.8M
        "upper" => &[(b'A', b'Z')],
1366
16.8M
        "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1367
5.07k
        "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1368
25
        _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
1369
    };
1370
16.8M
    Ok(slice.iter().map(|&(start, end)| hir::ClassRange {
1371
67.3M
        start: char::from(start),
1372
67.3M
        end: char::from(end),
1373
67.3M
    }))
1374
16.8M
}
1375
1376
/// Returns true if the given character is a hexadecimal digit.
1377
355k
fn is_hex(c: char) -> bool {
1378
355k
    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
1379
355k
}
1380
1381
/// Returns true if the given character is a valid in a capture group name.
1382
///
1383
/// If `first` is true, then `c` is treated as the first character in the
1384
/// group name (which must be alphabetic or underscore).
1385
3.55M
fn is_capture_char(c: char, first: bool) -> bool {
1386
3.55M
    if first {
1387
9.11k
        c == '_' || c.is_alphabetic()
1388
    } else {
1389
3.54M
        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
1390
    }
1391
3.55M
}
1392
1393
#[cfg(test)]
1394
mod tests {
1395
    use super::*;
1396
1397
    fn p(pattern: &str) -> Hir {
1398
        Parser::new(Config::default(), pattern).parse_inner().unwrap()
1399
    }
1400
1401
    fn perr(pattern: &str) -> String {
1402
        Parser::new(Config::default(), pattern)
1403
            .parse_inner()
1404
            .unwrap_err()
1405
            .to_string()
1406
    }
1407
1408
    fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
1409
        Hir::class(hir::Class::new(
1410
            it.into_iter().map(|(start, end)| hir::ClassRange { start, end }),
1411
        ))
1412
    }
1413
1414
    fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
1415
        Hir::class(hir::Class::new(
1416
            it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }),
1417
        ))
1418
    }
1419
1420
    fn posix(name: &str) -> Hir {
1421
        Hir::class(hir::Class::new(posix_class(name).unwrap()))
1422
    }
1423
1424
    fn cap(index: u32, sub: Hir) -> Hir {
1425
        Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
1426
    }
1427
1428
    fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
1429
        Hir::capture(hir::Capture {
1430
            index,
1431
            name: Some(Box::from(name)),
1432
            sub: Box::new(sub),
1433
        })
1434
    }
1435
1436
    #[test]
1437
    fn ok_literal() {
1438
        assert_eq!(p("a"), Hir::char('a'));
1439
        assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
1440
        assert_eq!(p("💩"), Hir::char('💩'));
1441
    }
1442
1443
    #[test]
1444
    fn ok_meta_escapes() {
1445
        assert_eq!(p(r"\*"), Hir::char('*'));
1446
        assert_eq!(p(r"\+"), Hir::char('+'));
1447
        assert_eq!(p(r"\?"), Hir::char('?'));
1448
        assert_eq!(p(r"\|"), Hir::char('|'));
1449
        assert_eq!(p(r"\("), Hir::char('('));
1450
        assert_eq!(p(r"\)"), Hir::char(')'));
1451
        assert_eq!(p(r"\^"), Hir::char('^'));
1452
        assert_eq!(p(r"\$"), Hir::char('$'));
1453
        assert_eq!(p(r"\["), Hir::char('['));
1454
        assert_eq!(p(r"\]"), Hir::char(']'));
1455
    }
1456
1457
    #[test]
1458
    fn ok_special_escapes() {
1459
        assert_eq!(p(r"\a"), Hir::char('\x07'));
1460
        assert_eq!(p(r"\f"), Hir::char('\x0C'));
1461
        assert_eq!(p(r"\t"), Hir::char('\t'));
1462
        assert_eq!(p(r"\n"), Hir::char('\n'));
1463
        assert_eq!(p(r"\r"), Hir::char('\r'));
1464
        assert_eq!(p(r"\v"), Hir::char('\x0B'));
1465
        assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
1466
        assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
1467
        assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
1468
        assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
1469
    }
1470
1471
    #[test]
1472
    fn ok_hex() {
1473
        // fixed length
1474
        assert_eq!(p(r"\x41"), Hir::char('A'));
1475
        assert_eq!(p(r"\u2603"), Hir::char('☃'));
1476
        assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
1477
        // braces
1478
        assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
1479
        assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
1480
        assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
1481
    }
1482
1483
    #[test]
1484
    fn ok_perl() {
1485
        assert_eq!(p(r"\d"), posix("digit"));
1486
        assert_eq!(p(r"\s"), posix("space"));
1487
        assert_eq!(p(r"\w"), posix("word"));
1488
1489
        let negated = |name| {
1490
            let mut class = hir::Class::new(posix_class(name).unwrap());
1491
            class.negate();
1492
            Hir::class(class)
1493
        };
1494
        assert_eq!(p(r"\D"), negated("digit"));
1495
        assert_eq!(p(r"\S"), negated("space"));
1496
        assert_eq!(p(r"\W"), negated("word"));
1497
    }
1498
1499
    #[test]
1500
    fn ok_flags_and_primitives() {
1501
        assert_eq!(p(r"a"), Hir::char('a'));
1502
        assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
1503
1504
        assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
1505
        assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
1506
        assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
1507
1508
        assert_eq!(p(r"$"), Hir::look(hir::Look::End));
1509
        assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
1510
        assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
1511
1512
        assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')]));
1513
        assert_eq!(
1514
            p(r"(?R:.)"),
1515
            class([
1516
                ('\x00', '\x09'),
1517
                ('\x0B', '\x0C'),
1518
                ('\x0E', '\u{10FFFF}'),
1519
            ])
1520
        );
1521
        assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')]));
1522
        assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')]));
1523
    }
1524
1525
    #[test]
1526
    fn ok_alternate() {
1527
        assert_eq!(
1528
            p(r"a|b"),
1529
            Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1530
        );
1531
        assert_eq!(
1532
            p(r"(?:a|b)"),
1533
            Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1534
        );
1535
1536
        assert_eq!(
1537
            p(r"(a|b)"),
1538
            cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
1539
        );
1540
        assert_eq!(
1541
            p(r"(?<foo>a|b)"),
1542
            named_cap(
1543
                1,
1544
                "foo",
1545
                Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1546
            )
1547
        );
1548
1549
        assert_eq!(
1550
            p(r"a|b|c"),
1551
            Hir::alternation(vec![
1552
                Hir::char('a'),
1553
                Hir::char('b'),
1554
                Hir::char('c')
1555
            ])
1556
        );
1557
1558
        assert_eq!(
1559
            p(r"ax|by|cz"),
1560
            Hir::alternation(vec![
1561
                Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1562
                Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1563
                Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
1564
            ])
1565
        );
1566
        assert_eq!(
1567
            p(r"(ax|(by|(cz)))"),
1568
            cap(
1569
                1,
1570
                Hir::alternation(vec![
1571
                    Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1572
                    cap(
1573
                        2,
1574
                        Hir::alternation(vec![
1575
                            Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1576
                            cap(
1577
                                3,
1578
                                Hir::concat(vec![
1579
                                    Hir::char('c'),
1580
                                    Hir::char('z')
1581
                                ])
1582
                            ),
1583
                        ])
1584
                    ),
1585
                ])
1586
            )
1587
        );
1588
1589
        assert_eq!(
1590
            p(r"|"),
1591
            Hir::alternation(vec![Hir::empty(), Hir::empty()])
1592
        );
1593
        assert_eq!(
1594
            p(r"||"),
1595
            Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
1596
        );
1597
1598
        assert_eq!(
1599
            p(r"a|"),
1600
            Hir::alternation(vec![Hir::char('a'), Hir::empty()])
1601
        );
1602
        assert_eq!(
1603
            p(r"|a"),
1604
            Hir::alternation(vec![Hir::empty(), Hir::char('a')])
1605
        );
1606
1607
        assert_eq!(
1608
            p(r"(|)"),
1609
            cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
1610
        );
1611
        assert_eq!(
1612
            p(r"(a|)"),
1613
            cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
1614
        );
1615
        assert_eq!(
1616
            p(r"(|a)"),
1617
            cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
1618
        );
1619
    }
1620
1621
    #[test]
1622
    fn ok_flag_group() {
1623
        assert_eq!(
1624
            p("a(?i:b)"),
1625
            Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1626
        );
1627
    }
1628
1629
    #[test]
1630
    fn ok_flag_directive() {
1631
        assert_eq!(p("(?i)a"), singles(['A', 'a']));
1632
        assert_eq!(p("a(?i)"), Hir::char('a'));
1633
        assert_eq!(
1634
            p("a(?i)b"),
1635
            Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1636
        );
1637
        assert_eq!(
1638
            p("a(?i)a(?-i)a"),
1639
            Hir::concat(vec![
1640
                Hir::char('a'),
1641
                singles(['A', 'a']),
1642
                Hir::char('a'),
1643
            ])
1644
        );
1645
        assert_eq!(
1646
            p("a(?:(?i)a)a"),
1647
            Hir::concat(vec![
1648
                Hir::char('a'),
1649
                singles(['A', 'a']),
1650
                Hir::char('a'),
1651
            ])
1652
        );
1653
        assert_eq!(
1654
            p("a((?i)a)a"),
1655
            Hir::concat(vec![
1656
                Hir::char('a'),
1657
                cap(1, singles(['A', 'a'])),
1658
                Hir::char('a'),
1659
            ])
1660
        );
1661
    }
1662
1663
    #[test]
1664
    fn ok_uncounted_repetition() {
1665
        assert_eq!(
1666
            p(r"a?"),
1667
            Hir::repetition(hir::Repetition {
1668
                min: 0,
1669
                max: Some(1),
1670
                greedy: true,
1671
                sub: Box::new(Hir::char('a')),
1672
            }),
1673
        );
1674
        assert_eq!(
1675
            p(r"a*"),
1676
            Hir::repetition(hir::Repetition {
1677
                min: 0,
1678
                max: None,
1679
                greedy: true,
1680
                sub: Box::new(Hir::char('a')),
1681
            }),
1682
        );
1683
        assert_eq!(
1684
            p(r"a+"),
1685
            Hir::repetition(hir::Repetition {
1686
                min: 1,
1687
                max: None,
1688
                greedy: true,
1689
                sub: Box::new(Hir::char('a')),
1690
            }),
1691
        );
1692
1693
        assert_eq!(
1694
            p(r"a??"),
1695
            Hir::repetition(hir::Repetition {
1696
                min: 0,
1697
                max: Some(1),
1698
                greedy: false,
1699
                sub: Box::new(Hir::char('a')),
1700
            }),
1701
        );
1702
        assert_eq!(
1703
            p(r"a*?"),
1704
            Hir::repetition(hir::Repetition {
1705
                min: 0,
1706
                max: None,
1707
                greedy: false,
1708
                sub: Box::new(Hir::char('a')),
1709
            }),
1710
        );
1711
        assert_eq!(
1712
            p(r"a+?"),
1713
            Hir::repetition(hir::Repetition {
1714
                min: 1,
1715
                max: None,
1716
                greedy: false,
1717
                sub: Box::new(Hir::char('a')),
1718
            }),
1719
        );
1720
1721
        assert_eq!(
1722
            p(r"a?b"),
1723
            Hir::concat(vec![
1724
                Hir::repetition(hir::Repetition {
1725
                    min: 0,
1726
                    max: Some(1),
1727
                    greedy: true,
1728
                    sub: Box::new(Hir::char('a')),
1729
                }),
1730
                Hir::char('b'),
1731
            ]),
1732
        );
1733
1734
        assert_eq!(
1735
            p(r"ab?"),
1736
            Hir::concat(vec![
1737
                Hir::char('a'),
1738
                Hir::repetition(hir::Repetition {
1739
                    min: 0,
1740
                    max: Some(1),
1741
                    greedy: true,
1742
                    sub: Box::new(Hir::char('b')),
1743
                }),
1744
            ]),
1745
        );
1746
1747
        assert_eq!(
1748
            p(r"(?:ab)?"),
1749
            Hir::repetition(hir::Repetition {
1750
                min: 0,
1751
                max: Some(1),
1752
                greedy: true,
1753
                sub: Box::new(Hir::concat(vec![
1754
                    Hir::char('a'),
1755
                    Hir::char('b')
1756
                ])),
1757
            }),
1758
        );
1759
1760
        assert_eq!(
1761
            p(r"(ab)?"),
1762
            Hir::repetition(hir::Repetition {
1763
                min: 0,
1764
                max: Some(1),
1765
                greedy: true,
1766
                sub: Box::new(cap(
1767
                    1,
1768
                    Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1769
                )),
1770
            }),
1771
        );
1772
1773
        assert_eq!(
1774
            p(r"|a?"),
1775
            Hir::alternation(vec![
1776
                Hir::empty(),
1777
                Hir::repetition(hir::Repetition {
1778
                    min: 0,
1779
                    max: Some(1),
1780
                    greedy: true,
1781
                    sub: Box::new(Hir::char('a')),
1782
                })
1783
            ]),
1784
        );
1785
    }
1786
1787
    #[test]
1788
    fn ok_counted_repetition() {
1789
        assert_eq!(
1790
            p(r"a{5}"),
1791
            Hir::repetition(hir::Repetition {
1792
                min: 5,
1793
                max: Some(5),
1794
                greedy: true,
1795
                sub: Box::new(Hir::char('a')),
1796
            }),
1797
        );
1798
        assert_eq!(
1799
            p(r"a{5}?"),
1800
            Hir::repetition(hir::Repetition {
1801
                min: 5,
1802
                max: Some(5),
1803
                greedy: false,
1804
                sub: Box::new(Hir::char('a')),
1805
            }),
1806
        );
1807
1808
        assert_eq!(
1809
            p(r"a{5,}"),
1810
            Hir::repetition(hir::Repetition {
1811
                min: 5,
1812
                max: None,
1813
                greedy: true,
1814
                sub: Box::new(Hir::char('a')),
1815
            }),
1816
        );
1817
1818
        assert_eq!(
1819
            p(r"a{5,9}"),
1820
            Hir::repetition(hir::Repetition {
1821
                min: 5,
1822
                max: Some(9),
1823
                greedy: true,
1824
                sub: Box::new(Hir::char('a')),
1825
            }),
1826
        );
1827
1828
        assert_eq!(
1829
            p(r"ab{5}c"),
1830
            Hir::concat(vec![
1831
                Hir::char('a'),
1832
                Hir::repetition(hir::Repetition {
1833
                    min: 5,
1834
                    max: Some(5),
1835
                    greedy: true,
1836
                    sub: Box::new(Hir::char('b')),
1837
                }),
1838
                Hir::char('c'),
1839
            ]),
1840
        );
1841
1842
        assert_eq!(
1843
            p(r"a{ 5 }"),
1844
            Hir::repetition(hir::Repetition {
1845
                min: 5,
1846
                max: Some(5),
1847
                greedy: true,
1848
                sub: Box::new(Hir::char('a')),
1849
            }),
1850
        );
1851
        assert_eq!(
1852
            p(r"a{ 5 , 9 }"),
1853
            Hir::repetition(hir::Repetition {
1854
                min: 5,
1855
                max: Some(9),
1856
                greedy: true,
1857
                sub: Box::new(Hir::char('a')),
1858
            }),
1859
        );
1860
    }
1861
1862
    #[test]
1863
    fn ok_group_unnamed() {
1864
        assert_eq!(p("(a)"), cap(1, Hir::char('a')));
1865
        assert_eq!(
1866
            p("(ab)"),
1867
            cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
1868
        );
1869
    }
1870
1871
    #[test]
1872
    fn ok_group_named() {
1873
        assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1874
        assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1875
1876
        assert_eq!(
1877
            p("(?P<foo>ab)"),
1878
            named_cap(
1879
                1,
1880
                "foo",
1881
                Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1882
            )
1883
        );
1884
        assert_eq!(
1885
            p("(?<foo>ab)"),
1886
            named_cap(
1887
                1,
1888
                "foo",
1889
                Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1890
            )
1891
        );
1892
1893
        assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z')));
1894
        assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z')));
1895
1896
        assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1897
        assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1898
1899
        assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1900
        assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1901
1902
        assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1903
        assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1904
1905
        assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1906
        assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1907
1908
        assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1909
        assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1910
    }
1911
1912
    #[test]
1913
    fn ok_class() {
1914
        assert_eq!(p(r"[a]"), singles(['a']));
1915
        assert_eq!(p(r"[a\]]"), singles(['a', ']']));
1916
        assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
1917
        assert_eq!(p(r"[ab]"), class([('a', 'b')]));
1918
        assert_eq!(p(r"[a-]"), singles(['a', '-']));
1919
        assert_eq!(p(r"[-a]"), singles(['a', '-']));
1920
        assert_eq!(p(r"[--a]"), singles(['a', '-']));
1921
        assert_eq!(p(r"[---a]"), singles(['a', '-']));
1922
        assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
1923
        assert_eq!(p(r"[\w]"), posix("word"));
1924
        assert_eq!(p(r"[a\wz]"), posix("word"));
1925
        assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')]));
1926
        assert_eq!(p(r"[^\s\S]"), Hir::fail());
1927
        assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
1928
        assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
1929
        assert_eq!(p(r"[]]"), singles([']']));
1930
        assert_eq!(p(r"[]a]"), singles([']', 'a']));
1931
        assert_eq!(p(r"[]\[]"), singles(['[', ']']));
1932
        assert_eq!(p(r"[\[]"), singles(['[']));
1933
1934
        assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
1935
        assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
1936
        assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
1937
        assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
1938
        assert_eq!(p(r"(?i)[β]"), singles(['β']));
1939
1940
        assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')]));
1941
        assert_eq!(
1942
            p(r"[^-a]"),
1943
            class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')])
1944
        );
1945
1946
        assert_eq!(
1947
            p(r"[-]a]"),
1948
            Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
1949
        );
1950
    }
1951
1952
    #[test]
1953
    fn ok_verbatim() {
1954
        assert_eq!(
1955
            p(r"(?x)a{5,9} ?"),
1956
            Hir::repetition(hir::Repetition {
1957
                min: 5,
1958
                max: Some(9),
1959
                greedy: false,
1960
                sub: Box::new(Hir::char('a')),
1961
            })
1962
        );
1963
        assert_eq!(p(r"(?x)[   a]"), singles(['a']));
1964
        assert_eq!(
1965
            p(r"(?x)[ ^  a]"),
1966
            class([('\x00', '`'), ('b', '\u{10FFFF}')])
1967
        );
1968
        assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
1969
        assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
1970
1971
        assert_eq!(
1972
            p(r"(?x)a b"),
1973
            Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1974
        );
1975
        assert_eq!(
1976
            p(r"(?x)a b(?-x)a b"),
1977
            Hir::concat(vec![
1978
                Hir::char('a'),
1979
                Hir::char('b'),
1980
                Hir::char('a'),
1981
                Hir::char(' '),
1982
                Hir::char('b'),
1983
            ])
1984
        );
1985
        assert_eq!(
1986
            p(r"a (?x:a )a "),
1987
            Hir::concat(vec![
1988
                Hir::char('a'),
1989
                Hir::char(' '),
1990
                Hir::char('a'),
1991
                Hir::char('a'),
1992
                Hir::char(' '),
1993
            ])
1994
        );
1995
        assert_eq!(
1996
            p(r"(?x)( ?P<foo> a )"),
1997
            named_cap(1, "foo", Hir::char('a')),
1998
        );
1999
        assert_eq!(p(r"(?x)(  a )"), cap(1, Hir::char('a')));
2000
        assert_eq!(p(r"(?x)(   ?:  a )"), Hir::char('a'));
2001
        assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53'));
2002
        assert_eq!(p(r"(?x)\ "), Hir::char(' '));
2003
    }
2004
2005
    #[test]
2006
    fn ok_comments() {
2007
        let pat = "(?x)
2008
# This is comment 1.
2009
foo # This is comment 2.
2010
  # This is comment 3.
2011
bar
2012
# This is comment 4.";
2013
        assert_eq!(
2014
            p(pat),
2015
            Hir::concat(vec![
2016
                Hir::char('f'),
2017
                Hir::char('o'),
2018
                Hir::char('o'),
2019
                Hir::char('b'),
2020
                Hir::char('a'),
2021
                Hir::char('r'),
2022
            ])
2023
        );
2024
    }
2025
2026
    #[test]
2027
    fn err_standard() {
2028
        assert_eq!(
2029
            ERR_TOO_MUCH_NESTING,
2030
            perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
2031
        );
2032
        // This one is tricky, because the only way it can happen is if the
2033
        // number of captures overflows u32. Perhaps we should allow setting a
2034
        // lower limit?
2035
        // assert_eq!(ERR_TOO_MANY_CAPTURES, perr(""));
2036
        assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
2037
        assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
2038
        assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
2039
        assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
2040
        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
2041
        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
2042
        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
2043
        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
2044
        assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
2045
        assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
2046
        assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
2047
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
2048
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
2049
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
2050
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
2051
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
2052
        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
2053
        assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
2054
        assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
2055
        assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
2056
        assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
2057
        assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
2058
        assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
2059
        assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
2060
        assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
2061
        assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
2062
        assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
2063
        assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
2064
        assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
2065
        assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
2066
        assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
2067
        assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
2068
        assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
2069
        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
2070
        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
2071
        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
2072
        assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
2073
        assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
2074
        assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
2075
        assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
2076
        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
2077
        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
2078
        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
2079
        assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
2080
        assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
2081
        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
2082
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
2083
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
2084
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
2085
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
2086
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?"));
2087
        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
2088
        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
2089
        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
2090
        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
2091
        assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
2092
        assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
2093
        assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
2094
        assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
2095
        assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
2096
        assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
2097
        assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
2098
        assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
2099
        assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
2100
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
2101
        assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
2102
        assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
2103
        assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
2104
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
2105
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
2106
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
2107
        assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
2108
        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
2109
        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
2110
        assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
2111
        assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
2112
        assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
2113
        assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
2114
        assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2115
        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2116
        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2117
        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2118
        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2119
        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
2120
    }
2121
2122
    #[test]
2123
    fn err_verbatim() {
2124
        // See: https://github.com/rust-lang/regex/issues/792
2125
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
2126
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
2127
        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
2128
        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[         "));
2129
    }
2130
2131
    // This tests a bug fix where the nest limit checker wasn't decrementing
2132
    // its depth during post-traversal, which causes long regexes to trip
2133
    // the default limit too aggressively.
2134
    #[test]
2135
    fn regression_454_nest_too_big() {
2136
        let pattern = r#"
2137
        2(?:
2138
          [45]\d{3}|
2139
          7(?:
2140
            1[0-267]|
2141
            2[0-289]|
2142
            3[0-29]|
2143
            4[01]|
2144
            5[1-3]|
2145
            6[013]|
2146
            7[0178]|
2147
            91
2148
          )|
2149
          8(?:
2150
            0[125]|
2151
            [139][1-6]|
2152
            2[0157-9]|
2153
            41|
2154
            6[1-35]|
2155
            7[1-5]|
2156
            8[1-8]|
2157
            90
2158
          )|
2159
          9(?:
2160
            0[0-2]|
2161
            1[0-4]|
2162
            2[568]|
2163
            3[3-6]|
2164
            5[5-7]|
2165
            6[0167]|
2166
            7[15]|
2167
            8[0146-9]
2168
          )
2169
        )\d{4}
2170
        "#;
2171
        p(pattern);
2172
    }
2173
2174
    // This tests that we treat a trailing `-` in a character class as a
2175
    // literal `-` even when whitespace mode is enabled and there is whitespace
2176
    // after the trailing `-`.
2177
    #[test]
2178
    fn regression_455_trailing_dash_ignore_whitespace() {
2179
        p("(?x)[ / - ]");
2180
        p("(?x)[ a - ]");
2181
        p("(?x)[
2182
            a
2183
            - ]
2184
        ");
2185
        p("(?x)[
2186
            a # wat
2187
            - ]
2188
        ");
2189
2190
        perr("(?x)[ / -");
2191
        perr("(?x)[ / - ");
2192
        perr(
2193
            "(?x)[
2194
            / -
2195
        ",
2196
        );
2197
        perr(
2198
            "(?x)[
2199
            / - # wat
2200
        ",
2201
        );
2202
    }
2203
2204
    #[test]
2205
    fn regression_capture_indices() {
2206
        let got = p(r"(a|ab|c|bcd){4,10}(d*)");
2207
        assert_eq!(
2208
            got,
2209
            Hir::concat(vec![
2210
                Hir::repetition(hir::Repetition {
2211
                    min: 4,
2212
                    max: Some(10),
2213
                    greedy: true,
2214
                    sub: Box::new(cap(
2215
                        1,
2216
                        Hir::alternation(vec![
2217
                            Hir::char('a'),
2218
                            Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
2219
                            Hir::char('c'),
2220
                            Hir::concat(vec![
2221
                                Hir::char('b'),
2222
                                Hir::char('c'),
2223
                                Hir::char('d')
2224
                            ]),
2225
                        ])
2226
                    ))
2227
                }),
2228
                cap(
2229
                    2,
2230
                    Hir::repetition(hir::Repetition {
2231
                        min: 0,
2232
                        max: None,
2233
                        greedy: true,
2234
                        sub: Box::new(Hir::char('d')),
2235
                    })
2236
                ),
2237
            ])
2238
        );
2239
    }
2240
}