Coverage Report

Created: 2024-12-17 06:15

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.6.29/src/ast/mod.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Defines an abstract syntax for regular expressions.
3
*/
4
5
use std::cmp::Ordering;
6
use std::error;
7
use std::fmt;
8
9
pub use crate::ast::visitor::{visit, Visitor};
10
11
pub mod parse;
12
pub mod print;
13
mod visitor;
14
15
/// An error that occurred while parsing a regular expression into an abstract
16
/// syntax tree.
17
///
18
/// Note that not all ASTs represents a valid regular expression. For example,
19
/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20
/// valid Unicode property name. That particular error is reported when
21
/// translating an AST to the high-level intermediate representation (`HIR`).
22
#[derive(Clone, Debug, Eq, PartialEq)]
23
pub struct Error {
24
    /// The kind of error.
25
    kind: ErrorKind,
26
    /// The original pattern that the parser generated the error from. Every
27
    /// span in an error is a valid range into this string.
28
    pattern: String,
29
    /// The span of this error.
30
    span: Span,
31
}
32
33
impl Error {
34
    /// Return the type of this error.
35
0
    pub fn kind(&self) -> &ErrorKind {
36
0
        &self.kind
37
0
    }
38
39
    /// The original pattern string in which this error occurred.
40
    ///
41
    /// Every span reported by this error is reported in terms of this string.
42
0
    pub fn pattern(&self) -> &str {
43
0
        &self.pattern
44
0
    }
45
46
    /// Return the span at which this error occurred.
47
0
    pub fn span(&self) -> &Span {
48
0
        &self.span
49
0
    }
50
51
    /// Return an auxiliary span. This span exists only for some errors that
52
    /// benefit from being able to point to two locations in the original
53
    /// regular expression. For example, "duplicate" errors will have the
54
    /// main error position set to the duplicate occurrence while its
55
    /// auxiliary span will be set to the initial occurrence.
56
0
    pub fn auxiliary_span(&self) -> Option<&Span> {
57
        use self::ErrorKind::*;
58
0
        match self.kind {
59
0
            FlagDuplicate { ref original } => Some(original),
60
0
            FlagRepeatedNegation { ref original, .. } => Some(original),
61
0
            GroupNameDuplicate { ref original, .. } => Some(original),
62
0
            _ => None,
63
        }
64
0
    }
65
}
66
67
/// The type of an error that occurred while building an AST.
68
#[derive(Clone, Debug, Eq, PartialEq)]
69
pub enum ErrorKind {
70
    /// The capturing group limit was exceeded.
71
    ///
72
    /// Note that this represents a limit on the total number of capturing
73
    /// groups in a regex and not necessarily the number of nested capturing
74
    /// groups. That is, the nest limit can be low and it is still possible for
75
    /// this error to occur.
76
    CaptureLimitExceeded,
77
    /// An invalid escape sequence was found in a character class set.
78
    ClassEscapeInvalid,
79
    /// An invalid character class range was found. An invalid range is any
80
    /// range where the start is greater than the end.
81
    ClassRangeInvalid,
82
    /// An invalid range boundary was found in a character class. Range
83
    /// boundaries must be a single literal codepoint, but this error indicates
84
    /// that something else was found, such as a nested class.
85
    ClassRangeLiteral,
86
    /// An opening `[` was found with no corresponding closing `]`.
87
    ClassUnclosed,
88
    /// Note that this error variant is no longer used. Namely, a decimal
89
    /// number can only appear as a repetition quantifier. When the number
90
    /// in a repetition quantifier is empty, then it gets its own specialized
91
    /// error, `RepetitionCountDecimalEmpty`.
92
    DecimalEmpty,
93
    /// An invalid decimal number was given where one was expected.
94
    DecimalInvalid,
95
    /// A bracketed hex literal was empty.
96
    EscapeHexEmpty,
97
    /// A bracketed hex literal did not correspond to a Unicode scalar value.
98
    EscapeHexInvalid,
99
    /// An invalid hexadecimal digit was found.
100
    EscapeHexInvalidDigit,
101
    /// EOF was found before an escape sequence was completed.
102
    EscapeUnexpectedEof,
103
    /// An unrecognized escape sequence.
104
    EscapeUnrecognized,
105
    /// A dangling negation was used when setting flags, e.g., `i-`.
106
    FlagDanglingNegation,
107
    /// A flag was used twice, e.g., `i-i`.
108
    FlagDuplicate {
109
        /// The position of the original flag. The error position
110
        /// points to the duplicate flag.
111
        original: Span,
112
    },
113
    /// The negation operator was used twice, e.g., `-i-s`.
114
    FlagRepeatedNegation {
115
        /// The position of the original negation operator. The error position
116
        /// points to the duplicate negation operator.
117
        original: Span,
118
    },
119
    /// Expected a flag but got EOF, e.g., `(?`.
120
    FlagUnexpectedEof,
121
    /// Unrecognized flag, e.g., `a`.
122
    FlagUnrecognized,
123
    /// A duplicate capture name was found.
124
    GroupNameDuplicate {
125
        /// The position of the initial occurrence of the capture name. The
126
        /// error position itself points to the duplicate occurrence.
127
        original: Span,
128
    },
129
    /// A capture group name is empty, e.g., `(?P<>abc)`.
130
    GroupNameEmpty,
131
    /// An invalid character was seen for a capture group name. This includes
132
    /// errors where the first character is a digit (even though subsequent
133
    /// characters are allowed to be digits).
134
    GroupNameInvalid,
135
    /// A closing `>` could not be found for a capture group name.
136
    GroupNameUnexpectedEof,
137
    /// An unclosed group, e.g., `(ab`.
138
    ///
139
    /// The span of this error corresponds to the unclosed parenthesis.
140
    GroupUnclosed,
141
    /// An unopened group, e.g., `ab)`.
142
    GroupUnopened,
143
    /// The nest limit was exceeded. The limit stored here is the limit
144
    /// configured in the parser.
145
    NestLimitExceeded(u32),
146
    /// The range provided in a counted repetition operator is invalid. The
147
    /// range is invalid if the start is greater than the end.
148
    RepetitionCountInvalid,
149
    /// An opening `{` was not followed by a valid decimal value.
150
    /// For example, `x{}` or `x{]}` would fail.
151
    RepetitionCountDecimalEmpty,
152
    /// An opening `{` was found with no corresponding closing `}`.
153
    RepetitionCountUnclosed,
154
    /// A repetition operator was applied to a missing sub-expression. This
155
    /// occurs, for example, in the regex consisting of just a `*` or even
156
    /// `(?i)*`. It is, however, possible to create a repetition operating on
157
    /// an empty sub-expression. For example, `()*` is still considered valid.
158
    RepetitionMissing,
159
    /// The Unicode class is not valid. This typically occurs when a `\p` is
160
    /// followed by something other than a `{`.
161
    UnicodeClassInvalid,
162
    /// When octal support is disabled, this error is produced when an octal
163
    /// escape is used. The octal escape is assumed to be an invocation of
164
    /// a backreference, which is the common case.
165
    UnsupportedBackreference,
166
    /// When syntax similar to PCRE's look-around is used, this error is
167
    /// returned. Some example syntaxes that are rejected include, but are
168
    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
169
    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
170
    /// error is used to improve the user experience.
171
    UnsupportedLookAround,
172
    /// Hints that destructuring should not be exhaustive.
173
    ///
174
    /// This enum may grow additional variants, so this makes sure clients
175
    /// don't count on exhaustive matching. (Otherwise, adding a new variant
176
    /// could break existing code.)
177
    #[doc(hidden)]
178
    __Nonexhaustive,
179
}
180
181
impl error::Error for Error {
182
    // TODO: Remove this method entirely on the next breaking semver release.
183
    #[allow(deprecated)]
184
0
    fn description(&self) -> &str {
185
        use self::ErrorKind::*;
186
0
        match self.kind {
187
0
            CaptureLimitExceeded => "capture group limit exceeded",
188
0
            ClassEscapeInvalid => "invalid escape sequence in character class",
189
0
            ClassRangeInvalid => "invalid character class range",
190
0
            ClassRangeLiteral => "invalid range boundary, must be a literal",
191
0
            ClassUnclosed => "unclosed character class",
192
0
            DecimalEmpty => "empty decimal literal",
193
0
            DecimalInvalid => "invalid decimal literal",
194
0
            EscapeHexEmpty => "empty hexadecimal literal",
195
0
            EscapeHexInvalid => "invalid hexadecimal literal",
196
0
            EscapeHexInvalidDigit => "invalid hexadecimal digit",
197
0
            EscapeUnexpectedEof => "unexpected eof (escape sequence)",
198
0
            EscapeUnrecognized => "unrecognized escape sequence",
199
0
            FlagDanglingNegation => "dangling flag negation operator",
200
0
            FlagDuplicate { .. } => "duplicate flag",
201
0
            FlagRepeatedNegation { .. } => "repeated negation",
202
0
            FlagUnexpectedEof => "unexpected eof (flag)",
203
0
            FlagUnrecognized => "unrecognized flag",
204
0
            GroupNameDuplicate { .. } => "duplicate capture group name",
205
0
            GroupNameEmpty => "empty capture group name",
206
0
            GroupNameInvalid => "invalid capture group name",
207
0
            GroupNameUnexpectedEof => "unclosed capture group name",
208
0
            GroupUnclosed => "unclosed group",
209
0
            GroupUnopened => "unopened group",
210
0
            NestLimitExceeded(_) => "nest limit exceeded",
211
0
            RepetitionCountInvalid => "invalid repetition count range",
212
0
            RepetitionCountUnclosed => "unclosed counted repetition",
213
0
            RepetitionMissing => "repetition operator missing expression",
214
0
            UnicodeClassInvalid => "invalid Unicode character class",
215
0
            UnsupportedBackreference => "backreferences are not supported",
216
0
            UnsupportedLookAround => "look-around is not supported",
217
0
            _ => unreachable!(),
218
        }
219
0
    }
220
}
221
222
impl fmt::Display for Error {
223
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224
0
        crate::error::Formatter::from(self).fmt(f)
225
0
    }
226
}
227
228
impl fmt::Display for ErrorKind {
229
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230
        use self::ErrorKind::*;
231
0
        match *self {
232
0
            CaptureLimitExceeded => write!(
233
0
                f,
234
0
                "exceeded the maximum number of \
235
0
                 capturing groups ({})",
236
0
                ::std::u32::MAX
237
0
            ),
238
            ClassEscapeInvalid => {
239
0
                write!(f, "invalid escape sequence found in character class")
240
            }
241
0
            ClassRangeInvalid => write!(
242
0
                f,
243
0
                "invalid character class range, \
244
0
                 the start must be <= the end"
245
0
            ),
246
            ClassRangeLiteral => {
247
0
                write!(f, "invalid range boundary, must be a literal")
248
            }
249
0
            ClassUnclosed => write!(f, "unclosed character class"),
250
0
            DecimalEmpty => write!(f, "decimal literal empty"),
251
0
            DecimalInvalid => write!(f, "decimal literal invalid"),
252
0
            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
253
            EscapeHexInvalid => {
254
0
                write!(f, "hexadecimal literal is not a Unicode scalar value")
255
            }
256
0
            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
257
0
            EscapeUnexpectedEof => write!(
258
0
                f,
259
0
                "incomplete escape sequence, \
260
0
                 reached end of pattern prematurely"
261
0
            ),
262
0
            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
263
            FlagDanglingNegation => {
264
0
                write!(f, "dangling flag negation operator")
265
            }
266
0
            FlagDuplicate { .. } => write!(f, "duplicate flag"),
267
            FlagRepeatedNegation { .. } => {
268
0
                write!(f, "flag negation operator repeated")
269
            }
270
            FlagUnexpectedEof => {
271
0
                write!(f, "expected flag but got end of regex")
272
            }
273
0
            FlagUnrecognized => write!(f, "unrecognized flag"),
274
            GroupNameDuplicate { .. } => {
275
0
                write!(f, "duplicate capture group name")
276
            }
277
0
            GroupNameEmpty => write!(f, "empty capture group name"),
278
0
            GroupNameInvalid => write!(f, "invalid capture group character"),
279
0
            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
280
0
            GroupUnclosed => write!(f, "unclosed group"),
281
0
            GroupUnopened => write!(f, "unopened group"),
282
0
            NestLimitExceeded(limit) => write!(
283
0
                f,
284
0
                "exceed the maximum number of \
285
0
                 nested parentheses/brackets ({})",
286
0
                limit
287
0
            ),
288
0
            RepetitionCountInvalid => write!(
289
0
                f,
290
0
                "invalid repetition count range, \
291
0
                 the start must be <= the end"
292
0
            ),
293
            RepetitionCountDecimalEmpty => {
294
0
                write!(f, "repetition quantifier expects a valid decimal")
295
            }
296
            RepetitionCountUnclosed => {
297
0
                write!(f, "unclosed counted repetition")
298
            }
299
            RepetitionMissing => {
300
0
                write!(f, "repetition operator missing expression")
301
            }
302
            UnicodeClassInvalid => {
303
0
                write!(f, "invalid Unicode character class")
304
            }
305
            UnsupportedBackreference => {
306
0
                write!(f, "backreferences are not supported")
307
            }
308
0
            UnsupportedLookAround => write!(
309
0
                f,
310
0
                "look-around, including look-ahead and look-behind, \
311
0
                 is not supported"
312
0
            ),
313
0
            _ => unreachable!(),
314
        }
315
0
    }
316
}
317
318
/// Span represents the position information of a single AST item.
319
///
320
/// All span positions are absolute byte offsets that can be used on the
321
/// original regular expression that was parsed.
322
#[derive(Clone, Copy, Eq, PartialEq)]
323
pub struct Span {
324
    /// The start byte offset.
325
    pub start: Position,
326
    /// The end byte offset.
327
    pub end: Position,
328
}
329
330
impl fmt::Debug for Span {
331
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332
0
        write!(f, "Span({:?}, {:?})", self.start, self.end)
333
0
    }
334
}
335
336
impl Ord for Span {
337
0
    fn cmp(&self, other: &Span) -> Ordering {
338
0
        (&self.start, &self.end).cmp(&(&other.start, &other.end))
339
0
    }
340
}
341
342
impl PartialOrd for Span {
343
0
    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
344
0
        Some(self.cmp(other))
345
0
    }
346
}
347
348
/// A single position in a regular expression.
349
///
350
/// A position encodes one half of a span, and include the byte offset, line
351
/// number and column number.
352
#[derive(Clone, Copy, Eq, PartialEq)]
353
pub struct Position {
354
    /// The absolute offset of this position, starting at `0` from the
355
    /// beginning of the regular expression pattern string.
356
    pub offset: usize,
357
    /// The line number, starting at `1`.
358
    pub line: usize,
359
    /// The approximate column number, starting at `1`.
360
    pub column: usize,
361
}
362
363
impl fmt::Debug for Position {
364
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
365
0
        write!(
366
0
            f,
367
0
            "Position(o: {:?}, l: {:?}, c: {:?})",
368
0
            self.offset, self.line, self.column
369
0
        )
370
0
    }
371
}
372
373
impl Ord for Position {
374
0
    fn cmp(&self, other: &Position) -> Ordering {
375
0
        self.offset.cmp(&other.offset)
376
0
    }
377
}
378
379
impl PartialOrd for Position {
380
0
    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
381
0
        Some(self.cmp(other))
382
0
    }
383
}
384
385
impl Span {
386
    /// Create a new span with the given positions.
387
0
    pub fn new(start: Position, end: Position) -> Span {
388
0
        Span { start, end }
389
0
    }
390
391
    /// Create a new span using the given position as the start and end.
392
0
    pub fn splat(pos: Position) -> Span {
393
0
        Span::new(pos, pos)
394
0
    }
395
396
    /// Create a new span by replacing the starting the position with the one
397
    /// given.
398
0
    pub fn with_start(self, pos: Position) -> Span {
399
0
        Span { start: pos, ..self }
400
0
    }
401
402
    /// Create a new span by replacing the ending the position with the one
403
    /// given.
404
0
    pub fn with_end(self, pos: Position) -> Span {
405
0
        Span { end: pos, ..self }
406
0
    }
407
408
    /// Returns true if and only if this span occurs on a single line.
409
0
    pub fn is_one_line(&self) -> bool {
410
0
        self.start.line == self.end.line
411
0
    }
412
413
    /// Returns true if and only if this span is empty. That is, it points to
414
    /// a single position in the concrete syntax of a regular expression.
415
0
    pub fn is_empty(&self) -> bool {
416
0
        self.start.offset == self.end.offset
417
0
    }
418
}
419
420
impl Position {
421
    /// Create a new position with the given information.
422
    ///
423
    /// `offset` is the absolute offset of the position, starting at `0` from
424
    /// the beginning of the regular expression pattern string.
425
    ///
426
    /// `line` is the line number, starting at `1`.
427
    ///
428
    /// `column` is the approximate column number, starting at `1`.
429
0
    pub fn new(offset: usize, line: usize, column: usize) -> Position {
430
0
        Position { offset, line, column }
431
0
    }
432
}
433
434
/// An abstract syntax tree for a singular expression along with comments
435
/// found.
436
///
437
/// Comments are not stored in the tree itself to avoid complexity. Each
438
/// comment contains a span of precisely where it occurred in the original
439
/// regular expression.
440
#[derive(Clone, Debug, Eq, PartialEq)]
441
pub struct WithComments {
442
    /// The actual ast.
443
    pub ast: Ast,
444
    /// All comments found in the original regular expression.
445
    pub comments: Vec<Comment>,
446
}
447
448
/// A comment from a regular expression with an associated span.
449
///
450
/// A regular expression can only contain comments when the `x` flag is
451
/// enabled.
452
#[derive(Clone, Debug, Eq, PartialEq)]
453
pub struct Comment {
454
    /// The span of this comment, including the beginning `#` and ending `\n`.
455
    pub span: Span,
456
    /// The comment text, starting with the first character following the `#`
457
    /// and ending with the last character preceding the `\n`.
458
    pub comment: String,
459
}
460
461
/// An abstract syntax tree for a single regular expression.
462
///
463
/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
464
/// space proportional to the size of the `Ast`.
465
///
466
/// This type defines its own destructor that uses constant stack space and
467
/// heap space proportional to the size of the `Ast`.
468
#[derive(Clone, Debug, Eq, PartialEq)]
469
pub enum Ast {
470
    /// An empty regex that matches everything.
471
    Empty(Span),
472
    /// A set of flags, e.g., `(?is)`.
473
    Flags(SetFlags),
474
    /// A single character literal, which includes escape sequences.
475
    Literal(Literal),
476
    /// The "any character" class.
477
    Dot(Span),
478
    /// A single zero-width assertion.
479
    Assertion(Assertion),
480
    /// A single character class. This includes all forms of character classes
481
    /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
482
    Class(Class),
483
    /// A repetition operator applied to an arbitrary regular expression.
484
    Repetition(Repetition),
485
    /// A grouped regular expression.
486
    Group(Group),
487
    /// An alternation of regular expressions.
488
    Alternation(Alternation),
489
    /// A concatenation of regular expressions.
490
    Concat(Concat),
491
}
492
493
impl Ast {
494
    /// Return the span of this abstract syntax tree.
495
0
    pub fn span(&self) -> &Span {
496
0
        match *self {
497
0
            Ast::Empty(ref span) => span,
498
0
            Ast::Flags(ref x) => &x.span,
499
0
            Ast::Literal(ref x) => &x.span,
500
0
            Ast::Dot(ref span) => span,
501
0
            Ast::Assertion(ref x) => &x.span,
502
0
            Ast::Class(ref x) => x.span(),
503
0
            Ast::Repetition(ref x) => &x.span,
504
0
            Ast::Group(ref x) => &x.span,
505
0
            Ast::Alternation(ref x) => &x.span,
506
0
            Ast::Concat(ref x) => &x.span,
507
        }
508
0
    }
509
510
    /// Return true if and only if this Ast is empty.
511
0
    pub fn is_empty(&self) -> bool {
512
0
        match *self {
513
0
            Ast::Empty(_) => true,
514
0
            _ => false,
515
        }
516
0
    }
517
518
    /// Returns true if and only if this AST has any (including possibly empty)
519
    /// subexpressions.
520
0
    fn has_subexprs(&self) -> bool {
521
0
        match *self {
522
            Ast::Empty(_)
523
            | Ast::Flags(_)
524
            | Ast::Literal(_)
525
            | Ast::Dot(_)
526
0
            | Ast::Assertion(_) => false,
527
            Ast::Class(_)
528
            | Ast::Repetition(_)
529
            | Ast::Group(_)
530
            | Ast::Alternation(_)
531
0
            | Ast::Concat(_) => true,
532
        }
533
0
    }
534
}
535
536
/// Print a display representation of this Ast.
537
///
538
/// This does not preserve any of the original whitespace formatting that may
539
/// have originally been present in the concrete syntax from which this Ast
540
/// was generated.
541
///
542
/// This implementation uses constant stack space and heap space proportional
543
/// to the size of the `Ast`.
544
impl fmt::Display for Ast {
545
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546
        use crate::ast::print::Printer;
547
0
        Printer::new().print(self, f)
548
0
    }
549
}
550
551
/// An alternation of regular expressions.
552
#[derive(Clone, Debug, Eq, PartialEq)]
553
pub struct Alternation {
554
    /// The span of this alternation.
555
    pub span: Span,
556
    /// The alternate regular expressions.
557
    pub asts: Vec<Ast>,
558
}
559
560
impl Alternation {
561
    /// Return this alternation as an AST.
562
    ///
563
    /// If this alternation contains zero ASTs, then Ast::Empty is
564
    /// returned. If this alternation contains exactly 1 AST, then the
565
    /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
566
0
    pub fn into_ast(mut self) -> Ast {
567
0
        match self.asts.len() {
568
0
            0 => Ast::Empty(self.span),
569
0
            1 => self.asts.pop().unwrap(),
570
0
            _ => Ast::Alternation(self),
571
        }
572
0
    }
573
}
574
575
/// A concatenation of regular expressions.
576
#[derive(Clone, Debug, Eq, PartialEq)]
577
pub struct Concat {
578
    /// The span of this concatenation.
579
    pub span: Span,
580
    /// The concatenation regular expressions.
581
    pub asts: Vec<Ast>,
582
}
583
584
impl Concat {
585
    /// Return this concatenation as an AST.
586
    ///
587
    /// If this concatenation contains zero ASTs, then Ast::Empty is
588
    /// returned. If this concatenation contains exactly 1 AST, then the
589
    /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
590
0
    pub fn into_ast(mut self) -> Ast {
591
0
        match self.asts.len() {
592
0
            0 => Ast::Empty(self.span),
593
0
            1 => self.asts.pop().unwrap(),
594
0
            _ => Ast::Concat(self),
595
        }
596
0
    }
597
}
598
599
/// A single literal expression.
600
///
601
/// A literal corresponds to a single Unicode scalar value. Literals may be
602
/// represented in their literal form, e.g., `a` or in their escaped form,
603
/// e.g., `\x61`.
604
#[derive(Clone, Debug, Eq, PartialEq)]
605
pub struct Literal {
606
    /// The span of this literal.
607
    pub span: Span,
608
    /// The kind of this literal.
609
    pub kind: LiteralKind,
610
    /// The Unicode scalar value corresponding to this literal.
611
    pub c: char,
612
}
613
614
impl Literal {
615
    /// If this literal was written as a `\x` hex escape, then this returns
616
    /// the corresponding byte value. Otherwise, this returns `None`.
617
0
    pub fn byte(&self) -> Option<u8> {
618
0
        let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
619
0
        if self.c as u32 <= 255 && self.kind == short_hex {
620
0
            Some(self.c as u8)
621
        } else {
622
0
            None
623
        }
624
0
    }
625
}
626
627
/// The kind of a single literal expression.
628
#[derive(Clone, Debug, Eq, PartialEq)]
629
pub enum LiteralKind {
630
    /// The literal is written verbatim, e.g., `a` or `☃`.
631
    Verbatim,
632
    /// The literal is written as an escape because it is punctuation, e.g.,
633
    /// `\*` or `\[`.
634
    Punctuation,
635
    /// The literal is written as an octal escape, e.g., `\141`.
636
    Octal,
637
    /// The literal is written as a hex code with a fixed number of digits
638
    /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
639
    /// `\U00000061`.
640
    HexFixed(HexLiteralKind),
641
    /// The literal is written as a hex code with a bracketed number of
642
    /// digits. The only restriction is that the bracketed hex code must refer
643
    /// to a valid Unicode scalar value.
644
    HexBrace(HexLiteralKind),
645
    /// The literal is written as a specially recognized escape, e.g., `\f`
646
    /// or `\n`.
647
    Special(SpecialLiteralKind),
648
}
649
650
/// The type of a special literal.
651
///
652
/// A special literal is a special escape sequence recognized by the regex
653
/// parser, e.g., `\f` or `\n`.
654
#[derive(Clone, Debug, Eq, PartialEq)]
655
pub enum SpecialLiteralKind {
656
    /// Bell, spelled `\a` (`\x07`).
657
    Bell,
658
    /// Form feed, spelled `\f` (`\x0C`).
659
    FormFeed,
660
    /// Tab, spelled `\t` (`\x09`).
661
    Tab,
662
    /// Line feed, spelled `\n` (`\x0A`).
663
    LineFeed,
664
    /// Carriage return, spelled `\r` (`\x0D`).
665
    CarriageReturn,
666
    /// Vertical tab, spelled `\v` (`\x0B`).
667
    VerticalTab,
668
    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
669
    /// parsing in verbose mode.
670
    Space,
671
}
672
673
/// The type of a Unicode hex literal.
674
///
675
/// Note that all variants behave the same when used with brackets. They only
676
/// differ when used without brackets in the number of hex digits that must
677
/// follow.
678
#[derive(Clone, Debug, Eq, PartialEq)]
679
pub enum HexLiteralKind {
680
    /// A `\x` prefix. When used without brackets, this form is limited to
681
    /// two digits.
682
    X,
683
    /// A `\u` prefix. When used without brackets, this form is limited to
684
    /// four digits.
685
    UnicodeShort,
686
    /// A `\U` prefix. When used without brackets, this form is limited to
687
    /// eight digits.
688
    UnicodeLong,
689
}
690
691
impl HexLiteralKind {
692
    /// The number of digits that must be used with this literal form when
693
    /// used without brackets. When used with brackets, there is no
694
    /// restriction on the number of digits.
695
0
    pub fn digits(&self) -> u32 {
696
0
        match *self {
697
0
            HexLiteralKind::X => 2,
698
0
            HexLiteralKind::UnicodeShort => 4,
699
0
            HexLiteralKind::UnicodeLong => 8,
700
        }
701
0
    }
702
}
703
704
/// A single character class expression.
705
#[derive(Clone, Debug, Eq, PartialEq)]
706
pub enum Class {
707
    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
708
    Unicode(ClassUnicode),
709
    /// A perl character class, e.g., `\d` or `\W`.
710
    Perl(ClassPerl),
711
    /// A bracketed character class set, which may contain zero or more
712
    /// character ranges and/or zero or more nested classes. e.g.,
713
    /// `[a-zA-Z\pL]`.
714
    Bracketed(ClassBracketed),
715
}
716
717
impl Class {
718
    /// Return the span of this character class.
719
0
    pub fn span(&self) -> &Span {
720
0
        match *self {
721
0
            Class::Perl(ref x) => &x.span,
722
0
            Class::Unicode(ref x) => &x.span,
723
0
            Class::Bracketed(ref x) => &x.span,
724
        }
725
0
    }
726
}
727
728
/// A Perl character class.
729
#[derive(Clone, Debug, Eq, PartialEq)]
730
pub struct ClassPerl {
731
    /// The span of this class.
732
    pub span: Span,
733
    /// The kind of Perl class.
734
    pub kind: ClassPerlKind,
735
    /// Whether the class is negated or not. e.g., `\d` is not negated but
736
    /// `\D` is.
737
    pub negated: bool,
738
}
739
740
/// The available Perl character classes.
741
#[derive(Clone, Debug, Eq, PartialEq)]
742
pub enum ClassPerlKind {
743
    /// Decimal numbers.
744
    Digit,
745
    /// Whitespace.
746
    Space,
747
    /// Word characters.
748
    Word,
749
}
750
751
/// An ASCII character class.
752
#[derive(Clone, Debug, Eq, PartialEq)]
753
pub struct ClassAscii {
754
    /// The span of this class.
755
    pub span: Span,
756
    /// The kind of ASCII class.
757
    pub kind: ClassAsciiKind,
758
    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
759
    /// but `[[:^alpha:]]` is.
760
    pub negated: bool,
761
}
762
763
/// The available ASCII character classes.
764
#[derive(Clone, Debug, Eq, PartialEq)]
765
pub enum ClassAsciiKind {
766
    /// `[0-9A-Za-z]`
767
    Alnum,
768
    /// `[A-Za-z]`
769
    Alpha,
770
    /// `[\x00-\x7F]`
771
    Ascii,
772
    /// `[ \t]`
773
    Blank,
774
    /// `[\x00-\x1F\x7F]`
775
    Cntrl,
776
    /// `[0-9]`
777
    Digit,
778
    /// `[!-~]`
779
    Graph,
780
    /// `[a-z]`
781
    Lower,
782
    /// `[ -~]`
783
    Print,
784
    /// `[!-/:-@\[-`{-~]`
785
    Punct,
786
    /// `[\t\n\v\f\r ]`
787
    Space,
788
    /// `[A-Z]`
789
    Upper,
790
    /// `[0-9A-Za-z_]`
791
    Word,
792
    /// `[0-9A-Fa-f]`
793
    Xdigit,
794
}
795
796
impl ClassAsciiKind {
797
    /// Return the corresponding ClassAsciiKind variant for the given name.
798
    ///
799
    /// The name given should correspond to the lowercase version of the
800
    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
801
    ///
802
    /// If no variant with the corresponding name exists, then `None` is
803
    /// returned.
804
0
    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
805
        use self::ClassAsciiKind::*;
806
0
        match name {
807
0
            "alnum" => Some(Alnum),
808
0
            "alpha" => Some(Alpha),
809
0
            "ascii" => Some(Ascii),
810
0
            "blank" => Some(Blank),
811
0
            "cntrl" => Some(Cntrl),
812
0
            "digit" => Some(Digit),
813
0
            "graph" => Some(Graph),
814
0
            "lower" => Some(Lower),
815
0
            "print" => Some(Print),
816
0
            "punct" => Some(Punct),
817
0
            "space" => Some(Space),
818
0
            "upper" => Some(Upper),
819
0
            "word" => Some(Word),
820
0
            "xdigit" => Some(Xdigit),
821
0
            _ => None,
822
        }
823
0
    }
824
}
825
826
/// A Unicode character class.
827
#[derive(Clone, Debug, Eq, PartialEq)]
828
pub struct ClassUnicode {
829
    /// The span of this class.
830
    pub span: Span,
831
    /// Whether this class is negated or not.
832
    ///
833
    /// Note: be careful when using this attribute. This specifically refers
834
    /// to whether the class is written as `\p` or `\P`, where the latter
835
    /// is `negated = true`. However, it also possible to write something like
836
    /// `\P{scx!=Katakana}` which is actually equivalent to
837
    /// `\p{scx=Katakana}` and is therefore not actually negated even though
838
    /// `negated = true` here. To test whether this class is truly negated
839
    /// or not, use the `is_negated` method.
840
    pub negated: bool,
841
    /// The kind of Unicode class.
842
    pub kind: ClassUnicodeKind,
843
}
844
845
impl ClassUnicode {
846
    /// Returns true if this class has been negated.
847
    ///
848
    /// Note that this takes the Unicode op into account, if it's present.
849
    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
850
0
    pub fn is_negated(&self) -> bool {
851
0
        match self.kind {
852
            ClassUnicodeKind::NamedValue {
853
                op: ClassUnicodeOpKind::NotEqual,
854
                ..
855
0
            } => !self.negated,
856
0
            _ => self.negated,
857
        }
858
0
    }
859
}
860
861
/// The available forms of Unicode character classes.
862
#[derive(Clone, Debug, Eq, PartialEq)]
863
pub enum ClassUnicodeKind {
864
    /// A one letter abbreviated class, e.g., `\pN`.
865
    OneLetter(char),
866
    /// A binary property, general category or script. The string may be
867
    /// empty.
868
    Named(String),
869
    /// A property name and an associated value.
870
    NamedValue {
871
        /// The type of Unicode op used to associate `name` with `value`.
872
        op: ClassUnicodeOpKind,
873
        /// The property name (which may be empty).
874
        name: String,
875
        /// The property value (which may be empty).
876
        value: String,
877
    },
878
}
879
880
/// The type of op used in a Unicode character class.
881
#[derive(Clone, Debug, Eq, PartialEq)]
882
pub enum ClassUnicodeOpKind {
883
    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
884
    Equal,
885
    /// A property set to a specific value using a colon, e.g.,
886
    /// `\p{scx:Katakana}`.
887
    Colon,
888
    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
889
    NotEqual,
890
}
891
892
impl ClassUnicodeOpKind {
893
    /// Whether the op is an equality op or not.
894
0
    pub fn is_equal(&self) -> bool {
895
0
        match *self {
896
0
            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
897
0
            _ => false,
898
        }
899
0
    }
900
}
901
902
/// A bracketed character class, e.g., `[a-z0-9]`.
903
#[derive(Clone, Debug, Eq, PartialEq)]
904
pub struct ClassBracketed {
905
    /// The span of this class.
906
    pub span: Span,
907
    /// Whether this class is negated or not. e.g., `[a]` is not negated but
908
    /// `[^a]` is.
909
    pub negated: bool,
910
    /// The type of this set. A set is either a normal union of things, e.g.,
911
    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
912
    pub kind: ClassSet,
913
}
914
915
/// A character class set.
916
///
917
/// This type corresponds to the internal structure of a bracketed character
918
/// class. That is, every bracketed character is one of two types: a union of
919
/// items (literals, ranges, other bracketed classes) or a tree of binary set
920
/// operations.
921
#[derive(Clone, Debug, Eq, PartialEq)]
922
pub enum ClassSet {
923
    /// An item, which can be a single literal, range, nested character class
924
    /// or a union of items.
925
    Item(ClassSetItem),
926
    /// A single binary operation (i.e., &&, -- or ~~).
927
    BinaryOp(ClassSetBinaryOp),
928
}
929
930
impl ClassSet {
931
    /// Build a set from a union.
932
0
    pub fn union(ast: ClassSetUnion) -> ClassSet {
933
0
        ClassSet::Item(ClassSetItem::Union(ast))
934
0
    }
935
936
    /// Return the span of this character class set.
937
0
    pub fn span(&self) -> &Span {
938
0
        match *self {
939
0
            ClassSet::Item(ref x) => x.span(),
940
0
            ClassSet::BinaryOp(ref x) => &x.span,
941
        }
942
0
    }
943
944
    /// Return true if and only if this class set is empty.
945
0
    fn is_empty(&self) -> bool {
946
0
        match *self {
947
0
            ClassSet::Item(ClassSetItem::Empty(_)) => true,
948
0
            _ => false,
949
        }
950
0
    }
951
}
952
953
/// A single component of a character class set.
954
#[derive(Clone, Debug, Eq, PartialEq)]
955
pub enum ClassSetItem {
956
    /// An empty item.
957
    ///
958
    /// Note that a bracketed character class cannot contain a single empty
959
    /// item. Empty items can appear when using one of the binary operators.
960
    /// For example, `[&&]` is the intersection of two empty classes.
961
    Empty(Span),
962
    /// A single literal.
963
    Literal(Literal),
964
    /// A range between two literals.
965
    Range(ClassSetRange),
966
    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
967
    Ascii(ClassAscii),
968
    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
969
    Unicode(ClassUnicode),
970
    /// A perl character class, e.g., `\d` or `\W`.
971
    Perl(ClassPerl),
972
    /// A bracketed character class set, which may contain zero or more
973
    /// character ranges and/or zero or more nested classes. e.g.,
974
    /// `[a-zA-Z\pL]`.
975
    Bracketed(Box<ClassBracketed>),
976
    /// A union of items.
977
    Union(ClassSetUnion),
978
}
979
980
impl ClassSetItem {
981
    /// Return the span of this character class set item.
982
0
    pub fn span(&self) -> &Span {
983
0
        match *self {
984
0
            ClassSetItem::Empty(ref span) => span,
985
0
            ClassSetItem::Literal(ref x) => &x.span,
986
0
            ClassSetItem::Range(ref x) => &x.span,
987
0
            ClassSetItem::Ascii(ref x) => &x.span,
988
0
            ClassSetItem::Perl(ref x) => &x.span,
989
0
            ClassSetItem::Unicode(ref x) => &x.span,
990
0
            ClassSetItem::Bracketed(ref x) => &x.span,
991
0
            ClassSetItem::Union(ref x) => &x.span,
992
        }
993
0
    }
994
}
995
996
/// A single character class range in a set.
997
#[derive(Clone, Debug, Eq, PartialEq)]
998
pub struct ClassSetRange {
999
    /// The span of this range.
1000
    pub span: Span,
1001
    /// The start of this range.
1002
    pub start: Literal,
1003
    /// The end of this range.
1004
    pub end: Literal,
1005
}
1006
1007
impl ClassSetRange {
1008
    /// Returns true if and only if this character class range is valid.
1009
    ///
1010
    /// The only case where a range is invalid is if its start is greater than
1011
    /// its end.
1012
0
    pub fn is_valid(&self) -> bool {
1013
0
        self.start.c <= self.end.c
1014
0
    }
1015
}
1016
1017
/// A union of items inside a character class set.
1018
#[derive(Clone, Debug, Eq, PartialEq)]
1019
pub struct ClassSetUnion {
1020
    /// The span of the items in this operation. e.g., the `a-z0-9` in
1021
    /// `[^a-z0-9]`
1022
    pub span: Span,
1023
    /// The sequence of items that make up this union.
1024
    pub items: Vec<ClassSetItem>,
1025
}
1026
1027
impl ClassSetUnion {
1028
    /// Push a new item in this union.
1029
    ///
1030
    /// The ending position of this union's span is updated to the ending
1031
    /// position of the span of the item given. If the union is empty, then
1032
    /// the starting position of this union is set to the starting position
1033
    /// of this item.
1034
    ///
1035
    /// In other words, if you only use this method to add items to a union
1036
    /// and you set the spans on each item correctly, then you should never
1037
    /// need to adjust the span of the union directly.
1038
0
    pub fn push(&mut self, item: ClassSetItem) {
1039
0
        if self.items.is_empty() {
1040
0
            self.span.start = item.span().start;
1041
0
        }
1042
0
        self.span.end = item.span().end;
1043
0
        self.items.push(item);
1044
0
    }
1045
1046
    /// Return this union as a character class set item.
1047
    ///
1048
    /// If this union contains zero items, then an empty union is
1049
    /// returned. If this concatenation contains exactly 1 item, then the
1050
    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1051
    /// returned.
1052
0
    pub fn into_item(mut self) -> ClassSetItem {
1053
0
        match self.items.len() {
1054
0
            0 => ClassSetItem::Empty(self.span),
1055
0
            1 => self.items.pop().unwrap(),
1056
0
            _ => ClassSetItem::Union(self),
1057
        }
1058
0
    }
1059
}
1060
1061
/// A Unicode character class set operation.
1062
#[derive(Clone, Debug, Eq, PartialEq)]
1063
pub struct ClassSetBinaryOp {
1064
    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1065
    pub span: Span,
1066
    /// The type of this set operation.
1067
    pub kind: ClassSetBinaryOpKind,
1068
    /// The left hand side of the operation.
1069
    pub lhs: Box<ClassSet>,
1070
    /// The right hand side of the operation.
1071
    pub rhs: Box<ClassSet>,
1072
}
1073
1074
/// The type of a Unicode character class set operation.
1075
///
1076
/// Note that this doesn't explicitly represent union since there is no
1077
/// explicit union operator. Concatenation inside a character class corresponds
1078
/// to the union operation.
1079
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1080
pub enum ClassSetBinaryOpKind {
1081
    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1082
    Intersection,
1083
    /// The difference of two sets, e.g., `\pN--[0-9]`.
1084
    Difference,
1085
    /// The symmetric difference of two sets. The symmetric difference is the
1086
    /// set of elements belonging to one but not both sets.
1087
    /// e.g., `[\pL~~[:ascii:]]`.
1088
    SymmetricDifference,
1089
}
1090
1091
/// A single zero-width assertion.
1092
#[derive(Clone, Debug, Eq, PartialEq)]
1093
pub struct Assertion {
1094
    /// The span of this assertion.
1095
    pub span: Span,
1096
    /// The assertion kind, e.g., `\b` or `^`.
1097
    pub kind: AssertionKind,
1098
}
1099
1100
/// An assertion kind.
1101
#[derive(Clone, Debug, Eq, PartialEq)]
1102
pub enum AssertionKind {
1103
    /// `^`
1104
    StartLine,
1105
    /// `$`
1106
    EndLine,
1107
    /// `\A`
1108
    StartText,
1109
    /// `\z`
1110
    EndText,
1111
    /// `\b`
1112
    WordBoundary,
1113
    /// `\B`
1114
    NotWordBoundary,
1115
}
1116
1117
/// A repetition operation applied to a regular expression.
1118
#[derive(Clone, Debug, Eq, PartialEq)]
1119
pub struct Repetition {
1120
    /// The span of this operation.
1121
    pub span: Span,
1122
    /// The actual operation.
1123
    pub op: RepetitionOp,
1124
    /// Whether this operation was applied greedily or not.
1125
    pub greedy: bool,
1126
    /// The regular expression under repetition.
1127
    pub ast: Box<Ast>,
1128
}
1129
1130
/// The repetition operator itself.
1131
#[derive(Clone, Debug, Eq, PartialEq)]
1132
pub struct RepetitionOp {
1133
    /// The span of this operator. This includes things like `+`, `*?` and
1134
    /// `{m,n}`.
1135
    pub span: Span,
1136
    /// The type of operation.
1137
    pub kind: RepetitionKind,
1138
}
1139
1140
/// The kind of a repetition operator.
1141
#[derive(Clone, Debug, Eq, PartialEq)]
1142
pub enum RepetitionKind {
1143
    /// `?`
1144
    ZeroOrOne,
1145
    /// `*`
1146
    ZeroOrMore,
1147
    /// `+`
1148
    OneOrMore,
1149
    /// `{m,n}`
1150
    Range(RepetitionRange),
1151
}
1152
1153
/// A range repetition operator.
1154
#[derive(Clone, Debug, Eq, PartialEq)]
1155
pub enum RepetitionRange {
1156
    /// `{m}`
1157
    Exactly(u32),
1158
    /// `{m,}`
1159
    AtLeast(u32),
1160
    /// `{m,n}`
1161
    Bounded(u32, u32),
1162
}
1163
1164
impl RepetitionRange {
1165
    /// Returns true if and only if this repetition range is valid.
1166
    ///
1167
    /// The only case where a repetition range is invalid is if it is bounded
1168
    /// and its start is greater than its end.
1169
0
    pub fn is_valid(&self) -> bool {
1170
0
        match *self {
1171
0
            RepetitionRange::Bounded(s, e) if s > e => false,
1172
0
            _ => true,
1173
        }
1174
0
    }
1175
}
1176
1177
/// A grouped regular expression.
1178
///
1179
/// This includes both capturing and non-capturing groups. This does **not**
1180
/// include flag-only groups like `(?is)`, but does contain any group that
1181
/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1182
/// `(?is:a)`.
1183
#[derive(Clone, Debug, Eq, PartialEq)]
1184
pub struct Group {
1185
    /// The span of this group.
1186
    pub span: Span,
1187
    /// The kind of this group.
1188
    pub kind: GroupKind,
1189
    /// The regular expression in this group.
1190
    pub ast: Box<Ast>,
1191
}
1192
1193
impl Group {
1194
    /// If this group is non-capturing, then this returns the (possibly empty)
1195
    /// set of flags. Otherwise, `None` is returned.
1196
0
    pub fn flags(&self) -> Option<&Flags> {
1197
0
        match self.kind {
1198
0
            GroupKind::NonCapturing(ref flags) => Some(flags),
1199
0
            _ => None,
1200
        }
1201
0
    }
1202
1203
    /// Returns true if and only if this group is capturing.
1204
0
    pub fn is_capturing(&self) -> bool {
1205
0
        match self.kind {
1206
0
            GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1207
0
            GroupKind::NonCapturing(_) => false,
1208
        }
1209
0
    }
1210
1211
    /// Returns the capture index of this group, if this is a capturing group.
1212
    ///
1213
    /// This returns a capture index precisely when `is_capturing` is `true`.
1214
0
    pub fn capture_index(&self) -> Option<u32> {
1215
0
        match self.kind {
1216
0
            GroupKind::CaptureIndex(i) => Some(i),
1217
0
            GroupKind::CaptureName(ref x) => Some(x.index),
1218
0
            GroupKind::NonCapturing(_) => None,
1219
        }
1220
0
    }
1221
}
1222
1223
/// The kind of a group.
1224
#[derive(Clone, Debug, Eq, PartialEq)]
1225
pub enum GroupKind {
1226
    /// `(a)`
1227
    CaptureIndex(u32),
1228
    /// `(?P<name>a)`
1229
    CaptureName(CaptureName),
1230
    /// `(?:a)` and `(?i:a)`
1231
    NonCapturing(Flags),
1232
}
1233
1234
/// A capture name.
1235
///
1236
/// This corresponds to the name itself between the angle brackets in, e.g.,
1237
/// `(?P<foo>expr)`.
1238
#[derive(Clone, Debug, Eq, PartialEq)]
1239
pub struct CaptureName {
1240
    /// The span of this capture name.
1241
    pub span: Span,
1242
    /// The capture name.
1243
    pub name: String,
1244
    /// The capture index.
1245
    pub index: u32,
1246
}
1247
1248
/// A group of flags that is not applied to a particular regular expression.
1249
#[derive(Clone, Debug, Eq, PartialEq)]
1250
pub struct SetFlags {
1251
    /// The span of these flags, including the grouping parentheses.
1252
    pub span: Span,
1253
    /// The actual sequence of flags.
1254
    pub flags: Flags,
1255
}
1256
1257
/// A group of flags.
1258
///
1259
/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1260
#[derive(Clone, Debug, Eq, PartialEq)]
1261
pub struct Flags {
1262
    /// The span of this group of flags.
1263
    pub span: Span,
1264
    /// A sequence of flag items. Each item is either a flag or a negation
1265
    /// operator.
1266
    pub items: Vec<FlagsItem>,
1267
}
1268
1269
impl Flags {
1270
    /// Add the given item to this sequence of flags.
1271
    ///
1272
    /// If the item was added successfully, then `None` is returned. If the
1273
    /// given item is a duplicate, then `Some(i)` is returned, where
1274
    /// `items[i].kind == item.kind`.
1275
0
    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1276
0
        for (i, x) in self.items.iter().enumerate() {
1277
0
            if x.kind == item.kind {
1278
0
                return Some(i);
1279
0
            }
1280
        }
1281
0
        self.items.push(item);
1282
0
        None
1283
0
    }
1284
1285
    /// Returns the state of the given flag in this set.
1286
    ///
1287
    /// If the given flag is in the set but is negated, then `Some(false)` is
1288
    /// returned.
1289
    ///
1290
    /// If the given flag is in the set and is not negated, then `Some(true)`
1291
    /// is returned.
1292
    ///
1293
    /// Otherwise, `None` is returned.
1294
0
    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1295
0
        let mut negated = false;
1296
0
        for x in &self.items {
1297
0
            match x.kind {
1298
0
                FlagsItemKind::Negation => {
1299
0
                    negated = true;
1300
0
                }
1301
0
                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1302
0
                    return Some(!negated);
1303
                }
1304
0
                _ => {}
1305
            }
1306
        }
1307
0
        None
1308
0
    }
1309
}
1310
1311
/// A single item in a group of flags.
1312
#[derive(Clone, Debug, Eq, PartialEq)]
1313
pub struct FlagsItem {
1314
    /// The span of this item.
1315
    pub span: Span,
1316
    /// The kind of this item.
1317
    pub kind: FlagsItemKind,
1318
}
1319
1320
/// The kind of an item in a group of flags.
1321
#[derive(Clone, Debug, Eq, PartialEq)]
1322
pub enum FlagsItemKind {
1323
    /// A negation operator applied to all subsequent flags in the enclosing
1324
    /// group.
1325
    Negation,
1326
    /// A single flag in a group.
1327
    Flag(Flag),
1328
}
1329
1330
impl FlagsItemKind {
1331
    /// Returns true if and only if this item is a negation operator.
1332
0
    pub fn is_negation(&self) -> bool {
1333
0
        match *self {
1334
0
            FlagsItemKind::Negation => true,
1335
0
            _ => false,
1336
        }
1337
0
    }
1338
}
1339
1340
/// A single flag.
1341
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1342
pub enum Flag {
1343
    /// `i`
1344
    CaseInsensitive,
1345
    /// `m`
1346
    MultiLine,
1347
    /// `s`
1348
    DotMatchesNewLine,
1349
    /// `U`
1350
    SwapGreed,
1351
    /// `u`
1352
    Unicode,
1353
    /// `x`
1354
    IgnoreWhitespace,
1355
}
1356
1357
/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1358
/// space but heap space proportional to the depth of the `Ast`.
1359
impl Drop for Ast {
1360
0
    fn drop(&mut self) {
1361
        use std::mem;
1362
1363
0
        match *self {
1364
            Ast::Empty(_)
1365
            | Ast::Flags(_)
1366
            | Ast::Literal(_)
1367
            | Ast::Dot(_)
1368
            | Ast::Assertion(_)
1369
            // Classes are recursive, so they get their own Drop impl.
1370
0
            | Ast::Class(_) => return,
1371
0
            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1372
0
            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1373
0
            Ast::Alternation(ref x) if x.asts.is_empty() => return,
1374
0
            Ast::Concat(ref x) if x.asts.is_empty() => return,
1375
0
            _ => {}
1376
0
        }
1377
0
1378
0
        let empty_span = || Span::splat(Position::new(0, 0, 0));
1379
0
        let empty_ast = || Ast::Empty(empty_span());
1380
0
        let mut stack = vec![mem::replace(self, empty_ast())];
1381
0
        while let Some(mut ast) = stack.pop() {
1382
0
            match ast {
1383
                Ast::Empty(_)
1384
                | Ast::Flags(_)
1385
                | Ast::Literal(_)
1386
                | Ast::Dot(_)
1387
                | Ast::Assertion(_)
1388
                // Classes are recursive, so they get their own Drop impl.
1389
0
                | Ast::Class(_) => {}
1390
0
                Ast::Repetition(ref mut x) => {
1391
0
                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1392
0
                }
1393
0
                Ast::Group(ref mut x) => {
1394
0
                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1395
0
                }
1396
0
                Ast::Alternation(ref mut x) => {
1397
0
                    stack.extend(x.asts.drain(..));
1398
0
                }
1399
0
                Ast::Concat(ref mut x) => {
1400
0
                    stack.extend(x.asts.drain(..));
1401
0
                }
1402
            }
1403
        }
1404
0
    }
1405
}
1406
1407
/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1408
/// stack space but heap space proportional to the depth of the `ClassSet`.
1409
impl Drop for ClassSet {
1410
0
    fn drop(&mut self) {
1411
        use std::mem;
1412
1413
0
        match *self {
1414
0
            ClassSet::Item(ref item) => match *item {
1415
                ClassSetItem::Empty(_)
1416
                | ClassSetItem::Literal(_)
1417
                | ClassSetItem::Range(_)
1418
                | ClassSetItem::Ascii(_)
1419
                | ClassSetItem::Unicode(_)
1420
0
                | ClassSetItem::Perl(_) => return,
1421
0
                ClassSetItem::Bracketed(ref x) => {
1422
0
                    if x.kind.is_empty() {
1423
0
                        return;
1424
0
                    }
1425
                }
1426
0
                ClassSetItem::Union(ref x) => {
1427
0
                    if x.items.is_empty() {
1428
0
                        return;
1429
0
                    }
1430
                }
1431
            },
1432
0
            ClassSet::BinaryOp(ref op) => {
1433
0
                if op.lhs.is_empty() && op.rhs.is_empty() {
1434
0
                    return;
1435
0
                }
1436
            }
1437
        }
1438
1439
0
        let empty_span = || Span::splat(Position::new(0, 0, 0));
1440
0
        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1441
0
        let mut stack = vec![mem::replace(self, empty_set())];
1442
0
        while let Some(mut set) = stack.pop() {
1443
0
            match set {
1444
0
                ClassSet::Item(ref mut item) => match *item {
1445
                    ClassSetItem::Empty(_)
1446
                    | ClassSetItem::Literal(_)
1447
                    | ClassSetItem::Range(_)
1448
                    | ClassSetItem::Ascii(_)
1449
                    | ClassSetItem::Unicode(_)
1450
0
                    | ClassSetItem::Perl(_) => {}
1451
0
                    ClassSetItem::Bracketed(ref mut x) => {
1452
0
                        stack.push(mem::replace(&mut x.kind, empty_set()));
1453
0
                    }
1454
0
                    ClassSetItem::Union(ref mut x) => {
1455
0
                        stack.extend(x.items.drain(..).map(ClassSet::Item));
1456
0
                    }
1457
                },
1458
0
                ClassSet::BinaryOp(ref mut op) => {
1459
0
                    stack.push(mem::replace(&mut op.lhs, empty_set()));
1460
0
                    stack.push(mem::replace(&mut op.rhs, empty_set()));
1461
0
                }
1462
            }
1463
        }
1464
0
    }
1465
}
1466
1467
#[cfg(test)]
1468
mod tests {
1469
    use super::*;
1470
1471
    // We use a thread with an explicit stack size to test that our destructor
1472
    // for Ast can handle arbitrarily sized expressions in constant stack
1473
    // space. In case we run on a platform without threads (WASM?), we limit
1474
    // this test to Windows/Unix.
1475
    #[test]
1476
    #[cfg(any(unix, windows))]
1477
    fn no_stack_overflow_on_drop() {
1478
        use std::thread;
1479
1480
        let run = || {
1481
            let span = || Span::splat(Position::new(0, 0, 0));
1482
            let mut ast = Ast::Empty(span());
1483
            for i in 0..200 {
1484
                ast = Ast::Group(Group {
1485
                    span: span(),
1486
                    kind: GroupKind::CaptureIndex(i),
1487
                    ast: Box::new(ast),
1488
                });
1489
            }
1490
            assert!(!ast.is_empty());
1491
        };
1492
1493
        // We run our test on a thread with a small stack size so we can
1494
        // force the issue more easily.
1495
        //
1496
        // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1497
        // will just barf with very small stack sizes. So we bump this up a bit
1498
        // to give more room to breath. When I did this, I confirmed that if
1499
        // I remove the custom `Drop` impl for `Ast`, then this test does
1500
        // indeed still fail with a stack overflow. (At the time of writing, I
1501
        // had to bump it all the way up to 32K before the test would pass even
1502
        // without the custom `Drop` impl. So 16K seems like a safe number
1503
        // here.)
1504
        //
1505
        // See: https://github.com/rust-lang/regex/issues/967
1506
        thread::Builder::new()
1507
            .stack_size(16 << 10)
1508
            .spawn(run)
1509
            .unwrap()
1510
            .join()
1511
            .unwrap();
1512
    }
1513
}