Coverage Report

Created: 2025-06-02 07:01

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/ast/mod.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Defines an abstract syntax for regular expressions.
3
*/
4
5
use core::cmp::Ordering;
6
7
use alloc::{boxed::Box, string::String, vec, vec::Vec};
8
9
pub use crate::ast::visitor::{visit, Visitor};
10
11
pub mod parse;
12
pub mod print;
13
mod visitor;
14
15
/// An error that occurred while parsing a regular expression into an abstract
16
/// syntax tree.
17
///
18
/// Note that not all ASTs represents a valid regular expression. For example,
19
/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20
/// valid Unicode property name. That particular error is reported when
21
/// translating an AST to the high-level intermediate representation (`HIR`).
22
#[derive(Clone, Debug, Eq, PartialEq)]
23
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24
pub struct Error {
25
    /// The kind of error.
26
    kind: ErrorKind,
27
    /// The original pattern that the parser generated the error from. Every
28
    /// span in an error is a valid range into this string.
29
    pattern: String,
30
    /// The span of this error.
31
    span: Span,
32
}
33
34
impl Error {
35
    /// Return the type of this error.
36
0
    pub fn kind(&self) -> &ErrorKind {
37
0
        &self.kind
38
0
    }
39
40
    /// The original pattern string in which this error occurred.
41
    ///
42
    /// Every span reported by this error is reported in terms of this string.
43
0
    pub fn pattern(&self) -> &str {
44
0
        &self.pattern
45
0
    }
46
47
    /// Return the span at which this error occurred.
48
0
    pub fn span(&self) -> &Span {
49
0
        &self.span
50
0
    }
51
52
    /// Return an auxiliary span. This span exists only for some errors that
53
    /// benefit from being able to point to two locations in the original
54
    /// regular expression. For example, "duplicate" errors will have the
55
    /// main error position set to the duplicate occurrence while its
56
    /// auxiliary span will be set to the initial occurrence.
57
0
    pub fn auxiliary_span(&self) -> Option<&Span> {
58
0
        use self::ErrorKind::*;
59
0
        match self.kind {
60
0
            FlagDuplicate { ref original } => Some(original),
61
0
            FlagRepeatedNegation { ref original, .. } => Some(original),
62
0
            GroupNameDuplicate { ref original, .. } => Some(original),
63
0
            _ => None,
64
        }
65
0
    }
66
}
67
68
/// The type of an error that occurred while building an AST.
69
///
70
/// This error type is marked as `non_exhaustive`. This means that adding a
71
/// new variant is not considered a breaking change.
72
#[non_exhaustive]
73
#[derive(Clone, Debug, Eq, PartialEq)]
74
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
75
pub enum ErrorKind {
76
    /// The capturing group limit was exceeded.
77
    ///
78
    /// Note that this represents a limit on the total number of capturing
79
    /// groups in a regex and not necessarily the number of nested capturing
80
    /// groups. That is, the nest limit can be low and it is still possible for
81
    /// this error to occur.
82
    CaptureLimitExceeded,
83
    /// An invalid escape sequence was found in a character class set.
84
    ClassEscapeInvalid,
85
    /// An invalid character class range was found. An invalid range is any
86
    /// range where the start is greater than the end.
87
    ClassRangeInvalid,
88
    /// An invalid range boundary was found in a character class. Range
89
    /// boundaries must be a single literal codepoint, but this error indicates
90
    /// that something else was found, such as a nested class.
91
    ClassRangeLiteral,
92
    /// An opening `[` was found with no corresponding closing `]`.
93
    ClassUnclosed,
94
    /// Note that this error variant is no longer used. Namely, a decimal
95
    /// number can only appear as a repetition quantifier. When the number
96
    /// in a repetition quantifier is empty, then it gets its own specialized
97
    /// error, `RepetitionCountDecimalEmpty`.
98
    DecimalEmpty,
99
    /// An invalid decimal number was given where one was expected.
100
    DecimalInvalid,
101
    /// A bracketed hex literal was empty.
102
    EscapeHexEmpty,
103
    /// A bracketed hex literal did not correspond to a Unicode scalar value.
104
    EscapeHexInvalid,
105
    /// An invalid hexadecimal digit was found.
106
    EscapeHexInvalidDigit,
107
    /// EOF was found before an escape sequence was completed.
108
    EscapeUnexpectedEof,
109
    /// An unrecognized escape sequence.
110
    EscapeUnrecognized,
111
    /// A dangling negation was used when setting flags, e.g., `i-`.
112
    FlagDanglingNegation,
113
    /// A flag was used twice, e.g., `i-i`.
114
    FlagDuplicate {
115
        /// The position of the original flag. The error position
116
        /// points to the duplicate flag.
117
        original: Span,
118
    },
119
    /// The negation operator was used twice, e.g., `-i-s`.
120
    FlagRepeatedNegation {
121
        /// The position of the original negation operator. The error position
122
        /// points to the duplicate negation operator.
123
        original: Span,
124
    },
125
    /// Expected a flag but got EOF, e.g., `(?`.
126
    FlagUnexpectedEof,
127
    /// Unrecognized flag, e.g., `a`.
128
    FlagUnrecognized,
129
    /// A duplicate capture name was found.
130
    GroupNameDuplicate {
131
        /// The position of the initial occurrence of the capture name. The
132
        /// error position itself points to the duplicate occurrence.
133
        original: Span,
134
    },
135
    /// A capture group name is empty, e.g., `(?P<>abc)`.
136
    GroupNameEmpty,
137
    /// An invalid character was seen for a capture group name. This includes
138
    /// errors where the first character is a digit (even though subsequent
139
    /// characters are allowed to be digits).
140
    GroupNameInvalid,
141
    /// A closing `>` could not be found for a capture group name.
142
    GroupNameUnexpectedEof,
143
    /// An unclosed group, e.g., `(ab`.
144
    ///
145
    /// The span of this error corresponds to the unclosed parenthesis.
146
    GroupUnclosed,
147
    /// An unopened group, e.g., `ab)`.
148
    GroupUnopened,
149
    /// The nest limit was exceeded. The limit stored here is the limit
150
    /// configured in the parser.
151
    NestLimitExceeded(u32),
152
    /// The range provided in a counted repetition operator is invalid. The
153
    /// range is invalid if the start is greater than the end.
154
    RepetitionCountInvalid,
155
    /// An opening `{` was not followed by a valid decimal value.
156
    /// For example, `x{}` or `x{]}` would fail.
157
    RepetitionCountDecimalEmpty,
158
    /// An opening `{` was found with no corresponding closing `}`.
159
    RepetitionCountUnclosed,
160
    /// A repetition operator was applied to a missing sub-expression. This
161
    /// occurs, for example, in the regex consisting of just a `*` or even
162
    /// `(?i)*`. It is, however, possible to create a repetition operating on
163
    /// an empty sub-expression. For example, `()*` is still considered valid.
164
    RepetitionMissing,
165
    /// The special word boundary syntax, `\b{something}`, was used, but
166
    /// either EOF without `}` was seen, or an invalid character in the
167
    /// braces was seen.
168
    SpecialWordBoundaryUnclosed,
169
    /// The special word boundary syntax, `\b{something}`, was used, but
170
    /// `something` was not recognized as a valid word boundary kind.
171
    SpecialWordBoundaryUnrecognized,
172
    /// The syntax `\b{` was observed, but afterwards the end of the pattern
173
    /// was observed without being able to tell whether it was meant to be a
174
    /// bounded repetition on the `\b` or the beginning of a special word
175
    /// boundary assertion.
176
    SpecialWordOrRepetitionUnexpectedEof,
177
    /// The Unicode class is not valid. This typically occurs when a `\p` is
178
    /// followed by something other than a `{`.
179
    UnicodeClassInvalid,
180
    /// When octal support is disabled, this error is produced when an octal
181
    /// escape is used. The octal escape is assumed to be an invocation of
182
    /// a backreference, which is the common case.
183
    UnsupportedBackreference,
184
    /// When syntax similar to PCRE's look-around is used, this error is
185
    /// returned. Some example syntaxes that are rejected include, but are
186
    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
187
    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
188
    /// error is used to improve the user experience.
189
    UnsupportedLookAround,
190
}
191
192
#[cfg(feature = "std")]
193
impl std::error::Error for Error {}
194
195
impl core::fmt::Display for Error {
196
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
197
0
        crate::error::Formatter::from(self).fmt(f)
198
0
    }
199
}
200
201
impl core::fmt::Display for ErrorKind {
202
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
203
0
        use self::ErrorKind::*;
204
0
        match *self {
205
0
            CaptureLimitExceeded => write!(
206
0
                f,
207
0
                "exceeded the maximum number of \
208
0
                 capturing groups ({})",
209
0
                u32::MAX
210
0
            ),
211
            ClassEscapeInvalid => {
212
0
                write!(f, "invalid escape sequence found in character class")
213
            }
214
0
            ClassRangeInvalid => write!(
215
0
                f,
216
0
                "invalid character class range, \
217
0
                 the start must be <= the end"
218
0
            ),
219
            ClassRangeLiteral => {
220
0
                write!(f, "invalid range boundary, must be a literal")
221
            }
222
0
            ClassUnclosed => write!(f, "unclosed character class"),
223
0
            DecimalEmpty => write!(f, "decimal literal empty"),
224
0
            DecimalInvalid => write!(f, "decimal literal invalid"),
225
0
            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
226
            EscapeHexInvalid => {
227
0
                write!(f, "hexadecimal literal is not a Unicode scalar value")
228
            }
229
0
            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
230
0
            EscapeUnexpectedEof => write!(
231
0
                f,
232
0
                "incomplete escape sequence, \
233
0
                 reached end of pattern prematurely"
234
0
            ),
235
0
            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
236
            FlagDanglingNegation => {
237
0
                write!(f, "dangling flag negation operator")
238
            }
239
0
            FlagDuplicate { .. } => write!(f, "duplicate flag"),
240
            FlagRepeatedNegation { .. } => {
241
0
                write!(f, "flag negation operator repeated")
242
            }
243
            FlagUnexpectedEof => {
244
0
                write!(f, "expected flag but got end of regex")
245
            }
246
0
            FlagUnrecognized => write!(f, "unrecognized flag"),
247
            GroupNameDuplicate { .. } => {
248
0
                write!(f, "duplicate capture group name")
249
            }
250
0
            GroupNameEmpty => write!(f, "empty capture group name"),
251
0
            GroupNameInvalid => write!(f, "invalid capture group character"),
252
0
            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
253
0
            GroupUnclosed => write!(f, "unclosed group"),
254
0
            GroupUnopened => write!(f, "unopened group"),
255
0
            NestLimitExceeded(limit) => write!(
256
0
                f,
257
0
                "exceed the maximum number of \
258
0
                 nested parentheses/brackets ({})",
259
0
                limit
260
0
            ),
261
0
            RepetitionCountInvalid => write!(
262
0
                f,
263
0
                "invalid repetition count range, \
264
0
                 the start must be <= the end"
265
0
            ),
266
            RepetitionCountDecimalEmpty => {
267
0
                write!(f, "repetition quantifier expects a valid decimal")
268
            }
269
            RepetitionCountUnclosed => {
270
0
                write!(f, "unclosed counted repetition")
271
            }
272
            RepetitionMissing => {
273
0
                write!(f, "repetition operator missing expression")
274
            }
275
            SpecialWordBoundaryUnclosed => {
276
0
                write!(
277
0
                    f,
278
0
                    "special word boundary assertion is either \
279
0
                     unclosed or contains an invalid character",
280
0
                )
281
            }
282
            SpecialWordBoundaryUnrecognized => {
283
0
                write!(
284
0
                    f,
285
0
                    "unrecognized special word boundary assertion, \
286
0
                     valid choices are: start, end, start-half \
287
0
                     or end-half",
288
0
                )
289
            }
290
            SpecialWordOrRepetitionUnexpectedEof => {
291
0
                write!(
292
0
                    f,
293
0
                    "found either the beginning of a special word \
294
0
                     boundary or a bounded repetition on a \\b with \
295
0
                     an opening brace, but no closing brace",
296
0
                )
297
            }
298
            UnicodeClassInvalid => {
299
0
                write!(f, "invalid Unicode character class")
300
            }
301
            UnsupportedBackreference => {
302
0
                write!(f, "backreferences are not supported")
303
            }
304
0
            UnsupportedLookAround => write!(
305
0
                f,
306
0
                "look-around, including look-ahead and look-behind, \
307
0
                 is not supported"
308
0
            ),
309
        }
310
0
    }
311
}
312
313
/// Span represents the position information of a single AST item.
314
///
315
/// All span positions are absolute byte offsets that can be used on the
316
/// original regular expression that was parsed.
317
#[derive(Clone, Copy, Eq, PartialEq)]
318
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
319
pub struct Span {
320
    /// The start byte offset.
321
    pub start: Position,
322
    /// The end byte offset.
323
    pub end: Position,
324
}
325
326
impl core::fmt::Debug for Span {
327
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
328
0
        write!(f, "Span({:?}, {:?})", self.start, self.end)
329
0
    }
330
}
331
332
impl Ord for Span {
333
0
    fn cmp(&self, other: &Span) -> Ordering {
334
0
        (&self.start, &self.end).cmp(&(&other.start, &other.end))
335
0
    }
336
}
337
338
impl PartialOrd for Span {
339
0
    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
340
0
        Some(self.cmp(other))
341
0
    }
342
}
343
344
/// A single position in a regular expression.
345
///
346
/// A position encodes one half of a span, and include the byte offset, line
347
/// number and column number.
348
#[derive(Clone, Copy, Eq, PartialEq)]
349
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
350
pub struct Position {
351
    /// The absolute offset of this position, starting at `0` from the
352
    /// beginning of the regular expression pattern string.
353
    pub offset: usize,
354
    /// The line number, starting at `1`.
355
    pub line: usize,
356
    /// The approximate column number, starting at `1`.
357
    pub column: usize,
358
}
359
360
impl core::fmt::Debug for Position {
361
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
362
0
        write!(
363
0
            f,
364
0
            "Position(o: {:?}, l: {:?}, c: {:?})",
365
0
            self.offset, self.line, self.column
366
0
        )
367
0
    }
368
}
369
370
impl Ord for Position {
371
0
    fn cmp(&self, other: &Position) -> Ordering {
372
0
        self.offset.cmp(&other.offset)
373
0
    }
374
}
375
376
impl PartialOrd for Position {
377
0
    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
378
0
        Some(self.cmp(other))
379
0
    }
380
}
381
382
impl Span {
383
    /// Create a new span with the given positions.
384
10.8M
    pub fn new(start: Position, end: Position) -> Span {
385
10.8M
        Span { start, end }
386
10.8M
    }
387
388
    /// Create a new span using the given position as the start and end.
389
2.83M
    pub fn splat(pos: Position) -> Span {
390
2.83M
        Span::new(pos, pos)
391
2.83M
    }
392
393
    /// Create a new span by replacing the starting the position with the one
394
    /// given.
395
0
    pub fn with_start(self, pos: Position) -> Span {
396
0
        Span { start: pos, ..self }
397
0
    }
398
399
    /// Create a new span by replacing the ending the position with the one
400
    /// given.
401
196k
    pub fn with_end(self, pos: Position) -> Span {
402
196k
        Span { end: pos, ..self }
403
196k
    }
404
405
    /// Returns true if and only if this span occurs on a single line.
406
0
    pub fn is_one_line(&self) -> bool {
407
0
        self.start.line == self.end.line
408
0
    }
409
410
    /// Returns true if and only if this span is empty. That is, it points to
411
    /// a single position in the concrete syntax of a regular expression.
412
0
    pub fn is_empty(&self) -> bool {
413
0
        self.start.offset == self.end.offset
414
0
    }
415
}
416
417
impl Position {
418
    /// Create a new position with the given information.
419
    ///
420
    /// `offset` is the absolute offset of the position, starting at `0` from
421
    /// the beginning of the regular expression pattern string.
422
    ///
423
    /// `line` is the line number, starting at `1`.
424
    ///
425
    /// `column` is the approximate column number, starting at `1`.
426
888k
    pub fn new(offset: usize, line: usize, column: usize) -> Position {
427
888k
        Position { offset, line, column }
428
888k
    }
429
}
430
431
/// An abstract syntax tree for a singular expression along with comments
432
/// found.
433
///
434
/// Comments are not stored in the tree itself to avoid complexity. Each
435
/// comment contains a span of precisely where it occurred in the original
436
/// regular expression.
437
#[derive(Clone, Debug, Eq, PartialEq)]
438
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
439
pub struct WithComments {
440
    /// The actual ast.
441
    pub ast: Ast,
442
    /// All comments found in the original regular expression.
443
    pub comments: Vec<Comment>,
444
}
445
446
/// A comment from a regular expression with an associated span.
447
///
448
/// A regular expression can only contain comments when the `x` flag is
449
/// enabled.
450
#[derive(Clone, Debug, Eq, PartialEq)]
451
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
452
pub struct Comment {
453
    /// The span of this comment, including the beginning `#` and ending `\n`.
454
    pub span: Span,
455
    /// The comment text, starting with the first character following the `#`
456
    /// and ending with the last character preceding the `\n`.
457
    pub comment: String,
458
}
459
460
/// An abstract syntax tree for a single regular expression.
461
///
462
/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
463
/// space proportional to the size of the `Ast`.
464
///
465
/// This type defines its own destructor that uses constant stack space and
466
/// heap space proportional to the size of the `Ast`.
467
#[derive(Clone, Debug, Eq, PartialEq)]
468
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
469
pub enum Ast {
470
    /// An empty regex that matches everything.
471
    Empty(Box<Span>),
472
    /// A set of flags, e.g., `(?is)`.
473
    Flags(Box<SetFlags>),
474
    /// A single character literal, which includes escape sequences.
475
    Literal(Box<Literal>),
476
    /// The "any character" class.
477
    Dot(Box<Span>),
478
    /// A single zero-width assertion.
479
    Assertion(Box<Assertion>),
480
    /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
481
    ClassUnicode(Box<ClassUnicode>),
482
    /// A single perl character class, e.g., `\d` or `\W`.
483
    ClassPerl(Box<ClassPerl>),
484
    /// A single bracketed character class set, which may contain zero or more
485
    /// character ranges and/or zero or more nested classes. e.g.,
486
    /// `[a-zA-Z\pL]`.
487
    ClassBracketed(Box<ClassBracketed>),
488
    /// A repetition operator applied to an arbitrary regular expression.
489
    Repetition(Box<Repetition>),
490
    /// A grouped regular expression.
491
    Group(Box<Group>),
492
    /// An alternation of regular expressions.
493
    Alternation(Box<Alternation>),
494
    /// A concatenation of regular expressions.
495
    Concat(Box<Concat>),
496
}
497
498
impl Ast {
499
    /// Create an "empty" AST item.
500
1.15M
    pub fn empty(span: Span) -> Ast {
501
1.15M
        Ast::Empty(Box::new(span))
502
1.15M
    }
503
504
    /// Create a "flags" AST item.
505
4
    pub fn flags(e: SetFlags) -> Ast {
506
4
        Ast::Flags(Box::new(e))
507
4
    }
508
509
    /// Create a "literal" AST item.
510
5.84M
    pub fn literal(e: Literal) -> Ast {
511
5.84M
        Ast::Literal(Box::new(e))
512
5.84M
    }
513
514
    /// Create a "dot" AST item.
515
28.5k
    pub fn dot(span: Span) -> Ast {
516
28.5k
        Ast::Dot(Box::new(span))
517
28.5k
    }
518
519
    /// Create a "assertion" AST item.
520
304k
    pub fn assertion(e: Assertion) -> Ast {
521
304k
        Ast::Assertion(Box::new(e))
522
304k
    }
523
524
    /// Create a "Unicode class" AST item.
525
73
    pub fn class_unicode(e: ClassUnicode) -> Ast {
526
73
        Ast::ClassUnicode(Box::new(e))
527
73
    }
528
529
    /// Create a "Perl class" AST item.
530
63.2k
    pub fn class_perl(e: ClassPerl) -> Ast {
531
63.2k
        Ast::ClassPerl(Box::new(e))
532
63.2k
    }
533
534
    /// Create a "bracketed class" AST item.
535
140k
    pub fn class_bracketed(e: ClassBracketed) -> Ast {
536
140k
        Ast::ClassBracketed(Box::new(e))
537
140k
    }
538
539
    /// Create a "repetition" AST item.
540
196k
    pub fn repetition(e: Repetition) -> Ast {
541
196k
        Ast::Repetition(Box::new(e))
542
196k
    }
543
544
    /// Create a "group" AST item.
545
370k
    pub fn group(e: Group) -> Ast {
546
370k
        Ast::Group(Box::new(e))
547
370k
    }
548
549
    /// Create a "alternation" AST item.
550
15.8k
    pub fn alternation(e: Alternation) -> Ast {
551
15.8k
        Ast::Alternation(Box::new(e))
552
15.8k
    }
553
554
    /// Create a "concat" AST item.
555
339k
    pub fn concat(e: Concat) -> Ast {
556
339k
        Ast::Concat(Box::new(e))
557
339k
    }
558
559
    /// Return the span of this abstract syntax tree.
560
196k
    pub fn span(&self) -> &Span {
561
196k
        match *self {
562
0
            Ast::Empty(ref span) => span,
563
0
            Ast::Flags(ref x) => &x.span,
564
3.34k
            Ast::Literal(ref x) => &x.span,
565
283
            Ast::Dot(ref span) => span,
566
0
            Ast::Assertion(ref x) => &x.span,
567
6
            Ast::ClassUnicode(ref x) => &x.span,
568
58.0k
            Ast::ClassPerl(ref x) => &x.span,
569
129k
            Ast::ClassBracketed(ref x) => &x.span,
570
0
            Ast::Repetition(ref x) => &x.span,
571
5.22k
            Ast::Group(ref x) => &x.span,
572
0
            Ast::Alternation(ref x) => &x.span,
573
0
            Ast::Concat(ref x) => &x.span,
574
        }
575
196k
    }
576
577
    /// Return true if and only if this Ast is empty.
578
0
    pub fn is_empty(&self) -> bool {
579
0
        match *self {
580
0
            Ast::Empty(_) => true,
581
0
            _ => false,
582
        }
583
0
    }
584
585
    /// Returns true if and only if this AST has any (including possibly empty)
586
    /// subexpressions.
587
598k
    fn has_subexprs(&self) -> bool {
588
598k
        match *self {
589
            Ast::Empty(_)
590
            | Ast::Flags(_)
591
            | Ast::Literal(_)
592
            | Ast::Dot(_)
593
            | Ast::Assertion(_)
594
            | Ast::ClassUnicode(_)
595
567k
            | Ast::ClassPerl(_) => false,
596
            Ast::ClassBracketed(_)
597
            | Ast::Repetition(_)
598
            | Ast::Group(_)
599
            | Ast::Alternation(_)
600
31.7k
            | Ast::Concat(_) => true,
601
        }
602
598k
    }
603
}
604
605
/// Print a display representation of this Ast.
606
///
607
/// This does not preserve any of the original whitespace formatting that may
608
/// have originally been present in the concrete syntax from which this Ast
609
/// was generated.
610
///
611
/// This implementation uses constant stack space and heap space proportional
612
/// to the size of the `Ast`.
613
impl core::fmt::Display for Ast {
614
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
615
0
        use crate::ast::print::Printer;
616
0
        Printer::new().print(self, f)
617
0
    }
618
}
619
620
/// An alternation of regular expressions.
621
#[derive(Clone, Debug, Eq, PartialEq)]
622
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
623
pub struct Alternation {
624
    /// The span of this alternation.
625
    pub span: Span,
626
    /// The alternate regular expressions.
627
    pub asts: Vec<Ast>,
628
}
629
630
impl Alternation {
631
    /// Return this alternation as an AST.
632
    ///
633
    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
634
    /// If this alternation contains exactly 1 AST, then the corresponding AST
635
    /// is returned. Otherwise, `Ast::alternation` is returned.
636
15.5k
    pub fn into_ast(mut self) -> Ast {
637
15.5k
        match self.asts.len() {
638
0
            0 => Ast::empty(self.span),
639
0
            1 => self.asts.pop().unwrap(),
640
15.5k
            _ => Ast::alternation(self),
641
        }
642
15.5k
    }
643
}
644
645
/// A concatenation of regular expressions.
646
#[derive(Clone, Debug, Eq, PartialEq)]
647
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
648
pub struct Concat {
649
    /// The span of this concatenation.
650
    pub span: Span,
651
    /// The concatenation regular expressions.
652
    pub asts: Vec<Ast>,
653
}
654
655
impl Concat {
656
    /// Return this concatenation as an AST.
657
    ///
658
    /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
659
    /// If this alternation contains exactly 1 AST, then the corresponding AST
660
    /// is returned. Otherwise, `Ast::concat` is returned.
661
695k
    pub fn into_ast(mut self) -> Ast {
662
695k
        match self.asts.len() {
663
362
            0 => Ast::empty(self.span),
664
355k
            1 => self.asts.pop().unwrap(),
665
339k
            _ => Ast::concat(self),
666
        }
667
695k
    }
668
}
669
670
/// A single literal expression.
671
///
672
/// A literal corresponds to a single Unicode scalar value. Literals may be
673
/// represented in their literal form, e.g., `a` or in their escaped form,
674
/// e.g., `\x61`.
675
#[derive(Clone, Debug, Eq, PartialEq)]
676
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
677
pub struct Literal {
678
    /// The span of this literal.
679
    pub span: Span,
680
    /// The kind of this literal.
681
    pub kind: LiteralKind,
682
    /// The Unicode scalar value corresponding to this literal.
683
    pub c: char,
684
}
685
686
impl Literal {
687
    /// If this literal was written as a `\x` hex escape, then this returns
688
    /// the corresponding byte value. Otherwise, this returns `None`.
689
0
    pub fn byte(&self) -> Option<u8> {
690
0
        match self.kind {
691
            LiteralKind::HexFixed(HexLiteralKind::X) => {
692
0
                u8::try_from(self.c).ok()
693
            }
694
0
            _ => None,
695
        }
696
0
    }
697
}
698
699
/// The kind of a single literal expression.
700
#[derive(Clone, Debug, Eq, PartialEq)]
701
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
702
pub enum LiteralKind {
703
    /// The literal is written verbatim, e.g., `a` or `☃`.
704
    Verbatim,
705
    /// The literal is written as an escape because it is otherwise a special
706
    /// regex meta character, e.g., `\*` or `\[`.
707
    Meta,
708
    /// The literal is written as an escape despite the fact that the escape is
709
    /// unnecessary, e.g., `\%` or `\/`.
710
    Superfluous,
711
    /// The literal is written as an octal escape, e.g., `\141`.
712
    Octal,
713
    /// The literal is written as a hex code with a fixed number of digits
714
    /// depending on the type of the escape, e.g., `\x61` or `\u0061` or
715
    /// `\U00000061`.
716
    HexFixed(HexLiteralKind),
717
    /// The literal is written as a hex code with a bracketed number of
718
    /// digits. The only restriction is that the bracketed hex code must refer
719
    /// to a valid Unicode scalar value.
720
    HexBrace(HexLiteralKind),
721
    /// The literal is written as a specially recognized escape, e.g., `\f`
722
    /// or `\n`.
723
    Special(SpecialLiteralKind),
724
}
725
726
/// The type of a special literal.
727
///
728
/// A special literal is a special escape sequence recognized by the regex
729
/// parser, e.g., `\f` or `\n`.
730
#[derive(Clone, Debug, Eq, PartialEq)]
731
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
732
pub enum SpecialLiteralKind {
733
    /// Bell, spelled `\a` (`\x07`).
734
    Bell,
735
    /// Form feed, spelled `\f` (`\x0C`).
736
    FormFeed,
737
    /// Tab, spelled `\t` (`\x09`).
738
    Tab,
739
    /// Line feed, spelled `\n` (`\x0A`).
740
    LineFeed,
741
    /// Carriage return, spelled `\r` (`\x0D`).
742
    CarriageReturn,
743
    /// Vertical tab, spelled `\v` (`\x0B`).
744
    VerticalTab,
745
    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
746
    /// parsing in verbose mode.
747
    Space,
748
}
749
750
/// The type of a Unicode hex literal.
751
///
752
/// Note that all variants behave the same when used with brackets. They only
753
/// differ when used without brackets in the number of hex digits that must
754
/// follow.
755
#[derive(Clone, Debug, Eq, PartialEq)]
756
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
757
pub enum HexLiteralKind {
758
    /// A `\x` prefix. When used without brackets, this form is limited to
759
    /// two digits.
760
    X,
761
    /// A `\u` prefix. When used without brackets, this form is limited to
762
    /// four digits.
763
    UnicodeShort,
764
    /// A `\U` prefix. When used without brackets, this form is limited to
765
    /// eight digits.
766
    UnicodeLong,
767
}
768
769
impl HexLiteralKind {
770
    /// The number of digits that must be used with this literal form when
771
    /// used without brackets. When used with brackets, there is no
772
    /// restriction on the number of digits.
773
0
    pub fn digits(&self) -> u32 {
774
0
        match *self {
775
0
            HexLiteralKind::X => 2,
776
0
            HexLiteralKind::UnicodeShort => 4,
777
0
            HexLiteralKind::UnicodeLong => 8,
778
        }
779
0
    }
780
}
781
782
/// A Perl character class.
783
#[derive(Clone, Debug, Eq, PartialEq)]
784
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
785
pub struct ClassPerl {
786
    /// The span of this class.
787
    pub span: Span,
788
    /// The kind of Perl class.
789
    pub kind: ClassPerlKind,
790
    /// Whether the class is negated or not. e.g., `\d` is not negated but
791
    /// `\D` is.
792
    pub negated: bool,
793
}
794
795
/// The available Perl character classes.
796
#[derive(Clone, Debug, Eq, PartialEq)]
797
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
798
pub enum ClassPerlKind {
799
    /// Decimal numbers.
800
    Digit,
801
    /// Whitespace.
802
    Space,
803
    /// Word characters.
804
    Word,
805
}
806
807
/// An ASCII character class.
808
#[derive(Clone, Debug, Eq, PartialEq)]
809
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
810
pub struct ClassAscii {
811
    /// The span of this class.
812
    pub span: Span,
813
    /// The kind of ASCII class.
814
    pub kind: ClassAsciiKind,
815
    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
816
    /// but `[[:^alpha:]]` is.
817
    pub negated: bool,
818
}
819
820
/// The available ASCII character classes.
821
#[derive(Clone, Debug, Eq, PartialEq)]
822
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
823
pub enum ClassAsciiKind {
824
    /// `[0-9A-Za-z]`
825
    Alnum,
826
    /// `[A-Za-z]`
827
    Alpha,
828
    /// `[\x00-\x7F]`
829
    Ascii,
830
    /// `[ \t]`
831
    Blank,
832
    /// `[\x00-\x1F\x7F]`
833
    Cntrl,
834
    /// `[0-9]`
835
    Digit,
836
    /// `[!-~]`
837
    Graph,
838
    /// `[a-z]`
839
    Lower,
840
    /// `[ -~]`
841
    Print,
842
    /// `[!-/:-@\[-`{-~]`
843
    Punct,
844
    /// `[\t\n\v\f\r ]`
845
    Space,
846
    /// `[A-Z]`
847
    Upper,
848
    /// `[0-9A-Za-z_]`
849
    Word,
850
    /// `[0-9A-Fa-f]`
851
    Xdigit,
852
}
853
854
impl ClassAsciiKind {
855
    /// Return the corresponding ClassAsciiKind variant for the given name.
856
    ///
857
    /// The name given should correspond to the lowercase version of the
858
    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
859
    ///
860
    /// If no variant with the corresponding name exists, then `None` is
861
    /// returned.
862
127
    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
863
127
        use self::ClassAsciiKind::*;
864
127
        match name {
865
127
            "alnum" => Some(Alnum),
866
127
            "alpha" => Some(Alpha),
867
127
            "ascii" => Some(Ascii),
868
15
            "blank" => Some(Blank),
869
15
            "cntrl" => Some(Cntrl),
870
15
            "digit" => Some(Digit),
871
15
            "graph" => Some(Graph),
872
15
            "lower" => Some(Lower),
873
15
            "print" => Some(Print),
874
15
            "punct" => Some(Punct),
875
15
            "space" => Some(Space),
876
12
            "upper" => Some(Upper),
877
12
            "word" => Some(Word),
878
6
            "xdigit" => Some(Xdigit),
879
0
            _ => None,
880
        }
881
127
    }
882
}
883
884
/// A Unicode character class.
885
#[derive(Clone, Debug, Eq, PartialEq)]
886
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
887
pub struct ClassUnicode {
888
    /// The span of this class.
889
    pub span: Span,
890
    /// Whether this class is negated or not.
891
    ///
892
    /// Note: be careful when using this attribute. This specifically refers
893
    /// to whether the class is written as `\p` or `\P`, where the latter
894
    /// is `negated = true`. However, it also possible to write something like
895
    /// `\P{scx!=Katakana}` which is actually equivalent to
896
    /// `\p{scx=Katakana}` and is therefore not actually negated even though
897
    /// `negated = true` here. To test whether this class is truly negated
898
    /// or not, use the `is_negated` method.
899
    pub negated: bool,
900
    /// The kind of Unicode class.
901
    pub kind: ClassUnicodeKind,
902
}
903
904
impl ClassUnicode {
905
    /// Returns true if this class has been negated.
906
    ///
907
    /// Note that this takes the Unicode op into account, if it's present.
908
    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
909
0
    pub fn is_negated(&self) -> bool {
910
0
        match self.kind {
911
            ClassUnicodeKind::NamedValue {
912
                op: ClassUnicodeOpKind::NotEqual,
913
                ..
914
0
            } => !self.negated,
915
0
            _ => self.negated,
916
        }
917
0
    }
918
}
919
920
/// The available forms of Unicode character classes.
921
#[derive(Clone, Debug, Eq, PartialEq)]
922
pub enum ClassUnicodeKind {
923
    /// A one letter abbreviated class, e.g., `\pN`.
924
    OneLetter(char),
925
    /// A binary property, general category or script. The string may be
926
    /// empty.
927
    Named(String),
928
    /// A property name and an associated value.
929
    NamedValue {
930
        /// The type of Unicode op used to associate `name` with `value`.
931
        op: ClassUnicodeOpKind,
932
        /// The property name (which may be empty).
933
        name: String,
934
        /// The property value (which may be empty).
935
        value: String,
936
    },
937
}
938
939
#[cfg(feature = "arbitrary")]
940
impl arbitrary::Arbitrary<'_> for ClassUnicodeKind {
941
    fn arbitrary(
942
        u: &mut arbitrary::Unstructured,
943
    ) -> arbitrary::Result<ClassUnicodeKind> {
944
        #[cfg(any(
945
            feature = "unicode-age",
946
            feature = "unicode-bool",
947
            feature = "unicode-gencat",
948
            feature = "unicode-perl",
949
            feature = "unicode-script",
950
            feature = "unicode-segment",
951
        ))]
952
        {
953
            use alloc::string::ToString;
954
955
            use super::unicode_tables::{
956
                property_names::PROPERTY_NAMES,
957
                property_values::PROPERTY_VALUES,
958
            };
959
960
            match u.choose_index(3)? {
961
                0 => {
962
                    let all = PROPERTY_VALUES
963
                        .iter()
964
                        .flat_map(|e| e.1.iter())
965
                        .filter(|(name, _)| name.len() == 1)
966
                        .count();
967
                    let idx = u.choose_index(all)?;
968
                    let value = PROPERTY_VALUES
969
                        .iter()
970
                        .flat_map(|e| e.1.iter())
971
                        .take(idx + 1)
972
                        .last()
973
                        .unwrap()
974
                        .0
975
                        .chars()
976
                        .next()
977
                        .unwrap();
978
                    Ok(ClassUnicodeKind::OneLetter(value))
979
                }
980
                1 => {
981
                    let all = PROPERTY_VALUES
982
                        .iter()
983
                        .map(|e| e.1.len())
984
                        .sum::<usize>()
985
                        + PROPERTY_NAMES.len();
986
                    let idx = u.choose_index(all)?;
987
                    let name = PROPERTY_VALUES
988
                        .iter()
989
                        .flat_map(|e| e.1.iter())
990
                        .chain(PROPERTY_NAMES)
991
                        .map(|(_, e)| e)
992
                        .take(idx + 1)
993
                        .last()
994
                        .unwrap();
995
                    Ok(ClassUnicodeKind::Named(name.to_string()))
996
                }
997
                2 => {
998
                    let all = PROPERTY_VALUES
999
                        .iter()
1000
                        .map(|e| e.1.len())
1001
                        .sum::<usize>();
1002
                    let idx = u.choose_index(all)?;
1003
                    let (prop, value) = PROPERTY_VALUES
1004
                        .iter()
1005
                        .flat_map(|e| {
1006
                            e.1.iter().map(|(_, value)| (e.0, value))
1007
                        })
1008
                        .take(idx + 1)
1009
                        .last()
1010
                        .unwrap();
1011
                    Ok(ClassUnicodeKind::NamedValue {
1012
                        op: u.arbitrary()?,
1013
                        name: prop.to_string(),
1014
                        value: value.to_string(),
1015
                    })
1016
                }
1017
                _ => unreachable!("index chosen is impossible"),
1018
            }
1019
        }
1020
        #[cfg(not(any(
1021
            feature = "unicode-age",
1022
            feature = "unicode-bool",
1023
            feature = "unicode-gencat",
1024
            feature = "unicode-perl",
1025
            feature = "unicode-script",
1026
            feature = "unicode-segment",
1027
        )))]
1028
        {
1029
            match u.choose_index(3)? {
1030
                0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)),
1031
                1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)),
1032
                2 => Ok(ClassUnicodeKind::NamedValue {
1033
                    op: u.arbitrary()?,
1034
                    name: u.arbitrary()?,
1035
                    value: u.arbitrary()?,
1036
                }),
1037
                _ => unreachable!("index chosen is impossible"),
1038
            }
1039
        }
1040
    }
1041
1042
    fn size_hint(depth: usize) -> (usize, Option<usize>) {
1043
        #[cfg(any(
1044
            feature = "unicode-age",
1045
            feature = "unicode-bool",
1046
            feature = "unicode-gencat",
1047
            feature = "unicode-perl",
1048
            feature = "unicode-script",
1049
            feature = "unicode-segment",
1050
        ))]
1051
        {
1052
            arbitrary::size_hint::and_all(&[
1053
                usize::size_hint(depth),
1054
                usize::size_hint(depth),
1055
                arbitrary::size_hint::or(
1056
                    (0, Some(0)),
1057
                    ClassUnicodeOpKind::size_hint(depth),
1058
                ),
1059
            ])
1060
        }
1061
        #[cfg(not(any(
1062
            feature = "unicode-age",
1063
            feature = "unicode-bool",
1064
            feature = "unicode-gencat",
1065
            feature = "unicode-perl",
1066
            feature = "unicode-script",
1067
            feature = "unicode-segment",
1068
        )))]
1069
        {
1070
            arbitrary::size_hint::and(
1071
                usize::size_hint(depth),
1072
                arbitrary::size_hint::or_all(&[
1073
                    char::size_hint(depth),
1074
                    String::size_hint(depth),
1075
                    arbitrary::size_hint::and_all(&[
1076
                        String::size_hint(depth),
1077
                        String::size_hint(depth),
1078
                        ClassUnicodeOpKind::size_hint(depth),
1079
                    ]),
1080
                ]),
1081
            )
1082
        }
1083
    }
1084
}
1085
1086
/// The type of op used in a Unicode character class.
1087
#[derive(Clone, Debug, Eq, PartialEq)]
1088
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1089
pub enum ClassUnicodeOpKind {
1090
    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
1091
    Equal,
1092
    /// A property set to a specific value using a colon, e.g.,
1093
    /// `\p{scx:Katakana}`.
1094
    Colon,
1095
    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
1096
    NotEqual,
1097
}
1098
1099
impl ClassUnicodeOpKind {
1100
    /// Whether the op is an equality op or not.
1101
0
    pub fn is_equal(&self) -> bool {
1102
0
        match *self {
1103
0
            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
1104
0
            _ => false,
1105
        }
1106
0
    }
1107
}
1108
1109
/// A bracketed character class, e.g., `[a-z0-9]`.
1110
#[derive(Clone, Debug, Eq, PartialEq)]
1111
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1112
pub struct ClassBracketed {
1113
    /// The span of this class.
1114
    pub span: Span,
1115
    /// Whether this class is negated or not. e.g., `[a]` is not negated but
1116
    /// `[^a]` is.
1117
    pub negated: bool,
1118
    /// The type of this set. A set is either a normal union of things, e.g.,
1119
    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
1120
    pub kind: ClassSet,
1121
}
1122
1123
/// A character class set.
1124
///
1125
/// This type corresponds to the internal structure of a bracketed character
1126
/// class. That is, every bracketed character is one of two types: a union of
1127
/// items (literals, ranges, other bracketed classes) or a tree of binary set
1128
/// operations.
1129
#[derive(Clone, Debug, Eq, PartialEq)]
1130
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1131
pub enum ClassSet {
1132
    /// An item, which can be a single literal, range, nested character class
1133
    /// or a union of items.
1134
    Item(ClassSetItem),
1135
    /// A single binary operation (i.e., &&, -- or ~~).
1136
    BinaryOp(ClassSetBinaryOp),
1137
}
1138
1139
impl ClassSet {
1140
    /// Build a set from a union.
1141
143k
    pub fn union(ast: ClassSetUnion) -> ClassSet {
1142
143k
        ClassSet::Item(ClassSetItem::Union(ast))
1143
143k
    }
1144
1145
    /// Return the span of this character class set.
1146
1.90k
    pub fn span(&self) -> &Span {
1147
1.90k
        match *self {
1148
1.90k
            ClassSet::Item(ref x) => x.span(),
1149
0
            ClassSet::BinaryOp(ref x) => &x.span,
1150
        }
1151
1.90k
    }
1152
1153
    /// Return true if and only if this class set is empty.
1154
5.17k
    fn is_empty(&self) -> bool {
1155
5.17k
        match *self {
1156
4.89k
            ClassSet::Item(ClassSetItem::Empty(_)) => true,
1157
277
            _ => false,
1158
        }
1159
5.17k
    }
1160
}
1161
1162
/// A single component of a character class set.
1163
#[derive(Clone, Debug, Eq, PartialEq)]
1164
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1165
pub enum ClassSetItem {
1166
    /// An empty item.
1167
    ///
1168
    /// Note that a bracketed character class cannot contain a single empty
1169
    /// item. Empty items can appear when using one of the binary operators.
1170
    /// For example, `[&&]` is the intersection of two empty classes.
1171
    Empty(Span),
1172
    /// A single literal.
1173
    Literal(Literal),
1174
    /// A range between two literals.
1175
    Range(ClassSetRange),
1176
    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
1177
    Ascii(ClassAscii),
1178
    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
1179
    Unicode(ClassUnicode),
1180
    /// A perl character class, e.g., `\d` or `\W`.
1181
    Perl(ClassPerl),
1182
    /// A bracketed character class set, which may contain zero or more
1183
    /// character ranges and/or zero or more nested classes. e.g.,
1184
    /// `[a-zA-Z\pL]`.
1185
    Bracketed(Box<ClassBracketed>),
1186
    /// A union of items.
1187
    Union(ClassSetUnion),
1188
}
1189
1190
impl ClassSetItem {
1191
    /// Return the span of this character class set item.
1192
777k
    pub fn span(&self) -> &Span {
1193
777k
        match *self {
1194
0
            ClassSetItem::Empty(ref span) => span,
1195
739k
            ClassSetItem::Literal(ref x) => &x.span,
1196
12.5k
            ClassSetItem::Range(ref x) => &x.span,
1197
254
            ClassSetItem::Ascii(ref x) => &x.span,
1198
6.91k
            ClassSetItem::Perl(ref x) => &x.span,
1199
11.9k
            ClassSetItem::Unicode(ref x) => &x.span,
1200
6.82k
            ClassSetItem::Bracketed(ref x) => &x.span,
1201
112
            ClassSetItem::Union(ref x) => &x.span,
1202
        }
1203
777k
    }
1204
}
1205
1206
/// A single character class range in a set.
1207
#[derive(Clone, Debug, Eq, PartialEq)]
1208
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1209
pub struct ClassSetRange {
1210
    /// The span of this range.
1211
    pub span: Span,
1212
    /// The start of this range.
1213
    pub start: Literal,
1214
    /// The end of this range.
1215
    pub end: Literal,
1216
}
1217
1218
impl ClassSetRange {
1219
    /// Returns true if and only if this character class range is valid.
1220
    ///
1221
    /// The only case where a range is invalid is if its start is greater than
1222
    /// its end.
1223
8.67k
    pub fn is_valid(&self) -> bool {
1224
8.67k
        self.start.c <= self.end.c
1225
8.67k
    }
1226
}
1227
1228
/// A union of items inside a character class set.
1229
#[derive(Clone, Debug, Eq, PartialEq)]
1230
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1231
pub struct ClassSetUnion {
1232
    /// The span of the items in this operation. e.g., the `a-z0-9` in
1233
    /// `[^a-z0-9]`
1234
    pub span: Span,
1235
    /// The sequence of items that make up this union.
1236
    pub items: Vec<ClassSetItem>,
1237
}
1238
1239
impl ClassSetUnion {
1240
    /// Push a new item in this union.
1241
    ///
1242
    /// The ending position of this union's span is updated to the ending
1243
    /// position of the span of the item given. If the union is empty, then
1244
    /// the starting position of this union is set to the starting position
1245
    /// of this item.
1246
    ///
1247
    /// In other words, if you only use this method to add items to a union
1248
    /// and you set the spans on each item correctly, then you should never
1249
    /// need to adjust the span of the union directly.
1250
631k
    pub fn push(&mut self, item: ClassSetItem) {
1251
631k
        if self.items.is_empty() {
1252
144k
            self.span.start = item.span().start;
1253
487k
        }
1254
631k
        self.span.end = item.span().end;
1255
631k
        self.items.push(item);
1256
631k
    }
1257
1258
    /// Return this union as a character class set item.
1259
    ///
1260
    /// If this union contains zero items, then an empty union is
1261
    /// returned. If this concatenation contains exactly 1 item, then the
1262
    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1263
    /// returned.
1264
144k
    pub fn into_item(mut self) -> ClassSetItem {
1265
144k
        match self.items.len() {
1266
0
            0 => ClassSetItem::Empty(self.span),
1267
40.9k
            1 => self.items.pop().unwrap(),
1268
103k
            _ => ClassSetItem::Union(self),
1269
        }
1270
144k
    }
1271
}
1272
1273
/// A Unicode character class set operation.
1274
#[derive(Clone, Debug, Eq, PartialEq)]
1275
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1276
pub struct ClassSetBinaryOp {
1277
    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1278
    pub span: Span,
1279
    /// The type of this set operation.
1280
    pub kind: ClassSetBinaryOpKind,
1281
    /// The left hand side of the operation.
1282
    pub lhs: Box<ClassSet>,
1283
    /// The right hand side of the operation.
1284
    pub rhs: Box<ClassSet>,
1285
}
1286
1287
/// The type of a Unicode character class set operation.
1288
///
1289
/// Note that this doesn't explicitly represent union since there is no
1290
/// explicit union operator. Concatenation inside a character class corresponds
1291
/// to the union operation.
1292
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1293
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1294
pub enum ClassSetBinaryOpKind {
1295
    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1296
    Intersection,
1297
    /// The difference of two sets, e.g., `\pN--[0-9]`.
1298
    Difference,
1299
    /// The symmetric difference of two sets. The symmetric difference is the
1300
    /// set of elements belonging to one but not both sets.
1301
    /// e.g., `[\pL~~[:ascii:]]`.
1302
    SymmetricDifference,
1303
}
1304
1305
/// A single zero-width assertion.
1306
#[derive(Clone, Debug, Eq, PartialEq)]
1307
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1308
pub struct Assertion {
1309
    /// The span of this assertion.
1310
    pub span: Span,
1311
    /// The assertion kind, e.g., `\b` or `^`.
1312
    pub kind: AssertionKind,
1313
}
1314
1315
/// An assertion kind.
1316
#[derive(Clone, Debug, Eq, PartialEq)]
1317
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1318
pub enum AssertionKind {
1319
    /// `^`
1320
    StartLine,
1321
    /// `$`
1322
    EndLine,
1323
    /// `\A`
1324
    StartText,
1325
    /// `\z`
1326
    EndText,
1327
    /// `\b`
1328
    WordBoundary,
1329
    /// `\B`
1330
    NotWordBoundary,
1331
    /// `\b{start}`
1332
    WordBoundaryStart,
1333
    /// `\b{end}`
1334
    WordBoundaryEnd,
1335
    /// `\<` (alias for `\b{start}`)
1336
    WordBoundaryStartAngle,
1337
    /// `\>` (alias for `\b{end}`)
1338
    WordBoundaryEndAngle,
1339
    /// `\b{start-half}`
1340
    WordBoundaryStartHalf,
1341
    /// `\b{end-half}`
1342
    WordBoundaryEndHalf,
1343
}
1344
1345
/// A repetition operation applied to a regular expression.
1346
#[derive(Clone, Debug, Eq, PartialEq)]
1347
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1348
pub struct Repetition {
1349
    /// The span of this operation.
1350
    pub span: Span,
1351
    /// The actual operation.
1352
    pub op: RepetitionOp,
1353
    /// Whether this operation was applied greedily or not.
1354
    pub greedy: bool,
1355
    /// The regular expression under repetition.
1356
    pub ast: Box<Ast>,
1357
}
1358
1359
/// The repetition operator itself.
1360
#[derive(Clone, Debug, Eq, PartialEq)]
1361
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1362
pub struct RepetitionOp {
1363
    /// The span of this operator. This includes things like `+`, `*?` and
1364
    /// `{m,n}`.
1365
    pub span: Span,
1366
    /// The type of operation.
1367
    pub kind: RepetitionKind,
1368
}
1369
1370
/// The kind of a repetition operator.
1371
#[derive(Clone, Debug, Eq, PartialEq)]
1372
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1373
pub enum RepetitionKind {
1374
    /// `?`
1375
    ZeroOrOne,
1376
    /// `*`
1377
    ZeroOrMore,
1378
    /// `+`
1379
    OneOrMore,
1380
    /// `{m,n}`
1381
    Range(RepetitionRange),
1382
}
1383
1384
/// A range repetition operator.
1385
#[derive(Clone, Debug, Eq, PartialEq)]
1386
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1387
pub enum RepetitionRange {
1388
    /// `{m}`
1389
    Exactly(u32),
1390
    /// `{m,}`
1391
    AtLeast(u32),
1392
    /// `{m,n}`
1393
    Bounded(u32, u32),
1394
}
1395
1396
impl RepetitionRange {
1397
    /// Returns true if and only if this repetition range is valid.
1398
    ///
1399
    /// The only case where a repetition range is invalid is if it is bounded
1400
    /// and its start is greater than its end.
1401
38.0k
    pub fn is_valid(&self) -> bool {
1402
34.8k
        match *self {
1403
34.8k
            RepetitionRange::Bounded(s, e) if s > e => false,
1404
38.0k
            _ => true,
1405
        }
1406
38.0k
    }
1407
}
1408
1409
/// A grouped regular expression.
1410
///
1411
/// This includes both capturing and non-capturing groups. This does **not**
1412
/// include flag-only groups like `(?is)`, but does contain any group that
1413
/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1414
/// `(?is:a)`.
1415
#[derive(Clone, Debug, Eq, PartialEq)]
1416
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1417
pub struct Group {
1418
    /// The span of this group.
1419
    pub span: Span,
1420
    /// The kind of this group.
1421
    pub kind: GroupKind,
1422
    /// The regular expression in this group.
1423
    pub ast: Box<Ast>,
1424
}
1425
1426
impl Group {
1427
    /// If this group is non-capturing, then this returns the (possibly empty)
1428
    /// set of flags. Otherwise, `None` is returned.
1429
740k
    pub fn flags(&self) -> Option<&Flags> {
1430
740k
        match self.kind {
1431
455k
            GroupKind::NonCapturing(ref flags) => Some(flags),
1432
285k
            _ => None,
1433
        }
1434
740k
    }
1435
1436
    /// Returns true if and only if this group is capturing.
1437
0
    pub fn is_capturing(&self) -> bool {
1438
0
        match self.kind {
1439
0
            GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
1440
0
            GroupKind::NonCapturing(_) => false,
1441
        }
1442
0
    }
1443
1444
    /// Returns the capture index of this group, if this is a capturing group.
1445
    ///
1446
    /// This returns a capture index precisely when `is_capturing` is `true`.
1447
0
    pub fn capture_index(&self) -> Option<u32> {
1448
0
        match self.kind {
1449
0
            GroupKind::CaptureIndex(i) => Some(i),
1450
0
            GroupKind::CaptureName { ref name, .. } => Some(name.index),
1451
0
            GroupKind::NonCapturing(_) => None,
1452
        }
1453
0
    }
1454
}
1455
1456
/// The kind of a group.
1457
#[derive(Clone, Debug, Eq, PartialEq)]
1458
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1459
pub enum GroupKind {
1460
    /// `(a)`
1461
    CaptureIndex(u32),
1462
    /// `(?<name>a)` or `(?P<name>a)`
1463
    CaptureName {
1464
        /// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1465
        starts_with_p: bool,
1466
        /// The capture name.
1467
        name: CaptureName,
1468
    },
1469
    /// `(?:a)` and `(?i:a)`
1470
    NonCapturing(Flags),
1471
}
1472
1473
/// A capture name.
1474
///
1475
/// This corresponds to the name itself between the angle brackets in, e.g.,
1476
/// `(?P<foo>expr)`.
1477
#[derive(Clone, Debug, Eq, PartialEq)]
1478
pub struct CaptureName {
1479
    /// The span of this capture name.
1480
    pub span: Span,
1481
    /// The capture name.
1482
    pub name: String,
1483
    /// The capture index.
1484
    pub index: u32,
1485
}
1486
1487
#[cfg(feature = "arbitrary")]
1488
impl arbitrary::Arbitrary<'_> for CaptureName {
1489
    fn arbitrary(
1490
        u: &mut arbitrary::Unstructured,
1491
    ) -> arbitrary::Result<CaptureName> {
1492
        let len = u.arbitrary_len::<char>()?;
1493
        if len == 0 {
1494
            return Err(arbitrary::Error::NotEnoughData);
1495
        }
1496
        let mut name: String = String::new();
1497
        for _ in 0..len {
1498
            let ch: char = u.arbitrary()?;
1499
            let cp = u32::from(ch);
1500
            let ascii_letter_offset = u8::try_from(cp % 26).unwrap();
1501
            let ascii_letter = b'a' + ascii_letter_offset;
1502
            name.push(char::from(ascii_letter));
1503
        }
1504
        Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? })
1505
    }
1506
1507
    fn size_hint(depth: usize) -> (usize, Option<usize>) {
1508
        arbitrary::size_hint::and_all(&[
1509
            Span::size_hint(depth),
1510
            usize::size_hint(depth),
1511
            u32::size_hint(depth),
1512
        ])
1513
    }
1514
}
1515
1516
/// A group of flags that is not applied to a particular regular expression.
1517
#[derive(Clone, Debug, Eq, PartialEq)]
1518
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1519
pub struct SetFlags {
1520
    /// The span of these flags, including the grouping parentheses.
1521
    pub span: Span,
1522
    /// The actual sequence of flags.
1523
    pub flags: Flags,
1524
}
1525
1526
/// A group of flags.
1527
///
1528
/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1529
#[derive(Clone, Debug, Eq, PartialEq)]
1530
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1531
pub struct Flags {
1532
    /// The span of this group of flags.
1533
    pub span: Span,
1534
    /// A sequence of flag items. Each item is either a flag or a negation
1535
    /// operator.
1536
    pub items: Vec<FlagsItem>,
1537
}
1538
1539
impl Flags {
1540
    /// Add the given item to this sequence of flags.
1541
    ///
1542
    /// If the item was added successfully, then `None` is returned. If the
1543
    /// given item is a duplicate, then `Some(i)` is returned, where
1544
    /// `items[i].kind == item.kind`.
1545
222k
    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1546
222k
        for (i, x) in self.items.iter().enumerate() {
1547
0
            if x.kind == item.kind {
1548
0
                return Some(i);
1549
0
            }
1550
        }
1551
222k
        self.items.push(item);
1552
222k
        None
1553
222k
    }
1554
1555
    /// Returns the state of the given flag in this set.
1556
    ///
1557
    /// If the given flag is in the set but is negated, then `Some(false)` is
1558
    /// returned.
1559
    ///
1560
    /// If the given flag is in the set and is not negated, then `Some(true)`
1561
    /// is returned.
1562
    ///
1563
    /// Otherwise, `None` is returned.
1564
227k
    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1565
227k
        let mut negated = false;
1566
450k
        for x in &self.items {
1567
222k
            match x.kind {
1568
0
                FlagsItemKind::Negation => {
1569
0
                    negated = true;
1570
0
                }
1571
222k
                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1572
4
                    return Some(!negated);
1573
                }
1574
222k
                _ => {}
1575
            }
1576
        }
1577
227k
        None
1578
227k
    }
1579
}
1580
1581
/// A single item in a group of flags.
1582
#[derive(Clone, Debug, Eq, PartialEq)]
1583
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1584
pub struct FlagsItem {
1585
    /// The span of this item.
1586
    pub span: Span,
1587
    /// The kind of this item.
1588
    pub kind: FlagsItemKind,
1589
}
1590
1591
/// The kind of an item in a group of flags.
1592
#[derive(Clone, Debug, Eq, PartialEq)]
1593
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1594
pub enum FlagsItemKind {
1595
    /// A negation operator applied to all subsequent flags in the enclosing
1596
    /// group.
1597
    Negation,
1598
    /// A single flag in a group.
1599
    Flag(Flag),
1600
}
1601
1602
impl FlagsItemKind {
1603
    /// Returns true if and only if this item is a negation operator.
1604
0
    pub fn is_negation(&self) -> bool {
1605
0
        match *self {
1606
0
            FlagsItemKind::Negation => true,
1607
0
            _ => false,
1608
        }
1609
0
    }
1610
}
1611
1612
/// A single flag.
1613
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1614
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1615
pub enum Flag {
1616
    /// `i`
1617
    CaseInsensitive,
1618
    /// `m`
1619
    MultiLine,
1620
    /// `s`
1621
    DotMatchesNewLine,
1622
    /// `U`
1623
    SwapGreed,
1624
    /// `u`
1625
    Unicode,
1626
    /// `R`
1627
    CRLF,
1628
    /// `x`
1629
    IgnoreWhitespace,
1630
}
1631
1632
/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1633
/// space but heap space proportional to the depth of the `Ast`.
1634
impl Drop for Ast {
1635
8.45M
    fn drop(&mut self) {
1636
        use core::mem;
1637
1638
196k
        match *self {
1639
            Ast::Empty(_)
1640
            | Ast::Flags(_)
1641
            | Ast::Literal(_)
1642
            | Ast::Dot(_)
1643
            | Ast::Assertion(_)
1644
            | Ast::ClassUnicode(_)
1645
            | Ast::ClassPerl(_)
1646
            // Bracketed classes are recursive, they get their own Drop impl.
1647
7.31M
            | Ast::ClassBracketed(_) => return,
1648
196k
            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1649
402k
            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1650
16.2k
            Ast::Alternation(ref x) if x.asts.is_empty() => return,
1651
522k
            Ast::Concat(ref x) if x.asts.is_empty() => return,
1652
214k
            _ => {}
1653
214k
        }
1654
214k
1655
781k
        let empty_span = || Span::splat(Position::new(0, 0, 0));
1656
781k
        let empty_ast = || Ast::empty(empty_span());
1657
214k
        let mut stack = vec![mem::replace(self, empty_ast())];
1658
7.48M
        while let Some(mut ast) = stack.pop() {
1659
922k
            match ast {
1660
                Ast::Empty(_)
1661
                | Ast::Flags(_)
1662
                | Ast::Literal(_)
1663
                | Ast::Dot(_)
1664
                | Ast::Assertion(_)
1665
                | Ast::ClassUnicode(_)
1666
                | Ast::ClassPerl(_)
1667
                // Bracketed classes are recursive, so they get their own Drop
1668
                // impl.
1669
6.35M
                | Ast::ClassBracketed(_) => {}
1670
196k
                Ast::Repetition(ref mut x) => {
1671
196k
                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1672
196k
                }
1673
370k
                Ast::Group(ref mut x) => {
1674
370k
                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1675
370k
                }
1676
15.8k
                Ast::Alternation(ref mut x) => {
1677
15.8k
                    stack.extend(x.asts.drain(..));
1678
15.8k
                }
1679
339k
                Ast::Concat(ref mut x) => {
1680
339k
                    stack.extend(x.asts.drain(..));
1681
339k
                }
1682
            }
1683
        }
1684
8.45M
    }
1685
}
1686
1687
/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1688
/// stack space but heap space proportional to the depth of the `ClassSet`.
1689
impl Drop for ClassSet {
1690
986k
    fn drop(&mut self) {
1691
986k
        use core::mem;
1692
986k
1693
986k
        match *self {
1694
985k
            ClassSet::Item(ref item) => match *item {
1695
                ClassSetItem::Empty(_)
1696
                | ClassSetItem::Literal(_)
1697
                | ClassSetItem::Range(_)
1698
                | ClassSetItem::Ascii(_)
1699
                | ClassSetItem::Unicode(_)
1700
633k
                | ClassSetItem::Perl(_) => return,
1701
3.11k
                ClassSetItem::Bracketed(ref x) => {
1702
3.11k
                    if x.kind.is_empty() {
1703
2.99k
                        return;
1704
124
                    }
1705
                }
1706
348k
                ClassSetItem::Union(ref x) => {
1707
348k
                    if x.items.is_empty() {
1708
246k
                        return;
1709
101k
                    }
1710
                }
1711
            },
1712
1.10k
            ClassSet::BinaryOp(ref op) => {
1713
1.10k
                if op.lhs.is_empty() && op.rhs.is_empty() {
1714
950
                    return;
1715
153
                }
1716
            }
1717
        }
1718
1719
106k
        let empty_span = || Span::splat(Position::new(0, 0, 0));
1720
106k
        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1721
102k
        let mut stack = vec![mem::replace(self, empty_set())];
1722
799k
        while let Some(mut set) = stack.pop() {
1723
697k
            match set {
1724
696k
                ClassSet::Item(ref mut item) => match *item {
1725
                    ClassSetItem::Empty(_)
1726
                    | ClassSetItem::Literal(_)
1727
                    | ClassSetItem::Range(_)
1728
                    | ClassSetItem::Ascii(_)
1729
                    | ClassSetItem::Unicode(_)
1730
590k
                    | ClassSetItem::Perl(_) => {}
1731
2.99k
                    ClassSetItem::Bracketed(ref mut x) => {
1732
2.99k
                        stack.push(mem::replace(&mut x.kind, empty_set()));
1733
2.99k
                    }
1734
103k
                    ClassSetItem::Union(ref mut x) => {
1735
103k
                        stack.extend(x.items.drain(..).map(ClassSet::Item));
1736
103k
                    }
1737
                },
1738
950
                ClassSet::BinaryOp(ref mut op) => {
1739
950
                    stack.push(mem::replace(&mut op.lhs, empty_set()));
1740
950
                    stack.push(mem::replace(&mut op.rhs, empty_set()));
1741
950
                }
1742
            }
1743
        }
1744
986k
    }
1745
}
1746
1747
#[cfg(test)]
1748
mod tests {
1749
    use super::*;
1750
1751
    // We use a thread with an explicit stack size to test that our destructor
1752
    // for Ast can handle arbitrarily sized expressions in constant stack
1753
    // space. In case we run on a platform without threads (WASM?), we limit
1754
    // this test to Windows/Unix.
1755
    #[test]
1756
    #[cfg(any(unix, windows))]
1757
    fn no_stack_overflow_on_drop() {
1758
        use std::thread;
1759
1760
        let run = || {
1761
            let span = || Span::splat(Position::new(0, 0, 0));
1762
            let mut ast = Ast::empty(span());
1763
            for i in 0..200 {
1764
                ast = Ast::group(Group {
1765
                    span: span(),
1766
                    kind: GroupKind::CaptureIndex(i),
1767
                    ast: Box::new(ast),
1768
                });
1769
            }
1770
            assert!(!ast.is_empty());
1771
        };
1772
1773
        // We run our test on a thread with a small stack size so we can
1774
        // force the issue more easily.
1775
        //
1776
        // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1777
        // will just barf with very small stack sizes. So we bump this up a bit
1778
        // to give more room to breath. When I did this, I confirmed that if
1779
        // I remove the custom `Drop` impl for `Ast`, then this test does
1780
        // indeed still fail with a stack overflow. (At the time of writing, I
1781
        // had to bump it all the way up to 32K before the test would pass even
1782
        // without the custom `Drop` impl. So 16K seems like a safe number
1783
        // here.)
1784
        //
1785
        // See: https://github.com/rust-lang/regex/issues/967
1786
        thread::Builder::new()
1787
            .stack_size(16 << 10)
1788
            .spawn(run)
1789
            .unwrap()
1790
            .join()
1791
            .unwrap();
1792
    }
1793
1794
    // This tests that our `Ast` has a reasonable size. This isn't a hard rule
1795
    // and it can be increased if given a good enough reason. But this test
1796
    // exists because the size of `Ast` was at one point over 200 bytes on a
1797
    // 64-bit target. Wow.
1798
    #[test]
1799
    fn ast_size() {
1800
        let max = 2 * core::mem::size_of::<usize>();
1801
        let size = core::mem::size_of::<Ast>();
1802
        assert!(
1803
            size <= max,
1804
            "Ast size of {} bytes is bigger than suggested max {}",
1805
            size,
1806
            max
1807
        );
1808
    }
1809
}