Coverage Report

Created: 2025-12-12 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-lite/src/hir/mod.rs
Line
Count
Source
1
use alloc::{boxed::Box, string::String, vec, vec::Vec};
2
3
use crate::{error::Error, utf8};
4
5
mod parse;
6
7
/// Escapes all regular expression meta characters in `pattern`.
8
///
9
/// The string returned may be safely used as a literal in a regular
10
/// expression.
11
0
pub fn escape(pattern: &str) -> String {
12
0
    let mut buf = String::new();
13
0
    buf.reserve(pattern.len());
14
0
    for ch in pattern.chars() {
15
0
        if is_meta_character(ch) {
16
0
            buf.push('\\');
17
0
        }
18
0
        buf.push(ch);
19
    }
20
0
    buf
21
0
}
22
23
/// Returns true if the given character has significance in a regex.
24
///
25
/// Generally speaking, these are the only characters which _must_ be escaped
26
/// in order to match their literal meaning. For example, to match a literal
27
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
28
/// example, `-` is treated as a meta character because of its significance
29
/// for writing ranges inside of character classes, but the regex `-` will
30
/// match a literal `-` because `-` has no special meaning outside of character
31
/// classes.
32
///
33
/// In order to determine whether a character may be escaped at all, the
34
/// [`is_escapable_character`] routine should be used. The difference between
35
/// `is_meta_character` and `is_escapable_character` is that the latter will
36
/// return true for some characters that are _not_ meta characters. For
37
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
38
/// words, `is_escapable_character` includes "superfluous" escapes.
39
///
40
/// Note that the set of characters for which this function returns `true` or
41
/// `false` is fixed and won't change in a semver compatible release. (In this
42
/// case, "semver compatible release" actually refers to the `regex` crate
43
/// itself, since reducing or expanding the set of meta characters would be a
44
/// breaking change for not just `regex-syntax` but also `regex` itself.)
45
1.13M
fn is_meta_character(c: char) -> bool {
46
1.13M
    match c {
47
        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
48
924k
        | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
49
213k
        _ => false,
50
    }
51
1.13M
}
52
53
/// Returns true if the given character can be escaped in a regex.
54
///
55
/// This returns true in all cases that `is_meta_character` returns true, but
56
/// also returns true in some cases where `is_meta_character` returns false.
57
/// For example, `%` is not a meta character, but it is escapable. That is,
58
/// `%` and `\%` both match a literal `%` in all contexts.
59
///
60
/// The purpose of this routine is to provide knowledge about what characters
61
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
62
/// where characters without any special significance may be escaped even
63
/// though there is no actual _need_ to do so.
64
///
65
/// This will return false for some characters. For example, `e` is not
66
/// escapable. Therefore, `\e` will either result in a parse error (which is
67
/// true today), or it could backwards compatibly evolve into a new construct
68
/// with its own meaning. Indeed, that is the purpose of banning _some_
69
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
70
/// manner.
71
106k
fn is_escapable_character(c: char) -> bool {
72
    // Certainly escapable if it's a meta character.
73
106k
    if is_meta_character(c) {
74
0
        return true;
75
106k
    }
76
    // Any character that isn't ASCII is definitely not escapable. There's
77
    // no real need to allow things like \☃ right?
78
106k
    if !c.is_ascii() {
79
25
        return false;
80
106k
    }
81
    // Otherwise, we basically say that everything is escapable unless it's a
82
    // letter or digit. Things like \3 are either octal (when enabled) or an
83
    // error, and we should keep it that way. Otherwise, letters are reserved
84
    // for adding new syntax in a backwards compatible way.
85
106k
    match c {
86
44.7k
        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
87
        // While not currently supported, we keep these as not escapable to
88
        // give us some flexibility with respect to supporting the \< and
89
        // \> word boundary assertions in the future. By rejecting them as
90
        // escapable, \< and \> will result in a parse error. Thus, we can
91
        // turn them into something else in the future without it being a
92
        // backwards incompatible change.
93
692
        '<' | '>' => false,
94
69.2k
        _ => true,
95
    }
96
106k
}
97
98
/// The configuration for a regex parser.
99
#[derive(Clone, Copy, Debug)]
100
pub(crate) struct Config {
101
    /// The maximum number of times we're allowed to recurse.
102
    ///
103
    /// Note that unlike the regex-syntax parser, we actually use recursion in
104
    /// this parser for simplicity. My hope is that by setting a conservative
105
    /// default call limit and providing a way to configure it, that we can
106
    /// keep this simplification. But if we must, we can re-work the parser to
107
    /// put the call stack on the heap like regex-syntax does.
108
    pub(crate) nest_limit: u32,
109
    /// Various flags that control how a pattern is interpreted.
110
    pub(crate) flags: Flags,
111
}
112
113
impl Default for Config {
114
7.73k
    fn default() -> Config {
115
7.73k
        Config { nest_limit: 50, flags: Flags::default() }
116
7.73k
    }
117
}
118
119
/// Various flags that control the interpretation of the pattern.
120
///
121
/// These can be set via explicit configuration in code, or change dynamically
122
/// during parsing via inline flags. For example, `foo(?i:bar)baz` will match
123
/// `foo` and `baz` case sensitively and `bar` case insensitively (assuming a
124
/// default configuration).
125
#[derive(Clone, Copy, Debug, Default)]
126
pub(crate) struct Flags {
127
    /// Whether to match case insensitively.
128
    ///
129
    /// This is the `i` flag.
130
    pub(crate) case_insensitive: bool,
131
    /// Whether `^` and `$` should be treated as line anchors or not.
132
    ///
133
    /// This is the `m` flag.
134
    pub(crate) multi_line: bool,
135
    /// Whether `.` should match line terminators or not.
136
    ///
137
    /// This is the `s` flag.
138
    pub(crate) dot_matches_new_line: bool,
139
    /// Whether to swap the meaning of greedy and non-greedy operators.
140
    ///
141
    /// This is the `U` flag.
142
    pub(crate) swap_greed: bool,
143
    /// Whether to enable CRLF mode.
144
    ///
145
    /// This is the `R` flag.
146
    pub(crate) crlf: bool,
147
    /// Whether to ignore whitespace. i.e., verbose mode.
148
    ///
149
    /// This is the `x` flag.
150
    pub(crate) ignore_whitespace: bool,
151
}
152
153
#[derive(Clone, Debug, Eq, PartialEq)]
154
pub(crate) struct Hir {
155
    kind: HirKind,
156
    is_start_anchored: bool,
157
    is_match_empty: bool,
158
    static_explicit_captures_len: Option<usize>,
159
}
160
161
#[derive(Clone, Debug, Eq, PartialEq)]
162
pub(crate) enum HirKind {
163
    Empty,
164
    Char(char),
165
    Class(Class),
166
    Look(Look),
167
    Repetition(Repetition),
168
    Capture(Capture),
169
    Concat(Vec<Hir>),
170
    Alternation(Vec<Hir>),
171
}
172
173
impl Hir {
174
    /// Parses the given pattern string with the given configuration into a
175
    /// structured representation. If the pattern is invalid, then an error
176
    /// is returned.
177
7.73k
    pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> {
178
7.73k
        self::parse::Parser::new(config, pattern).parse()
179
7.73k
    }
180
181
    /// Returns the underlying kind of this high-level intermediate
182
    /// representation.
183
    ///
184
    /// Note that there is explicitly no way to build an `Hir` directly from
185
    /// an `HirKind`. If you need to do that, then you must do case analysis
186
    /// on the `HirKind` and call the appropriate smart constructor on `Hir`.
187
75.1M
    pub(crate) fn kind(&self) -> &HirKind {
188
75.1M
        &self.kind
189
75.1M
    }
190
191
    /// Returns true if and only if this Hir expression can only match at the
192
    /// beginning of a haystack.
193
5.65k
    pub(crate) fn is_start_anchored(&self) -> bool {
194
5.65k
        self.is_start_anchored
195
5.65k
    }
196
197
    /// Returns true if and only if this Hir expression can match the empty
198
    /// string.
199
45.8k
    pub(crate) fn is_match_empty(&self) -> bool {
200
45.8k
        self.is_match_empty
201
45.8k
    }
202
203
    /// If the pattern always reports the same number of matching capture groups
204
    /// for every match, then this returns the number of those groups. This
205
    /// doesn't include the implicit group found in every pattern.
206
5.65k
    pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
207
5.65k
        self.static_explicit_captures_len
208
5.65k
    }
209
210
0
    fn fail() -> Hir {
211
0
        let kind = HirKind::Class(Class { ranges: vec![] });
212
0
        Hir {
213
0
            kind,
214
0
            is_start_anchored: false,
215
0
            is_match_empty: false,
216
0
            static_explicit_captures_len: Some(0),
217
0
        }
218
0
    }
219
220
4.96M
    fn empty() -> Hir {
221
4.96M
        let kind = HirKind::Empty;
222
4.96M
        Hir {
223
4.96M
            kind,
224
4.96M
            is_start_anchored: false,
225
4.96M
            is_match_empty: true,
226
4.96M
            static_explicit_captures_len: Some(0),
227
4.96M
        }
228
4.96M
    }
229
230
33.0M
    fn char(ch: char) -> Hir {
231
33.0M
        let kind = HirKind::Char(ch);
232
33.0M
        Hir {
233
33.0M
            kind,
234
33.0M
            is_start_anchored: false,
235
33.0M
            is_match_empty: false,
236
33.0M
            static_explicit_captures_len: Some(0),
237
33.0M
        }
238
33.0M
    }
239
240
27.9M
    fn class(class: Class) -> Hir {
241
27.9M
        let kind = HirKind::Class(class);
242
27.9M
        Hir {
243
27.9M
            kind,
244
27.9M
            is_start_anchored: false,
245
27.9M
            is_match_empty: false,
246
27.9M
            static_explicit_captures_len: Some(0),
247
27.9M
        }
248
27.9M
    }
249
250
38.2k
    fn look(look: Look) -> Hir {
251
38.2k
        let kind = HirKind::Look(look);
252
        Hir {
253
38.2k
            kind,
254
38.2k
            is_start_anchored: matches!(look, Look::Start),
255
            is_match_empty: true,
256
38.2k
            static_explicit_captures_len: Some(0),
257
        }
258
38.2k
    }
259
260
351k
    fn repetition(rep: Repetition) -> Hir {
261
351k
        if rep.min == 0 && rep.max == Some(0) {
262
195k
            return Hir::empty();
263
156k
        } else if rep.min == 1 && rep.max == Some(1) {
264
776
            return *rep.sub;
265
155k
        }
266
155k
        let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored;
267
155k
        let is_match_empty = rep.min == 0 || rep.sub.is_match_empty;
268
155k
        let mut static_explicit_captures_len =
269
155k
            rep.sub.static_explicit_captures_len;
270
        // If the static captures len of the sub-expression is not known or
271
        // is greater than zero, then it automatically propagates to the
272
        // repetition, regardless of the repetition. Otherwise, it might
273
        // change, but only when the repetition can match 0 times.
274
155k
        if rep.min == 0
275
102k
            && static_explicit_captures_len.map_or(false, |len| len > 0)
276
        {
277
            // If we require a match 0 times, then our captures len is
278
            // guaranteed to be zero. Otherwise, if we *can* match the empty
279
            // string, then it's impossible to know how many captures will be
280
            // in the resulting match.
281
24.2k
            if rep.max == Some(0) {
282
0
                static_explicit_captures_len = Some(0);
283
24.2k
            } else {
284
24.2k
                static_explicit_captures_len = None;
285
24.2k
            }
286
131k
        }
287
155k
        Hir {
288
155k
            kind: HirKind::Repetition(rep),
289
155k
            is_start_anchored,
290
155k
            is_match_empty,
291
155k
            static_explicit_captures_len,
292
155k
        }
293
351k
    }
294
295
410k
    fn capture(cap: Capture) -> Hir {
296
410k
        let is_start_anchored = cap.sub.is_start_anchored;
297
410k
        let is_match_empty = cap.sub.is_match_empty;
298
410k
        let static_explicit_captures_len = cap
299
410k
            .sub
300
410k
            .static_explicit_captures_len
301
410k
            .map(|len| len.saturating_add(1));
302
410k
        let kind = HirKind::Capture(cap);
303
410k
        Hir {
304
410k
            kind,
305
410k
            is_start_anchored,
306
410k
            is_match_empty,
307
410k
            static_explicit_captures_len,
308
410k
        }
309
410k
    }
310
311
4.52M
    fn concat(mut subs: Vec<Hir>) -> Hir {
312
4.52M
        if subs.is_empty() {
313
4.39M
            Hir::empty()
314
130k
        } else if subs.len() == 1 {
315
69.7k
            subs.pop().unwrap()
316
        } else {
317
60.6k
            let is_start_anchored = subs[0].is_start_anchored;
318
60.6k
            let mut is_match_empty = true;
319
60.6k
            let mut static_explicit_captures_len = Some(0usize);
320
13.4M
            for sub in subs.iter() {
321
13.4M
                is_match_empty = is_match_empty && sub.is_match_empty;
322
13.4M
                static_explicit_captures_len = static_explicit_captures_len
323
13.4M
                    .and_then(|len1| {
324
13.3M
                        Some((len1, sub.static_explicit_captures_len?))
325
13.3M
                    })
326
13.4M
                    .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
327
            }
328
60.6k
            Hir {
329
60.6k
                kind: HirKind::Concat(subs),
330
60.6k
                is_start_anchored,
331
60.6k
                is_match_empty,
332
60.6k
                static_explicit_captures_len,
333
60.6k
            }
334
        }
335
4.52M
    }
336
337
417k
    fn alternation(mut subs: Vec<Hir>) -> Hir {
338
417k
        if subs.is_empty() {
339
0
            Hir::fail()
340
417k
        } else if subs.len() == 1 {
341
413k
            subs.pop().unwrap()
342
        } else {
343
3.24k
            let mut it = subs.iter().peekable();
344
3.24k
            let mut is_start_anchored =
345
3.24k
                it.peek().map_or(false, |sub| sub.is_start_anchored);
346
3.24k
            let mut is_match_empty =
347
3.24k
                it.peek().map_or(false, |sub| sub.is_match_empty);
348
3.24k
            let mut static_explicit_captures_len =
349
3.24k
                it.peek().and_then(|sub| sub.static_explicit_captures_len);
350
3.46M
            for sub in it {
351
3.46M
                is_start_anchored = is_start_anchored && sub.is_start_anchored;
352
3.46M
                is_match_empty = is_match_empty || sub.is_match_empty;
353
3.46M
                if static_explicit_captures_len
354
3.46M
                    != sub.static_explicit_captures_len
355
84.2k
                {
356
84.2k
                    static_explicit_captures_len = None;
357
3.38M
                }
358
            }
359
3.24k
            Hir {
360
3.24k
                kind: HirKind::Alternation(subs),
361
3.24k
                is_start_anchored,
362
3.24k
                is_match_empty,
363
3.24k
                static_explicit_captures_len,
364
3.24k
            }
365
        }
366
417k
    }
367
}
368
369
impl HirKind {
370
    /// Returns a slice of this kind's sub-expressions, if any.
371
584k
    fn subs(&self) -> &[Hir] {
372
        use core::slice::from_ref;
373
374
584k
        match *self {
375
            HirKind::Empty
376
            | HirKind::Char(_)
377
            | HirKind::Class(_)
378
565k
            | HirKind::Look(_) => &[],
379
1.40k
            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
380
8.05k
            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
381
9.01k
            HirKind::Concat(ref subs) => subs,
382
400
            HirKind::Alternation(ref subs) => subs,
383
        }
384
584k
    }
385
}
386
387
#[derive(Clone, Debug, Eq, PartialEq)]
388
pub(crate) struct Class {
389
    pub(crate) ranges: Vec<ClassRange>,
390
}
391
392
impl Class {
393
    /// Create a new class from the given ranges. The ranges may be provided
394
    /// in any order or may even overlap. They will be automatically
395
    /// canonicalized.
396
27.9M
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
27.9M
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
27.9M
        class.canonicalize();
399
27.9M
        class
400
27.9M
    }
<regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 1]>
Line
Count
Source
396
3.01k
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
3.01k
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
3.01k
        class.canonicalize();
399
3.01k
        class
400
3.01k
    }
<regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 2]>
Line
Count
Source
396
4.63M
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
4.63M
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
4.63M
        class.canonicalize();
399
4.63M
        class
400
4.63M
    }
<regex_lite::hir::Class>::new::<[regex_lite::hir::ClassRange; 3]>
Line
Count
Source
396
1.42k
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
1.42k
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
1.42k
        class.canonicalize();
399
1.42k
        class
400
1.42k
    }
<regex_lite::hir::Class>::new::<alloc::vec::Vec<regex_lite::hir::ClassRange>>
Line
Count
Source
396
147k
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
147k
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
147k
        class.canonicalize();
399
147k
        class
400
147k
    }
<regex_lite::hir::Class>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<(u8, u8)>, regex_lite::hir::parse::posix_class::{closure#0}>>
Line
Count
Source
396
23.1M
    fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397
23.1M
        let mut class = Class { ranges: ranges.into_iter().collect() };
398
23.1M
        class.canonicalize();
399
23.1M
        class
400
23.1M
    }
401
402
    /// Expand this class such that it matches the ASCII codepoints in this set
403
    /// case insensitively.
404
99.7k
    fn ascii_case_fold(&mut self) {
405
99.7k
        let len = self.ranges.len();
406
353k
        for i in 0..len {
407
353k
            if let Some(folded) = self.ranges[i].ascii_case_fold() {
408
129k
                self.ranges.push(folded);
409
223k
            }
410
        }
411
99.7k
        self.canonicalize();
412
99.7k
    }
413
414
    /// Negate this set.
415
    ///
416
    /// For all `x` where `x` is any element, if `x` was in this set, then it
417
    /// will not be in this set after negation.
418
22.9M
    fn negate(&mut self) {
419
        const MIN: char = '\x00';
420
        const MAX: char = char::MAX;
421
422
22.9M
        if self.ranges.is_empty() {
423
0
            self.ranges.push(ClassRange { start: MIN, end: MAX });
424
0
            return;
425
22.9M
        }
426
427
        // There should be a way to do this in-place with constant memory,
428
        // but I couldn't figure out a simple way to do it. So just append
429
        // the negation to the end of this range, and then drain it before
430
        // we're done.
431
22.9M
        let drain_end = self.ranges.len();
432
433
        // If our class doesn't start the minimum possible char, then negation
434
        // needs to include all codepoints up to the minimum in this set.
435
22.9M
        if self.ranges[0].start > MIN {
436
22.9M
            self.ranges.push(ClassRange {
437
22.9M
                start: MIN,
438
22.9M
                // OK because we know it's bigger than MIN.
439
22.9M
                end: prev_char(self.ranges[0].start).unwrap(),
440
22.9M
            });
441
22.9M
        }
442
68.9M
        for i in 1..drain_end {
443
68.9M
            // let lower = self.ranges[i - 1].upper().increment();
444
68.9M
            // let upper = self.ranges[i].lower().decrement();
445
68.9M
            // self.ranges.push(I::create(lower, upper));
446
68.9M
            self.ranges.push(ClassRange {
447
68.9M
                // OK because we know i-1 is never the last range and therefore
448
68.9M
                // there must be a range greater than it. It therefore follows
449
68.9M
                // that 'end' can never be char::MAX, and thus there must be
450
68.9M
                // a next char.
451
68.9M
                start: next_char(self.ranges[i - 1].end).unwrap(),
452
68.9M
                // Since 'i' is guaranteed to never be the first range, it
453
68.9M
                // follows that there is always a range before this and thus
454
68.9M
                // 'start' can never be '\x00'. Thus, there must be a previous
455
68.9M
                // char.
456
68.9M
                end: prev_char(self.ranges[i].start).unwrap(),
457
68.9M
            });
458
68.9M
        }
459
22.9M
        if self.ranges[drain_end - 1].end < MAX {
460
22.9M
            // let lower = self.ranges[drain_end - 1].upper().increment();
461
22.9M
            // self.ranges.push(I::create(lower, I::Bound::max_value()));
462
22.9M
            self.ranges.push(ClassRange {
463
22.9M
                // OK because we know 'end' is less than char::MAX, and thus
464
22.9M
                // there is a next char.
465
22.9M
                start: next_char(self.ranges[drain_end - 1].end).unwrap(),
466
22.9M
                end: MAX,
467
22.9M
            });
468
22.9M
        }
469
22.9M
        self.ranges.drain(..drain_end);
470
        // We don't need to canonicalize because we processed the ranges above
471
        // in canonical order and the new ranges we added based on those are
472
        // also necessarily in canonical order.
473
22.9M
    }
474
475
    /// Converts this set into a canonical ordering.
476
28.0M
    fn canonicalize(&mut self) {
477
28.0M
        if self.is_canonical() {
478
24.2M
            return;
479
3.81M
        }
480
3.81M
        self.ranges.sort();
481
3.81M
        assert!(!self.ranges.is_empty());
482
483
        // Is there a way to do this in-place with constant memory? I couldn't
484
        // figure out a way to do it. So just append the canonicalization to
485
        // the end of this range, and then drain it before we're done.
486
3.81M
        let drain_end = self.ranges.len();
487
146M
        for oldi in 0..drain_end {
488
            // If we've added at least one new range, then check if we can
489
            // merge this range in the previously added range.
490
146M
            if self.ranges.len() > drain_end {
491
142M
                let (last, rest) = self.ranges.split_last_mut().unwrap();
492
142M
                if let Some(union) = last.union(&rest[oldi]) {
493
138M
                    *last = union;
494
138M
                    continue;
495
4.23M
                }
496
3.81M
            }
497
8.05M
            self.ranges.push(self.ranges[oldi]);
498
        }
499
3.81M
        self.ranges.drain(..drain_end);
500
28.0M
    }
501
502
    /// Returns true if and only if this class is in a canonical ordering.
503
28.0M
    fn is_canonical(&self) -> bool {
504
74.7M
        for pair in self.ranges.windows(2) {
505
74.7M
            if pair[0] >= pair[1] {
506
3.77M
                return false;
507
70.9M
            }
508
70.9M
            if pair[0].is_contiguous(&pair[1]) {
509
44.1k
                return false;
510
70.8M
            }
511
        }
512
24.2M
        true
513
28.0M
    }
514
}
515
516
#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
517
pub(crate) struct ClassRange {
518
    pub(crate) start: char,
519
    pub(crate) end: char,
520
}
521
522
impl ClassRange {
523
    /// Apply simple case folding to this byte range. Only ASCII case mappings
524
    /// (for A-Za-z) are applied.
525
    ///
526
    /// Additional ranges are appended to the given vector. Canonical ordering
527
    /// is *not* maintained in the given vector.
528
7.32M
    fn ascii_case_fold(&self) -> Option<ClassRange> {
529
7.32M
        if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) {
530
3.71M
            let start = core::cmp::max(self.start, 'a');
531
3.71M
            let end = core::cmp::min(self.end, 'z');
532
3.71M
            return Some(ClassRange {
533
3.71M
                start: char::try_from(u32::from(start) - 32).unwrap(),
534
3.71M
                end: char::try_from(u32::from(end) - 32).unwrap(),
535
3.71M
            });
536
3.61M
        }
537
3.61M
        if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) {
538
1.03M
            let start = core::cmp::max(self.start, 'A');
539
1.03M
            let end = core::cmp::min(self.end, 'Z');
540
1.03M
            return Some(ClassRange {
541
1.03M
                start: char::try_from(u32::from(start) + 32).unwrap(),
542
1.03M
                end: char::try_from(u32::from(end) + 32).unwrap(),
543
1.03M
            });
544
2.57M
        }
545
2.57M
        None
546
7.32M
    }
547
548
    /// Union the given overlapping range into this range.
549
    ///
550
    /// If the two ranges aren't contiguous, then this returns `None`.
551
142M
    fn union(&self, other: &ClassRange) -> Option<ClassRange> {
552
142M
        if !self.is_contiguous(other) {
553
4.23M
            return None;
554
138M
        }
555
138M
        let start = core::cmp::min(self.start, other.start);
556
138M
        let end = core::cmp::max(self.end, other.end);
557
138M
        Some(ClassRange { start, end })
558
142M
    }
559
560
    /// Returns true if and only if the two ranges are contiguous. Two ranges
561
    /// are contiguous if and only if the ranges are either overlapping or
562
    /// adjacent.
563
213M
    fn is_contiguous(&self, other: &ClassRange) -> bool {
564
213M
        let (s1, e1) = (u32::from(self.start), u32::from(self.end));
565
213M
        let (s2, e2) = (u32::from(other.start), u32::from(other.end));
566
213M
        core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1)
567
213M
    }
568
569
    /// Returns true if and only if the intersection of this range and the
570
    /// other range is empty.
571
10.9M
    fn is_intersection_empty(&self, other: &ClassRange) -> bool {
572
10.9M
        let (s1, e1) = (self.start, self.end);
573
10.9M
        let (s2, e2) = (other.start, other.end);
574
10.9M
        core::cmp::max(s1, s2) > core::cmp::min(e1, e2)
575
10.9M
    }
576
}
577
578
/// The high-level intermediate representation for a look-around assertion.
579
///
580
/// An assertion match is always zero-length. Also called an "empty match."
581
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
582
pub(crate) enum Look {
583
    /// Match the beginning of text. Specifically, this matches at the starting
584
    /// position of the input.
585
    Start = 1 << 0,
586
    /// Match the end of text. Specifically, this matches at the ending
587
    /// position of the input.
588
    End = 1 << 1,
589
    /// Match the beginning of a line or the beginning of text. Specifically,
590
    /// this matches at the starting position of the input, or at the position
591
    /// immediately following a `\n` character.
592
    StartLF = 1 << 2,
593
    /// Match the end of a line or the end of text. Specifically, this matches
594
    /// at the end position of the input, or at the position immediately
595
    /// preceding a `\n` character.
596
    EndLF = 1 << 3,
597
    /// Match the beginning of a line or the beginning of text. Specifically,
598
    /// this matches at the starting position of the input, or at the position
599
    /// immediately following either a `\r` or `\n` character, but never after
600
    /// a `\r` when a `\n` follows.
601
    StartCRLF = 1 << 4,
602
    /// Match the end of a line or the end of text. Specifically, this matches
603
    /// at the end position of the input, or at the position immediately
604
    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
605
    /// precedes it.
606
    EndCRLF = 1 << 5,
607
    /// Match an ASCII-only word boundary. That is, this matches a position
608
    /// where the left adjacent character and right adjacent character
609
    /// correspond to a word and non-word or a non-word and word character.
610
    Word = 1 << 6,
611
    /// Match an ASCII-only negation of a word boundary.
612
    WordNegate = 1 << 7,
613
    /// Match the start of an ASCII-only word boundary. That is, this matches a
614
    /// position at either the beginning of the haystack or where the previous
615
    /// character is not a word character and the following character is a word
616
    /// character.
617
    WordStart = 1 << 8,
618
    /// Match the end of an ASCII-only word boundary. That is, this matches
619
    /// a position at either the end of the haystack or where the previous
620
    /// character is a word character and the following character is not a word
621
    /// character.
622
    WordEnd = 1 << 9,
623
    /// Match the start half of an ASCII-only word boundary. That is, this
624
    /// matches a position at either the beginning of the haystack or where the
625
    /// previous character is not a word character.
626
    WordStartHalf = 1 << 10,
627
    /// Match the end half of an ASCII-only word boundary. That is, this
628
    /// matches a position at either the end of the haystack or where the
629
    /// following character is not a word character.
630
    WordEndHalf = 1 << 11,
631
}
632
633
impl Look {
634
    /// Returns true if the given position in the given haystack matches this
635
    /// look-around assertion.
636
9.53M
    pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool {
637
        use self::Look::*;
638
639
9.53M
        match *self {
640
1.84M
            Start => at == 0,
641
660k
            End => at == haystack.len(),
642
573k
            StartLF => at == 0 || haystack[at - 1] == b'\n',
643
573k
            EndLF => at == haystack.len() || haystack[at] == b'\n',
644
            StartCRLF => {
645
380k
                at == 0
646
378k
                    || haystack[at - 1] == b'\n'
647
375k
                    || (haystack[at - 1] == b'\r'
648
1.69k
                        && (at >= haystack.len() || haystack[at] != b'\n'))
649
            }
650
            EndCRLF => {
651
378k
                at == haystack.len()
652
377k
                    || haystack[at] == b'\r'
653
375k
                    || (haystack[at] == b'\n'
654
3.93k
                        && (at == 0 || haystack[at - 1] != b'\r'))
655
            }
656
            Word => {
657
4.42M
                let word_before =
658
4.42M
                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
659
4.42M
                let word_after =
660
4.42M
                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
661
4.42M
                word_before != word_after
662
            }
663
            WordNegate => {
664
279k
                let word_before =
665
279k
                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
666
279k
                let word_after =
667
279k
                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
668
279k
                word_before == word_after
669
            }
670
            WordStart => {
671
5.37k
                let word_before =
672
5.37k
                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
673
5.37k
                let word_after =
674
5.37k
                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
675
5.37k
                !word_before && word_after
676
            }
677
            WordEnd => {
678
408k
                let word_before =
679
408k
                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
680
408k
                let word_after =
681
408k
                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
682
408k
                word_before && !word_after
683
            }
684
            WordStartHalf => {
685
1.11k
                let word_before =
686
1.11k
                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
687
1.11k
                !word_before
688
            }
689
            WordEndHalf => {
690
2.00k
                let word_after =
691
2.00k
                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
692
2.00k
                !word_after
693
            }
694
        }
695
9.53M
    }
696
}
697
698
/// The high-level intermediate representation of a repetition operator.
699
///
700
/// A repetition operator permits the repetition of an arbitrary
701
/// sub-expression.
702
#[derive(Clone, Debug, Eq, PartialEq)]
703
pub(crate) struct Repetition {
704
    /// The minimum range of the repetition.
705
    ///
706
    /// Note that special cases like `?`, `+` and `*` all get translated into
707
    /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
708
    ///
709
    /// When `min` is zero, this expression can match the empty string
710
    /// regardless of what its sub-expression is.
711
    pub(crate) min: u32,
712
    /// The maximum range of the repetition.
713
    ///
714
    /// Note that when `max` is `None`, `min` acts as a lower bound but where
715
    /// there is no upper bound. For something like `x{5}` where the min and
716
    /// max are equivalent, `min` will be set to `5` and `max` will be set to
717
    /// `Some(5)`.
718
    pub(crate) max: Option<u32>,
719
    /// Whether this repetition operator is greedy or not. A greedy operator
720
    /// will match as much as it can. A non-greedy operator will match as
721
    /// little as it can.
722
    ///
723
    /// Typically, operators are greedy by default and are only non-greedy when
724
    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
725
    /// not. However, this can be inverted via the `U` "ungreedy" flag.
726
    pub(crate) greedy: bool,
727
    /// The expression being repeated.
728
    pub(crate) sub: Box<Hir>,
729
}
730
731
/// The high-level intermediate representation for a capturing group.
732
///
733
/// A capturing group always has an index and a child expression. It may
734
/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
735
/// necessary.
736
///
737
/// Note that there is no explicit representation of a non-capturing group
738
/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
739
/// the recursive structure of the `Hir` itself.
740
#[derive(Clone, Debug, Eq, PartialEq)]
741
pub(crate) struct Capture {
742
    /// The capture index of the capture.
743
    pub(crate) index: u32,
744
    /// The name of the capture, if it exists.
745
    pub(crate) name: Option<Box<str>>,
746
    /// The expression inside the capturing group, which may be empty.
747
    pub(crate) sub: Box<Hir>,
748
}
749
750
91.9M
fn next_char(ch: char) -> Option<char> {
751
    // Skip over the surrogate range.
752
91.9M
    if ch == '\u{D7FF}' {
753
2.61k
        return Some('\u{E000}');
754
91.9M
    }
755
    // OK because char::MAX < u32::MAX and we handle U+D7FF above.
756
91.9M
    char::from_u32(u32::from(ch).checked_add(1).unwrap())
757
91.9M
}
758
759
91.9M
fn prev_char(ch: char) -> Option<char> {
760
    // Skip over the surrogate range.
761
91.9M
    if ch == '\u{E000}' {
762
391
        return Some('\u{D7FF}');
763
91.9M
    }
764
    // OK because subtracting 1 from any valid scalar value other than 0
765
    // and U+E000 yields a valid scalar value.
766
91.9M
    Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap())
767
91.9M
}
768
769
impl Drop for Hir {
770
66.6M
    fn drop(&mut self) {
771
        use core::mem;
772
773
66.6M
        match *self.kind() {
774
            HirKind::Empty
775
            | HirKind::Char(_)
776
            | HirKind::Class(_)
777
66.0M
            | HirKind::Look(_) => return,
778
426k
            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
779
158k
            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
780
155k
                return
781
            }
782
76.9k
            HirKind::Concat(ref x) if x.is_empty() => return,
783
4.00k
            HirKind::Alternation(ref x) if x.is_empty() => return,
784
35.9k
            _ => {}
785
        }
786
787
35.9k
        let mut stack = vec![mem::replace(self, Hir::empty())];
788
17.3M
        while let Some(mut expr) = stack.pop() {
789
17.3M
            match expr.kind {
790
                HirKind::Empty
791
                | HirKind::Char(_)
792
                | HirKind::Class(_)
793
16.9M
                | HirKind::Look(_) => {}
794
180k
                HirKind::Capture(ref mut x) => {
795
180k
                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
796
180k
                }
797
153k
                HirKind::Repetition(ref mut x) => {
798
153k
                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
799
153k
                }
800
60.6k
                HirKind::Concat(ref mut x) => {
801
60.6k
                    stack.extend(x.drain(..));
802
60.6k
                }
803
3.24k
                HirKind::Alternation(ref mut x) => {
804
3.24k
                    stack.extend(x.drain(..));
805
3.24k
                }
806
            }
807
        }
808
66.6M
    }
809
}