Coverage Report

Created: 2025-02-21 07:11

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/hir/mod.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Defines a high-level intermediate (HIR) representation for regular expressions.
3
4
The HIR is represented by the [`Hir`] type, and it principally constructed via
5
[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6
may use the smart constructors defined on `Hir` to build their own by hand. The
7
smart constructors simultaneously simplify and "optimize" the HIR, and are also
8
the same routines used by translation.
9
10
Most regex engines only have an HIR like this, and usually construct it
11
directly from the concrete syntax. This crate however first parses the
12
concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13
as mentioned above. It's done this way to facilitate better error reporting,
14
and to have a structured representation of a regex that faithfully represents
15
its concrete syntax. Namely, while an `Hir` value can be converted back to an
16
equivalent regex pattern string, it is unlikely to look like the original due
17
to its simplified structure.
18
*/
19
20
use core::{char, cmp};
21
22
use alloc::{
23
    boxed::Box,
24
    format,
25
    string::{String, ToString},
26
    vec,
27
    vec::Vec,
28
};
29
30
use crate::{
31
    ast::Span,
32
    hir::interval::{Interval, IntervalSet, IntervalSetIter},
33
    unicode,
34
};
35
36
pub use crate::{
37
    hir::visitor::{visit, Visitor},
38
    unicode::CaseFoldError,
39
};
40
41
mod interval;
42
pub mod literal;
43
pub mod print;
44
pub mod translate;
45
mod visitor;
46
47
/// An error that can occur while translating an `Ast` to a `Hir`.
48
#[derive(Clone, Debug, Eq, PartialEq)]
49
pub struct Error {
50
    /// The kind of error.
51
    kind: ErrorKind,
52
    /// The original pattern that the translator's Ast was parsed from. Every
53
    /// span in an error is a valid range into this string.
54
    pattern: String,
55
    /// The span of this error, derived from the Ast given to the translator.
56
    span: Span,
57
}
58
59
impl Error {
60
    /// Return the type of this error.
61
1.40k
    pub fn kind(&self) -> &ErrorKind {
62
1.40k
        &self.kind
63
1.40k
    }
64
65
    /// The original pattern string in which this error occurred.
66
    ///
67
    /// Every span reported by this error is reported in terms of this string.
68
1.40k
    pub fn pattern(&self) -> &str {
69
1.40k
        &self.pattern
70
1.40k
    }
71
72
    /// Return the span at which this error occurred.
73
1.40k
    pub fn span(&self) -> &Span {
74
1.40k
        &self.span
75
1.40k
    }
76
}
77
78
/// The type of an error that occurred while building an `Hir`.
79
///
80
/// This error type is marked as `non_exhaustive`. This means that adding a
81
/// new variant is not considered a breaking change.
82
#[non_exhaustive]
83
#[derive(Clone, Debug, Eq, PartialEq)]
84
pub enum ErrorKind {
85
    /// This error occurs when a Unicode feature is used when Unicode
86
    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87
    UnicodeNotAllowed,
88
    /// This error occurs when translating a pattern that could match a byte
89
    /// sequence that isn't UTF-8 and `utf8` was enabled.
90
    InvalidUtf8,
91
    /// This error occurs when one uses a non-ASCII byte for a line terminator,
92
    /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93
    InvalidLineTerminator,
94
    /// This occurs when an unrecognized Unicode property name could not
95
    /// be found.
96
    UnicodePropertyNotFound,
97
    /// This occurs when an unrecognized Unicode property value could not
98
    /// be found.
99
    UnicodePropertyValueNotFound,
100
    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101
    /// `\d`) could not be found. This can occur when the `unicode-perl`
102
    /// crate feature is not enabled.
103
    UnicodePerlClassNotFound,
104
    /// This occurs when the Unicode simple case mapping tables are not
105
    /// available, and the regular expression required Unicode aware case
106
    /// insensitivity.
107
    UnicodeCaseUnavailable,
108
}
109
110
#[cfg(feature = "std")]
111
impl std::error::Error for Error {}
112
113
impl core::fmt::Display for Error {
114
1.40k
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115
1.40k
        crate::error::Formatter::from(self).fmt(f)
116
1.40k
    }
117
}
118
119
impl core::fmt::Display for ErrorKind {
120
1.40k
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121
        use self::ErrorKind::*;
122
123
1.40k
        let msg = match *self {
124
191
            UnicodeNotAllowed => "Unicode not allowed here",
125
604
            InvalidUtf8 => "pattern can match invalid UTF-8",
126
0
            InvalidLineTerminator => "invalid line terminator, must be ASCII",
127
596
            UnicodePropertyNotFound => "Unicode property not found",
128
17
            UnicodePropertyValueNotFound => "Unicode property value not found",
129
            UnicodePerlClassNotFound => {
130
0
                "Unicode-aware Perl class not found \
131
0
                 (make sure the unicode-perl feature is enabled)"
132
            }
133
            UnicodeCaseUnavailable => {
134
0
                "Unicode-aware case insensitivity matching is not available \
135
0
                 (make sure the unicode-case feature is enabled)"
136
            }
137
        };
138
1.40k
        f.write_str(msg)
139
1.40k
    }
140
}
141
142
/// A high-level intermediate representation (HIR) for a regular expression.
143
///
144
/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145
/// An `HirKind` indicates what kind of regular expression it is (a literal,
146
/// a repetition, a look-around assertion, etc.), where as a `Properties`
147
/// describes various facts about the regular expression. For example, whether
148
/// it matches UTF-8 or if it matches the empty string.
149
///
150
/// The HIR of a regular expression represents an intermediate step between
151
/// its abstract syntax (a structured description of the concrete syntax) and
152
/// an actual regex matcher. The purpose of HIR is to make regular expressions
153
/// easier to analyze. In particular, the AST is much more complex than the
154
/// HIR. For example, while an AST supports arbitrarily nested character
155
/// classes, the HIR will flatten all nested classes into a single set. The HIR
156
/// will also "compile away" every flag present in the concrete syntax. For
157
/// example, users of HIR expressions never need to worry about case folding;
158
/// it is handled automatically by the translator (e.g., by translating
159
/// `(?i:A)` to `[aA]`).
160
///
161
/// The specific type of an HIR expression can be accessed via its `kind`
162
/// or `into_kind` methods. This extra level of indirection exists for two
163
/// reasons:
164
///
165
/// 1. Construction of an HIR expression *must* use the constructor methods on
166
/// this `Hir` type instead of building the `HirKind` values directly. This
167
/// permits construction to enforce invariants like "concatenations always
168
/// consist of two or more sub-expressions."
169
/// 2. Every HIR expression contains attributes that are defined inductively,
170
/// and can be computed cheaply during the construction process. For example,
171
/// one such attribute is whether the expression must match at the beginning of
172
/// the haystack.
173
///
174
/// In particular, if you have an `HirKind` value, then there is intentionally
175
/// no way to build an `Hir` value from it. You instead need to do case
176
/// analysis on the `HirKind` value and build the `Hir` value using its smart
177
/// constructors.
178
///
179
/// # UTF-8
180
///
181
/// If the HIR was produced by a translator with
182
/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183
/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184
/// matches.
185
///
186
/// For empty matches, those can occur at any position. It is the
187
/// responsibility of the regex engine to determine whether empty matches are
188
/// permitted between the code units of a single codepoint.
189
///
190
/// # Stack space
191
///
192
/// This type defines its own destructor that uses constant stack space and
193
/// heap space proportional to the size of the HIR.
194
///
195
/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196
/// expression pattern string, and uses constant stack space and heap space
197
/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198
/// be _semantically_ equivalent to the original concrete syntax, but it may
199
/// look very different. (And potentially not practically readable by a human.)
200
///
201
/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202
/// stack space. The implementation will also suppress some details (such as
203
/// the `Properties` inlined into every `Hir` value to make it less noisy).
204
#[derive(Clone, Eq, PartialEq)]
205
pub struct Hir {
206
    /// The underlying HIR kind.
207
    kind: HirKind,
208
    /// Analysis info about this HIR, computed during construction.
209
    props: Properties,
210
}
211
212
/// Methods for accessing the underlying `HirKind` and `Properties`.
213
impl Hir {
214
    /// Returns a reference to the underlying HIR kind.
215
236M
    pub fn kind(&self) -> &HirKind {
216
236M
        &self.kind
217
236M
    }
218
219
    /// Consumes ownership of this HIR expression and returns its underlying
220
    /// `HirKind`.
221
53.7k
    pub fn into_kind(mut self) -> HirKind {
222
53.7k
        core::mem::replace(&mut self.kind, HirKind::Empty)
223
53.7k
    }
224
225
    /// Returns the properties computed for this `Hir`.
226
31.0M
    pub fn properties(&self) -> &Properties {
227
31.0M
        &self.props
228
31.0M
    }
229
230
    /// Splits this HIR into its constituent parts.
231
    ///
232
    /// This is useful because `let Hir { kind, props } = hir;` does not work
233
    /// because of `Hir`'s custom `Drop` implementation.
234
18.4M
    fn into_parts(mut self) -> (HirKind, Properties) {
235
18.4M
        (
236
18.4M
            core::mem::replace(&mut self.kind, HirKind::Empty),
237
18.4M
            core::mem::replace(&mut self.props, Properties::empty()),
238
18.4M
        )
239
18.4M
    }
240
}
241
242
/// Smart constructors for HIR values.
243
///
244
/// These constructors are called "smart" because they do inductive work or
245
/// simplifications. For example, calling `Hir::repetition` with a repetition
246
/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247
/// since it is equivalent to an empty regex. Another example is calling
248
/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249
/// just get back the original `expr` since it's precisely equivalent.
250
///
251
/// Smart constructors enable maintaining invariants about the HIR data type
252
/// while also simulanteously keeping the representation as simple as possible.
253
impl Hir {
254
    /// Returns an empty HIR expression.
255
    ///
256
    /// An empty HIR expression always matches, including the empty string.
257
    #[inline]
258
2.71M
    pub fn empty() -> Hir {
259
2.71M
        let props = Properties::empty();
260
2.71M
        Hir { kind: HirKind::Empty, props }
261
2.71M
    }
<regex_syntax::hir::Hir>::empty
Line
Count
Source
258
161k
    pub fn empty() -> Hir {
259
161k
        let props = Properties::empty();
260
161k
        Hir { kind: HirKind::Empty, props }
261
161k
    }
<regex_syntax::hir::Hir>::empty
Line
Count
Source
258
2.55M
    pub fn empty() -> Hir {
259
2.55M
        let props = Properties::empty();
260
2.55M
        Hir { kind: HirKind::Empty, props }
261
2.55M
    }
262
263
    /// Returns an HIR expression that can never match anything. That is,
264
    /// the size of the set of strings in the language described by the HIR
265
    /// returned is `0`.
266
    ///
267
    /// This is distinct from [`Hir::empty`] in that the empty string matches
268
    /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269
    /// language describe described by `Hir::empty` is non-empty.
270
    ///
271
    /// Note that currently, the HIR returned uses an empty character class to
272
    /// indicate that nothing can match. An equivalent expression that cannot
273
    /// match is an empty alternation, but all such "fail" expressions are
274
    /// normalized (via smart constructors) to empty character classes. This is
275
    /// because empty character classes can be spelled in the concrete syntax
276
    /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277
    /// empty alternations cannot.
278
    #[inline]
279
10.9k
    pub fn fail() -> Hir {
280
10.9k
        let class = Class::Bytes(ClassBytes::empty());
281
10.9k
        let props = Properties::class(&class);
282
10.9k
        // We can't just call Hir::class here because it defers to Hir::fail
283
10.9k
        // in order to canonicalize the Hir value used to represent "cannot
284
10.9k
        // match."
285
10.9k
        Hir { kind: HirKind::Class(class), props }
286
10.9k
    }
<regex_syntax::hir::Hir>::fail
Line
Count
Source
279
2.67k
    pub fn fail() -> Hir {
280
2.67k
        let class = Class::Bytes(ClassBytes::empty());
281
2.67k
        let props = Properties::class(&class);
282
2.67k
        // We can't just call Hir::class here because it defers to Hir::fail
283
2.67k
        // in order to canonicalize the Hir value used to represent "cannot
284
2.67k
        // match."
285
2.67k
        Hir { kind: HirKind::Class(class), props }
286
2.67k
    }
<regex_syntax::hir::Hir>::fail
Line
Count
Source
279
8.30k
    pub fn fail() -> Hir {
280
8.30k
        let class = Class::Bytes(ClassBytes::empty());
281
8.30k
        let props = Properties::class(&class);
282
8.30k
        // We can't just call Hir::class here because it defers to Hir::fail
283
8.30k
        // in order to canonicalize the Hir value used to represent "cannot
284
8.30k
        // match."
285
8.30k
        Hir { kind: HirKind::Class(class), props }
286
8.30k
    }
287
288
    /// Creates a literal HIR expression.
289
    ///
290
    /// This accepts anything that can be converted into a `Box<[u8]>`.
291
    ///
292
    /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293
    /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294
    /// any HIR node) matches valid UTF-8 exclusively can be queried via
295
    /// [`Properties::is_utf8`].
296
    ///
297
    /// # Example
298
    ///
299
    /// This example shows that concatenations of `Literal` HIR values will
300
    /// automatically get flattened and combined together. So for example, even
301
    /// if you concat multiple `Literal` values that are themselves not valid
302
    /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303
    /// how "smart" Hir's smart constructors are.
304
    ///
305
    /// ```
306
    /// use regex_syntax::hir::{Hir, HirKind, Literal};
307
    ///
308
    /// let literals = vec![
309
    ///     Hir::literal([0xE2]),
310
    ///     Hir::literal([0x98]),
311
    ///     Hir::literal([0x83]),
312
    /// ];
313
    /// // Each literal, on its own, is invalid UTF-8.
314
    /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315
    ///
316
    /// let concat = Hir::concat(literals);
317
    /// // But the concatenation is valid UTF-8!
318
    /// assert!(concat.properties().is_utf8());
319
    ///
320
    /// // And also notice that the literals have been concatenated into a
321
    /// // single `Literal`, to the point where there is no explicit `Concat`!
322
    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323
    /// assert_eq!(&expected, concat.kind());
324
    /// ```
325
    ///
326
    /// # Example: building a literal from a `char`
327
    ///
328
    /// This example shows how to build a single `Hir` literal from a `char`
329
    /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330
    /// encode a `char` value:
331
    ///
332
    /// ```
333
    /// use regex_syntax::hir::{Hir, HirKind, Literal};
334
    ///
335
    /// let ch = '☃';
336
    /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337
    ///
338
    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339
    /// assert_eq!(&expected, got.kind());
340
    /// ```
341
    #[inline]
342
12.5M
    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343
12.5M
        let bytes = lit.into();
344
12.5M
        if bytes.is_empty() {
345
0
            return Hir::empty();
346
12.5M
        }
347
12.5M
348
12.5M
        let lit = Literal(bytes);
349
12.5M
        let props = Properties::literal(&lit);
350
12.5M
        Hir { kind: HirKind::Literal(lit), props }
351
12.5M
    }
<regex_syntax::hir::Hir>::literal::<alloc::boxed::Box<[u8]>>
Line
Count
Source
342
527k
    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343
527k
        let bytes = lit.into();
344
527k
        if bytes.is_empty() {
345
0
            return Hir::empty();
346
527k
        }
347
527k
348
527k
        let lit = Literal(bytes);
349
527k
        let props = Properties::literal(&lit);
350
527k
        Hir { kind: HirKind::Literal(lit), props }
351
527k
    }
<regex_syntax::hir::Hir>::literal::<alloc::vec::Vec<u8>>
Line
Count
Source
342
11.9M
    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343
11.9M
        let bytes = lit.into();
344
11.9M
        if bytes.is_empty() {
345
0
            return Hir::empty();
346
11.9M
        }
347
11.9M
348
11.9M
        let lit = Literal(bytes);
349
11.9M
        let props = Properties::literal(&lit);
350
11.9M
        Hir { kind: HirKind::Literal(lit), props }
351
11.9M
    }
352
353
    /// Creates a class HIR expression. The class may either be defined over
354
    /// ranges of Unicode codepoints or ranges of raw byte values.
355
    ///
356
    /// Note that an empty class is permitted. An empty class is equivalent to
357
    /// `Hir::fail()`.
358
    #[inline]
359
3.80M
    pub fn class(class: Class) -> Hir {
360
3.80M
        if class.is_empty() {
361
10.9k
            return Hir::fail();
362
3.78M
        } else if let Some(bytes) = class.literal() {
363
13.4k
            return Hir::literal(bytes);
364
3.77M
        }
365
3.77M
        let props = Properties::class(&class);
366
3.77M
        Hir { kind: HirKind::Class(class), props }
367
3.80M
    }
<regex_syntax::hir::Hir>::class
Line
Count
Source
359
823k
    pub fn class(class: Class) -> Hir {
360
823k
        if class.is_empty() {
361
2.67k
            return Hir::fail();
362
820k
        } else if let Some(bytes) = class.literal() {
363
0
            return Hir::literal(bytes);
364
820k
        }
365
820k
        let props = Properties::class(&class);
366
820k
        Hir { kind: HirKind::Class(class), props }
367
823k
    }
<regex_syntax::hir::Hir>::class
Line
Count
Source
359
2.97M
    pub fn class(class: Class) -> Hir {
360
2.97M
        if class.is_empty() {
361
8.30k
            return Hir::fail();
362
2.96M
        } else if let Some(bytes) = class.literal() {
363
13.4k
            return Hir::literal(bytes);
364
2.95M
        }
365
2.95M
        let props = Properties::class(&class);
366
2.95M
        Hir { kind: HirKind::Class(class), props }
367
2.97M
    }
368
369
    /// Creates a look-around assertion HIR expression.
370
    #[inline]
371
335k
    pub fn look(look: Look) -> Hir {
372
335k
        let props = Properties::look(look);
373
335k
        Hir { kind: HirKind::Look(look), props }
374
335k
    }
<regex_syntax::hir::Hir>::look
Line
Count
Source
371
12.4k
    pub fn look(look: Look) -> Hir {
372
12.4k
        let props = Properties::look(look);
373
12.4k
        Hir { kind: HirKind::Look(look), props }
374
12.4k
    }
<regex_syntax::hir::Hir>::look
Line
Count
Source
371
322k
    pub fn look(look: Look) -> Hir {
372
322k
        let props = Properties::look(look);
373
322k
        Hir { kind: HirKind::Look(look), props }
374
322k
    }
375
376
    /// Creates a repetition HIR expression.
377
    #[inline]
378
702k
    pub fn repetition(mut rep: Repetition) -> Hir {
379
702k
        // If the sub-expression of a repetition can only match the empty
380
702k
        // string, then we force its maximum to be at most 1.
381
702k
        if rep.sub.properties().maximum_len() == Some(0) {
382
29.9k
            rep.min = cmp::min(rep.min, 1);
383
29.9k
            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384
672k
        }
385
        // The regex 'a{0}' is always equivalent to the empty regex. This is
386
        // true even when 'a' is an expression that never matches anything
387
        // (like '\P{any}').
388
        //
389
        // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390
702k
        if rep.min == 0 && rep.max == Some(0) {
391
4.08k
            return Hir::empty();
392
698k
        } else if rep.min == 1 && rep.max == Some(1) {
393
3.97k
            return *rep.sub;
394
694k
        }
395
694k
        let props = Properties::repetition(&rep);
396
694k
        Hir { kind: HirKind::Repetition(rep), props }
397
702k
    }
<regex_syntax::hir::Hir>::repetition
Line
Count
Source
378
249k
    pub fn repetition(mut rep: Repetition) -> Hir {
379
249k
        // If the sub-expression of a repetition can only match the empty
380
249k
        // string, then we force its maximum to be at most 1.
381
249k
        if rep.sub.properties().maximum_len() == Some(0) {
382
2.17k
            rep.min = cmp::min(rep.min, 1);
383
2.17k
            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384
247k
        }
385
        // The regex 'a{0}' is always equivalent to the empty regex. This is
386
        // true even when 'a' is an expression that never matches anything
387
        // (like '\P{any}').
388
        //
389
        // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390
249k
        if rep.min == 0 && rep.max == Some(0) {
391
0
            return Hir::empty();
392
249k
        } else if rep.min == 1 && rep.max == Some(1) {
393
0
            return *rep.sub;
394
249k
        }
395
249k
        let props = Properties::repetition(&rep);
396
249k
        Hir { kind: HirKind::Repetition(rep), props }
397
249k
    }
<regex_syntax::hir::Hir>::repetition
Line
Count
Source
378
453k
    pub fn repetition(mut rep: Repetition) -> Hir {
379
453k
        // If the sub-expression of a repetition can only match the empty
380
453k
        // string, then we force its maximum to be at most 1.
381
453k
        if rep.sub.properties().maximum_len() == Some(0) {
382
27.7k
            rep.min = cmp::min(rep.min, 1);
383
27.7k
            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384
425k
        }
385
        // The regex 'a{0}' is always equivalent to the empty regex. This is
386
        // true even when 'a' is an expression that never matches anything
387
        // (like '\P{any}').
388
        //
389
        // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390
453k
        if rep.min == 0 && rep.max == Some(0) {
391
4.08k
            return Hir::empty();
392
448k
        } else if rep.min == 1 && rep.max == Some(1) {
393
3.97k
            return *rep.sub;
394
444k
        }
395
444k
        let props = Properties::repetition(&rep);
396
444k
        Hir { kind: HirKind::Repetition(rep), props }
397
453k
    }
398
399
    /// Creates a capture HIR expression.
400
    ///
401
    /// Note that there is no explicit HIR value for a non-capturing group.
402
    /// Since a non-capturing group only exists to override precedence in the
403
    /// concrete syntax and since an HIR already does its own grouping based on
404
    /// what is parsed, there is no need to explicitly represent non-capturing
405
    /// groups in the HIR.
406
    #[inline]
407
99.7k
    pub fn capture(capture: Capture) -> Hir {
408
99.7k
        let props = Properties::capture(&capture);
409
99.7k
        Hir { kind: HirKind::Capture(capture), props }
410
99.7k
    }
411
412
    /// Returns the concatenation of the given expressions.
413
    ///
414
    /// This attempts to flatten and simplify the concatenation as appropriate.
415
    ///
416
    /// # Example
417
    ///
418
    /// This shows a simple example of basic flattening of both concatenations
419
    /// and literals.
420
    ///
421
    /// ```
422
    /// use regex_syntax::hir::Hir;
423
    ///
424
    /// let hir = Hir::concat(vec![
425
    ///     Hir::concat(vec![
426
    ///         Hir::literal([b'a']),
427
    ///         Hir::literal([b'b']),
428
    ///         Hir::literal([b'c']),
429
    ///     ]),
430
    ///     Hir::concat(vec![
431
    ///         Hir::literal([b'x']),
432
    ///         Hir::literal([b'y']),
433
    ///         Hir::literal([b'z']),
434
    ///     ]),
435
    /// ]);
436
    /// let expected = Hir::literal("abcxyz".as_bytes());
437
    /// assert_eq!(expected, hir);
438
    /// ```
439
1.38M
    pub fn concat(subs: Vec<Hir>) -> Hir {
440
1.38M
        // We rebuild the concatenation by simplifying it. Would be nice to do
441
1.38M
        // it in place, but that seems a little tricky?
442
1.38M
        let mut new = vec![];
443
1.38M
        // This gobbles up any adjacent literals in a concatenation and smushes
444
1.38M
        // them together. Basically, when we see a literal, we add its bytes
445
1.38M
        // to 'prior_lit', and whenever we see anything else, we first take
446
1.38M
        // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447
1.38M
        let mut prior_lit: Option<Vec<u8>> = None;
448
8.42M
        for sub in subs {
449
7.04M
            let (kind, props) = sub.into_parts();
450
7.04M
            match kind {
451
1.80M
                HirKind::Literal(Literal(bytes)) => {
452
1.80M
                    if let Some(ref mut prior_bytes) = prior_lit {
453
22.8k
                        prior_bytes.extend_from_slice(&bytes);
454
1.78M
                    } else {
455
1.78M
                        prior_lit = Some(bytes.to_vec());
456
1.78M
                    }
457
                }
458
                // We also flatten concats that are direct children of another
459
                // concat. We only need to do this one level deep since
460
                // Hir::concat is the only way to build concatenations, and so
461
                // flattening happens inductively.
462
2.57k
                HirKind::Concat(subs2) => {
463
39.2k
                    for sub2 in subs2 {
464
36.7k
                        let (kind2, props2) = sub2.into_parts();
465
36.7k
                        match kind2 {
466
14.1k
                            HirKind::Literal(Literal(bytes)) => {
467
14.1k
                                if let Some(ref mut prior_bytes) = prior_lit {
468
896
                                    prior_bytes.extend_from_slice(&bytes);
469
13.2k
                                } else {
470
13.2k
                                    prior_lit = Some(bytes.to_vec());
471
13.2k
                                }
472
                            }
473
22.5k
                            kind2 => {
474
22.5k
                                if let Some(prior_bytes) = prior_lit.take() {
475
13.6k
                                    new.push(Hir::literal(prior_bytes));
476
13.6k
                                }
477
22.5k
                                new.push(Hir { kind: kind2, props: props2 });
478
                            }
479
                        }
480
                    }
481
                }
482
                // We can just skip empty HIRs.
483
1.33k
                HirKind::Empty => {}
484
5.22M
                kind => {
485
5.22M
                    if let Some(prior_bytes) = prior_lit.take() {
486
697k
                        new.push(Hir::literal(prior_bytes));
487
4.53M
                    }
488
5.22M
                    new.push(Hir { kind, props });
489
                }
490
            }
491
        }
492
1.38M
        if let Some(prior_bytes) = prior_lit.take() {
493
1.08M
            new.push(Hir::literal(prior_bytes));
494
1.08M
        }
495
1.38M
        if new.is_empty() {
496
673
            return Hir::empty();
497
1.38M
        } else if new.len() == 1 {
498
998k
            return new.pop().unwrap();
499
382k
        }
500
382k
        let props = Properties::concat(&new);
501
382k
        Hir { kind: HirKind::Concat(new), props }
502
1.38M
    }
503
504
    /// Returns the alternation of the given expressions.
505
    ///
506
    /// This flattens and simplifies the alternation as appropriate. This may
507
    /// include factoring out common prefixes or even rewriting the alternation
508
    /// as a character class.
509
    ///
510
    /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511
    /// is not possible for one to write an empty alternation, or even an
512
    /// alternation with a single sub-expression, in the concrete syntax of a
513
    /// regex.)
514
    ///
515
    /// # Example
516
    ///
517
    /// This is a simple example showing how an alternation might get
518
    /// simplified.
519
    ///
520
    /// ```
521
    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522
    ///
523
    /// let hir = Hir::alternation(vec![
524
    ///     Hir::literal([b'a']),
525
    ///     Hir::literal([b'b']),
526
    ///     Hir::literal([b'c']),
527
    ///     Hir::literal([b'd']),
528
    ///     Hir::literal([b'e']),
529
    ///     Hir::literal([b'f']),
530
    /// ]);
531
    /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532
    ///     ClassUnicodeRange::new('a', 'f'),
533
    /// ])));
534
    /// assert_eq!(expected, hir);
535
    /// ```
536
    ///
537
    /// And another example showing how common prefixes might get factored
538
    /// out.
539
    ///
540
    /// ```
541
    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542
    ///
543
    /// let hir = Hir::alternation(vec![
544
    ///     Hir::concat(vec![
545
    ///         Hir::literal("abc".as_bytes()),
546
    ///         Hir::class(Class::Unicode(ClassUnicode::new([
547
    ///             ClassUnicodeRange::new('A', 'Z'),
548
    ///         ]))),
549
    ///     ]),
550
    ///     Hir::concat(vec![
551
    ///         Hir::literal("abc".as_bytes()),
552
    ///         Hir::class(Class::Unicode(ClassUnicode::new([
553
    ///             ClassUnicodeRange::new('a', 'z'),
554
    ///         ]))),
555
    ///     ]),
556
    /// ]);
557
    /// let expected = Hir::concat(vec![
558
    ///     Hir::literal("abc".as_bytes()),
559
    ///     Hir::alternation(vec![
560
    ///         Hir::class(Class::Unicode(ClassUnicode::new([
561
    ///             ClassUnicodeRange::new('A', 'Z'),
562
    ///         ]))),
563
    ///         Hir::class(Class::Unicode(ClassUnicode::new([
564
    ///             ClassUnicodeRange::new('a', 'z'),
565
    ///         ]))),
566
    ///     ]),
567
    /// ]);
568
    /// assert_eq!(expected, hir);
569
    /// ```
570
    ///
571
    /// Note that these sorts of simplifications are not guaranteed.
572
70.6k
    pub fn alternation(subs: Vec<Hir>) -> Hir {
573
70.6k
        // We rebuild the alternation by simplifying it. We proceed similarly
574
70.6k
        // as the concatenation case. But in this case, there's no literal
575
70.6k
        // simplification happening. We're just flattening alternations.
576
70.6k
        let mut new = Vec::with_capacity(subs.len());
577
11.4M
        for sub in subs {
578
11.4M
            let (kind, props) = sub.into_parts();
579
11.4M
            match kind {
580
1.29k
                HirKind::Alternation(subs2) => {
581
1.29k
                    new.extend(subs2);
582
1.29k
                }
583
11.4M
                kind => {
584
11.4M
                    new.push(Hir { kind, props });
585
11.4M
                }
586
            }
587
        }
588
70.6k
        if new.is_empty() {
589
0
            return Hir::fail();
590
70.6k
        } else if new.len() == 1 {
591
0
            return new.pop().unwrap();
592
70.6k
        }
593
        // Now that it's completely flattened, look for the special case of
594
        // 'char1|char2|...|charN' and collapse that into a class. Note that
595
        // we look for 'char' first and then bytes. The issue here is that if
596
        // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597
        // then it isn't actually possible to smush them into a single class.
598
        // (Because classes are either "all codepoints" or "all bytes." You
599
        // can have a class that both matches non-ASCII but valid UTF-8 and
600
        // invalid UTF-8.) So we look for all chars and then all bytes, and
601
        // don't handle anything else.
602
70.6k
        if let Some(singletons) = singleton_chars(&new) {
603
3.65k
            let it = singletons
604
3.65k
                .into_iter()
605
469k
                .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606
3.65k
            return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607
66.9k
        }
608
66.9k
        if let Some(singletons) = singleton_bytes(&new) {
609
0
            let it = singletons
610
0
                .into_iter()
611
0
                .map(|b| ClassBytesRange { start: b, end: b });
612
0
            return Hir::class(Class::Bytes(ClassBytes::new(it)));
613
66.9k
        }
614
        // Similar to singleton chars, we can also look for alternations of
615
        // classes. Those can be smushed into a single class.
616
66.9k
        if let Some(cls) = class_chars(&new) {
617
78
            return Hir::class(cls);
618
66.8k
        }
619
66.8k
        if let Some(cls) = class_bytes(&new) {
620
0
            return Hir::class(cls);
621
66.8k
        }
622
        // Factor out a common prefix if we can, which might potentially
623
        // simplify the expression and unlock other optimizations downstream.
624
        // It also might generally make NFA matching and DFA construction
625
        // faster by reducing the scope of branching in the regex.
626
66.8k
        new = match lift_common_prefix(new) {
627
1.71k
            Ok(hir) => return hir,
628
65.1k
            Err(unchanged) => unchanged,
629
65.1k
        };
630
65.1k
        let props = Properties::alternation(&new);
631
65.1k
        Hir { kind: HirKind::Alternation(new), props }
632
70.6k
    }
633
634
    /// Returns an HIR expression for `.`.
635
    ///
636
    /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637
    /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638
    /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639
    /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640
    /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641
    /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642
    ///
643
    /// # Example
644
    ///
645
    /// Note that this is a convenience routine for constructing the correct
646
    /// character class based on the value of `Dot`. There is no explicit "dot"
647
    /// HIR value. It is just an abbreviation for a common character class.
648
    ///
649
    /// ```
650
    /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651
    ///
652
    /// let hir = Hir::dot(Dot::AnyByte);
653
    /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654
    ///     ClassBytesRange::new(0x00, 0xFF),
655
    /// ])));
656
    /// assert_eq!(expected, hir);
657
    /// ```
658
    #[inline]
659
1.42M
    pub fn dot(dot: Dot) -> Hir {
660
1.42M
        match dot {
661
1.29k
            Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662
1.29k
                ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663
1.29k
            ]))),
664
221k
            Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665
221k
                ClassBytesRange::new(b'\0', b'\xFF'),
666
221k
            ]))),
667
1.20M
            Dot::AnyCharExcept(ch) => {
668
1.20M
                let mut cls =
669
1.20M
                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670
1.20M
                cls.negate();
671
1.20M
                Hir::class(Class::Unicode(cls))
672
            }
673
            Dot::AnyCharExceptLF => {
674
0
                Hir::class(Class::Unicode(ClassUnicode::new([
675
0
                    ClassUnicodeRange::new('\0', '\x09'),
676
0
                    ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677
0
                ])))
678
            }
679
            Dot::AnyCharExceptCRLF => {
680
319
                Hir::class(Class::Unicode(ClassUnicode::new([
681
319
                    ClassUnicodeRange::new('\0', '\x09'),
682
319
                    ClassUnicodeRange::new('\x0B', '\x0C'),
683
319
                    ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684
319
                ])))
685
            }
686
0
            Dot::AnyByteExcept(byte) => {
687
0
                let mut cls =
688
0
                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689
0
                cls.negate();
690
0
                Hir::class(Class::Bytes(cls))
691
            }
692
            Dot::AnyByteExceptLF => {
693
0
                Hir::class(Class::Bytes(ClassBytes::new([
694
0
                    ClassBytesRange::new(b'\0', b'\x09'),
695
0
                    ClassBytesRange::new(b'\x0B', b'\xFF'),
696
0
                ])))
697
            }
698
            Dot::AnyByteExceptCRLF => {
699
0
                Hir::class(Class::Bytes(ClassBytes::new([
700
0
                    ClassBytesRange::new(b'\0', b'\x09'),
701
0
                    ClassBytesRange::new(b'\x0B', b'\x0C'),
702
0
                    ClassBytesRange::new(b'\x0E', b'\xFF'),
703
0
                ])))
704
            }
705
        }
706
1.42M
    }
<regex_syntax::hir::Hir>::dot
Line
Count
Source
659
221k
    pub fn dot(dot: Dot) -> Hir {
660
221k
        match dot {
661
0
            Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662
0
                ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663
0
            ]))),
664
221k
            Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665
221k
                ClassBytesRange::new(b'\0', b'\xFF'),
666
221k
            ]))),
667
0
            Dot::AnyCharExcept(ch) => {
668
0
                let mut cls =
669
0
                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670
0
                cls.negate();
671
0
                Hir::class(Class::Unicode(cls))
672
            }
673
            Dot::AnyCharExceptLF => {
674
0
                Hir::class(Class::Unicode(ClassUnicode::new([
675
0
                    ClassUnicodeRange::new('\0', '\x09'),
676
0
                    ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677
0
                ])))
678
            }
679
            Dot::AnyCharExceptCRLF => {
680
0
                Hir::class(Class::Unicode(ClassUnicode::new([
681
0
                    ClassUnicodeRange::new('\0', '\x09'),
682
0
                    ClassUnicodeRange::new('\x0B', '\x0C'),
683
0
                    ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684
0
                ])))
685
            }
686
0
            Dot::AnyByteExcept(byte) => {
687
0
                let mut cls =
688
0
                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689
0
                cls.negate();
690
0
                Hir::class(Class::Bytes(cls))
691
            }
692
            Dot::AnyByteExceptLF => {
693
0
                Hir::class(Class::Bytes(ClassBytes::new([
694
0
                    ClassBytesRange::new(b'\0', b'\x09'),
695
0
                    ClassBytesRange::new(b'\x0B', b'\xFF'),
696
0
                ])))
697
            }
698
            Dot::AnyByteExceptCRLF => {
699
0
                Hir::class(Class::Bytes(ClassBytes::new([
700
0
                    ClassBytesRange::new(b'\0', b'\x09'),
701
0
                    ClassBytesRange::new(b'\x0B', b'\x0C'),
702
0
                    ClassBytesRange::new(b'\x0E', b'\xFF'),
703
0
                ])))
704
            }
705
        }
706
221k
    }
<regex_syntax::hir::Hir>::dot
Line
Count
Source
659
1.20M
    pub fn dot(dot: Dot) -> Hir {
660
1.20M
        match dot {
661
1.29k
            Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662
1.29k
                ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663
1.29k
            ]))),
664
0
            Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665
0
                ClassBytesRange::new(b'\0', b'\xFF'),
666
0
            ]))),
667
1.20M
            Dot::AnyCharExcept(ch) => {
668
1.20M
                let mut cls =
669
1.20M
                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670
1.20M
                cls.negate();
671
1.20M
                Hir::class(Class::Unicode(cls))
672
            }
673
            Dot::AnyCharExceptLF => {
674
0
                Hir::class(Class::Unicode(ClassUnicode::new([
675
0
                    ClassUnicodeRange::new('\0', '\x09'),
676
0
                    ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677
0
                ])))
678
            }
679
            Dot::AnyCharExceptCRLF => {
680
319
                Hir::class(Class::Unicode(ClassUnicode::new([
681
319
                    ClassUnicodeRange::new('\0', '\x09'),
682
319
                    ClassUnicodeRange::new('\x0B', '\x0C'),
683
319
                    ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684
319
                ])))
685
            }
686
0
            Dot::AnyByteExcept(byte) => {
687
0
                let mut cls =
688
0
                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689
0
                cls.negate();
690
0
                Hir::class(Class::Bytes(cls))
691
            }
692
            Dot::AnyByteExceptLF => {
693
0
                Hir::class(Class::Bytes(ClassBytes::new([
694
0
                    ClassBytesRange::new(b'\0', b'\x09'),
695
0
                    ClassBytesRange::new(b'\x0B', b'\xFF'),
696
0
                ])))
697
            }
698
            Dot::AnyByteExceptCRLF => {
699
0
                Hir::class(Class::Bytes(ClassBytes::new([
700
0
                    ClassBytesRange::new(b'\0', b'\x09'),
701
0
                    ClassBytesRange::new(b'\x0B', b'\x0C'),
702
0
                    ClassBytesRange::new(b'\x0E', b'\xFF'),
703
0
                ])))
704
            }
705
        }
706
1.20M
    }
707
}
708
709
/// The underlying kind of an arbitrary [`Hir`] expression.
710
///
711
/// An `HirKind` is principally useful for doing case analysis on the type
712
/// of a regular expression. If you're looking to build new `Hir` values,
713
/// then you _must_ use the smart constructors defined on `Hir`, like
714
/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
715
/// not expose any way of building an `Hir` directly from an `HirKind`.
716
#[derive(Clone, Debug, Eq, PartialEq)]
717
pub enum HirKind {
718
    /// The empty regular expression, which matches everything, including the
719
    /// empty string.
720
    Empty,
721
    /// A literalstring that matches exactly these bytes.
722
    Literal(Literal),
723
    /// A single character class that matches any of the characters in the
724
    /// class. A class can either consist of Unicode scalar values as
725
    /// characters, or it can use bytes.
726
    ///
727
    /// A class may be empty. In which case, it matches nothing.
728
    Class(Class),
729
    /// A look-around assertion. A look-around match always has zero length.
730
    Look(Look),
731
    /// A repetition operation applied to a sub-expression.
732
    Repetition(Repetition),
733
    /// A capturing group, which contains a sub-expression.
734
    Capture(Capture),
735
    /// A concatenation of expressions.
736
    ///
737
    /// A concatenation matches only if each of its sub-expressions match one
738
    /// after the other.
739
    ///
740
    /// Concatenations are guaranteed by `Hir`'s smart constructors to always
741
    /// have at least two sub-expressions.
742
    Concat(Vec<Hir>),
743
    /// An alternation of expressions.
744
    ///
745
    /// An alternation matches only if at least one of its sub-expressions
746
    /// match. If multiple sub-expressions match, then the leftmost is
747
    /// preferred.
748
    ///
749
    /// Alternations are guaranteed by `Hir`'s smart constructors to always
750
    /// have at least two sub-expressions.
751
    Alternation(Vec<Hir>),
752
}
753
754
impl HirKind {
755
    /// Returns a slice of this kind's sub-expressions, if any.
756
833k
    pub fn subs(&self) -> &[Hir] {
757
        use core::slice::from_ref;
758
759
833k
        match *self {
760
            HirKind::Empty
761
            | HirKind::Literal(_)
762
            | HirKind::Class(_)
763
794k
            | HirKind::Look(_) => &[],
764
31.8k
            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
765
547
            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
766
3.66k
            HirKind::Concat(ref subs) => subs,
767
2.43k
            HirKind::Alternation(ref subs) => subs,
768
        }
769
833k
    }
770
}
771
772
impl core::fmt::Debug for Hir {
773
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774
0
        self.kind.fmt(f)
775
0
    }
776
}
777
778
/// Print a display representation of this Hir.
779
///
780
/// The result of this is a valid regular expression pattern string.
781
///
782
/// This implementation uses constant stack space and heap space proportional
783
/// to the size of the `Hir`.
784
impl core::fmt::Display for Hir {
785
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
786
0
        crate::hir::print::Printer::new().print(self, f)
787
0
    }
788
}
789
790
/// The high-level intermediate representation of a literal.
791
///
792
/// A literal corresponds to `0` or more bytes that should be matched
793
/// literally. The smart constructors defined on `Hir` will automatically
794
/// concatenate adjacent literals into one literal, and will even automatically
795
/// replace empty literals with `Hir::empty()`.
796
///
797
/// Note that despite a literal being represented by a sequence of bytes, its
798
/// `Debug` implementation will attempt to print it as a normal string. (That
799
/// is, not a sequence of decimal numbers.)
800
#[derive(Clone, Eq, PartialEq)]
801
pub struct Literal(pub Box<[u8]>);
802
803
impl core::fmt::Debug for Literal {
804
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
805
0
        crate::debug::Bytes(&self.0).fmt(f)
806
0
    }
807
}
808
809
/// The high-level intermediate representation of a character class.
810
///
811
/// A character class corresponds to a set of characters. A character is either
812
/// defined by a Unicode scalar value or a byte.
813
///
814
/// A character class, regardless of its character type, is represented by a
815
/// sequence of non-overlapping non-adjacent ranges of characters.
816
///
817
/// There are no guarantees about which class variant is used. Generally
818
/// speaking, the Unicode variat is used whenever a class needs to contain
819
/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
820
/// when Unicode mode is disabled. For example, at the time of writing, the
821
/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
822
/// `[a\u00A0]` due to optimizations.
823
///
824
/// Note that `Bytes` variant may be produced even when it exclusively matches
825
/// valid UTF-8. This is because a `Bytes` variant represents an intention by
826
/// the author of the regular expression to disable Unicode mode, which in turn
827
/// impacts the semantics of case insensitive matching. For example, `(?i)k`
828
/// and `(?i-u)k` will not match the same set of strings.
829
#[derive(Clone, Eq, PartialEq)]
830
pub enum Class {
831
    /// A set of characters represented by Unicode scalar values.
832
    Unicode(ClassUnicode),
833
    /// A set of characters represented by arbitrary bytes (one byte per
834
    /// character).
835
    Bytes(ClassBytes),
836
}
837
838
impl Class {
839
    /// Apply Unicode simple case folding to this character class, in place.
840
    /// The character class will be expanded to include all simple case folded
841
    /// character variants.
842
    ///
843
    /// If this is a byte oriented character class, then this will be limited
844
    /// to the ASCII ranges `A-Z` and `a-z`.
845
    ///
846
    /// # Panics
847
    ///
848
    /// This routine panics when the case mapping data necessary for this
849
    /// routine to complete is unavailable. This occurs when the `unicode-case`
850
    /// feature is not enabled and the underlying class is Unicode oriented.
851
    ///
852
    /// Callers should prefer using `try_case_fold_simple` instead, which will
853
    /// return an error instead of panicking.
854
0
    pub fn case_fold_simple(&mut self) {
855
0
        match *self {
856
0
            Class::Unicode(ref mut x) => x.case_fold_simple(),
857
0
            Class::Bytes(ref mut x) => x.case_fold_simple(),
858
        }
859
0
    }
860
861
    /// Apply Unicode simple case folding to this character class, in place.
862
    /// The character class will be expanded to include all simple case folded
863
    /// character variants.
864
    ///
865
    /// If this is a byte oriented character class, then this will be limited
866
    /// to the ASCII ranges `A-Z` and `a-z`.
867
    ///
868
    /// # Error
869
    ///
870
    /// This routine returns an error when the case mapping data necessary
871
    /// for this routine to complete is unavailable. This occurs when the
872
    /// `unicode-case` feature is not enabled and the underlying class is
873
    /// Unicode oriented.
874
0
    pub fn try_case_fold_simple(
875
0
        &mut self,
876
0
    ) -> core::result::Result<(), CaseFoldError> {
877
0
        match *self {
878
0
            Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
879
0
            Class::Bytes(ref mut x) => x.case_fold_simple(),
880
        }
881
0
        Ok(())
882
0
    }
883
884
    /// Negate this character class in place.
885
    ///
886
    /// After completion, this character class will contain precisely the
887
    /// characters that weren't previously in the class.
888
0
    pub fn negate(&mut self) {
889
0
        match *self {
890
0
            Class::Unicode(ref mut x) => x.negate(),
891
0
            Class::Bytes(ref mut x) => x.negate(),
892
        }
893
0
    }
894
895
    /// Returns true if and only if this character class will only ever match
896
    /// valid UTF-8.
897
    ///
898
    /// A character class can match invalid UTF-8 only when the following
899
    /// conditions are met:
900
    ///
901
    /// 1. The translator was configured to permit generating an expression
902
    ///    that can match invalid UTF-8. (By default, this is disabled.)
903
    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
904
    ///    syntax or in the parser builder. By default, Unicode mode is
905
    ///    enabled.
906
3.78M
    pub fn is_utf8(&self) -> bool {
907
3.78M
        match *self {
908
3.39M
            Class::Unicode(_) => true,
909
389k
            Class::Bytes(ref x) => x.is_ascii(),
910
        }
911
3.78M
    }
912
913
    /// Returns the length, in bytes, of the smallest string matched by this
914
    /// character class.
915
    ///
916
    /// For non-empty byte oriented classes, this always returns `1`. For
917
    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
918
    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
919
    /// be returned.
920
    ///
921
    /// # Example
922
    ///
923
    /// This example shows some examples of regexes and their corresponding
924
    /// minimum length, if any.
925
    ///
926
    /// ```
927
    /// use regex_syntax::{hir::Properties, parse};
928
    ///
929
    /// // The empty string has a min length of 0.
930
    /// let hir = parse(r"")?;
931
    /// assert_eq!(Some(0), hir.properties().minimum_len());
932
    /// // As do other types of regexes that only match the empty string.
933
    /// let hir = parse(r"^$\b\B")?;
934
    /// assert_eq!(Some(0), hir.properties().minimum_len());
935
    /// // A regex that can match the empty string but match more is still 0.
936
    /// let hir = parse(r"a*")?;
937
    /// assert_eq!(Some(0), hir.properties().minimum_len());
938
    /// // A regex that matches nothing has no minimum defined.
939
    /// let hir = parse(r"[a&&b]")?;
940
    /// assert_eq!(None, hir.properties().minimum_len());
941
    /// // Character classes usually have a minimum length of 1.
942
    /// let hir = parse(r"\w")?;
943
    /// assert_eq!(Some(1), hir.properties().minimum_len());
944
    /// // But sometimes Unicode classes might be bigger!
945
    /// let hir = parse(r"\p{Cyrillic}")?;
946
    /// assert_eq!(Some(2), hir.properties().minimum_len());
947
    ///
948
    /// # Ok::<(), Box<dyn std::error::Error>>(())
949
    /// ```
950
3.78M
    pub fn minimum_len(&self) -> Option<usize> {
951
3.78M
        match *self {
952
3.39M
            Class::Unicode(ref x) => x.minimum_len(),
953
389k
            Class::Bytes(ref x) => x.minimum_len(),
954
        }
955
3.78M
    }
956
957
    /// Returns the length, in bytes, of the longest string matched by this
958
    /// character class.
959
    ///
960
    /// For non-empty byte oriented classes, this always returns `1`. For
961
    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
962
    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
963
    /// be returned.
964
    ///
965
    /// # Example
966
    ///
967
    /// This example shows some examples of regexes and their corresponding
968
    /// maximum length, if any.
969
    ///
970
    /// ```
971
    /// use regex_syntax::{hir::Properties, parse};
972
    ///
973
    /// // The empty string has a max length of 0.
974
    /// let hir = parse(r"")?;
975
    /// assert_eq!(Some(0), hir.properties().maximum_len());
976
    /// // As do other types of regexes that only match the empty string.
977
    /// let hir = parse(r"^$\b\B")?;
978
    /// assert_eq!(Some(0), hir.properties().maximum_len());
979
    /// // A regex that matches nothing has no maximum defined.
980
    /// let hir = parse(r"[a&&b]")?;
981
    /// assert_eq!(None, hir.properties().maximum_len());
982
    /// // Bounded repeats work as you expect.
983
    /// let hir = parse(r"x{2,10}")?;
984
    /// assert_eq!(Some(10), hir.properties().maximum_len());
985
    /// // An unbounded repeat means there is no maximum.
986
    /// let hir = parse(r"x{2,}")?;
987
    /// assert_eq!(None, hir.properties().maximum_len());
988
    /// // With Unicode enabled, \w can match up to 4 bytes!
989
    /// let hir = parse(r"\w")?;
990
    /// assert_eq!(Some(4), hir.properties().maximum_len());
991
    /// // Without Unicode enabled, \w matches at most 1 byte.
992
    /// let hir = parse(r"(?-u)\w")?;
993
    /// assert_eq!(Some(1), hir.properties().maximum_len());
994
    ///
995
    /// # Ok::<(), Box<dyn std::error::Error>>(())
996
    /// ```
997
3.78M
    pub fn maximum_len(&self) -> Option<usize> {
998
3.78M
        match *self {
999
3.39M
            Class::Unicode(ref x) => x.maximum_len(),
1000
389k
            Class::Bytes(ref x) => x.maximum_len(),
1001
        }
1002
3.78M
    }
1003
1004
    /// Returns true if and only if this character class is empty. That is,
1005
    /// it has no elements.
1006
    ///
1007
    /// An empty character can never match anything, including an empty string.
1008
3.80M
    pub fn is_empty(&self) -> bool {
1009
3.80M
        match *self {
1010
3.41M
            Class::Unicode(ref x) => x.ranges().is_empty(),
1011
383k
            Class::Bytes(ref x) => x.ranges().is_empty(),
1012
        }
1013
3.80M
    }
1014
1015
    /// If this class consists of exactly one element (whether a codepoint or a
1016
    /// byte), then return it as a literal byte string.
1017
    ///
1018
    /// If this class is empty or contains more than one element, then `None`
1019
    /// is returned.
1020
3.78M
    pub fn literal(&self) -> Option<Vec<u8>> {
1021
3.78M
        match *self {
1022
3.40M
            Class::Unicode(ref x) => x.literal(),
1023
379k
            Class::Bytes(ref x) => x.literal(),
1024
        }
1025
3.78M
    }
1026
}
1027
1028
impl core::fmt::Debug for Class {
1029
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1030
        use crate::debug::Byte;
1031
1032
0
        let mut fmter = f.debug_set();
1033
0
        match *self {
1034
0
            Class::Unicode(ref cls) => {
1035
0
                for r in cls.ranges().iter() {
1036
0
                    fmter.entry(&(r.start..=r.end));
1037
0
                }
1038
            }
1039
0
            Class::Bytes(ref cls) => {
1040
0
                for r in cls.ranges().iter() {
1041
0
                    fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1042
0
                }
1043
            }
1044
        }
1045
0
        fmter.finish()
1046
0
    }
1047
}
1048
1049
/// A set of characters represented by Unicode scalar values.
1050
#[derive(Clone, Debug, Eq, PartialEq)]
1051
pub struct ClassUnicode {
1052
    set: IntervalSet<ClassUnicodeRange>,
1053
}
1054
1055
impl ClassUnicode {
1056
    /// Create a new class from a sequence of ranges.
1057
    ///
1058
    /// The given ranges do not need to be in any specific order, and ranges
1059
    /// may overlap. Ranges will automatically be sorted into a canonical
1060
    /// non-overlapping order.
1061
3.16M
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
3.16M
    where
1063
3.16M
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
3.16M
    {
1065
3.16M
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
3.16M
    }
<regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 1]>
Line
Count
Source
1061
1.20M
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
1.20M
    where
1063
1.20M
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
1.20M
    {
1065
1.20M
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
1.20M
    }
Unexecuted instantiation: <regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 2]>
<regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 3]>
Line
Count
Source
1061
319
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
319
    where
1063
319
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
319
    {
1065
319
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
319
    }
<regex_syntax::hir::ClassUnicode>::new::<alloc::vec::Vec<regex_syntax::hir::ClassUnicodeRange>>
Line
Count
Source
1061
1.95M
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
1.95M
    where
1063
1.95M
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
1.95M
    {
1065
1.95M
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
1.95M
    }
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<(u8, u8)>>, regex_syntax::hir::translate::ascii_class_as_chars::{closure#0}>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_unicode_class::{closure#0}>>
Line
Count
Source
1061
615
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
615
    where
1063
615
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
615
    {
1065
615
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
615
    }
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::ClassBytesRange>, <regex_syntax::hir::ClassBytes>::to_unicode_class::{closure#0}>>
Line
Count
Source
1061
2.11k
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
2.11k
    where
1063
2.11k
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
2.11k
    {
1065
2.11k
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
2.11k
    }
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<alloc::vec::into_iter::IntoIter<char>, <regex_syntax::hir::Hir>::alternation::{closure#0}>>
Line
Count
Source
1061
3.65k
    pub fn new<I>(ranges: I) -> ClassUnicode
1062
3.65k
    where
1063
3.65k
        I: IntoIterator<Item = ClassUnicodeRange>,
1064
3.65k
    {
1065
3.65k
        ClassUnicode { set: IntervalSet::new(ranges) }
1066
3.65k
    }
1067
1068
    /// Create a new class with no ranges.
1069
    ///
1070
    /// An empty class matches nothing. That is, it is equivalent to
1071
    /// [`Hir::fail`].
1072
330k
    pub fn empty() -> ClassUnicode {
1073
330k
        ClassUnicode::new(vec![])
1074
330k
    }
1075
1076
    /// Add a new range to this set.
1077
1.60M
    pub fn push(&mut self, range: ClassUnicodeRange) {
1078
1.60M
        self.set.push(range);
1079
1.60M
    }
1080
1081
    /// Return an iterator over all ranges in this class.
1082
    ///
1083
    /// The iterator yields ranges in ascending order.
1084
8.92M
    pub fn iter(&self) -> ClassUnicodeIter<'_> {
1085
8.92M
        ClassUnicodeIter(self.set.iter())
1086
8.92M
    }
1087
1088
    /// Return the underlying ranges as a slice.
1089
16.5M
    pub fn ranges(&self) -> &[ClassUnicodeRange] {
1090
16.5M
        self.set.intervals()
1091
16.5M
    }
1092
1093
    /// Expand this character class such that it contains all case folded
1094
    /// characters, according to Unicode's "simple" mapping. For example, if
1095
    /// this class consists of the range `a-z`, then applying case folding will
1096
    /// result in the class containing both the ranges `a-z` and `A-Z`.
1097
    ///
1098
    /// # Panics
1099
    ///
1100
    /// This routine panics when the case mapping data necessary for this
1101
    /// routine to complete is unavailable. This occurs when the `unicode-case`
1102
    /// feature is not enabled.
1103
    ///
1104
    /// Callers should prefer using `try_case_fold_simple` instead, which will
1105
    /// return an error instead of panicking.
1106
0
    pub fn case_fold_simple(&mut self) {
1107
0
        self.set
1108
0
            .case_fold_simple()
1109
0
            .expect("unicode-case feature must be enabled");
1110
0
    }
1111
1112
    /// Expand this character class such that it contains all case folded
1113
    /// characters, according to Unicode's "simple" mapping. For example, if
1114
    /// this class consists of the range `a-z`, then applying case folding will
1115
    /// result in the class containing both the ranges `a-z` and `A-Z`.
1116
    ///
1117
    /// # Error
1118
    ///
1119
    /// This routine returns an error when the case mapping data necessary
1120
    /// for this routine to complete is unavailable. This occurs when the
1121
    /// `unicode-case` feature is not enabled.
1122
1.63M
    pub fn try_case_fold_simple(
1123
1.63M
        &mut self,
1124
1.63M
    ) -> core::result::Result<(), CaseFoldError> {
1125
1.63M
        self.set.case_fold_simple()
1126
1.63M
    }
1127
1128
    /// Negate this character class.
1129
    ///
1130
    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1131
    /// set, then it will not be in this set after negation.
1132
1.21M
    pub fn negate(&mut self) {
1133
1.21M
        self.set.negate();
1134
1.21M
    }
1135
1136
    /// Union this character class with the given character class, in place.
1137
194k
    pub fn union(&mut self, other: &ClassUnicode) {
1138
194k
        self.set.union(&other.set);
1139
194k
    }
1140
1141
    /// Intersect this character class with the given character class, in
1142
    /// place.
1143
2.84k
    pub fn intersect(&mut self, other: &ClassUnicode) {
1144
2.84k
        self.set.intersect(&other.set);
1145
2.84k
    }
1146
1147
    /// Subtract the given character class from this character class, in place.
1148
27.9k
    pub fn difference(&mut self, other: &ClassUnicode) {
1149
27.9k
        self.set.difference(&other.set);
1150
27.9k
    }
1151
1152
    /// Compute the symmetric difference of the given character classes, in
1153
    /// place.
1154
    ///
1155
    /// This computes the symmetric difference of two character classes. This
1156
    /// removes all elements in this class that are also in the given class,
1157
    /// but all adds all elements from the given class that aren't in this
1158
    /// class. That is, the class will contain all elements in either class,
1159
    /// but will not contain any elements that are in both classes.
1160
42.1k
    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1161
42.1k
        self.set.symmetric_difference(&other.set);
1162
42.1k
    }
1163
1164
    /// Returns true if and only if this character class will either match
1165
    /// nothing or only ASCII bytes. Stated differently, this returns false
1166
    /// if and only if this class contains a non-ASCII codepoint.
1167
8.47M
    pub fn is_ascii(&self) -> bool {
1168
8.47M
        self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1169
8.47M
    }
1170
1171
    /// Returns the length, in bytes, of the smallest string matched by this
1172
    /// character class.
1173
    ///
1174
    /// Returns `None` when the class is empty.
1175
3.39M
    pub fn minimum_len(&self) -> Option<usize> {
1176
3.39M
        let first = self.ranges().get(0)?;
1177
        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1178
3.39M
        Some(first.start.len_utf8())
1179
3.39M
    }
1180
1181
    /// Returns the length, in bytes, of the longest string matched by this
1182
    /// character class.
1183
    ///
1184
    /// Returns `None` when the class is empty.
1185
3.39M
    pub fn maximum_len(&self) -> Option<usize> {
1186
3.39M
        let last = self.ranges().last()?;
1187
        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1188
3.39M
        Some(last.end.len_utf8())
1189
3.39M
    }
1190
1191
    /// If this class consists of exactly one codepoint, then return it as
1192
    /// a literal byte string.
1193
    ///
1194
    /// If this class is empty or contains more than one codepoint, then `None`
1195
    /// is returned.
1196
3.40M
    pub fn literal(&self) -> Option<Vec<u8>> {
1197
3.40M
        let rs = self.ranges();
1198
3.40M
        if rs.len() == 1 && rs[0].start == rs[0].end {
1199
12.3k
            Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1200
        } else {
1201
3.39M
            None
1202
        }
1203
3.40M
    }
1204
1205
    /// If this class consists of only ASCII ranges, then return its
1206
    /// corresponding and equivalent byte class.
1207
5.86k
    pub fn to_byte_class(&self) -> Option<ClassBytes> {
1208
5.86k
        if !self.is_ascii() {
1209
598
            return None;
1210
5.26k
        }
1211
38.9k
        Some(ClassBytes::new(self.ranges().iter().map(|r| {
1212
38.9k
            // Since we are guaranteed that our codepoint range is ASCII, the
1213
38.9k
            // 'u8::try_from' calls below are guaranteed to be correct.
1214
38.9k
            ClassBytesRange {
1215
38.9k
                start: u8::try_from(r.start).unwrap(),
1216
38.9k
                end: u8::try_from(r.end).unwrap(),
1217
38.9k
            }
1218
38.9k
        })))
1219
5.86k
    }
1220
}
1221
1222
/// An iterator over all ranges in a Unicode character class.
1223
///
1224
/// The lifetime `'a` refers to the lifetime of the underlying class.
1225
#[derive(Debug)]
1226
pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1227
1228
impl<'a> Iterator for ClassUnicodeIter<'a> {
1229
    type Item = &'a ClassUnicodeRange;
1230
1231
55.0M
    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1232
55.0M
        self.0.next()
1233
55.0M
    }
1234
}
1235
1236
/// A single range of characters represented by Unicode scalar values.
1237
///
1238
/// The range is closed. That is, the start and end of the range are included
1239
/// in the range.
1240
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1241
pub struct ClassUnicodeRange {
1242
    start: char,
1243
    end: char,
1244
}
1245
1246
impl core::fmt::Debug for ClassUnicodeRange {
1247
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1248
0
        let start = if !self.start.is_whitespace() && !self.start.is_control()
1249
        {
1250
0
            self.start.to_string()
1251
        } else {
1252
0
            format!("0x{:X}", u32::from(self.start))
1253
        };
1254
0
        let end = if !self.end.is_whitespace() && !self.end.is_control() {
1255
0
            self.end.to_string()
1256
        } else {
1257
0
            format!("0x{:X}", u32::from(self.end))
1258
        };
1259
0
        f.debug_struct("ClassUnicodeRange")
1260
0
            .field("start", &start)
1261
0
            .field("end", &end)
1262
0
            .finish()
1263
0
    }
1264
}
1265
1266
impl Interval for ClassUnicodeRange {
1267
    type Bound = char;
1268
1269
    #[inline]
1270
253M
    fn lower(&self) -> char {
1271
253M
        self.start
1272
253M
    }
1273
    #[inline]
1274
257M
    fn upper(&self) -> char {
1275
257M
        self.end
1276
257M
    }
1277
    #[inline]
1278
45.4M
    fn set_lower(&mut self, bound: char) {
1279
45.4M
        self.start = bound;
1280
45.4M
    }
1281
    #[inline]
1282
45.4M
    fn set_upper(&mut self, bound: char) {
1283
45.4M
        self.end = bound;
1284
45.4M
    }
1285
1286
    /// Apply simple case folding to this Unicode scalar value range.
1287
    ///
1288
    /// Additional ranges are appended to the given vector. Canonical ordering
1289
    /// is *not* maintained in the given vector.
1290
3.72M
    fn case_fold_simple(
1291
3.72M
        &self,
1292
3.72M
        ranges: &mut Vec<ClassUnicodeRange>,
1293
3.72M
    ) -> Result<(), unicode::CaseFoldError> {
1294
3.72M
        let mut folder = unicode::SimpleCaseFolder::new()?;
1295
3.72M
        if !folder.overlaps(self.start, self.end) {
1296
1.25M
            return Ok(());
1297
2.47M
        }
1298
2.47M
        let (start, end) = (u32::from(self.start), u32::from(self.end));
1299
1.86G
        for cp in (start..=end).filter_map(char::from_u32) {
1300
1.86G
            for &cp_folded in folder.mapping(cp) {
1301
11.4M
                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1302
11.4M
            }
1303
        }
1304
2.47M
        Ok(())
1305
3.72M
    }
1306
}
1307
1308
impl ClassUnicodeRange {
1309
    /// Create a new Unicode scalar value range for a character class.
1310
    ///
1311
    /// The returned range is always in a canonical form. That is, the range
1312
    /// returned always satisfies the invariant that `start <= end`.
1313
27.8M
    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1314
27.8M
        ClassUnicodeRange::create(start, end)
1315
27.8M
    }
1316
1317
    /// Return the start of this range.
1318
    ///
1319
    /// The start of a range is always less than or equal to the end of the
1320
    /// range.
1321
45.4M
    pub fn start(&self) -> char {
1322
45.4M
        self.start
1323
45.4M
    }
1324
1325
    /// Return the end of this range.
1326
    ///
1327
    /// The end of a range is always greater than or equal to the start of the
1328
    /// range.
1329
45.4M
    pub fn end(&self) -> char {
1330
45.4M
        self.end
1331
45.4M
    }
1332
1333
    /// Returns the number of codepoints in this range.
1334
682k
    pub fn len(&self) -> usize {
1335
682k
        let diff = 1 + u32::from(self.end) - u32::from(self.start);
1336
682k
        // This is likely to panic in 16-bit targets since a usize can only fit
1337
682k
        // 2^16. It's not clear what to do here, other than to return an error
1338
682k
        // when building a Unicode class that contains a range whose length
1339
682k
        // overflows usize. (Which, to be honest, is probably quite common on
1340
682k
        // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1341
682k
        // would be impossible to build.)
1342
682k
        usize::try_from(diff).expect("char class len fits in usize")
1343
682k
    }
1344
}
1345
1346
/// A set of characters represented by arbitrary bytes.
1347
///
1348
/// Each byte corresponds to one character.
1349
#[derive(Clone, Debug, Eq, PartialEq)]
1350
pub struct ClassBytes {
1351
    set: IntervalSet<ClassBytesRange>,
1352
}
1353
1354
impl ClassBytes {
1355
    /// Create a new class from a sequence of ranges.
1356
    ///
1357
    /// The given ranges do not need to be in any specific order, and ranges
1358
    /// may overlap. Ranges will automatically be sorted into a canonical
1359
    /// non-overlapping order.
1360
1.23M
    pub fn new<I>(ranges: I) -> ClassBytes
1361
1.23M
    where
1362
1.23M
        I: IntoIterator<Item = ClassBytesRange>,
1363
1.23M
    {
1364
1.23M
        ClassBytes { set: IntervalSet::new(ranges) }
1365
1.23M
    }
<regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 1]>
Line
Count
Source
1360
221k
    pub fn new<I>(ranges: I) -> ClassBytes
1361
221k
    where
1362
221k
        I: IntoIterator<Item = ClassBytesRange>,
1363
221k
    {
1364
221k
        ClassBytes { set: IntervalSet::new(ranges) }
1365
221k
    }
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 2]>
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 3]>
<regex_syntax::hir::ClassBytes>::new::<alloc::vec::Vec<regex_syntax::hir::ClassBytesRange>>
Line
Count
Source
1360
1.00M
    pub fn new<I>(ranges: I) -> ClassBytes
1361
1.00M
    where
1362
1.00M
        I: IntoIterator<Item = ClassBytesRange>,
1363
1.00M
    {
1364
1.00M
        ClassBytes { set: IntervalSet::new(ranges) }
1365
1.00M
    }
<regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<(u8, u8)>>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_byte_class::{closure#0}>>
Line
Count
Source
1360
238
    pub fn new<I>(ranges: I) -> ClassBytes
1361
238
    where
1362
238
        I: IntoIterator<Item = ClassBytesRange>,
1363
238
    {
1364
238
        ClassBytes { set: IntervalSet::new(ranges) }
1365
238
    }
<regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::ClassUnicodeRange>, <regex_syntax::hir::ClassUnicode>::to_byte_class::{closure#0}>>
Line
Count
Source
1360
5.26k
    pub fn new<I>(ranges: I) -> ClassBytes
1361
5.26k
    where
1362
5.26k
        I: IntoIterator<Item = ClassBytesRange>,
1363
5.26k
    {
1364
5.26k
        ClassBytes { set: IntervalSet::new(ranges) }
1365
5.26k
    }
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<alloc::vec::into_iter::IntoIter<u8>, <regex_syntax::hir::Hir>::alternation::{closure#1}>>
1366
1367
    /// Create a new class with no ranges.
1368
    ///
1369
    /// An empty class matches nothing. That is, it is equivalent to
1370
    /// [`Hir::fail`].
1371
818k
    pub fn empty() -> ClassBytes {
1372
818k
        ClassBytes::new(vec![])
1373
818k
    }
1374
1375
    /// Add a new range to this set.
1376
6.07M
    pub fn push(&mut self, range: ClassBytesRange) {
1377
6.07M
        self.set.push(range);
1378
6.07M
    }
1379
1380
    /// Return an iterator over all ranges in this class.
1381
    ///
1382
    /// The iterator yields ranges in ascending order.
1383
2.64M
    pub fn iter(&self) -> ClassBytesIter<'_> {
1384
2.64M
        ClassBytesIter(self.set.iter())
1385
2.64M
    }
1386
1387
    /// Return the underlying ranges as a slice.
1388
4.14M
    pub fn ranges(&self) -> &[ClassBytesRange] {
1389
4.14M
        self.set.intervals()
1390
4.14M
    }
1391
1392
    /// Expand this character class such that it contains all case folded
1393
    /// characters. For example, if this class consists of the range `a-z`,
1394
    /// then applying case folding will result in the class containing both the
1395
    /// ranges `a-z` and `A-Z`.
1396
    ///
1397
    /// Note that this only applies ASCII case folding, which is limited to the
1398
    /// characters `a-z` and `A-Z`.
1399
898k
    pub fn case_fold_simple(&mut self) {
1400
898k
        self.set.case_fold_simple().expect("ASCII case folding never fails");
1401
898k
    }
1402
1403
    /// Negate this byte class.
1404
    ///
1405
    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1406
    /// will not be in this set after negation.
1407
274
    pub fn negate(&mut self) {
1408
274
        self.set.negate();
1409
274
    }
1410
1411
    /// Union this byte class with the given byte class, in place.
1412
743k
    pub fn union(&mut self, other: &ClassBytes) {
1413
743k
        self.set.union(&other.set);
1414
743k
    }
1415
1416
    /// Intersect this byte class with the given byte class, in place.
1417
1.53k
    pub fn intersect(&mut self, other: &ClassBytes) {
1418
1.53k
        self.set.intersect(&other.set);
1419
1.53k
    }
1420
1421
    /// Subtract the given byte class from this byte class, in place.
1422
20.2k
    pub fn difference(&mut self, other: &ClassBytes) {
1423
20.2k
        self.set.difference(&other.set);
1424
20.2k
    }
1425
1426
    /// Compute the symmetric difference of the given byte classes, in place.
1427
    ///
1428
    /// This computes the symmetric difference of two byte classes. This
1429
    /// removes all elements in this class that are also in the given class,
1430
    /// but all adds all elements from the given class that aren't in this
1431
    /// class. That is, the class will contain all elements in either class,
1432
    /// but will not contain any elements that are in both classes.
1433
32.7k
    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1434
32.7k
        self.set.symmetric_difference(&other.set);
1435
32.7k
    }
1436
1437
    /// Returns true if and only if this character class will either match
1438
    /// nothing or only ASCII bytes. Stated differently, this returns false
1439
    /// if and only if this class contains a non-ASCII byte.
1440
1.08M
    pub fn is_ascii(&self) -> bool {
1441
1.08M
        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1442
1.08M
    }
1443
1444
    /// Returns the length, in bytes, of the smallest string matched by this
1445
    /// character class.
1446
    ///
1447
    /// Returns `None` when the class is empty.
1448
389k
    pub fn minimum_len(&self) -> Option<usize> {
1449
389k
        if self.ranges().is_empty() {
1450
10.9k
            None
1451
        } else {
1452
378k
            Some(1)
1453
        }
1454
389k
    }
1455
1456
    /// Returns the length, in bytes, of the longest string matched by this
1457
    /// character class.
1458
    ///
1459
    /// Returns `None` when the class is empty.
1460
389k
    pub fn maximum_len(&self) -> Option<usize> {
1461
389k
        if self.ranges().is_empty() {
1462
10.9k
            None
1463
        } else {
1464
378k
            Some(1)
1465
        }
1466
389k
    }
1467
1468
    /// If this class consists of exactly one byte, then return it as
1469
    /// a literal byte string.
1470
    ///
1471
    /// If this class is empty or contains more than one byte, then `None`
1472
    /// is returned.
1473
379k
    pub fn literal(&self) -> Option<Vec<u8>> {
1474
379k
        let rs = self.ranges();
1475
379k
        if rs.len() == 1 && rs[0].start == rs[0].end {
1476
1.11k
            Some(vec![rs[0].start])
1477
        } else {
1478
378k
            None
1479
        }
1480
379k
    }
1481
1482
    /// If this class consists of only ASCII ranges, then return its
1483
    /// corresponding and equivalent Unicode class.
1484
2.11k
    pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1485
2.11k
        if !self.is_ascii() {
1486
0
            return None;
1487
2.11k
        }
1488
21.6k
        Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1489
21.6k
            // Since we are guaranteed that our byte range is ASCII, the
1490
21.6k
            // 'char::from' calls below are correct and will not erroneously
1491
21.6k
            // convert a raw byte value into its corresponding codepoint.
1492
21.6k
            ClassUnicodeRange {
1493
21.6k
                start: char::from(r.start),
1494
21.6k
                end: char::from(r.end),
1495
21.6k
            }
1496
21.6k
        })))
1497
2.11k
    }
1498
}
1499
1500
/// An iterator over all ranges in a byte character class.
1501
///
1502
/// The lifetime `'a` refers to the lifetime of the underlying class.
1503
#[derive(Debug)]
1504
pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1505
1506
impl<'a> Iterator for ClassBytesIter<'a> {
1507
    type Item = &'a ClassBytesRange;
1508
1509
14.6M
    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1510
14.6M
        self.0.next()
1511
14.6M
    }
1512
}
1513
1514
/// A single range of characters represented by arbitrary bytes.
1515
///
1516
/// The range is closed. That is, the start and end of the range are included
1517
/// in the range.
1518
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1519
pub struct ClassBytesRange {
1520
    start: u8,
1521
    end: u8,
1522
}
1523
1524
impl Interval for ClassBytesRange {
1525
    type Bound = u8;
1526
1527
    #[inline]
1528
572M
    fn lower(&self) -> u8 {
1529
572M
        self.start
1530
572M
    }
1531
    #[inline]
1532
573M
    fn upper(&self) -> u8 {
1533
573M
        self.end
1534
573M
    }
1535
    #[inline]
1536
66.6M
    fn set_lower(&mut self, bound: u8) {
1537
66.6M
        self.start = bound;
1538
66.6M
    }
1539
    #[inline]
1540
66.6M
    fn set_upper(&mut self, bound: u8) {
1541
66.6M
        self.end = bound;
1542
66.6M
    }
1543
1544
    /// Apply simple case folding to this byte range. Only ASCII case mappings
1545
    /// (for a-z) are applied.
1546
    ///
1547
    /// Additional ranges are appended to the given vector. Canonical ordering
1548
    /// is *not* maintained in the given vector.
1549
17.0M
    fn case_fold_simple(
1550
17.0M
        &self,
1551
17.0M
        ranges: &mut Vec<ClassBytesRange>,
1552
17.0M
    ) -> Result<(), unicode::CaseFoldError> {
1553
17.0M
        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1554
4.92M
            let lower = cmp::max(self.start, b'a');
1555
4.92M
            let upper = cmp::min(self.end, b'z');
1556
4.92M
            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1557
12.1M
        }
1558
17.0M
        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1559
5.12M
            let lower = cmp::max(self.start, b'A');
1560
5.12M
            let upper = cmp::min(self.end, b'Z');
1561
5.12M
            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1562
11.9M
        }
1563
17.0M
        Ok(())
1564
17.0M
    }
1565
}
1566
1567
impl ClassBytesRange {
1568
    /// Create a new byte range for a character class.
1569
    ///
1570
    /// The returned range is always in a canonical form. That is, the range
1571
    /// returned always satisfies the invariant that `start <= end`.
1572
50.6M
    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1573
50.6M
        ClassBytesRange::create(start, end)
1574
50.6M
    }
1575
1576
    /// Return the start of this range.
1577
    ///
1578
    /// The start of a range is always less than or equal to the end of the
1579
    /// range.
1580
11.9M
    pub fn start(&self) -> u8 {
1581
11.9M
        self.start
1582
11.9M
    }
1583
1584
    /// Return the end of this range.
1585
    ///
1586
    /// The end of a range is always greater than or equal to the start of the
1587
    /// range.
1588
11.9M
    pub fn end(&self) -> u8 {
1589
11.9M
        self.end
1590
11.9M
    }
1591
1592
    /// Returns the number of bytes in this range.
1593
46.0k
    pub fn len(&self) -> usize {
1594
46.0k
        usize::from(self.end.checked_sub(self.start).unwrap())
1595
46.0k
            .checked_add(1)
1596
46.0k
            .unwrap()
1597
46.0k
    }
1598
}
1599
1600
impl core::fmt::Debug for ClassBytesRange {
1601
0
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1602
0
        f.debug_struct("ClassBytesRange")
1603
0
            .field("start", &crate::debug::Byte(self.start))
1604
0
            .field("end", &crate::debug::Byte(self.end))
1605
0
            .finish()
1606
0
    }
1607
}
1608
1609
/// The high-level intermediate representation for a look-around assertion.
1610
///
1611
/// An assertion match is always zero-length. Also called an "empty match."
1612
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1613
pub enum Look {
1614
    /// Match the beginning of text. Specifically, this matches at the starting
1615
    /// position of the input.
1616
    Start = 1 << 0,
1617
    /// Match the end of text. Specifically, this matches at the ending
1618
    /// position of the input.
1619
    End = 1 << 1,
1620
    /// Match the beginning of a line or the beginning of text. Specifically,
1621
    /// this matches at the starting position of the input, or at the position
1622
    /// immediately following a `\n` character.
1623
    StartLF = 1 << 2,
1624
    /// Match the end of a line or the end of text. Specifically, this matches
1625
    /// at the end position of the input, or at the position immediately
1626
    /// preceding a `\n` character.
1627
    EndLF = 1 << 3,
1628
    /// Match the beginning of a line or the beginning of text. Specifically,
1629
    /// this matches at the starting position of the input, or at the position
1630
    /// immediately following either a `\r` or `\n` character, but never after
1631
    /// a `\r` when a `\n` follows.
1632
    StartCRLF = 1 << 4,
1633
    /// Match the end of a line or the end of text. Specifically, this matches
1634
    /// at the end position of the input, or at the position immediately
1635
    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1636
    /// precedes it.
1637
    EndCRLF = 1 << 5,
1638
    /// Match an ASCII-only word boundary. That is, this matches a position
1639
    /// where the left adjacent character and right adjacent character
1640
    /// correspond to a word and non-word or a non-word and word character.
1641
    WordAscii = 1 << 6,
1642
    /// Match an ASCII-only negation of a word boundary.
1643
    WordAsciiNegate = 1 << 7,
1644
    /// Match a Unicode-aware word boundary. That is, this matches a position
1645
    /// where the left adjacent character and right adjacent character
1646
    /// correspond to a word and non-word or a non-word and word character.
1647
    WordUnicode = 1 << 8,
1648
    /// Match a Unicode-aware negation of a word boundary.
1649
    WordUnicodeNegate = 1 << 9,
1650
    /// Match the start of an ASCII-only word boundary. That is, this matches a
1651
    /// position at either the beginning of the haystack or where the previous
1652
    /// character is not a word character and the following character is a word
1653
    /// character.
1654
    WordStartAscii = 1 << 10,
1655
    /// Match the end of an ASCII-only word boundary. That is, this matches
1656
    /// a position at either the end of the haystack or where the previous
1657
    /// character is a word character and the following character is not a word
1658
    /// character.
1659
    WordEndAscii = 1 << 11,
1660
    /// Match the start of a Unicode word boundary. That is, this matches a
1661
    /// position at either the beginning of the haystack or where the previous
1662
    /// character is not a word character and the following character is a word
1663
    /// character.
1664
    WordStartUnicode = 1 << 12,
1665
    /// Match the end of a Unicode word boundary. That is, this matches a
1666
    /// position at either the end of the haystack or where the previous
1667
    /// character is a word character and the following character is not a word
1668
    /// character.
1669
    WordEndUnicode = 1 << 13,
1670
    /// Match the start half of an ASCII-only word boundary. That is, this
1671
    /// matches a position at either the beginning of the haystack or where the
1672
    /// previous character is not a word character.
1673
    WordStartHalfAscii = 1 << 14,
1674
    /// Match the end half of an ASCII-only word boundary. That is, this
1675
    /// matches a position at either the end of the haystack or where the
1676
    /// following character is not a word character.
1677
    WordEndHalfAscii = 1 << 15,
1678
    /// Match the start half of a Unicode word boundary. That is, this matches
1679
    /// a position at either the beginning of the haystack or where the
1680
    /// previous character is not a word character.
1681
    WordStartHalfUnicode = 1 << 16,
1682
    /// Match the end half of a Unicode word boundary. That is, this matches
1683
    /// a position at either the end of the haystack or where the following
1684
    /// character is not a word character.
1685
    WordEndHalfUnicode = 1 << 17,
1686
}
1687
1688
impl Look {
1689
    /// Flip the look-around assertion to its equivalent for reverse searches.
1690
    /// For example, `StartLF` gets translated to `EndLF`.
1691
    ///
1692
    /// Some assertions, such as `WordUnicode`, remain the same since they
1693
    /// match the same positions regardless of the direction of the search.
1694
    #[inline]
1695
0
    pub const fn reversed(self) -> Look {
1696
0
        match self {
1697
0
            Look::Start => Look::End,
1698
0
            Look::End => Look::Start,
1699
0
            Look::StartLF => Look::EndLF,
1700
0
            Look::EndLF => Look::StartLF,
1701
0
            Look::StartCRLF => Look::EndCRLF,
1702
0
            Look::EndCRLF => Look::StartCRLF,
1703
0
            Look::WordAscii => Look::WordAscii,
1704
0
            Look::WordAsciiNegate => Look::WordAsciiNegate,
1705
0
            Look::WordUnicode => Look::WordUnicode,
1706
0
            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1707
0
            Look::WordStartAscii => Look::WordEndAscii,
1708
0
            Look::WordEndAscii => Look::WordStartAscii,
1709
0
            Look::WordStartUnicode => Look::WordEndUnicode,
1710
0
            Look::WordEndUnicode => Look::WordStartUnicode,
1711
0
            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1712
0
            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1713
0
            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1714
0
            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1715
        }
1716
0
    }
1717
1718
    /// Return the underlying representation of this look-around enumeration
1719
    /// as an integer. Giving the return value to the [`Look::from_repr`]
1720
    /// constructor is guaranteed to return the same look-around variant that
1721
    /// one started with within a semver compatible release of this crate.
1722
    #[inline]
1723
2.85M
    pub const fn as_repr(self) -> u32 {
1724
2.85M
        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725
2.85M
        // actual int.
1726
2.85M
        self as u32
1727
2.85M
    }
<regex_syntax::hir::Look>::as_repr
Line
Count
Source
1723
56.5k
    pub const fn as_repr(self) -> u32 {
1724
56.5k
        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725
56.5k
        // actual int.
1726
56.5k
        self as u32
1727
56.5k
    }
Unexecuted instantiation: <regex_syntax::hir::Look>::as_repr
<regex_syntax::hir::Look>::as_repr
Line
Count
Source
1723
1.12M
    pub const fn as_repr(self) -> u32 {
1724
1.12M
        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725
1.12M
        // actual int.
1726
1.12M
        self as u32
1727
1.12M
    }
<regex_syntax::hir::Look>::as_repr
Line
Count
Source
1723
1.67M
    pub const fn as_repr(self) -> u32 {
1724
1.67M
        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725
1.67M
        // actual int.
1726
1.67M
        self as u32
1727
1.67M
    }
1728
1729
    /// Given the underlying representation of a `Look` value, return the
1730
    /// corresponding `Look` value if the representation is valid. Otherwise
1731
    /// `None` is returned.
1732
    #[inline]
1733
0
    pub const fn from_repr(repr: u32) -> Option<Look> {
1734
0
        match repr {
1735
0
            0b00_0000_0000_0000_0001 => Some(Look::Start),
1736
0
            0b00_0000_0000_0000_0010 => Some(Look::End),
1737
0
            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1738
0
            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1739
0
            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1740
0
            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1741
0
            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1742
0
            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1743
0
            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1744
0
            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1745
0
            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1746
0
            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1747
0
            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1748
0
            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1749
0
            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1750
0
            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1751
0
            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1752
0
            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1753
0
            _ => None,
1754
        }
1755
0
    }
1756
1757
    /// Returns a convenient single codepoint representation of this
1758
    /// look-around assertion. Each assertion is guaranteed to be represented
1759
    /// by a distinct character.
1760
    ///
1761
    /// This is useful for succinctly representing a look-around assertion in
1762
    /// human friendly but succinct output intended for a programmer working on
1763
    /// regex internals.
1764
    #[inline]
1765
0
    pub const fn as_char(self) -> char {
1766
0
        match self {
1767
0
            Look::Start => 'A',
1768
0
            Look::End => 'z',
1769
0
            Look::StartLF => '^',
1770
0
            Look::EndLF => '$',
1771
0
            Look::StartCRLF => 'r',
1772
0
            Look::EndCRLF => 'R',
1773
0
            Look::WordAscii => 'b',
1774
0
            Look::WordAsciiNegate => 'B',
1775
0
            Look::WordUnicode => '𝛃',
1776
0
            Look::WordUnicodeNegate => '𝚩',
1777
0
            Look::WordStartAscii => '<',
1778
0
            Look::WordEndAscii => '>',
1779
0
            Look::WordStartUnicode => '〈',
1780
0
            Look::WordEndUnicode => '〉',
1781
0
            Look::WordStartHalfAscii => '◁',
1782
0
            Look::WordEndHalfAscii => '▷',
1783
0
            Look::WordStartHalfUnicode => '◀',
1784
0
            Look::WordEndHalfUnicode => '▶',
1785
        }
1786
0
    }
1787
}
1788
1789
/// The high-level intermediate representation for a capturing group.
1790
///
1791
/// A capturing group always has an index and a child expression. It may
1792
/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1793
/// necessary.
1794
///
1795
/// Note that there is no explicit representation of a non-capturing group
1796
/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1797
/// the recursive structure of the `Hir` itself.
1798
#[derive(Clone, Debug, Eq, PartialEq)]
1799
pub struct Capture {
1800
    /// The capture index of the capture.
1801
    pub index: u32,
1802
    /// The name of the capture, if it exists.
1803
    pub name: Option<Box<str>>,
1804
    /// The expression inside the capturing group, which may be empty.
1805
    pub sub: Box<Hir>,
1806
}
1807
1808
/// The high-level intermediate representation of a repetition operator.
1809
///
1810
/// A repetition operator permits the repetition of an arbitrary
1811
/// sub-expression.
1812
#[derive(Clone, Debug, Eq, PartialEq)]
1813
pub struct Repetition {
1814
    /// The minimum range of the repetition.
1815
    ///
1816
    /// Note that special cases like `?`, `+` and `*` all get translated into
1817
    /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1818
    ///
1819
    /// When `min` is zero, this expression can match the empty string
1820
    /// regardless of what its sub-expression is.
1821
    pub min: u32,
1822
    /// The maximum range of the repetition.
1823
    ///
1824
    /// Note that when `max` is `None`, `min` acts as a lower bound but where
1825
    /// there is no upper bound. For something like `x{5}` where the min and
1826
    /// max are equivalent, `min` will be set to `5` and `max` will be set to
1827
    /// `Some(5)`.
1828
    pub max: Option<u32>,
1829
    /// Whether this repetition operator is greedy or not. A greedy operator
1830
    /// will match as much as it can. A non-greedy operator will match as
1831
    /// little as it can.
1832
    ///
1833
    /// Typically, operators are greedy by default and are only non-greedy when
1834
    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1835
    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1836
    pub greedy: bool,
1837
    /// The expression being repeated.
1838
    pub sub: Box<Hir>,
1839
}
1840
1841
impl Repetition {
1842
    /// Returns a new repetition with the same `min`, `max` and `greedy`
1843
    /// values, but with its sub-expression replaced with the one given.
1844
249k
    pub fn with(&self, sub: Hir) -> Repetition {
1845
249k
        Repetition {
1846
249k
            min: self.min,
1847
249k
            max: self.max,
1848
249k
            greedy: self.greedy,
1849
249k
            sub: Box::new(sub),
1850
249k
        }
1851
249k
    }
1852
}
1853
1854
/// A type describing the different flavors of `.`.
1855
///
1856
/// This type is meant to be used with [`Hir::dot`], which is a convenience
1857
/// routine for building HIR values derived from the `.` regex.
1858
#[non_exhaustive]
1859
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1860
pub enum Dot {
1861
    /// Matches the UTF-8 encoding of any Unicode scalar value.
1862
    ///
1863
    /// This is equivalent to `(?su:.)` and also `\p{any}`.
1864
    AnyChar,
1865
    /// Matches any byte value.
1866
    ///
1867
    /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1868
    AnyByte,
1869
    /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1870
    /// `char` given.
1871
    ///
1872
    /// This is equivalent to using `(?u-s:.)` with the line terminator set
1873
    /// to a particular ASCII byte. (Because of peculiarities in the regex
1874
    /// engines, a line terminator must be a single byte. It follows that when
1875
    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1876
    /// value. That is, ti must be ASCII.)
1877
    ///
1878
    /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1879
    /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1880
    AnyCharExcept(char),
1881
    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1882
    ///
1883
    /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1884
    AnyCharExceptLF,
1885
    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1886
    /// and `\n`.
1887
    ///
1888
    /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1889
    AnyCharExceptCRLF,
1890
    /// Matches any byte value except for the `u8` given.
1891
    ///
1892
    /// This is equivalent to using `(?-us:.)` with the line terminator set
1893
    /// to a particular ASCII byte. (Because of peculiarities in the regex
1894
    /// engines, a line terminator must be a single byte. It follows that when
1895
    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1896
    /// value. That is, ti must be ASCII.)
1897
    ///
1898
    /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1899
    /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1900
    AnyByteExcept(u8),
1901
    /// Matches any byte value except for `\n`.
1902
    ///
1903
    /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1904
    AnyByteExceptLF,
1905
    /// Matches any byte value except for `\r` and `\n`.
1906
    ///
1907
    /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1908
    AnyByteExceptCRLF,
1909
}
1910
1911
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1912
/// space but heap space proportional to the depth of the total `Hir`.
1913
impl Drop for Hir {
1914
37.2M
    fn drop(&mut self) {
1915
        use core::mem;
1916
1917
37.2M
        match *self.kind() {
1918
            HirKind::Empty
1919
            | HirKind::Literal(_)
1920
            | HirKind::Class(_)
1921
35.8M
            | HirKind::Look(_) => return,
1922
104k
            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1923
728k
            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1924
694k
                return
1925
            }
1926
435k
            HirKind::Concat(ref x) if x.is_empty() => return,
1927
97.3k
            HirKind::Alternation(ref x) if x.is_empty() => return,
1928
181k
            _ => {}
1929
181k
        }
1930
181k
1931
181k
        let mut stack = vec![mem::replace(self, Hir::empty())];
1932
17.0M
        while let Some(mut expr) = stack.pop() {
1933
16.8M
            match expr.kind {
1934
                HirKind::Empty
1935
                | HirKind::Literal(_)
1936
                | HirKind::Class(_)
1937
15.6M
                | HirKind::Look(_) => {}
1938
97.5k
                HirKind::Capture(ref mut x) => {
1939
97.5k
                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1940
97.5k
                }
1941
669k
                HirKind::Repetition(ref mut x) => {
1942
669k
                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1943
669k
                }
1944
325k
                HirKind::Concat(ref mut x) => {
1945
325k
                    stack.extend(x.drain(..));
1946
325k
                }
1947
63.7k
                HirKind::Alternation(ref mut x) => {
1948
63.7k
                    stack.extend(x.drain(..));
1949
63.7k
                }
1950
            }
1951
        }
1952
37.2M
    }
1953
}
1954
1955
/// A type that collects various properties of an HIR value.
1956
///
1957
/// Properties are always scalar values and represent meta data that is
1958
/// computed inductively on an HIR value. Properties are defined for all
1959
/// HIR values.
1960
///
1961
/// All methods on a `Properties` value take constant time and are meant to
1962
/// be cheap to call.
1963
#[derive(Clone, Debug, Eq, PartialEq)]
1964
pub struct Properties(Box<PropertiesI>);
1965
1966
/// The property definition. It is split out so that we can box it, and
1967
/// there by make `Properties` use less stack size. This is kind-of important
1968
/// because every HIR value has a `Properties` attached to it.
1969
///
1970
/// This does have the unfortunate consequence that creating any HIR value
1971
/// always leads to at least one alloc for properties, but this is generally
1972
/// true anyway (for pretty much all HirKinds except for look-arounds).
1973
#[derive(Clone, Debug, Eq, PartialEq)]
1974
struct PropertiesI {
1975
    minimum_len: Option<usize>,
1976
    maximum_len: Option<usize>,
1977
    look_set: LookSet,
1978
    look_set_prefix: LookSet,
1979
    look_set_suffix: LookSet,
1980
    look_set_prefix_any: LookSet,
1981
    look_set_suffix_any: LookSet,
1982
    utf8: bool,
1983
    explicit_captures_len: usize,
1984
    static_explicit_captures_len: Option<usize>,
1985
    literal: bool,
1986
    alternation_literal: bool,
1987
}
1988
1989
impl Properties {
1990
    /// Returns the length (in bytes) of the smallest string matched by this
1991
    /// HIR.
1992
    ///
1993
    /// A return value of `0` is possible and occurs when the HIR can match an
1994
    /// empty string.
1995
    ///
1996
    /// `None` is returned when there is no minimum length. This occurs in
1997
    /// precisely the cases where the HIR matches nothing. i.e., The language
1998
    /// the regex matches is empty. An example of such a regex is `\P{any}`.
1999
    #[inline]
2000
27.4M
    pub fn minimum_len(&self) -> Option<usize> {
2001
27.4M
        self.0.minimum_len
2002
27.4M
    }
<regex_syntax::hir::Properties>::minimum_len
Line
Count
Source
2000
68.4k
    pub fn minimum_len(&self) -> Option<usize> {
2001
68.4k
        self.0.minimum_len
2002
68.4k
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::minimum_len
<regex_syntax::hir::Properties>::minimum_len
Line
Count
Source
2000
9.70M
    pub fn minimum_len(&self) -> Option<usize> {
2001
9.70M
        self.0.minimum_len
2002
9.70M
    }
<regex_syntax::hir::Properties>::minimum_len
Line
Count
Source
2000
17.6M
    pub fn minimum_len(&self) -> Option<usize> {
2001
17.6M
        self.0.minimum_len
2002
17.6M
    }
2003
2004
    /// Returns the length (in bytes) of the longest string matched by this
2005
    /// HIR.
2006
    ///
2007
    /// A return value of `0` is possible and occurs when nothing longer than
2008
    /// the empty string is in the language described by this HIR.
2009
    ///
2010
    /// `None` is returned when there is no longest matching string. This
2011
    /// occurs when the HIR matches nothing or when there is no upper bound on
2012
    /// the length of matching strings. Example of such regexes are `\P{any}`
2013
    /// (matches nothing) and `a+` (has no upper bound).
2014
    #[inline]
2015
16.7M
    pub fn maximum_len(&self) -> Option<usize> {
2016
16.7M
        self.0.maximum_len
2017
16.7M
    }
<regex_syntax::hir::Properties>::maximum_len
Line
Count
Source
2015
1
    pub fn maximum_len(&self) -> Option<usize> {
2016
1
        self.0.maximum_len
2017
1
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::maximum_len
<regex_syntax::hir::Properties>::maximum_len
Line
Count
Source
2015
379k
    pub fn maximum_len(&self) -> Option<usize> {
2016
379k
        self.0.maximum_len
2017
379k
    }
<regex_syntax::hir::Properties>::maximum_len
Line
Count
Source
2015
16.3M
    pub fn maximum_len(&self) -> Option<usize> {
2016
16.3M
        self.0.maximum_len
2017
16.3M
    }
2018
2019
    /// Returns a set of all look-around assertions that appear at least once
2020
    /// in this HIR value.
2021
    #[inline]
2022
18.0M
    pub fn look_set(&self) -> LookSet {
2023
18.0M
        self.0.look_set
2024
18.0M
    }
<regex_syntax::hir::Properties>::look_set
Line
Count
Source
2022
343k
    pub fn look_set(&self) -> LookSet {
2023
343k
        self.0.look_set
2024
343k
    }
<regex_syntax::hir::Properties>::look_set
Line
Count
Source
2022
17.7M
    pub fn look_set(&self) -> LookSet {
2023
17.7M
        self.0.look_set
2024
17.7M
    }
2025
2026
    /// Returns a set of all look-around assertions that appear as a prefix for
2027
    /// this HIR value. That is, the set returned corresponds to the set of
2028
    /// assertions that must be passed before matching any bytes in a haystack.
2029
    ///
2030
    /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2031
    /// if and only if the HIR is fully anchored at the start.
2032
    #[inline]
2033
12.0M
    pub fn look_set_prefix(&self) -> LookSet {
2034
12.0M
        self.0.look_set_prefix
2035
12.0M
    }
<regex_syntax::hir::Properties>::look_set_prefix
Line
Count
Source
2033
56.5k
    pub fn look_set_prefix(&self) -> LookSet {
2034
56.5k
        self.0.look_set_prefix
2035
56.5k
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::look_set_prefix
<regex_syntax::hir::Properties>::look_set_prefix
Line
Count
Source
2033
538k
    pub fn look_set_prefix(&self) -> LookSet {
2034
538k
        self.0.look_set_prefix
2035
538k
    }
<regex_syntax::hir::Properties>::look_set_prefix
Line
Count
Source
2033
11.4M
    pub fn look_set_prefix(&self) -> LookSet {
2034
11.4M
        self.0.look_set_prefix
2035
11.4M
    }
2036
2037
    /// Returns a set of all look-around assertions that appear as a _possible_
2038
    /// prefix for this HIR value. That is, the set returned corresponds to the
2039
    /// set of assertions that _may_ be passed before matching any bytes in a
2040
    /// haystack.
2041
    ///
2042
    /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2043
    /// true if and only if it's possible for the regex to match through a
2044
    /// anchored assertion before consuming any input.
2045
    #[inline]
2046
12.2M
    pub fn look_set_prefix_any(&self) -> LookSet {
2047
12.2M
        self.0.look_set_prefix_any
2048
12.2M
    }
<regex_syntax::hir::Properties>::look_set_prefix_any
Line
Count
Source
2046
129k
    pub fn look_set_prefix_any(&self) -> LookSet {
2047
129k
        self.0.look_set_prefix_any
2048
129k
    }
<regex_syntax::hir::Properties>::look_set_prefix_any
Line
Count
Source
2046
12.1M
    pub fn look_set_prefix_any(&self) -> LookSet {
2047
12.1M
        self.0.look_set_prefix_any
2048
12.1M
    }
2049
2050
    /// Returns a set of all look-around assertions that appear as a suffix for
2051
    /// this HIR value. That is, the set returned corresponds to the set of
2052
    /// assertions that must be passed in order to be considered a match after
2053
    /// all other consuming HIR expressions.
2054
    ///
2055
    /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2056
    /// if and only if the HIR is fully anchored at the end.
2057
    #[inline]
2058
11.7M
    pub fn look_set_suffix(&self) -> LookSet {
2059
11.7M
        self.0.look_set_suffix
2060
11.7M
    }
<regex_syntax::hir::Properties>::look_set_suffix
Line
Count
Source
2058
4
    pub fn look_set_suffix(&self) -> LookSet {
2059
4
        self.0.look_set_suffix
2060
4
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::look_set_suffix
<regex_syntax::hir::Properties>::look_set_suffix
Line
Count
Source
2058
355k
    pub fn look_set_suffix(&self) -> LookSet {
2059
355k
        self.0.look_set_suffix
2060
355k
    }
<regex_syntax::hir::Properties>::look_set_suffix
Line
Count
Source
2058
11.4M
    pub fn look_set_suffix(&self) -> LookSet {
2059
11.4M
        self.0.look_set_suffix
2060
11.4M
    }
2061
2062
    /// Returns a set of all look-around assertions that appear as a _possible_
2063
    /// suffix for this HIR value. That is, the set returned corresponds to the
2064
    /// set of assertions that _may_ be passed before matching any bytes in a
2065
    /// haystack.
2066
    ///
2067
    /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2068
    /// true if and only if it's possible for the regex to match through a
2069
    /// anchored assertion at the end of a match without consuming any input.
2070
    #[inline]
2071
12.2M
    pub fn look_set_suffix_any(&self) -> LookSet {
2072
12.2M
        self.0.look_set_suffix_any
2073
12.2M
    }
<regex_syntax::hir::Properties>::look_set_suffix_any
Line
Count
Source
2071
129k
    pub fn look_set_suffix_any(&self) -> LookSet {
2072
129k
        self.0.look_set_suffix_any
2073
129k
    }
<regex_syntax::hir::Properties>::look_set_suffix_any
Line
Count
Source
2071
12.0M
    pub fn look_set_suffix_any(&self) -> LookSet {
2072
12.0M
        self.0.look_set_suffix_any
2073
12.0M
    }
2074
2075
    /// Return true if and only if the corresponding HIR will always match
2076
    /// valid UTF-8.
2077
    ///
2078
    /// When this returns false, then it is possible for this HIR expression to
2079
    /// match invalid UTF-8, including by matching between the code units of
2080
    /// a single UTF-8 encoded codepoint.
2081
    ///
2082
    /// Note that this returns true even when the corresponding HIR can match
2083
    /// the empty string. Since an empty string can technically appear between
2084
    /// UTF-8 code units, it is possible for a match to be reported that splits
2085
    /// a codepoint which could in turn be considered matching invalid UTF-8.
2086
    /// However, it is generally assumed that such empty matches are handled
2087
    /// specially by the search routine if it is absolutely required that
2088
    /// matches not split a codepoint.
2089
    ///
2090
    /// # Example
2091
    ///
2092
    /// This code example shows the UTF-8 property of a variety of patterns.
2093
    ///
2094
    /// ```
2095
    /// use regex_syntax::{ParserBuilder, parse};
2096
    ///
2097
    /// // Examples of 'is_utf8() == true'.
2098
    /// assert!(parse(r"a")?.properties().is_utf8());
2099
    /// assert!(parse(r"[^a]")?.properties().is_utf8());
2100
    /// assert!(parse(r".")?.properties().is_utf8());
2101
    /// assert!(parse(r"\W")?.properties().is_utf8());
2102
    /// assert!(parse(r"\b")?.properties().is_utf8());
2103
    /// assert!(parse(r"\B")?.properties().is_utf8());
2104
    /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2105
    /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2106
    /// // Unicode mode is enabled by default, and in
2107
    /// // that mode, all \x hex escapes are treated as
2108
    /// // codepoints. So this actually matches the UTF-8
2109
    /// // encoding of U+00FF.
2110
    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2111
    ///
2112
    /// // Now we show examples of 'is_utf8() == false'.
2113
    /// // The only way to do this is to force the parser
2114
    /// // to permit invalid UTF-8, otherwise all of these
2115
    /// // would fail to parse!
2116
    /// let parse = |pattern| {
2117
    ///     ParserBuilder::new().utf8(false).build().parse(pattern)
2118
    /// };
2119
    /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2120
    /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2121
    /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2122
    /// // Conversely to the equivalent example above,
2123
    /// // when Unicode mode is disabled, \x hex escapes
2124
    /// // are treated as their raw byte values.
2125
    /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2126
    /// // Note that just because we disabled UTF-8 in the
2127
    /// // parser doesn't mean we still can't use Unicode.
2128
    /// // It is enabled by default, so \xFF is still
2129
    /// // equivalent to matching the UTF-8 encoding of
2130
    /// // U+00FF by default.
2131
    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2132
    /// // Even though we use raw bytes that individually
2133
    /// // are not valid UTF-8, when combined together, the
2134
    /// // overall expression *does* match valid UTF-8!
2135
    /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2136
    ///
2137
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2138
    /// ```
2139
    #[inline]
2140
17.8M
    pub fn is_utf8(&self) -> bool {
2141
17.8M
        self.0.utf8
2142
17.8M
    }
<regex_syntax::hir::Properties>::is_utf8
Line
Count
Source
2140
129k
    pub fn is_utf8(&self) -> bool {
2141
129k
        self.0.utf8
2142
129k
    }
<regex_syntax::hir::Properties>::is_utf8
Line
Count
Source
2140
17.7M
    pub fn is_utf8(&self) -> bool {
2141
17.7M
        self.0.utf8
2142
17.7M
    }
2143
2144
    /// Returns the total number of explicit capturing groups in the
2145
    /// corresponding HIR.
2146
    ///
2147
    /// Note that this does not include the implicit capturing group
2148
    /// corresponding to the entire match that is typically included by regex
2149
    /// engines.
2150
    ///
2151
    /// # Example
2152
    ///
2153
    /// This method will return `0` for `a` and `1` for `(a)`:
2154
    ///
2155
    /// ```
2156
    /// use regex_syntax::parse;
2157
    ///
2158
    /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2159
    /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2160
    ///
2161
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2162
    /// ```
2163
    #[inline]
2164
18.1M
    pub fn explicit_captures_len(&self) -> usize {
2165
18.1M
        self.0.explicit_captures_len
2166
18.1M
    }
<regex_syntax::hir::Properties>::explicit_captures_len
Line
Count
Source
2164
347k
    pub fn explicit_captures_len(&self) -> usize {
2165
347k
        self.0.explicit_captures_len
2166
347k
    }
<regex_syntax::hir::Properties>::explicit_captures_len
Line
Count
Source
2164
17.8M
    pub fn explicit_captures_len(&self) -> usize {
2165
17.8M
        self.0.explicit_captures_len
2166
17.8M
    }
2167
2168
    /// Returns the total number of explicit capturing groups that appear in
2169
    /// every possible match.
2170
    ///
2171
    /// If the number of capture groups can vary depending on the match, then
2172
    /// this returns `None`. That is, a value is only returned when the number
2173
    /// of matching groups is invariant or "static."
2174
    ///
2175
    /// Note that this does not include the implicit capturing group
2176
    /// corresponding to the entire match.
2177
    ///
2178
    /// # Example
2179
    ///
2180
    /// This shows a few cases where a static number of capture groups is
2181
    /// available and a few cases where it is not.
2182
    ///
2183
    /// ```
2184
    /// use regex_syntax::parse;
2185
    ///
2186
    /// let len = |pattern| {
2187
    ///     parse(pattern).map(|h| {
2188
    ///         h.properties().static_explicit_captures_len()
2189
    ///     })
2190
    /// };
2191
    ///
2192
    /// assert_eq!(Some(0), len("a")?);
2193
    /// assert_eq!(Some(1), len("(a)")?);
2194
    /// assert_eq!(Some(1), len("(a)|(b)")?);
2195
    /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2196
    /// assert_eq!(None, len("(a)|b")?);
2197
    /// assert_eq!(None, len("a|(b)")?);
2198
    /// assert_eq!(None, len("(b)*")?);
2199
    /// assert_eq!(Some(1), len("(b)+")?);
2200
    ///
2201
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2202
    /// ```
2203
    #[inline]
2204
18.1M
    pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205
18.1M
        self.0.static_explicit_captures_len
2206
18.1M
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::static_explicit_captures_len
Unexecuted instantiation: <regex_syntax::hir::Properties>::static_explicit_captures_len
<regex_syntax::hir::Properties>::static_explicit_captures_len
Line
Count
Source
2204
259k
    pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205
259k
        self.0.static_explicit_captures_len
2206
259k
    }
<regex_syntax::hir::Properties>::static_explicit_captures_len
Line
Count
Source
2204
17.8M
    pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205
17.8M
        self.0.static_explicit_captures_len
2206
17.8M
    }
2207
2208
    /// Return true if and only if this HIR is a simple literal. This is
2209
    /// only true when this HIR expression is either itself a `Literal` or a
2210
    /// concatenation of only `Literal`s.
2211
    ///
2212
    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2213
    /// the empty string are not (even though they contain sub-expressions that
2214
    /// are literals).
2215
    #[inline]
2216
9.08M
    pub fn is_literal(&self) -> bool {
2217
9.08M
        self.0.literal
2218
9.08M
    }
<regex_syntax::hir::Properties>::is_literal
Line
Count
Source
2216
129k
    pub fn is_literal(&self) -> bool {
2217
129k
        self.0.literal
2218
129k
    }
<regex_syntax::hir::Properties>::is_literal
Line
Count
Source
2216
8.95M
    pub fn is_literal(&self) -> bool {
2217
8.95M
        self.0.literal
2218
8.95M
    }
2219
2220
    /// Return true if and only if this HIR is either a simple literal or an
2221
    /// alternation of simple literals. This is only
2222
    /// true when this HIR expression is either itself a `Literal` or a
2223
    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2224
    ///
2225
    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2226
    /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2227
    /// (even though that contain sub-expressions that are literals).
2228
    #[inline]
2229
552k
    pub fn is_alternation_literal(&self) -> bool {
2230
552k
        self.0.alternation_literal
2231
552k
    }
<regex_syntax::hir::Properties>::is_alternation_literal
Line
Count
Source
2229
69.3k
    pub fn is_alternation_literal(&self) -> bool {
2230
69.3k
        self.0.alternation_literal
2231
69.3k
    }
<regex_syntax::hir::Properties>::is_alternation_literal
Line
Count
Source
2229
483k
    pub fn is_alternation_literal(&self) -> bool {
2230
483k
        self.0.alternation_literal
2231
483k
    }
2232
2233
    /// Returns the total amount of heap memory usage, in bytes, used by this
2234
    /// `Properties` value.
2235
    #[inline]
2236
0
    pub fn memory_usage(&self) -> usize {
2237
0
        core::mem::size_of::<PropertiesI>()
2238
0
    }
Unexecuted instantiation: <regex_syntax::hir::Properties>::memory_usage
Unexecuted instantiation: <regex_syntax::hir::Properties>::memory_usage
2239
2240
    /// Returns a new set of properties that corresponds to the union of the
2241
    /// iterator of properties given.
2242
    ///
2243
    /// This is useful when one has multiple `Hir` expressions and wants
2244
    /// to combine them into a single alternation without constructing the
2245
    /// corresponding `Hir`. This routine provides a way of combining the
2246
    /// properties of each `Hir` expression into one set of properties
2247
    /// representing the union of those expressions.
2248
    ///
2249
    /// # Example: union with HIRs that never match
2250
    ///
2251
    /// This example shows that unioning properties together with one that
2252
    /// represents a regex that never matches will "poison" certain attributes,
2253
    /// like the minimum and maximum lengths.
2254
    ///
2255
    /// ```
2256
    /// use regex_syntax::{hir::Properties, parse};
2257
    ///
2258
    /// let hir1 = parse("ab?c?")?;
2259
    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2260
    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2261
    ///
2262
    /// let hir2 = parse(r"[a&&b]")?;
2263
    /// assert_eq!(None, hir2.properties().minimum_len());
2264
    /// assert_eq!(None, hir2.properties().maximum_len());
2265
    ///
2266
    /// let hir3 = parse(r"wxy?z?")?;
2267
    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2268
    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2269
    ///
2270
    /// let unioned = Properties::union([
2271
    ///   hir1.properties(),
2272
    ///   hir2.properties(),
2273
    ///   hir3.properties(),
2274
    /// ]);
2275
    /// assert_eq!(None, unioned.minimum_len());
2276
    /// assert_eq!(None, unioned.maximum_len());
2277
    ///
2278
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2279
    /// ```
2280
    ///
2281
    /// The maximum length can also be "poisoned" by a pattern that has no
2282
    /// upper bound on the length of a match. The minimum length remains
2283
    /// unaffected:
2284
    ///
2285
    /// ```
2286
    /// use regex_syntax::{hir::Properties, parse};
2287
    ///
2288
    /// let hir1 = parse("ab?c?")?;
2289
    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2290
    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2291
    ///
2292
    /// let hir2 = parse(r"a+")?;
2293
    /// assert_eq!(Some(1), hir2.properties().minimum_len());
2294
    /// assert_eq!(None, hir2.properties().maximum_len());
2295
    ///
2296
    /// let hir3 = parse(r"wxy?z?")?;
2297
    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2298
    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2299
    ///
2300
    /// let unioned = Properties::union([
2301
    ///   hir1.properties(),
2302
    ///   hir2.properties(),
2303
    ///   hir3.properties(),
2304
    /// ]);
2305
    /// assert_eq!(Some(1), unioned.minimum_len());
2306
    /// assert_eq!(None, unioned.maximum_len());
2307
    ///
2308
    /// # Ok::<(), Box<dyn std::error::Error>>(())
2309
    /// ```
2310
194k
    pub fn union<I, P>(props: I) -> Properties
2311
194k
    where
2312
194k
        I: IntoIterator<Item = P>,
2313
194k
        P: core::borrow::Borrow<Properties>,
2314
194k
    {
2315
194k
        let mut it = props.into_iter().peekable();
2316
        // While empty alternations aren't possible, we still behave as if they
2317
        // are. When we have an empty alternate, then clearly the look-around
2318
        // prefix and suffix is empty. Otherwise, it is the intersection of all
2319
        // prefixes and suffixes (respectively) of the branches.
2320
194k
        let fix = if it.peek().is_none() {
2321
0
            LookSet::empty()
2322
        } else {
2323
194k
            LookSet::full()
2324
        };
2325
        // And also, an empty alternate means we have 0 static capture groups,
2326
        // but we otherwise start with the number corresponding to the first
2327
        // alternate. If any subsequent alternate has a different number of
2328
        // static capture groups, then we overall have a variation and not a
2329
        // static number of groups.
2330
194k
        let static_explicit_captures_len =
2331
194k
            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
<regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, &regex_syntax::hir::Properties>::{closure#0}
Line
Count
Source
2331
129k
            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, &regex_syntax::hir::Properties>::{closure#0}
Line
Count
Source
2331
65.1k
            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332
194k
        // The base case is an empty alternation, which matches nothing.
2333
194k
        // Note though that empty alternations aren't possible, because the
2334
194k
        // Hir::alternation smart constructor rewrites those as empty character
2335
194k
        // classes.
2336
194k
        let mut props = PropertiesI {
2337
194k
            minimum_len: None,
2338
194k
            maximum_len: None,
2339
194k
            look_set: LookSet::empty(),
2340
194k
            look_set_prefix: fix,
2341
194k
            look_set_suffix: fix,
2342
194k
            look_set_prefix_any: LookSet::empty(),
2343
194k
            look_set_suffix_any: LookSet::empty(),
2344
194k
            utf8: true,
2345
194k
            explicit_captures_len: 0,
2346
194k
            static_explicit_captures_len,
2347
194k
            literal: false,
2348
194k
            alternation_literal: true,
2349
194k
        };
2350
194k
        let (mut min_poisoned, mut max_poisoned) = (false, false);
2351
        // Handle properties that need to visit every child hir.
2352
11.2M
        for prop in it {
2353
11.0M
            let p = prop.borrow();
2354
11.0M
            props.look_set.set_union(p.look_set());
2355
11.0M
            props.look_set_prefix.set_intersect(p.look_set_prefix());
2356
11.0M
            props.look_set_suffix.set_intersect(p.look_set_suffix());
2357
11.0M
            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358
11.0M
            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359
11.0M
            props.utf8 = props.utf8 && p.is_utf8();
2360
11.0M
            props.explicit_captures_len = props
2361
11.0M
                .explicit_captures_len
2362
11.0M
                .saturating_add(p.explicit_captures_len());
2363
11.0M
            if props.static_explicit_captures_len
2364
11.0M
                != p.static_explicit_captures_len()
2365
134k
            {
2366
134k
                props.static_explicit_captures_len = None;
2367
10.9M
            }
2368
            props.alternation_literal =
2369
11.0M
                props.alternation_literal && p.is_literal();
2370
11.0M
            if !min_poisoned {
2371
11.0M
                if let Some(xmin) = p.minimum_len() {
2372
11.0M
                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
Unexecuted instantiation: <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, &regex_syntax::hir::Properties>::{closure#1}
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, &regex_syntax::hir::Properties>::{closure#1}
Line
Count
Source
2372
10.8M
                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373
223k
                        props.minimum_len = Some(xmin);
2374
10.8M
                    }
2375
2.05k
                } else {
2376
2.05k
                    props.minimum_len = None;
2377
2.05k
                    min_poisoned = true;
2378
2.05k
                }
2379
10.0k
            }
2380
11.0M
            if !max_poisoned {
2381
9.48M
                if let Some(xmax) = p.maximum_len() {
2382
9.42M
                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
Unexecuted instantiation: <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, &regex_syntax::hir::Properties>::{closure#2}
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, &regex_syntax::hir::Properties>::{closure#2}
Line
Count
Source
2382
9.29M
                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383
168k
                        props.maximum_len = Some(xmax);
2384
9.26M
                    }
2385
60.8k
                } else {
2386
60.8k
                    props.maximum_len = None;
2387
60.8k
                    max_poisoned = true;
2388
60.8k
                }
2389
1.60M
            }
2390
        }
2391
194k
        Properties(Box::new(props))
2392
194k
    }
<regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, &regex_syntax::hir::Properties>
Line
Count
Source
2310
129k
    pub fn union<I, P>(props: I) -> Properties
2311
129k
    where
2312
129k
        I: IntoIterator<Item = P>,
2313
129k
        P: core::borrow::Borrow<Properties>,
2314
129k
    {
2315
129k
        let mut it = props.into_iter().peekable();
2316
        // While empty alternations aren't possible, we still behave as if they
2317
        // are. When we have an empty alternate, then clearly the look-around
2318
        // prefix and suffix is empty. Otherwise, it is the intersection of all
2319
        // prefixes and suffixes (respectively) of the branches.
2320
129k
        let fix = if it.peek().is_none() {
2321
0
            LookSet::empty()
2322
        } else {
2323
129k
            LookSet::full()
2324
        };
2325
        // And also, an empty alternate means we have 0 static capture groups,
2326
        // but we otherwise start with the number corresponding to the first
2327
        // alternate. If any subsequent alternate has a different number of
2328
        // static capture groups, then we overall have a variation and not a
2329
        // static number of groups.
2330
129k
        let static_explicit_captures_len =
2331
129k
            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332
129k
        // The base case is an empty alternation, which matches nothing.
2333
129k
        // Note though that empty alternations aren't possible, because the
2334
129k
        // Hir::alternation smart constructor rewrites those as empty character
2335
129k
        // classes.
2336
129k
        let mut props = PropertiesI {
2337
129k
            minimum_len: None,
2338
129k
            maximum_len: None,
2339
129k
            look_set: LookSet::empty(),
2340
129k
            look_set_prefix: fix,
2341
129k
            look_set_suffix: fix,
2342
129k
            look_set_prefix_any: LookSet::empty(),
2343
129k
            look_set_suffix_any: LookSet::empty(),
2344
129k
            utf8: true,
2345
129k
            explicit_captures_len: 0,
2346
129k
            static_explicit_captures_len,
2347
129k
            literal: false,
2348
129k
            alternation_literal: true,
2349
129k
        };
2350
129k
        let (mut min_poisoned, mut max_poisoned) = (false, false);
2351
        // Handle properties that need to visit every child hir.
2352
259k
        for prop in it {
2353
129k
            let p = prop.borrow();
2354
129k
            props.look_set.set_union(p.look_set());
2355
129k
            props.look_set_prefix.set_intersect(p.look_set_prefix());
2356
129k
            props.look_set_suffix.set_intersect(p.look_set_suffix());
2357
129k
            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358
129k
            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359
129k
            props.utf8 = props.utf8 && p.is_utf8();
2360
129k
            props.explicit_captures_len = props
2361
129k
                .explicit_captures_len
2362
129k
                .saturating_add(p.explicit_captures_len());
2363
129k
            if props.static_explicit_captures_len
2364
129k
                != p.static_explicit_captures_len()
2365
0
            {
2366
0
                props.static_explicit_captures_len = None;
2367
129k
            }
2368
            props.alternation_literal =
2369
129k
                props.alternation_literal && p.is_literal();
2370
129k
            if !min_poisoned {
2371
129k
                if let Some(xmin) = p.minimum_len() {
2372
128k
                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373
128k
                        props.minimum_len = Some(xmin);
2374
128k
                    }
2375
1.47k
                } else {
2376
1.47k
                    props.minimum_len = None;
2377
1.47k
                    min_poisoned = true;
2378
1.47k
                }
2379
0
            }
2380
129k
            if !max_poisoned {
2381
129k
                if let Some(xmax) = p.maximum_len() {
2382
76.3k
                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383
76.3k
                        props.maximum_len = Some(xmax);
2384
76.3k
                    }
2385
53.2k
                } else {
2386
53.2k
                    props.maximum_len = None;
2387
53.2k
                    max_poisoned = true;
2388
53.2k
                }
2389
0
            }
2390
        }
2391
129k
        Properties(Box::new(props))
2392
129k
    }
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, &regex_syntax::hir::Properties>
Line
Count
Source
2310
65.1k
    pub fn union<I, P>(props: I) -> Properties
2311
65.1k
    where
2312
65.1k
        I: IntoIterator<Item = P>,
2313
65.1k
        P: core::borrow::Borrow<Properties>,
2314
65.1k
    {
2315
65.1k
        let mut it = props.into_iter().peekable();
2316
        // While empty alternations aren't possible, we still behave as if they
2317
        // are. When we have an empty alternate, then clearly the look-around
2318
        // prefix and suffix is empty. Otherwise, it is the intersection of all
2319
        // prefixes and suffixes (respectively) of the branches.
2320
65.1k
        let fix = if it.peek().is_none() {
2321
0
            LookSet::empty()
2322
        } else {
2323
65.1k
            LookSet::full()
2324
        };
2325
        // And also, an empty alternate means we have 0 static capture groups,
2326
        // but we otherwise start with the number corresponding to the first
2327
        // alternate. If any subsequent alternate has a different number of
2328
        // static capture groups, then we overall have a variation and not a
2329
        // static number of groups.
2330
65.1k
        let static_explicit_captures_len =
2331
65.1k
            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332
65.1k
        // The base case is an empty alternation, which matches nothing.
2333
65.1k
        // Note though that empty alternations aren't possible, because the
2334
65.1k
        // Hir::alternation smart constructor rewrites those as empty character
2335
65.1k
        // classes.
2336
65.1k
        let mut props = PropertiesI {
2337
65.1k
            minimum_len: None,
2338
65.1k
            maximum_len: None,
2339
65.1k
            look_set: LookSet::empty(),
2340
65.1k
            look_set_prefix: fix,
2341
65.1k
            look_set_suffix: fix,
2342
65.1k
            look_set_prefix_any: LookSet::empty(),
2343
65.1k
            look_set_suffix_any: LookSet::empty(),
2344
65.1k
            utf8: true,
2345
65.1k
            explicit_captures_len: 0,
2346
65.1k
            static_explicit_captures_len,
2347
65.1k
            literal: false,
2348
65.1k
            alternation_literal: true,
2349
65.1k
        };
2350
65.1k
        let (mut min_poisoned, mut max_poisoned) = (false, false);
2351
        // Handle properties that need to visit every child hir.
2352
11.0M
        for prop in it {
2353
10.9M
            let p = prop.borrow();
2354
10.9M
            props.look_set.set_union(p.look_set());
2355
10.9M
            props.look_set_prefix.set_intersect(p.look_set_prefix());
2356
10.9M
            props.look_set_suffix.set_intersect(p.look_set_suffix());
2357
10.9M
            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358
10.9M
            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359
10.9M
            props.utf8 = props.utf8 && p.is_utf8();
2360
10.9M
            props.explicit_captures_len = props
2361
10.9M
                .explicit_captures_len
2362
10.9M
                .saturating_add(p.explicit_captures_len());
2363
10.9M
            if props.static_explicit_captures_len
2364
10.9M
                != p.static_explicit_captures_len()
2365
134k
            {
2366
134k
                props.static_explicit_captures_len = None;
2367
10.8M
            }
2368
            props.alternation_literal =
2369
10.9M
                props.alternation_literal && p.is_literal();
2370
10.9M
            if !min_poisoned {
2371
10.9M
                if let Some(xmin) = p.minimum_len() {
2372
10.9M
                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373
95.0k
                        props.minimum_len = Some(xmin);
2374
10.8M
                    }
2375
575
                } else {
2376
575
                    props.minimum_len = None;
2377
575
                    min_poisoned = true;
2378
575
                }
2379
10.0k
            }
2380
10.9M
            if !max_poisoned {
2381
9.35M
                if let Some(xmax) = p.maximum_len() {
2382
9.35M
                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383
91.6k
                        props.maximum_len = Some(xmax);
2384
9.26M
                    }
2385
7.60k
                } else {
2386
7.60k
                    props.maximum_len = None;
2387
7.60k
                    max_poisoned = true;
2388
7.60k
                }
2389
1.60M
            }
2390
        }
2391
65.1k
        Properties(Box::new(props))
2392
65.1k
    }
2393
}
2394
2395
impl Properties {
2396
    /// Create a new set of HIR properties for an empty regex.
2397
21.1M
    fn empty() -> Properties {
2398
21.1M
        let inner = PropertiesI {
2399
21.1M
            minimum_len: Some(0),
2400
21.1M
            maximum_len: Some(0),
2401
21.1M
            look_set: LookSet::empty(),
2402
21.1M
            look_set_prefix: LookSet::empty(),
2403
21.1M
            look_set_suffix: LookSet::empty(),
2404
21.1M
            look_set_prefix_any: LookSet::empty(),
2405
21.1M
            look_set_suffix_any: LookSet::empty(),
2406
21.1M
            // It is debatable whether an empty regex always matches at valid
2407
21.1M
            // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2408
21.1M
            // it is clearly false. There are, for example, many empty strings
2409
21.1M
            // between the bytes encoding a '☃'.
2410
21.1M
            //
2411
21.1M
            // However, when Unicode mode is enabled, the fundamental atom
2412
21.1M
            // of matching is really a codepoint. And in that scenario, an
2413
21.1M
            // empty regex is defined to only match at valid UTF-8 boundaries
2414
21.1M
            // and to never split a codepoint. It just so happens that this
2415
21.1M
            // enforcement is somewhat tricky to do for regexes that match
2416
21.1M
            // the empty string inside regex engines themselves. It usually
2417
21.1M
            // requires some layer above the regex engine to filter out such
2418
21.1M
            // matches.
2419
21.1M
            //
2420
21.1M
            // In any case, 'true' is really the only coherent option. If it
2421
21.1M
            // were false, for example, then 'a*' would also need to be false
2422
21.1M
            // since it too can match the empty string.
2423
21.1M
            utf8: true,
2424
21.1M
            explicit_captures_len: 0,
2425
21.1M
            static_explicit_captures_len: Some(0),
2426
21.1M
            literal: false,
2427
21.1M
            alternation_literal: false,
2428
21.1M
        };
2429
21.1M
        Properties(Box::new(inner))
2430
21.1M
    }
2431
2432
    /// Create a new set of HIR properties for a literal regex.
2433
12.5M
    fn literal(lit: &Literal) -> Properties {
2434
12.5M
        let inner = PropertiesI {
2435
12.5M
            minimum_len: Some(lit.0.len()),
2436
12.5M
            maximum_len: Some(lit.0.len()),
2437
12.5M
            look_set: LookSet::empty(),
2438
12.5M
            look_set_prefix: LookSet::empty(),
2439
12.5M
            look_set_suffix: LookSet::empty(),
2440
12.5M
            look_set_prefix_any: LookSet::empty(),
2441
12.5M
            look_set_suffix_any: LookSet::empty(),
2442
12.5M
            utf8: core::str::from_utf8(&lit.0).is_ok(),
2443
12.5M
            explicit_captures_len: 0,
2444
12.5M
            static_explicit_captures_len: Some(0),
2445
12.5M
            literal: true,
2446
12.5M
            alternation_literal: true,
2447
12.5M
        };
2448
12.5M
        Properties(Box::new(inner))
2449
12.5M
    }
2450
2451
    /// Create a new set of HIR properties for a character class.
2452
3.78M
    fn class(class: &Class) -> Properties {
2453
3.78M
        let inner = PropertiesI {
2454
3.78M
            minimum_len: class.minimum_len(),
2455
3.78M
            maximum_len: class.maximum_len(),
2456
3.78M
            look_set: LookSet::empty(),
2457
3.78M
            look_set_prefix: LookSet::empty(),
2458
3.78M
            look_set_suffix: LookSet::empty(),
2459
3.78M
            look_set_prefix_any: LookSet::empty(),
2460
3.78M
            look_set_suffix_any: LookSet::empty(),
2461
3.78M
            utf8: class.is_utf8(),
2462
3.78M
            explicit_captures_len: 0,
2463
3.78M
            static_explicit_captures_len: Some(0),
2464
3.78M
            literal: false,
2465
3.78M
            alternation_literal: false,
2466
3.78M
        };
2467
3.78M
        Properties(Box::new(inner))
2468
3.78M
    }
2469
2470
    /// Create a new set of HIR properties for a look-around assertion.
2471
335k
    fn look(look: Look) -> Properties {
2472
335k
        let inner = PropertiesI {
2473
335k
            minimum_len: Some(0),
2474
335k
            maximum_len: Some(0),
2475
335k
            look_set: LookSet::singleton(look),
2476
335k
            look_set_prefix: LookSet::singleton(look),
2477
335k
            look_set_suffix: LookSet::singleton(look),
2478
335k
            look_set_prefix_any: LookSet::singleton(look),
2479
335k
            look_set_suffix_any: LookSet::singleton(look),
2480
335k
            // This requires a little explanation. Basically, we don't consider
2481
335k
            // matching an empty string to be equivalent to matching invalid
2482
335k
            // UTF-8, even though technically matching every empty string will
2483
335k
            // split the UTF-8 encoding of a single codepoint when treating a
2484
335k
            // UTF-8 encoded string as a sequence of bytes. Our defense here is
2485
335k
            // that in such a case, a codepoint should logically be treated as
2486
335k
            // the fundamental atom for matching, and thus the only valid match
2487
335k
            // points are between codepoints and not bytes.
2488
335k
            //
2489
335k
            // More practically, this is true here because it's also true
2490
335k
            // for 'Hir::empty()', otherwise something like 'a*' would be
2491
335k
            // considered to match invalid UTF-8. That in turn makes this
2492
335k
            // property borderline useless.
2493
335k
            utf8: true,
2494
335k
            explicit_captures_len: 0,
2495
335k
            static_explicit_captures_len: Some(0),
2496
335k
            literal: false,
2497
335k
            alternation_literal: false,
2498
335k
        };
2499
335k
        Properties(Box::new(inner))
2500
335k
    }
2501
2502
    /// Create a new set of HIR properties for a repetition.
2503
694k
    fn repetition(rep: &Repetition) -> Properties {
2504
694k
        let p = rep.sub.properties();
2505
694k
        let minimum_len = p.minimum_len().map(|child_min| {
2506
692k
            let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2507
692k
            child_min.saturating_mul(rep_min)
2508
694k
        });
2509
694k
        let maximum_len = rep.max.and_then(|rep_max| {
2510
494k
            let rep_max = usize::try_from(rep_max).ok()?;
2511
494k
            let child_max = p.maximum_len()?;
2512
435k
            child_max.checked_mul(rep_max)
2513
694k
        });
2514
694k
2515
694k
        let mut inner = PropertiesI {
2516
694k
            minimum_len,
2517
694k
            maximum_len,
2518
694k
            look_set: p.look_set(),
2519
694k
            look_set_prefix: LookSet::empty(),
2520
694k
            look_set_suffix: LookSet::empty(),
2521
694k
            look_set_prefix_any: p.look_set_prefix_any(),
2522
694k
            look_set_suffix_any: p.look_set_suffix_any(),
2523
694k
            utf8: p.is_utf8(),
2524
694k
            explicit_captures_len: p.explicit_captures_len(),
2525
694k
            static_explicit_captures_len: p.static_explicit_captures_len(),
2526
694k
            literal: false,
2527
694k
            alternation_literal: false,
2528
694k
        };
2529
694k
        // If the repetition operator can match the empty string, then its
2530
694k
        // lookset prefix and suffixes themselves remain empty since they are
2531
694k
        // no longer required to match.
2532
694k
        if rep.min > 0 {
2533
48.0k
            inner.look_set_prefix = p.look_set_prefix();
2534
48.0k
            inner.look_set_suffix = p.look_set_suffix();
2535
646k
        }
2536
        // If the static captures len of the sub-expression is not known or
2537
        // is greater than zero, then it automatically propagates to the
2538
        // repetition, regardless of the repetition. Otherwise, it might
2539
        // change, but only when the repetition can match 0 times.
2540
694k
        if rep.min == 0
2541
646k
            && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2542
        {
2543
            // If we require a match 0 times, then our captures len is
2544
            // guaranteed to be zero. Otherwise, if we *can* match the empty
2545
            // string, then it's impossible to know how many captures will be
2546
            // in the resulting match.
2547
32.7k
            if rep.max == Some(0) {
2548
0
                inner.static_explicit_captures_len = Some(0);
2549
32.7k
            } else {
2550
32.7k
                inner.static_explicit_captures_len = None;
2551
32.7k
            }
2552
662k
        }
2553
694k
        Properties(Box::new(inner))
2554
694k
    }
2555
2556
    /// Create a new set of HIR properties for a capture.
2557
99.7k
    fn capture(capture: &Capture) -> Properties {
2558
99.7k
        let p = capture.sub.properties();
2559
99.7k
        Properties(Box::new(PropertiesI {
2560
99.7k
            explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2561
99.7k
            static_explicit_captures_len: p
2562
99.7k
                .static_explicit_captures_len()
2563
99.7k
                .map(|len| len.saturating_add(1)),
2564
99.7k
            literal: false,
2565
99.7k
            alternation_literal: false,
2566
99.7k
            ..*p.0.clone()
2567
99.7k
        }))
2568
99.7k
    }
2569
2570
    /// Create a new set of HIR properties for a concatenation.
2571
382k
    fn concat(concat: &[Hir]) -> Properties {
2572
382k
        // The base case is an empty concatenation, which matches the empty
2573
382k
        // string. Note though that empty concatenations aren't possible,
2574
382k
        // because the Hir::concat smart constructor rewrites those as
2575
382k
        // Hir::empty.
2576
382k
        let mut props = PropertiesI {
2577
382k
            minimum_len: Some(0),
2578
382k
            maximum_len: Some(0),
2579
382k
            look_set: LookSet::empty(),
2580
382k
            look_set_prefix: LookSet::empty(),
2581
382k
            look_set_suffix: LookSet::empty(),
2582
382k
            look_set_prefix_any: LookSet::empty(),
2583
382k
            look_set_suffix_any: LookSet::empty(),
2584
382k
            utf8: true,
2585
382k
            explicit_captures_len: 0,
2586
382k
            static_explicit_captures_len: Some(0),
2587
382k
            literal: true,
2588
382k
            alternation_literal: true,
2589
382k
        };
2590
        // Handle properties that need to visit every child hir.
2591
6.05M
        for x in concat.iter() {
2592
6.05M
            let p = x.properties();
2593
6.05M
            props.look_set.set_union(p.look_set());
2594
6.05M
            props.utf8 = props.utf8 && p.is_utf8();
2595
6.05M
            props.explicit_captures_len = props
2596
6.05M
                .explicit_captures_len
2597
6.05M
                .saturating_add(p.explicit_captures_len());
2598
6.05M
            props.static_explicit_captures_len = p
2599
6.05M
                .static_explicit_captures_len()
2600
6.05M
                .and_then(|len1| {
2601
6.01M
                    Some((len1, props.static_explicit_captures_len?))
2602
6.05M
                })
2603
6.05M
                .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2604
6.05M
            props.literal = props.literal && p.is_literal();
2605
            props.alternation_literal =
2606
6.05M
                props.alternation_literal && p.is_alternation_literal();
2607
6.05M
            if let Some(minimum_len) = props.minimum_len {
2608
5.98M
                match p.minimum_len() {
2609
7.63k
                    None => props.minimum_len = None,
2610
5.97M
                    Some(len) => {
2611
5.97M
                        // We use saturating arithmetic here because the
2612
5.97M
                        // minimum is just a lower bound. We can't go any
2613
5.97M
                        // higher than what our number types permit.
2614
5.97M
                        props.minimum_len =
2615
5.97M
                            Some(minimum_len.saturating_add(len));
2616
5.97M
                    }
2617
                }
2618
70.6k
            }
2619
6.05M
            if let Some(maximum_len) = props.maximum_len {
2620
5.20M
                match p.maximum_len() {
2621
114k
                    None => props.maximum_len = None,
2622
5.09M
                    Some(len) => {
2623
5.09M
                        props.maximum_len = maximum_len.checked_add(len)
2624
                    }
2625
                }
2626
846k
            }
2627
        }
2628
        // Handle the prefix properties, which only requires visiting
2629
        // child exprs until one matches more than the empty string.
2630
382k
        let mut it = concat.iter();
2631
461k
        while let Some(x) = it.next() {
2632
458k
            props.look_set_prefix.set_union(x.properties().look_set_prefix());
2633
458k
            props
2634
458k
                .look_set_prefix_any
2635
458k
                .set_union(x.properties().look_set_prefix_any());
2636
458k
            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2637
379k
                break;
2638
79.3k
            }
2639
        }
2640
        // Same thing for the suffix properties, but in reverse.
2641
382k
        let mut it = concat.iter().rev();
2642
420k
        while let Some(x) = it.next() {
2643
417k
            props.look_set_suffix.set_union(x.properties().look_set_suffix());
2644
417k
            props
2645
417k
                .look_set_suffix_any
2646
417k
                .set_union(x.properties().look_set_suffix_any());
2647
417k
            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2648
379k
                break;
2649
38.5k
            }
2650
        }
2651
382k
        Properties(Box::new(props))
2652
382k
    }
2653
2654
    /// Create a new set of HIR properties for a concatenation.
2655
65.1k
    fn alternation(alts: &[Hir]) -> Properties {
2656
10.9M
        Properties::union(alts.iter().map(|hir| hir.properties()))
2657
65.1k
    }
2658
}
2659
2660
/// A set of look-around assertions.
2661
///
2662
/// This is useful for efficiently tracking look-around assertions. For
2663
/// example, an [`Hir`] provides properties that return `LookSet`s.
2664
#[derive(Clone, Copy, Default, Eq, PartialEq)]
2665
pub struct LookSet {
2666
    /// The underlying representation this set is exposed to make it possible
2667
    /// to store it somewhere efficiently. The representation is that
2668
    /// of a bitset, where each assertion occupies bit `i` where `i =
2669
    /// Look::as_repr()`.
2670
    ///
2671
    /// Note that users of this internal representation must permit the full
2672
    /// range of `u16` values to be represented. For example, even if the
2673
    /// current implementation only makes use of the 10 least significant bits,
2674
    /// it may use more bits in a future semver compatible release.
2675
    pub bits: u32,
2676
}
2677
2678
impl LookSet {
2679
    /// Create an empty set of look-around assertions.
2680
    #[inline]
2681
193M
    pub fn empty() -> LookSet {
2682
193M
        LookSet { bits: 0 }
2683
193M
    }
<regex_syntax::hir::LookSet>::empty
Line
Count
Source
2681
388k
    pub fn empty() -> LookSet {
2682
388k
        LookSet { bits: 0 }
2683
388k
    }
<regex_syntax::hir::LookSet>::empty
Line
Count
Source
2681
192M
    pub fn empty() -> LookSet {
2682
192M
        LookSet { bits: 0 }
2683
192M
    }
2684
2685
    /// Create a full set of look-around assertions.
2686
    ///
2687
    /// This set contains all possible look-around assertions.
2688
    #[inline]
2689
194k
    pub fn full() -> LookSet {
2690
194k
        LookSet { bits: !0 }
2691
194k
    }
<regex_syntax::hir::LookSet>::full
Line
Count
Source
2689
129k
    pub fn full() -> LookSet {
2690
129k
        LookSet { bits: !0 }
2691
129k
    }
<regex_syntax::hir::LookSet>::full
Line
Count
Source
2689
65.1k
    pub fn full() -> LookSet {
2690
65.1k
        LookSet { bits: !0 }
2691
65.1k
    }
2692
2693
    /// Create a look-around set containing the look-around assertion given.
2694
    ///
2695
    /// This is a convenience routine for creating an empty set and inserting
2696
    /// one look-around assertions.
2697
    #[inline]
2698
1.67M
    pub fn singleton(look: Look) -> LookSet {
2699
1.67M
        LookSet::empty().insert(look)
2700
1.67M
    }
2701
2702
    /// Returns the total number of look-around assertions in this set.
2703
    #[inline]
2704
131k
    pub fn len(self) -> usize {
2705
131k
        // OK because max value always fits in a u8, which in turn always
2706
131k
        // fits in a usize, regardless of target.
2707
131k
        usize::try_from(self.bits.count_ones()).unwrap()
2708
131k
    }
<regex_syntax::hir::LookSet>::len
Line
Count
Source
2704
131k
    pub fn len(self) -> usize {
2705
131k
        // OK because max value always fits in a u8, which in turn always
2706
131k
        // fits in a usize, regardless of target.
2707
131k
        usize::try_from(self.bits.count_ones()).unwrap()
2708
131k
    }
Unexecuted instantiation: <regex_syntax::hir::LookSet>::len
2709
2710
    /// Returns true if and only if this set is empty.
2711
    #[inline]
2712
131k
    pub fn is_empty(self) -> bool {
2713
131k
        self.len() == 0
2714
131k
    }
<regex_syntax::hir::LookSet>::is_empty
Line
Count
Source
2712
131k
    pub fn is_empty(self) -> bool {
2713
131k
        self.len() == 0
2714
131k
    }
Unexecuted instantiation: <regex_syntax::hir::LookSet>::is_empty
2715
2716
    /// Returns true if and only if the given look-around assertion is in this
2717
    /// set.
2718
    #[inline]
2719
1.17M
    pub fn contains(self, look: Look) -> bool {
2720
1.17M
        self.bits & look.as_repr() != 0
2721
1.17M
    }
<regex_syntax::hir::LookSet>::contains
Line
Count
Source
2719
56.5k
    pub fn contains(self, look: Look) -> bool {
2720
56.5k
        self.bits & look.as_repr() != 0
2721
56.5k
    }
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains
<regex_syntax::hir::LookSet>::contains
Line
Count
Source
2719
1.12M
    pub fn contains(self, look: Look) -> bool {
2720
1.12M
        self.bits & look.as_repr() != 0
2721
1.12M
    }
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains
2722
2723
    /// Returns true if and only if this set contains any anchor assertions.
2724
    /// This includes both "start/end of haystack" and "start/end of line."
2725
    #[inline]
2726
0
    pub fn contains_anchor(&self) -> bool {
2727
0
        self.contains_anchor_haystack() || self.contains_anchor_line()
2728
0
    }
2729
2730
    /// Returns true if and only if this set contains any "start/end of
2731
    /// haystack" anchors. This doesn't include "start/end of line" anchors.
2732
    #[inline]
2733
0
    pub fn contains_anchor_haystack(&self) -> bool {
2734
0
        self.contains(Look::Start) || self.contains(Look::End)
2735
0
    }
2736
2737
    /// Returns true if and only if this set contains any "start/end of line"
2738
    /// anchors. This doesn't include "start/end of haystack" anchors. This
2739
    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2740
    #[inline]
2741
0
    pub fn contains_anchor_line(&self) -> bool {
2742
0
        self.contains(Look::StartLF)
2743
0
            || self.contains(Look::EndLF)
2744
0
            || self.contains(Look::StartCRLF)
2745
0
            || self.contains(Look::EndCRLF)
2746
0
    }
2747
2748
    /// Returns true if and only if this set contains any "start/end of line"
2749
    /// anchors that only treat `\n` as line terminators. This does not include
2750
    /// haystack anchors or CRLF aware line anchors.
2751
    #[inline]
2752
0
    pub fn contains_anchor_lf(&self) -> bool {
2753
0
        self.contains(Look::StartLF) || self.contains(Look::EndLF)
2754
0
    }
2755
2756
    /// Returns true if and only if this set contains any "start/end of line"
2757
    /// anchors that are CRLF-aware. This doesn't include "start/end of
2758
    /// haystack" or "start/end of line-feed" anchors.
2759
    #[inline]
2760
0
    pub fn contains_anchor_crlf(&self) -> bool {
2761
0
        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2762
0
    }
2763
2764
    /// Returns true if and only if this set contains any word boundary or
2765
    /// negated word boundary assertions. This include both Unicode and ASCII
2766
    /// word boundaries.
2767
    #[inline]
2768
0
    pub fn contains_word(self) -> bool {
2769
0
        self.contains_word_unicode() || self.contains_word_ascii()
2770
0
    }
2771
2772
    /// Returns true if and only if this set contains any Unicode word boundary
2773
    /// or negated Unicode word boundary assertions.
2774
    #[inline]
2775
82.7k
    pub fn contains_word_unicode(self) -> bool {
2776
82.7k
        self.contains(Look::WordUnicode)
2777
82.4k
            || self.contains(Look::WordUnicodeNegate)
2778
81.5k
            || self.contains(Look::WordStartUnicode)
2779
81.4k
            || self.contains(Look::WordEndUnicode)
2780
80.3k
            || self.contains(Look::WordStartHalfUnicode)
2781
80.3k
            || self.contains(Look::WordEndHalfUnicode)
2782
82.7k
    }
<regex_syntax::hir::LookSet>::contains_word_unicode
Line
Count
Source
2775
82.7k
    pub fn contains_word_unicode(self) -> bool {
2776
82.7k
        self.contains(Look::WordUnicode)
2777
82.4k
            || self.contains(Look::WordUnicodeNegate)
2778
81.5k
            || self.contains(Look::WordStartUnicode)
2779
81.4k
            || self.contains(Look::WordEndUnicode)
2780
80.3k
            || self.contains(Look::WordStartHalfUnicode)
2781
80.3k
            || self.contains(Look::WordEndHalfUnicode)
2782
82.7k
    }
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains_word_unicode
2783
2784
    /// Returns true if and only if this set contains any ASCII word boundary
2785
    /// or negated ASCII word boundary assertions.
2786
    #[inline]
2787
0
    pub fn contains_word_ascii(self) -> bool {
2788
0
        self.contains(Look::WordAscii)
2789
0
            || self.contains(Look::WordAsciiNegate)
2790
0
            || self.contains(Look::WordStartAscii)
2791
0
            || self.contains(Look::WordEndAscii)
2792
0
            || self.contains(Look::WordStartHalfAscii)
2793
0
            || self.contains(Look::WordEndHalfAscii)
2794
0
    }
2795
2796
    /// Returns an iterator over all of the look-around assertions in this set.
2797
    #[inline]
2798
0
    pub fn iter(self) -> LookSetIter {
2799
0
        LookSetIter { set: self }
2800
0
    }
2801
2802
    /// Return a new set that is equivalent to the original, but with the given
2803
    /// assertion added to it. If the assertion is already in the set, then the
2804
    /// returned set is equivalent to the original.
2805
    #[inline]
2806
1.67M
    pub fn insert(self, look: Look) -> LookSet {
2807
1.67M
        LookSet { bits: self.bits | look.as_repr() }
2808
1.67M
    }
2809
2810
    /// Updates this set in place with the result of inserting the given
2811
    /// assertion into this set.
2812
    #[inline]
2813
0
    pub fn set_insert(&mut self, look: Look) {
2814
0
        *self = self.insert(look);
2815
0
    }
2816
2817
    /// Return a new set that is equivalent to the original, but with the given
2818
    /// assertion removed from it. If the assertion is not in the set, then the
2819
    /// returned set is equivalent to the original.
2820
    #[inline]
2821
0
    pub fn remove(self, look: Look) -> LookSet {
2822
0
        LookSet { bits: self.bits & !look.as_repr() }
2823
0
    }
2824
2825
    /// Updates this set in place with the result of removing the given
2826
    /// assertion from this set.
2827
    #[inline]
2828
0
    pub fn set_remove(&mut self, look: Look) {
2829
0
        *self = self.remove(look);
2830
0
    }
2831
2832
    /// Returns a new set that is the result of subtracting the given set from
2833
    /// this set.
2834
    #[inline]
2835
0
    pub fn subtract(self, other: LookSet) -> LookSet {
2836
0
        LookSet { bits: self.bits & !other.bits }
2837
0
    }
2838
2839
    /// Updates this set in place with the result of subtracting the given set
2840
    /// from this set.
2841
    #[inline]
2842
0
    pub fn set_subtract(&mut self, other: LookSet) {
2843
0
        *self = self.subtract(other);
2844
0
    }
2845
2846
    /// Returns a new set that is the union of this and the one given.
2847
    #[inline]
2848
41.0M
    pub fn union(self, other: LookSet) -> LookSet {
2849
41.0M
        LookSet { bits: self.bits | other.bits }
2850
41.0M
    }
<regex_syntax::hir::LookSet>::union
Line
Count
Source
2848
388k
    pub fn union(self, other: LookSet) -> LookSet {
2849
388k
        LookSet { bits: self.bits | other.bits }
2850
388k
    }
<regex_syntax::hir::LookSet>::union
Line
Count
Source
2848
40.7M
    pub fn union(self, other: LookSet) -> LookSet {
2849
40.7M
        LookSet { bits: self.bits | other.bits }
2850
40.7M
    }
2851
2852
    /// Updates this set in place with the result of unioning it with the one
2853
    /// given.
2854
    #[inline]
2855
41.0M
    pub fn set_union(&mut self, other: LookSet) {
2856
41.0M
        *self = self.union(other);
2857
41.0M
    }
<regex_syntax::hir::LookSet>::set_union
Line
Count
Source
2855
388k
    pub fn set_union(&mut self, other: LookSet) {
2856
388k
        *self = self.union(other);
2857
388k
    }
<regex_syntax::hir::LookSet>::set_union
Line
Count
Source
2855
40.7M
    pub fn set_union(&mut self, other: LookSet) {
2856
40.7M
        *self = self.union(other);
2857
40.7M
    }
2858
2859
    /// Returns a new set that is the intersection of this and the one given.
2860
    #[inline]
2861
22.1M
    pub fn intersect(self, other: LookSet) -> LookSet {
2862
22.1M
        LookSet { bits: self.bits & other.bits }
2863
22.1M
    }
<regex_syntax::hir::LookSet>::intersect
Line
Count
Source
2861
259k
    pub fn intersect(self, other: LookSet) -> LookSet {
2862
259k
        LookSet { bits: self.bits & other.bits }
2863
259k
    }
<regex_syntax::hir::LookSet>::intersect
Line
Count
Source
2861
21.9M
    pub fn intersect(self, other: LookSet) -> LookSet {
2862
21.9M
        LookSet { bits: self.bits & other.bits }
2863
21.9M
    }
2864
2865
    /// Updates this set in place with the result of intersecting it with the
2866
    /// one given.
2867
    #[inline]
2868
22.1M
    pub fn set_intersect(&mut self, other: LookSet) {
2869
22.1M
        *self = self.intersect(other);
2870
22.1M
    }
<regex_syntax::hir::LookSet>::set_intersect
Line
Count
Source
2868
259k
    pub fn set_intersect(&mut self, other: LookSet) {
2869
259k
        *self = self.intersect(other);
2870
259k
    }
<regex_syntax::hir::LookSet>::set_intersect
Line
Count
Source
2868
21.9M
    pub fn set_intersect(&mut self, other: LookSet) {
2869
21.9M
        *self = self.intersect(other);
2870
21.9M
    }
2871
2872
    /// Return a `LookSet` from the slice given as a native endian 32-bit
2873
    /// integer.
2874
    ///
2875
    /// # Panics
2876
    ///
2877
    /// This panics if `slice.len() < 4`.
2878
    #[inline]
2879
0
    pub fn read_repr(slice: &[u8]) -> LookSet {
2880
0
        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2881
0
        LookSet { bits }
2882
0
    }
2883
2884
    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2885
    /// of the slice given.
2886
    ///
2887
    /// # Panics
2888
    ///
2889
    /// This panics if `slice.len() < 4`.
2890
    #[inline]
2891
0
    pub fn write_repr(self, slice: &mut [u8]) {
2892
0
        let raw = self.bits.to_ne_bytes();
2893
0
        slice[0] = raw[0];
2894
0
        slice[1] = raw[1];
2895
0
        slice[2] = raw[2];
2896
0
        slice[3] = raw[3];
2897
0
    }
2898
}
2899
2900
impl core::fmt::Debug for LookSet {
2901
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2902
0
        if self.is_empty() {
2903
0
            return write!(f, "∅");
2904
0
        }
2905
0
        for look in self.iter() {
2906
0
            write!(f, "{}", look.as_char())?;
2907
        }
2908
0
        Ok(())
2909
0
    }
2910
}
2911
2912
/// An iterator over all look-around assertions in a [`LookSet`].
2913
///
2914
/// This iterator is created by [`LookSet::iter`].
2915
#[derive(Clone, Debug)]
2916
pub struct LookSetIter {
2917
    set: LookSet,
2918
}
2919
2920
impl Iterator for LookSetIter {
2921
    type Item = Look;
2922
2923
    #[inline]
2924
0
    fn next(&mut self) -> Option<Look> {
2925
0
        if self.set.is_empty() {
2926
0
            return None;
2927
0
        }
2928
0
        // We'll never have more than u8::MAX distinct look-around assertions,
2929
0
        // so 'bit' will always fit into a u16.
2930
0
        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2931
0
        let look = Look::from_repr(1 << bit)?;
2932
0
        self.set = self.set.remove(look);
2933
0
        Some(look)
2934
0
    }
2935
}
2936
2937
/// Given a sequence of HIR values where each value corresponds to a Unicode
2938
/// class (or an all-ASCII byte class), return a single Unicode class
2939
/// corresponding to the union of the classes found.
2940
66.9k
fn class_chars(hirs: &[Hir]) -> Option<Class> {
2941
66.9k
    let mut cls = ClassUnicode::new(vec![]);
2942
75.7k
    for hir in hirs.iter() {
2943
75.7k
        match *hir.kind() {
2944
6.70k
            HirKind::Class(Class::Unicode(ref cls2)) => {
2945
6.70k
                cls.union(cls2);
2946
6.70k
            }
2947
2.11k
            HirKind::Class(Class::Bytes(ref cls2)) => {
2948
2.11k
                cls.union(&cls2.to_unicode_class()?);
2949
            }
2950
66.8k
            _ => return None,
2951
        };
2952
    }
2953
78
    Some(Class::Unicode(cls))
2954
66.9k
}
2955
2956
/// Given a sequence of HIR values where each value corresponds to a byte class
2957
/// (or an all-ASCII Unicode class), return a single byte class corresponding
2958
/// to the union of the classes found.
2959
66.8k
fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2960
66.8k
    let mut cls = ClassBytes::new(vec![]);
2961
74.2k
    for hir in hirs.iter() {
2962
74.2k
        match *hir.kind() {
2963
5.86k
            HirKind::Class(Class::Unicode(ref cls2)) => {
2964
5.86k
                cls.union(&cls2.to_byte_class()?);
2965
            }
2966
2.09k
            HirKind::Class(Class::Bytes(ref cls2)) => {
2967
2.09k
                cls.union(cls2);
2968
2.09k
            }
2969
66.3k
            _ => return None,
2970
        };
2971
    }
2972
0
    Some(Class::Bytes(cls))
2973
66.8k
}
2974
2975
/// Given a sequence of HIR values where each value corresponds to a literal
2976
/// that is a single `char`, return that sequence of `char`s. Otherwise return
2977
/// None. No deduplication is done.
2978
70.6k
fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2979
70.6k
    let mut singletons = vec![];
2980
664k
    for hir in hirs.iter() {
2981
664k
        let literal = match *hir.kind() {
2982
617k
            HirKind::Literal(Literal(ref bytes)) => bytes,
2983
46.6k
            _ => return None,
2984
        };
2985
617k
        let ch = match crate::debug::utf8_decode(literal) {
2986
0
            None => return None,
2987
0
            Some(Err(_)) => return None,
2988
617k
            Some(Ok(ch)) => ch,
2989
617k
        };
2990
617k
        if literal.len() != ch.len_utf8() {
2991
20.3k
            return None;
2992
597k
        }
2993
597k
        singletons.push(ch);
2994
    }
2995
3.65k
    Some(singletons)
2996
70.6k
}
2997
2998
/// Given a sequence of HIR values where each value corresponds to a literal
2999
/// that is a single byte, return that sequence of bytes. Otherwise return
3000
/// None. No deduplication is done.
3001
66.9k
fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3002
66.9k
    let mut singletons = vec![];
3003
194k
    for hir in hirs.iter() {
3004
194k
        let literal = match *hir.kind() {
3005
148k
            HirKind::Literal(Literal(ref bytes)) => bytes,
3006
46.2k
            _ => return None,
3007
        };
3008
148k
        if literal.len() != 1 {
3009
20.7k
            return None;
3010
127k
        }
3011
127k
        singletons.push(literal[0]);
3012
    }
3013
0
    Some(singletons)
3014
66.9k
}
3015
3016
/// Looks for a common prefix in the list of alternation branches given. If one
3017
/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3018
/// Otherwise, the original given list of branches is returned unmodified.
3019
///
3020
/// This is not quite as good as it could be. Right now, it requires that
3021
/// all branches are 'Concat' expressions. It also doesn't do well with
3022
/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3023
/// 'foo(?:foo|bar)' because literals are flattened into their own special
3024
/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3025
/// instead of a string of bytes because of this. Otherwise, handling the
3026
/// current representation in this routine will be pretty gnarly. Sigh.)
3027
66.8k
fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3028
66.8k
    if hirs.len() <= 1 {
3029
0
        return Err(hirs);
3030
66.8k
    }
3031
66.8k
    let mut prefix = match hirs[0].kind() {
3032
18.0k
        HirKind::Concat(ref xs) => &**xs,
3033
48.8k
        _ => return Err(hirs),
3034
    };
3035
18.0k
    if prefix.is_empty() {
3036
0
        return Err(hirs);
3037
18.0k
    }
3038
25.3k
    for h in hirs.iter().skip(1) {
3039
25.3k
        let concat = match h.kind() {
3040
20.2k
            HirKind::Concat(ref xs) => xs,
3041
5.11k
            _ => return Err(hirs),
3042
        };
3043
20.2k
        let common_len = prefix
3044
20.2k
            .iter()
3045
20.2k
            .zip(concat.iter())
3046
29.5k
            .take_while(|(x, y)| x == y)
3047
20.2k
            .count();
3048
20.2k
        prefix = &prefix[..common_len];
3049
20.2k
        if prefix.is_empty() {
3050
11.1k
            return Err(hirs);
3051
9.07k
        }
3052
    }
3053
1.71k
    let len = prefix.len();
3054
1.71k
    assert_ne!(0, len);
3055
1.71k
    let mut prefix_concat = vec![];
3056
1.71k
    let mut suffix_alts = vec![];
3057
9.19k
    for h in hirs {
3058
7.48k
        let mut concat = match h.into_kind() {
3059
7.48k
            HirKind::Concat(xs) => xs,
3060
            // We required all sub-expressions to be
3061
            // concats above, so we're only here if we
3062
            // have a concat.
3063
0
            _ => unreachable!(),
3064
        };
3065
7.48k
        suffix_alts.push(Hir::concat(concat.split_off(len)));
3066
7.48k
        if prefix_concat.is_empty() {
3067
1.71k
            prefix_concat = concat;
3068
5.77k
        }
3069
    }
3070
1.71k
    let mut concat = prefix_concat;
3071
1.71k
    concat.push(Hir::alternation(suffix_alts));
3072
1.71k
    Ok(Hir::concat(concat))
3073
66.8k
}
3074
3075
#[cfg(test)]
3076
mod tests {
3077
    use super::*;
3078
3079
    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3080
        let ranges: Vec<ClassUnicodeRange> = ranges
3081
            .iter()
3082
            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3083
            .collect();
3084
        ClassUnicode::new(ranges)
3085
    }
3086
3087
    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3088
        let ranges: Vec<ClassBytesRange> =
3089
            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3090
        ClassBytes::new(ranges)
3091
    }
3092
3093
    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3094
        cls.iter().map(|x| (x.start(), x.end())).collect()
3095
    }
3096
3097
    #[cfg(feature = "unicode-case")]
3098
    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3099
        let mut cls_ = cls.clone();
3100
        cls_.case_fold_simple();
3101
        cls_
3102
    }
3103
3104
    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105
        let mut cls_ = cls1.clone();
3106
        cls_.union(cls2);
3107
        cls_
3108
    }
3109
3110
    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3111
        let mut cls_ = cls1.clone();
3112
        cls_.intersect(cls2);
3113
        cls_
3114
    }
3115
3116
    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3117
        let mut cls_ = cls1.clone();
3118
        cls_.difference(cls2);
3119
        cls_
3120
    }
3121
3122
    fn usymdifference(
3123
        cls1: &ClassUnicode,
3124
        cls2: &ClassUnicode,
3125
    ) -> ClassUnicode {
3126
        let mut cls_ = cls1.clone();
3127
        cls_.symmetric_difference(cls2);
3128
        cls_
3129
    }
3130
3131
    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3132
        let mut cls_ = cls.clone();
3133
        cls_.negate();
3134
        cls_
3135
    }
3136
3137
    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3138
        cls.iter().map(|x| (x.start(), x.end())).collect()
3139
    }
3140
3141
    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3142
        let mut cls_ = cls.clone();
3143
        cls_.case_fold_simple();
3144
        cls_
3145
    }
3146
3147
    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148
        let mut cls_ = cls1.clone();
3149
        cls_.union(cls2);
3150
        cls_
3151
    }
3152
3153
    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154
        let mut cls_ = cls1.clone();
3155
        cls_.intersect(cls2);
3156
        cls_
3157
    }
3158
3159
    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3160
        let mut cls_ = cls1.clone();
3161
        cls_.difference(cls2);
3162
        cls_
3163
    }
3164
3165
    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3166
        let mut cls_ = cls1.clone();
3167
        cls_.symmetric_difference(cls2);
3168
        cls_
3169
    }
3170
3171
    fn bnegate(cls: &ClassBytes) -> ClassBytes {
3172
        let mut cls_ = cls.clone();
3173
        cls_.negate();
3174
        cls_
3175
    }
3176
3177
    #[test]
3178
    fn class_range_canonical_unicode() {
3179
        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3180
        assert_eq!('\0', range.start());
3181
        assert_eq!('\u{00FF}', range.end());
3182
    }
3183
3184
    #[test]
3185
    fn class_range_canonical_bytes() {
3186
        let range = ClassBytesRange::new(b'\xFF', b'\0');
3187
        assert_eq!(b'\0', range.start());
3188
        assert_eq!(b'\xFF', range.end());
3189
    }
3190
3191
    #[test]
3192
    fn class_canonicalize_unicode() {
3193
        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3194
        let expected = vec![('a', 'c'), ('x', 'z')];
3195
        assert_eq!(expected, uranges(&cls));
3196
3197
        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3198
        let expected = vec![('a', 'c'), ('x', 'z')];
3199
        assert_eq!(expected, uranges(&cls));
3200
3201
        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3202
        let expected = vec![('w', 'z')];
3203
        assert_eq!(expected, uranges(&cls));
3204
3205
        let cls = uclass(&[
3206
            ('c', 'f'),
3207
            ('a', 'g'),
3208
            ('d', 'j'),
3209
            ('a', 'c'),
3210
            ('m', 'p'),
3211
            ('l', 's'),
3212
        ]);
3213
        let expected = vec![('a', 'j'), ('l', 's')];
3214
        assert_eq!(expected, uranges(&cls));
3215
3216
        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3217
        let expected = vec![('u', 'z')];
3218
        assert_eq!(expected, uranges(&cls));
3219
3220
        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3221
        let expected = vec![('\x00', '\u{10FFFF}')];
3222
        assert_eq!(expected, uranges(&cls));
3223
3224
        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3225
        let expected = vec![('a', 'b')];
3226
        assert_eq!(expected, uranges(&cls));
3227
    }
3228
3229
    #[test]
3230
    fn class_canonicalize_bytes() {
3231
        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3232
        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3233
        assert_eq!(expected, branges(&cls));
3234
3235
        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3236
        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237
        assert_eq!(expected, branges(&cls));
3238
3239
        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3240
        let expected = vec![(b'w', b'z')];
3241
        assert_eq!(expected, branges(&cls));
3242
3243
        let cls = bclass(&[
3244
            (b'c', b'f'),
3245
            (b'a', b'g'),
3246
            (b'd', b'j'),
3247
            (b'a', b'c'),
3248
            (b'm', b'p'),
3249
            (b'l', b's'),
3250
        ]);
3251
        let expected = vec![(b'a', b'j'), (b'l', b's')];
3252
        assert_eq!(expected, branges(&cls));
3253
3254
        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3255
        let expected = vec![(b'u', b'z')];
3256
        assert_eq!(expected, branges(&cls));
3257
3258
        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3259
        let expected = vec![(b'\x00', b'\xFF')];
3260
        assert_eq!(expected, branges(&cls));
3261
3262
        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3263
        let expected = vec![(b'a', b'b')];
3264
        assert_eq!(expected, branges(&cls));
3265
    }
3266
3267
    #[test]
3268
    #[cfg(feature = "unicode-case")]
3269
    fn class_case_fold_unicode() {
3270
        let cls = uclass(&[
3271
            ('C', 'F'),
3272
            ('A', 'G'),
3273
            ('D', 'J'),
3274
            ('A', 'C'),
3275
            ('M', 'P'),
3276
            ('L', 'S'),
3277
            ('c', 'f'),
3278
        ]);
3279
        let expected = uclass(&[
3280
            ('A', 'J'),
3281
            ('L', 'S'),
3282
            ('a', 'j'),
3283
            ('l', 's'),
3284
            ('\u{17F}', '\u{17F}'),
3285
        ]);
3286
        assert_eq!(expected, ucasefold(&cls));
3287
3288
        let cls = uclass(&[('A', 'Z')]);
3289
        let expected = uclass(&[
3290
            ('A', 'Z'),
3291
            ('a', 'z'),
3292
            ('\u{17F}', '\u{17F}'),
3293
            ('\u{212A}', '\u{212A}'),
3294
        ]);
3295
        assert_eq!(expected, ucasefold(&cls));
3296
3297
        let cls = uclass(&[('a', 'z')]);
3298
        let expected = uclass(&[
3299
            ('A', 'Z'),
3300
            ('a', 'z'),
3301
            ('\u{17F}', '\u{17F}'),
3302
            ('\u{212A}', '\u{212A}'),
3303
        ]);
3304
        assert_eq!(expected, ucasefold(&cls));
3305
3306
        let cls = uclass(&[('A', 'A'), ('_', '_')]);
3307
        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3308
        assert_eq!(expected, ucasefold(&cls));
3309
3310
        let cls = uclass(&[('A', 'A'), ('=', '=')]);
3311
        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3312
        assert_eq!(expected, ucasefold(&cls));
3313
3314
        let cls = uclass(&[('\x00', '\x10')]);
3315
        assert_eq!(cls, ucasefold(&cls));
3316
3317
        let cls = uclass(&[('k', 'k')]);
3318
        let expected =
3319
            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3320
        assert_eq!(expected, ucasefold(&cls));
3321
3322
        let cls = uclass(&[('@', '@')]);
3323
        assert_eq!(cls, ucasefold(&cls));
3324
    }
3325
3326
    #[test]
3327
    #[cfg(not(feature = "unicode-case"))]
3328
    fn class_case_fold_unicode_disabled() {
3329
        let mut cls = uclass(&[
3330
            ('C', 'F'),
3331
            ('A', 'G'),
3332
            ('D', 'J'),
3333
            ('A', 'C'),
3334
            ('M', 'P'),
3335
            ('L', 'S'),
3336
            ('c', 'f'),
3337
        ]);
3338
        assert!(cls.try_case_fold_simple().is_err());
3339
    }
3340
3341
    #[test]
3342
    #[should_panic]
3343
    #[cfg(not(feature = "unicode-case"))]
3344
    fn class_case_fold_unicode_disabled_panics() {
3345
        let mut cls = uclass(&[
3346
            ('C', 'F'),
3347
            ('A', 'G'),
3348
            ('D', 'J'),
3349
            ('A', 'C'),
3350
            ('M', 'P'),
3351
            ('L', 'S'),
3352
            ('c', 'f'),
3353
        ]);
3354
        cls.case_fold_simple();
3355
    }
3356
3357
    #[test]
3358
    fn class_case_fold_bytes() {
3359
        let cls = bclass(&[
3360
            (b'C', b'F'),
3361
            (b'A', b'G'),
3362
            (b'D', b'J'),
3363
            (b'A', b'C'),
3364
            (b'M', b'P'),
3365
            (b'L', b'S'),
3366
            (b'c', b'f'),
3367
        ]);
3368
        let expected =
3369
            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3370
        assert_eq!(expected, bcasefold(&cls));
3371
3372
        let cls = bclass(&[(b'A', b'Z')]);
3373
        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3374
        assert_eq!(expected, bcasefold(&cls));
3375
3376
        let cls = bclass(&[(b'a', b'z')]);
3377
        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378
        assert_eq!(expected, bcasefold(&cls));
3379
3380
        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3381
        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3382
        assert_eq!(expected, bcasefold(&cls));
3383
3384
        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3385
        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3386
        assert_eq!(expected, bcasefold(&cls));
3387
3388
        let cls = bclass(&[(b'\x00', b'\x10')]);
3389
        assert_eq!(cls, bcasefold(&cls));
3390
3391
        let cls = bclass(&[(b'k', b'k')]);
3392
        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3393
        assert_eq!(expected, bcasefold(&cls));
3394
3395
        let cls = bclass(&[(b'@', b'@')]);
3396
        assert_eq!(cls, bcasefold(&cls));
3397
    }
3398
3399
    #[test]
3400
    fn class_negate_unicode() {
3401
        let cls = uclass(&[('a', 'a')]);
3402
        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3403
        assert_eq!(expected, unegate(&cls));
3404
3405
        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3406
        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3407
        assert_eq!(expected, unegate(&cls));
3408
3409
        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3410
        let expected = uclass(&[
3411
            ('\x00', '\x60'),
3412
            ('\x64', '\x77'),
3413
            ('\x7B', '\u{10FFFF}'),
3414
        ]);
3415
        assert_eq!(expected, unegate(&cls));
3416
3417
        let cls = uclass(&[('\x00', 'a')]);
3418
        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3419
        assert_eq!(expected, unegate(&cls));
3420
3421
        let cls = uclass(&[('a', '\u{10FFFF}')]);
3422
        let expected = uclass(&[('\x00', '\x60')]);
3423
        assert_eq!(expected, unegate(&cls));
3424
3425
        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3426
        let expected = uclass(&[]);
3427
        assert_eq!(expected, unegate(&cls));
3428
3429
        let cls = uclass(&[]);
3430
        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3431
        assert_eq!(expected, unegate(&cls));
3432
3433
        let cls =
3434
            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3435
        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3436
        assert_eq!(expected, unegate(&cls));
3437
3438
        let cls = uclass(&[('\x00', '\u{D7FF}')]);
3439
        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3440
        assert_eq!(expected, unegate(&cls));
3441
3442
        let cls = uclass(&[('\x00', '\u{D7FE}')]);
3443
        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3444
        assert_eq!(expected, unegate(&cls));
3445
3446
        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3447
        let expected = uclass(&[('\x00', '\u{D7FF}')]);
3448
        assert_eq!(expected, unegate(&cls));
3449
3450
        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3451
        let expected = uclass(&[('\x00', '\u{E000}')]);
3452
        assert_eq!(expected, unegate(&cls));
3453
    }
3454
3455
    #[test]
3456
    fn class_negate_bytes() {
3457
        let cls = bclass(&[(b'a', b'a')]);
3458
        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3459
        assert_eq!(expected, bnegate(&cls));
3460
3461
        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3462
        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3463
        assert_eq!(expected, bnegate(&cls));
3464
3465
        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3466
        let expected = bclass(&[
3467
            (b'\x00', b'\x60'),
3468
            (b'\x64', b'\x77'),
3469
            (b'\x7B', b'\xFF'),
3470
        ]);
3471
        assert_eq!(expected, bnegate(&cls));
3472
3473
        let cls = bclass(&[(b'\x00', b'a')]);
3474
        let expected = bclass(&[(b'\x62', b'\xFF')]);
3475
        assert_eq!(expected, bnegate(&cls));
3476
3477
        let cls = bclass(&[(b'a', b'\xFF')]);
3478
        let expected = bclass(&[(b'\x00', b'\x60')]);
3479
        assert_eq!(expected, bnegate(&cls));
3480
3481
        let cls = bclass(&[(b'\x00', b'\xFF')]);
3482
        let expected = bclass(&[]);
3483
        assert_eq!(expected, bnegate(&cls));
3484
3485
        let cls = bclass(&[]);
3486
        let expected = bclass(&[(b'\x00', b'\xFF')]);
3487
        assert_eq!(expected, bnegate(&cls));
3488
3489
        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3490
        let expected = bclass(&[(b'\xFE', b'\xFE')]);
3491
        assert_eq!(expected, bnegate(&cls));
3492
    }
3493
3494
    #[test]
3495
    fn class_union_unicode() {
3496
        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3497
        let cls2 = uclass(&[('a', 'z')]);
3498
        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3499
        assert_eq!(expected, uunion(&cls1, &cls2));
3500
    }
3501
3502
    #[test]
3503
    fn class_union_bytes() {
3504
        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3505
        let cls2 = bclass(&[(b'a', b'z')]);
3506
        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3507
        assert_eq!(expected, bunion(&cls1, &cls2));
3508
    }
3509
3510
    #[test]
3511
    fn class_intersect_unicode() {
3512
        let cls1 = uclass(&[]);
3513
        let cls2 = uclass(&[('a', 'a')]);
3514
        let expected = uclass(&[]);
3515
        assert_eq!(expected, uintersect(&cls1, &cls2));
3516
3517
        let cls1 = uclass(&[('a', 'a')]);
3518
        let cls2 = uclass(&[('a', 'a')]);
3519
        let expected = uclass(&[('a', 'a')]);
3520
        assert_eq!(expected, uintersect(&cls1, &cls2));
3521
3522
        let cls1 = uclass(&[('a', 'a')]);
3523
        let cls2 = uclass(&[('b', 'b')]);
3524
        let expected = uclass(&[]);
3525
        assert_eq!(expected, uintersect(&cls1, &cls2));
3526
3527
        let cls1 = uclass(&[('a', 'a')]);
3528
        let cls2 = uclass(&[('a', 'c')]);
3529
        let expected = uclass(&[('a', 'a')]);
3530
        assert_eq!(expected, uintersect(&cls1, &cls2));
3531
3532
        let cls1 = uclass(&[('a', 'b')]);
3533
        let cls2 = uclass(&[('a', 'c')]);
3534
        let expected = uclass(&[('a', 'b')]);
3535
        assert_eq!(expected, uintersect(&cls1, &cls2));
3536
3537
        let cls1 = uclass(&[('a', 'b')]);
3538
        let cls2 = uclass(&[('b', 'c')]);
3539
        let expected = uclass(&[('b', 'b')]);
3540
        assert_eq!(expected, uintersect(&cls1, &cls2));
3541
3542
        let cls1 = uclass(&[('a', 'b')]);
3543
        let cls2 = uclass(&[('c', 'd')]);
3544
        let expected = uclass(&[]);
3545
        assert_eq!(expected, uintersect(&cls1, &cls2));
3546
3547
        let cls1 = uclass(&[('b', 'c')]);
3548
        let cls2 = uclass(&[('a', 'd')]);
3549
        let expected = uclass(&[('b', 'c')]);
3550
        assert_eq!(expected, uintersect(&cls1, &cls2));
3551
3552
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3553
        let cls2 = uclass(&[('a', 'h')]);
3554
        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3555
        assert_eq!(expected, uintersect(&cls1, &cls2));
3556
3557
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3558
        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559
        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3560
        assert_eq!(expected, uintersect(&cls1, &cls2));
3561
3562
        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3563
        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3564
        let expected = uclass(&[]);
3565
        assert_eq!(expected, uintersect(&cls1, &cls2));
3566
3567
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3568
        let cls2 = uclass(&[('h', 'h')]);
3569
        let expected = uclass(&[('h', 'h')]);
3570
        assert_eq!(expected, uintersect(&cls1, &cls2));
3571
3572
        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3573
        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3574
        let expected = uclass(&[]);
3575
        assert_eq!(expected, uintersect(&cls1, &cls2));
3576
3577
        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3578
        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3579
        let expected = uclass(&[('b', 'f')]);
3580
        assert_eq!(expected, uintersect(&cls1, &cls2));
3581
    }
3582
3583
    #[test]
3584
    fn class_intersect_bytes() {
3585
        let cls1 = bclass(&[]);
3586
        let cls2 = bclass(&[(b'a', b'a')]);
3587
        let expected = bclass(&[]);
3588
        assert_eq!(expected, bintersect(&cls1, &cls2));
3589
3590
        let cls1 = bclass(&[(b'a', b'a')]);
3591
        let cls2 = bclass(&[(b'a', b'a')]);
3592
        let expected = bclass(&[(b'a', b'a')]);
3593
        assert_eq!(expected, bintersect(&cls1, &cls2));
3594
3595
        let cls1 = bclass(&[(b'a', b'a')]);
3596
        let cls2 = bclass(&[(b'b', b'b')]);
3597
        let expected = bclass(&[]);
3598
        assert_eq!(expected, bintersect(&cls1, &cls2));
3599
3600
        let cls1 = bclass(&[(b'a', b'a')]);
3601
        let cls2 = bclass(&[(b'a', b'c')]);
3602
        let expected = bclass(&[(b'a', b'a')]);
3603
        assert_eq!(expected, bintersect(&cls1, &cls2));
3604
3605
        let cls1 = bclass(&[(b'a', b'b')]);
3606
        let cls2 = bclass(&[(b'a', b'c')]);
3607
        let expected = bclass(&[(b'a', b'b')]);
3608
        assert_eq!(expected, bintersect(&cls1, &cls2));
3609
3610
        let cls1 = bclass(&[(b'a', b'b')]);
3611
        let cls2 = bclass(&[(b'b', b'c')]);
3612
        let expected = bclass(&[(b'b', b'b')]);
3613
        assert_eq!(expected, bintersect(&cls1, &cls2));
3614
3615
        let cls1 = bclass(&[(b'a', b'b')]);
3616
        let cls2 = bclass(&[(b'c', b'd')]);
3617
        let expected = bclass(&[]);
3618
        assert_eq!(expected, bintersect(&cls1, &cls2));
3619
3620
        let cls1 = bclass(&[(b'b', b'c')]);
3621
        let cls2 = bclass(&[(b'a', b'd')]);
3622
        let expected = bclass(&[(b'b', b'c')]);
3623
        assert_eq!(expected, bintersect(&cls1, &cls2));
3624
3625
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3626
        let cls2 = bclass(&[(b'a', b'h')]);
3627
        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3628
        assert_eq!(expected, bintersect(&cls1, &cls2));
3629
3630
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3631
        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632
        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3633
        assert_eq!(expected, bintersect(&cls1, &cls2));
3634
3635
        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3636
        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3637
        let expected = bclass(&[]);
3638
        assert_eq!(expected, bintersect(&cls1, &cls2));
3639
3640
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3641
        let cls2 = bclass(&[(b'h', b'h')]);
3642
        let expected = bclass(&[(b'h', b'h')]);
3643
        assert_eq!(expected, bintersect(&cls1, &cls2));
3644
3645
        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3646
        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3647
        let expected = bclass(&[]);
3648
        assert_eq!(expected, bintersect(&cls1, &cls2));
3649
3650
        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3651
        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3652
        let expected = bclass(&[(b'b', b'f')]);
3653
        assert_eq!(expected, bintersect(&cls1, &cls2));
3654
    }
3655
3656
    #[test]
3657
    fn class_difference_unicode() {
3658
        let cls1 = uclass(&[('a', 'a')]);
3659
        let cls2 = uclass(&[('a', 'a')]);
3660
        let expected = uclass(&[]);
3661
        assert_eq!(expected, udifference(&cls1, &cls2));
3662
3663
        let cls1 = uclass(&[('a', 'a')]);
3664
        let cls2 = uclass(&[]);
3665
        let expected = uclass(&[('a', 'a')]);
3666
        assert_eq!(expected, udifference(&cls1, &cls2));
3667
3668
        let cls1 = uclass(&[]);
3669
        let cls2 = uclass(&[('a', 'a')]);
3670
        let expected = uclass(&[]);
3671
        assert_eq!(expected, udifference(&cls1, &cls2));
3672
3673
        let cls1 = uclass(&[('a', 'z')]);
3674
        let cls2 = uclass(&[('a', 'a')]);
3675
        let expected = uclass(&[('b', 'z')]);
3676
        assert_eq!(expected, udifference(&cls1, &cls2));
3677
3678
        let cls1 = uclass(&[('a', 'z')]);
3679
        let cls2 = uclass(&[('z', 'z')]);
3680
        let expected = uclass(&[('a', 'y')]);
3681
        assert_eq!(expected, udifference(&cls1, &cls2));
3682
3683
        let cls1 = uclass(&[('a', 'z')]);
3684
        let cls2 = uclass(&[('m', 'm')]);
3685
        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3686
        assert_eq!(expected, udifference(&cls1, &cls2));
3687
3688
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3689
        let cls2 = uclass(&[('a', 'z')]);
3690
        let expected = uclass(&[]);
3691
        assert_eq!(expected, udifference(&cls1, &cls2));
3692
3693
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3694
        let cls2 = uclass(&[('d', 'v')]);
3695
        let expected = uclass(&[('a', 'c')]);
3696
        assert_eq!(expected, udifference(&cls1, &cls2));
3697
3698
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3699
        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3700
        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3701
        assert_eq!(expected, udifference(&cls1, &cls2));
3702
3703
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3704
        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3705
        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3706
        assert_eq!(expected, udifference(&cls1, &cls2));
3707
3708
        let cls1 = uclass(&[('x', 'z')]);
3709
        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3710
        let expected = uclass(&[('x', 'z')]);
3711
        assert_eq!(expected, udifference(&cls1, &cls2));
3712
3713
        let cls1 = uclass(&[('a', 'z')]);
3714
        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3715
        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3716
        assert_eq!(expected, udifference(&cls1, &cls2));
3717
    }
3718
3719
    #[test]
3720
    fn class_difference_bytes() {
3721
        let cls1 = bclass(&[(b'a', b'a')]);
3722
        let cls2 = bclass(&[(b'a', b'a')]);
3723
        let expected = bclass(&[]);
3724
        assert_eq!(expected, bdifference(&cls1, &cls2));
3725
3726
        let cls1 = bclass(&[(b'a', b'a')]);
3727
        let cls2 = bclass(&[]);
3728
        let expected = bclass(&[(b'a', b'a')]);
3729
        assert_eq!(expected, bdifference(&cls1, &cls2));
3730
3731
        let cls1 = bclass(&[]);
3732
        let cls2 = bclass(&[(b'a', b'a')]);
3733
        let expected = bclass(&[]);
3734
        assert_eq!(expected, bdifference(&cls1, &cls2));
3735
3736
        let cls1 = bclass(&[(b'a', b'z')]);
3737
        let cls2 = bclass(&[(b'a', b'a')]);
3738
        let expected = bclass(&[(b'b', b'z')]);
3739
        assert_eq!(expected, bdifference(&cls1, &cls2));
3740
3741
        let cls1 = bclass(&[(b'a', b'z')]);
3742
        let cls2 = bclass(&[(b'z', b'z')]);
3743
        let expected = bclass(&[(b'a', b'y')]);
3744
        assert_eq!(expected, bdifference(&cls1, &cls2));
3745
3746
        let cls1 = bclass(&[(b'a', b'z')]);
3747
        let cls2 = bclass(&[(b'm', b'm')]);
3748
        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3749
        assert_eq!(expected, bdifference(&cls1, &cls2));
3750
3751
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3752
        let cls2 = bclass(&[(b'a', b'z')]);
3753
        let expected = bclass(&[]);
3754
        assert_eq!(expected, bdifference(&cls1, &cls2));
3755
3756
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3757
        let cls2 = bclass(&[(b'd', b'v')]);
3758
        let expected = bclass(&[(b'a', b'c')]);
3759
        assert_eq!(expected, bdifference(&cls1, &cls2));
3760
3761
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3762
        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3763
        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3764
        assert_eq!(expected, bdifference(&cls1, &cls2));
3765
3766
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3767
        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3768
        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3769
        assert_eq!(expected, bdifference(&cls1, &cls2));
3770
3771
        let cls1 = bclass(&[(b'x', b'z')]);
3772
        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3773
        let expected = bclass(&[(b'x', b'z')]);
3774
        assert_eq!(expected, bdifference(&cls1, &cls2));
3775
3776
        let cls1 = bclass(&[(b'a', b'z')]);
3777
        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3778
        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3779
        assert_eq!(expected, bdifference(&cls1, &cls2));
3780
    }
3781
3782
    #[test]
3783
    fn class_symmetric_difference_unicode() {
3784
        let cls1 = uclass(&[('a', 'm')]);
3785
        let cls2 = uclass(&[('g', 't')]);
3786
        let expected = uclass(&[('a', 'f'), ('n', 't')]);
3787
        assert_eq!(expected, usymdifference(&cls1, &cls2));
3788
    }
3789
3790
    #[test]
3791
    fn class_symmetric_difference_bytes() {
3792
        let cls1 = bclass(&[(b'a', b'm')]);
3793
        let cls2 = bclass(&[(b'g', b't')]);
3794
        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3795
        assert_eq!(expected, bsymdifference(&cls1, &cls2));
3796
    }
3797
3798
    // We use a thread with an explicit stack size to test that our destructor
3799
    // for Hir can handle arbitrarily sized expressions in constant stack
3800
    // space. In case we run on a platform without threads (WASM?), we limit
3801
    // this test to Windows/Unix.
3802
    #[test]
3803
    #[cfg(any(unix, windows))]
3804
    fn no_stack_overflow_on_drop() {
3805
        use std::thread;
3806
3807
        let run = || {
3808
            let mut expr = Hir::empty();
3809
            for _ in 0..100 {
3810
                expr = Hir::capture(Capture {
3811
                    index: 1,
3812
                    name: None,
3813
                    sub: Box::new(expr),
3814
                });
3815
                expr = Hir::repetition(Repetition {
3816
                    min: 0,
3817
                    max: Some(1),
3818
                    greedy: true,
3819
                    sub: Box::new(expr),
3820
                });
3821
3822
                expr = Hir {
3823
                    kind: HirKind::Concat(vec![expr]),
3824
                    props: Properties::empty(),
3825
                };
3826
                expr = Hir {
3827
                    kind: HirKind::Alternation(vec![expr]),
3828
                    props: Properties::empty(),
3829
                };
3830
            }
3831
            assert!(!matches!(*expr.kind(), HirKind::Empty));
3832
        };
3833
3834
        // We run our test on a thread with a small stack size so we can
3835
        // force the issue more easily.
3836
        //
3837
        // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3838
        // for context on the specific stack size chosen here.
3839
        thread::Builder::new()
3840
            .stack_size(16 << 10)
3841
            .spawn(run)
3842
            .unwrap()
3843
            .join()
3844
            .unwrap();
3845
    }
3846
3847
    #[test]
3848
    fn look_set_iter() {
3849
        let set = LookSet::empty();
3850
        assert_eq!(0, set.iter().count());
3851
3852
        let set = LookSet::full();
3853
        assert_eq!(18, set.iter().count());
3854
3855
        let set =
3856
            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3857
        assert_eq!(2, set.iter().count());
3858
3859
        let set = LookSet::empty().insert(Look::StartLF);
3860
        assert_eq!(1, set.iter().count());
3861
3862
        let set = LookSet::empty().insert(Look::WordAsciiNegate);
3863
        assert_eq!(1, set.iter().count());
3864
    }
3865
3866
    #[test]
3867
    fn look_set_debug() {
3868
        let res = format!("{:?}", LookSet::empty());
3869
        assert_eq!("∅", res);
3870
        let res = format!("{:?}", LookSet::full());
3871
        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3872
    }
3873
}