Coverage Report

Created: 2025-12-31 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.6.29/src/hir/mod.rs
Line
Count
Source
1
/*!
2
Defines a high-level intermediate representation for regular expressions.
3
*/
4
use std::char;
5
use std::cmp;
6
use std::error;
7
use std::fmt;
8
use std::result;
9
use std::u8;
10
11
use crate::ast::Span;
12
use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13
use crate::unicode;
14
15
pub use crate::hir::visitor::{visit, Visitor};
16
pub use crate::unicode::CaseFoldError;
17
18
mod interval;
19
pub mod literal;
20
pub mod print;
21
pub mod translate;
22
mod visitor;
23
24
/// An error that can occur while translating an `Ast` to a `Hir`.
25
#[derive(Clone, Debug, Eq, PartialEq)]
26
pub struct Error {
27
    /// The kind of error.
28
    kind: ErrorKind,
29
    /// The original pattern that the translator's Ast was parsed from. Every
30
    /// span in an error is a valid range into this string.
31
    pattern: String,
32
    /// The span of this error, derived from the Ast given to the translator.
33
    span: Span,
34
}
35
36
impl Error {
37
    /// Return the type of this error.
38
0
    pub fn kind(&self) -> &ErrorKind {
39
0
        &self.kind
40
0
    }
41
42
    /// The original pattern string in which this error occurred.
43
    ///
44
    /// Every span reported by this error is reported in terms of this string.
45
0
    pub fn pattern(&self) -> &str {
46
0
        &self.pattern
47
0
    }
48
49
    /// Return the span at which this error occurred.
50
0
    pub fn span(&self) -> &Span {
51
0
        &self.span
52
0
    }
53
}
54
55
/// The type of an error that occurred while building an `Hir`.
56
#[derive(Clone, Debug, Eq, PartialEq)]
57
pub enum ErrorKind {
58
    /// This error occurs when a Unicode feature is used when Unicode
59
    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60
    UnicodeNotAllowed,
61
    /// This error occurs when translating a pattern that could match a byte
62
    /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63
    InvalidUtf8,
64
    /// This occurs when an unrecognized Unicode property name could not
65
    /// be found.
66
    UnicodePropertyNotFound,
67
    /// This occurs when an unrecognized Unicode property value could not
68
    /// be found.
69
    UnicodePropertyValueNotFound,
70
    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71
    /// `\d`) could not be found. This can occur when the `unicode-perl`
72
    /// crate feature is not enabled.
73
    UnicodePerlClassNotFound,
74
    /// This occurs when the Unicode simple case mapping tables are not
75
    /// available, and the regular expression required Unicode aware case
76
    /// insensitivity.
77
    UnicodeCaseUnavailable,
78
    /// This occurs when the translator attempts to construct a character class
79
    /// that is empty.
80
    ///
81
    /// Note that this restriction in the translator may be removed in the
82
    /// future.
83
    EmptyClassNotAllowed,
84
    /// Hints that destructuring should not be exhaustive.
85
    ///
86
    /// This enum may grow additional variants, so this makes sure clients
87
    /// don't count on exhaustive matching. (Otherwise, adding a new variant
88
    /// could break existing code.)
89
    #[doc(hidden)]
90
    __Nonexhaustive,
91
}
92
93
impl ErrorKind {
94
    // TODO: Remove this method entirely on the next breaking semver release.
95
    #[allow(deprecated)]
96
0
    fn description(&self) -> &str {
97
        use self::ErrorKind::*;
98
0
        match *self {
99
0
            UnicodeNotAllowed => "Unicode not allowed here",
100
0
            InvalidUtf8 => "pattern can match invalid UTF-8",
101
0
            UnicodePropertyNotFound => "Unicode property not found",
102
0
            UnicodePropertyValueNotFound => "Unicode property value not found",
103
            UnicodePerlClassNotFound => {
104
0
                "Unicode-aware Perl class not found \
105
0
                 (make sure the unicode-perl feature is enabled)"
106
            }
107
            UnicodeCaseUnavailable => {
108
0
                "Unicode-aware case insensitivity matching is not available \
109
0
                 (make sure the unicode-case feature is enabled)"
110
            }
111
0
            EmptyClassNotAllowed => "empty character classes are not allowed",
112
0
            __Nonexhaustive => unreachable!(),
113
        }
114
0
    }
115
}
116
117
impl error::Error for Error {
118
    // TODO: Remove this method entirely on the next breaking semver release.
119
    #[allow(deprecated)]
120
0
    fn description(&self) -> &str {
121
0
        self.kind.description()
122
0
    }
123
}
124
125
impl fmt::Display for Error {
126
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127
0
        crate::error::Formatter::from(self).fmt(f)
128
0
    }
129
}
130
131
impl fmt::Display for ErrorKind {
132
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133
        // TODO: Remove this on the next breaking semver release.
134
        #[allow(deprecated)]
135
0
        f.write_str(self.description())
136
0
    }
137
}
138
139
/// A high-level intermediate representation (HIR) for a regular expression.
140
///
141
/// The HIR of a regular expression represents an intermediate step between its
142
/// abstract syntax (a structured description of the concrete syntax) and
143
/// compiled byte codes. The purpose of HIR is to make regular expressions
144
/// easier to analyze. In particular, the AST is much more complex than the
145
/// HIR. For example, while an AST supports arbitrarily nested character
146
/// classes, the HIR will flatten all nested classes into a single set. The HIR
147
/// will also "compile away" every flag present in the concrete syntax. For
148
/// example, users of HIR expressions never need to worry about case folding;
149
/// it is handled automatically by the translator (e.g., by translating `(?i)A`
150
/// to `[aA]`).
151
///
152
/// If the HIR was produced by a translator that disallows invalid UTF-8, then
153
/// the HIR is guaranteed to match UTF-8 exclusively.
154
///
155
/// This type defines its own destructor that uses constant stack space and
156
/// heap space proportional to the size of the HIR.
157
///
158
/// The specific type of an HIR expression can be accessed via its `kind`
159
/// or `into_kind` methods. This extra level of indirection exists for two
160
/// reasons:
161
///
162
/// 1. Construction of an HIR expression *must* use the constructor methods
163
///    on this `Hir` type instead of building the `HirKind` values directly.
164
///    This permits construction to enforce invariants like "concatenations
165
///    always consist of two or more sub-expressions."
166
/// 2. Every HIR expression contains attributes that are defined inductively,
167
///    and can be computed cheaply during the construction process. For
168
///    example, one such attribute is whether the expression must match at the
169
///    beginning of the text.
170
///
171
/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172
/// expression pattern string, and uses constant stack space and heap space
173
/// proportional to the size of the `Hir`.
174
#[derive(Clone, Debug, Eq, PartialEq)]
175
pub struct Hir {
176
    /// The underlying HIR kind.
177
    kind: HirKind,
178
    /// Analysis info about this HIR, computed during construction.
179
    info: HirInfo,
180
}
181
182
/// The kind of an arbitrary `Hir` expression.
183
#[derive(Clone, Debug, Eq, PartialEq)]
184
pub enum HirKind {
185
    /// The empty regular expression, which matches everything, including the
186
    /// empty string.
187
    Empty,
188
    /// A single literal character that matches exactly this character.
189
    Literal(Literal),
190
    /// A single character class that matches any of the characters in the
191
    /// class. A class can either consist of Unicode scalar values as
192
    /// characters, or it can use bytes.
193
    Class(Class),
194
    /// An anchor assertion. An anchor assertion match always has zero length.
195
    Anchor(Anchor),
196
    /// A word boundary assertion, which may or may not be Unicode aware. A
197
    /// word boundary assertion match always has zero length.
198
    WordBoundary(WordBoundary),
199
    /// A repetition operation applied to a child expression.
200
    Repetition(Repetition),
201
    /// A possibly capturing group, which contains a child expression.
202
    Group(Group),
203
    /// A concatenation of expressions. A concatenation always has at least two
204
    /// child expressions.
205
    ///
206
    /// A concatenation matches only if each of its child expression matches
207
    /// one after the other.
208
    Concat(Vec<Hir>),
209
    /// An alternation of expressions. An alternation always has at least two
210
    /// child expressions.
211
    ///
212
    /// An alternation matches only if at least one of its child expression
213
    /// matches. If multiple expressions match, then the leftmost is preferred.
214
    Alternation(Vec<Hir>),
215
}
216
217
impl Hir {
218
    /// Returns a reference to the underlying HIR kind.
219
8.85k
    pub fn kind(&self) -> &HirKind {
220
8.85k
        &self.kind
221
8.85k
    }
222
223
    /// Consumes ownership of this HIR expression and returns its underlying
224
    /// `HirKind`.
225
0
    pub fn into_kind(mut self) -> HirKind {
226
        use std::mem;
227
0
        mem::replace(&mut self.kind, HirKind::Empty)
228
0
    }
229
230
    /// Returns an empty HIR expression.
231
    ///
232
    /// An empty HIR expression always matches, including the empty string.
233
675
    pub fn empty() -> Hir {
234
675
        let mut info = HirInfo::new();
235
675
        info.set_always_utf8(true);
236
675
        info.set_all_assertions(true);
237
675
        info.set_anchored_start(false);
238
675
        info.set_anchored_end(false);
239
675
        info.set_line_anchored_start(false);
240
675
        info.set_line_anchored_end(false);
241
675
        info.set_any_anchored_start(false);
242
675
        info.set_any_anchored_end(false);
243
675
        info.set_match_empty(true);
244
675
        info.set_literal(false);
245
675
        info.set_alternation_literal(false);
246
675
        Hir { kind: HirKind::Empty, info }
247
675
    }
248
249
    /// Creates a literal HIR expression.
250
    ///
251
    /// If the given literal has a `Byte` variant with an ASCII byte, then this
252
    /// method panics. This enforces the invariant that `Byte` variants are
253
    /// only used to express matching of invalid UTF-8.
254
465
    pub fn literal(lit: Literal) -> Hir {
255
465
        if let Literal::Byte(b) = lit {
256
0
            assert!(b > 0x7F);
257
465
        }
258
259
465
        let mut info = HirInfo::new();
260
465
        info.set_always_utf8(lit.is_unicode());
261
465
        info.set_all_assertions(false);
262
465
        info.set_anchored_start(false);
263
465
        info.set_anchored_end(false);
264
465
        info.set_line_anchored_start(false);
265
465
        info.set_line_anchored_end(false);
266
465
        info.set_any_anchored_start(false);
267
465
        info.set_any_anchored_end(false);
268
465
        info.set_match_empty(false);
269
465
        info.set_literal(true);
270
465
        info.set_alternation_literal(true);
271
465
        Hir { kind: HirKind::Literal(lit), info }
272
465
    }
273
274
    /// Creates a class HIR expression.
275
175
    pub fn class(class: Class) -> Hir {
276
175
        let mut info = HirInfo::new();
277
175
        info.set_always_utf8(class.is_always_utf8());
278
175
        info.set_all_assertions(false);
279
175
        info.set_anchored_start(false);
280
175
        info.set_anchored_end(false);
281
175
        info.set_line_anchored_start(false);
282
175
        info.set_line_anchored_end(false);
283
175
        info.set_any_anchored_start(false);
284
175
        info.set_any_anchored_end(false);
285
175
        info.set_match_empty(false);
286
175
        info.set_literal(false);
287
175
        info.set_alternation_literal(false);
288
175
        Hir { kind: HirKind::Class(class), info }
289
175
    }
290
291
    /// Creates an anchor assertion HIR expression.
292
30
    pub fn anchor(anchor: Anchor) -> Hir {
293
30
        let mut info = HirInfo::new();
294
30
        info.set_always_utf8(true);
295
30
        info.set_all_assertions(true);
296
30
        info.set_anchored_start(false);
297
30
        info.set_anchored_end(false);
298
30
        info.set_line_anchored_start(false);
299
30
        info.set_line_anchored_end(false);
300
30
        info.set_any_anchored_start(false);
301
30
        info.set_any_anchored_end(false);
302
30
        info.set_match_empty(true);
303
30
        info.set_literal(false);
304
30
        info.set_alternation_literal(false);
305
30
        if let Anchor::StartText = anchor {
306
15
            info.set_anchored_start(true);
307
15
            info.set_line_anchored_start(true);
308
15
            info.set_any_anchored_start(true);
309
15
        }
310
30
        if let Anchor::EndText = anchor {
311
15
            info.set_anchored_end(true);
312
15
            info.set_line_anchored_end(true);
313
15
            info.set_any_anchored_end(true);
314
15
        }
315
30
        if let Anchor::StartLine = anchor {
316
0
            info.set_line_anchored_start(true);
317
30
        }
318
30
        if let Anchor::EndLine = anchor {
319
0
            info.set_line_anchored_end(true);
320
30
        }
321
30
        Hir { kind: HirKind::Anchor(anchor), info }
322
30
    }
323
324
    /// Creates a word boundary assertion HIR expression.
325
0
    pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326
0
        let mut info = HirInfo::new();
327
0
        info.set_always_utf8(true);
328
0
        info.set_all_assertions(true);
329
0
        info.set_anchored_start(false);
330
0
        info.set_anchored_end(false);
331
0
        info.set_line_anchored_start(false);
332
0
        info.set_line_anchored_end(false);
333
0
        info.set_any_anchored_start(false);
334
0
        info.set_any_anchored_end(false);
335
0
        info.set_literal(false);
336
0
        info.set_alternation_literal(false);
337
        // A negated word boundary matches '', so that's fine. But \b does not
338
        // match \b, so why do we say it can match the empty string? Well,
339
        // because, if you search for \b against 'a', it will report [0, 0) and
340
        // [1, 1) as matches, and both of those matches correspond to the empty
341
        // string. Thus, only *certain* empty strings match \b, which similarly
342
        // applies to \B.
343
0
        info.set_match_empty(true);
344
        // Negated ASCII word boundaries can match invalid UTF-8.
345
0
        if let WordBoundary::AsciiNegate = word_boundary {
346
0
            info.set_always_utf8(false);
347
0
        }
348
0
        Hir { kind: HirKind::WordBoundary(word_boundary), info }
349
0
    }
350
351
    /// Creates a repetition HIR expression.
352
325
    pub fn repetition(rep: Repetition) -> Hir {
353
325
        let mut info = HirInfo::new();
354
325
        info.set_always_utf8(rep.hir.is_always_utf8());
355
325
        info.set_all_assertions(rep.hir.is_all_assertions());
356
        // If this operator can match the empty string, then it can never
357
        // be anchored.
358
325
        info.set_anchored_start(
359
325
            !rep.is_match_empty() && rep.hir.is_anchored_start(),
360
        );
361
325
        info.set_anchored_end(
362
325
            !rep.is_match_empty() && rep.hir.is_anchored_end(),
363
        );
364
325
        info.set_line_anchored_start(
365
325
            !rep.is_match_empty() && rep.hir.is_anchored_start(),
366
        );
367
325
        info.set_line_anchored_end(
368
325
            !rep.is_match_empty() && rep.hir.is_anchored_end(),
369
        );
370
325
        info.set_any_anchored_start(rep.hir.is_any_anchored_start());
371
325
        info.set_any_anchored_end(rep.hir.is_any_anchored_end());
372
325
        info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
373
325
        info.set_literal(false);
374
325
        info.set_alternation_literal(false);
375
325
        Hir { kind: HirKind::Repetition(rep), info }
376
325
    }
377
378
    /// Creates a group HIR expression.
379
90
    pub fn group(group: Group) -> Hir {
380
90
        let mut info = HirInfo::new();
381
90
        info.set_always_utf8(group.hir.is_always_utf8());
382
90
        info.set_all_assertions(group.hir.is_all_assertions());
383
90
        info.set_anchored_start(group.hir.is_anchored_start());
384
90
        info.set_anchored_end(group.hir.is_anchored_end());
385
90
        info.set_line_anchored_start(group.hir.is_line_anchored_start());
386
90
        info.set_line_anchored_end(group.hir.is_line_anchored_end());
387
90
        info.set_any_anchored_start(group.hir.is_any_anchored_start());
388
90
        info.set_any_anchored_end(group.hir.is_any_anchored_end());
389
90
        info.set_match_empty(group.hir.is_match_empty());
390
90
        info.set_literal(false);
391
90
        info.set_alternation_literal(false);
392
90
        Hir { kind: HirKind::Group(group), info }
393
90
    }
394
395
    /// Returns the concatenation of the given expressions.
396
    ///
397
    /// This flattens the concatenation as appropriate.
398
100
    pub fn concat(mut exprs: Vec<Hir>) -> Hir {
399
100
        match exprs.len() {
400
0
            0 => Hir::empty(),
401
0
            1 => exprs.pop().unwrap(),
402
            _ => {
403
100
                let mut info = HirInfo::new();
404
100
                info.set_always_utf8(true);
405
100
                info.set_all_assertions(true);
406
100
                info.set_any_anchored_start(false);
407
100
                info.set_any_anchored_end(false);
408
100
                info.set_match_empty(true);
409
100
                info.set_literal(true);
410
100
                info.set_alternation_literal(true);
411
412
                // Some attributes require analyzing all sub-expressions.
413
835
                for e in &exprs {
414
735
                    let x = info.is_always_utf8() && e.is_always_utf8();
415
735
                    info.set_always_utf8(x);
416
417
735
                    let x = info.is_all_assertions() && e.is_all_assertions();
418
735
                    info.set_all_assertions(x);
419
420
735
                    let x = info.is_any_anchored_start()
421
480
                        || e.is_any_anchored_start();
422
735
                    info.set_any_anchored_start(x);
423
424
735
                    let x =
425
735
                        info.is_any_anchored_end() || e.is_any_anchored_end();
426
735
                    info.set_any_anchored_end(x);
427
428
735
                    let x = info.is_match_empty() && e.is_match_empty();
429
735
                    info.set_match_empty(x);
430
431
735
                    let x = info.is_literal() && e.is_literal();
432
735
                    info.set_literal(x);
433
434
735
                    let x = info.is_alternation_literal()
435
295
                        && e.is_alternation_literal();
436
735
                    info.set_alternation_literal(x);
437
                }
438
                // Anchored attributes require something slightly more
439
                // sophisticated. Normally, WLOG, to determine whether an
440
                // expression is anchored to the start, we'd only need to check
441
                // the first expression of a concatenation. However,
442
                // expressions like `$\b^` are still anchored to the start,
443
                // but the first expression in the concatenation *isn't*
444
                // anchored to the start. So the "first" expression to look at
445
                // is actually one that is either not an assertion or is
446
                // specifically the StartText assertion.
447
100
                info.set_anchored_start(
448
100
                    exprs
449
100
                        .iter()
450
100
                        .take_while(|e| {
451
100
                            e.is_anchored_start() || e.is_all_assertions()
452
100
                        })
453
100
                        .any(|e| e.is_anchored_start()),
454
                );
455
                // Similarly for the end anchor, but in reverse.
456
100
                info.set_anchored_end(
457
100
                    exprs
458
100
                        .iter()
459
100
                        .rev()
460
100
                        .take_while(|e| {
461
100
                            e.is_anchored_end() || e.is_all_assertions()
462
100
                        })
463
100
                        .any(|e| e.is_anchored_end()),
464
                );
465
                // Repeat the process for line anchors.
466
100
                info.set_line_anchored_start(
467
100
                    exprs
468
100
                        .iter()
469
100
                        .take_while(|e| {
470
100
                            e.is_line_anchored_start() || e.is_all_assertions()
471
100
                        })
472
100
                        .any(|e| e.is_line_anchored_start()),
473
                );
474
100
                info.set_line_anchored_end(
475
100
                    exprs
476
100
                        .iter()
477
100
                        .rev()
478
100
                        .take_while(|e| {
479
100
                            e.is_line_anchored_end() || e.is_all_assertions()
480
100
                        })
481
100
                        .any(|e| e.is_line_anchored_end()),
482
                );
483
100
                Hir { kind: HirKind::Concat(exprs), info }
484
            }
485
        }
486
100
    }
487
488
    /// Returns the alternation of the given expressions.
489
    ///
490
    /// This flattens the alternation as appropriate.
491
10
    pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
492
10
        match exprs.len() {
493
0
            0 => Hir::empty(),
494
0
            1 => exprs.pop().unwrap(),
495
            _ => {
496
10
                let mut info = HirInfo::new();
497
10
                info.set_always_utf8(true);
498
10
                info.set_all_assertions(true);
499
10
                info.set_anchored_start(true);
500
10
                info.set_anchored_end(true);
501
10
                info.set_line_anchored_start(true);
502
10
                info.set_line_anchored_end(true);
503
10
                info.set_any_anchored_start(false);
504
10
                info.set_any_anchored_end(false);
505
10
                info.set_match_empty(false);
506
10
                info.set_literal(false);
507
10
                info.set_alternation_literal(true);
508
509
                // Some attributes require analyzing all sub-expressions.
510
40
                for e in &exprs {
511
30
                    let x = info.is_always_utf8() && e.is_always_utf8();
512
30
                    info.set_always_utf8(x);
513
514
30
                    let x = info.is_all_assertions() && e.is_all_assertions();
515
30
                    info.set_all_assertions(x);
516
517
30
                    let x = info.is_anchored_start() && e.is_anchored_start();
518
30
                    info.set_anchored_start(x);
519
520
30
                    let x = info.is_anchored_end() && e.is_anchored_end();
521
30
                    info.set_anchored_end(x);
522
523
30
                    let x = info.is_line_anchored_start()
524
10
                        && e.is_line_anchored_start();
525
30
                    info.set_line_anchored_start(x);
526
527
30
                    let x = info.is_line_anchored_end()
528
10
                        && e.is_line_anchored_end();
529
30
                    info.set_line_anchored_end(x);
530
531
30
                    let x = info.is_any_anchored_start()
532
30
                        || e.is_any_anchored_start();
533
30
                    info.set_any_anchored_start(x);
534
535
30
                    let x =
536
30
                        info.is_any_anchored_end() || e.is_any_anchored_end();
537
30
                    info.set_any_anchored_end(x);
538
539
30
                    let x = info.is_match_empty() || e.is_match_empty();
540
30
                    info.set_match_empty(x);
541
542
30
                    let x = info.is_alternation_literal() && e.is_literal();
543
30
                    info.set_alternation_literal(x);
544
                }
545
10
                Hir { kind: HirKind::Alternation(exprs), info }
546
            }
547
        }
548
10
    }
549
550
    /// Build an HIR expression for `.`.
551
    ///
552
    /// A `.` expression matches any character except for `\n`. To build an
553
    /// expression that matches any character, including `\n`, use the `any`
554
    /// method.
555
    ///
556
    /// If `bytes` is `true`, then this assumes characters are limited to a
557
    /// single byte.
558
5
    pub fn dot(bytes: bool) -> Hir {
559
5
        if bytes {
560
0
            let mut cls = ClassBytes::empty();
561
0
            cls.push(ClassBytesRange::new(b'\0', b'\x09'));
562
0
            cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
563
0
            Hir::class(Class::Bytes(cls))
564
        } else {
565
5
            let mut cls = ClassUnicode::empty();
566
5
            cls.push(ClassUnicodeRange::new('\0', '\x09'));
567
5
            cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
568
5
            Hir::class(Class::Unicode(cls))
569
        }
570
5
    }
571
572
    /// Build an HIR expression for `(?s).`.
573
    ///
574
    /// A `(?s).` expression matches any character, including `\n`. To build an
575
    /// expression that matches any character except for `\n`, then use the
576
    /// `dot` method.
577
    ///
578
    /// If `bytes` is `true`, then this assumes characters are limited to a
579
    /// single byte.
580
0
    pub fn any(bytes: bool) -> Hir {
581
0
        if bytes {
582
0
            let mut cls = ClassBytes::empty();
583
0
            cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
584
0
            Hir::class(Class::Bytes(cls))
585
        } else {
586
0
            let mut cls = ClassUnicode::empty();
587
0
            cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
588
0
            Hir::class(Class::Unicode(cls))
589
        }
590
0
    }
591
592
    /// Return true if and only if this HIR will always match valid UTF-8.
593
    ///
594
    /// When this returns false, then it is possible for this HIR expression
595
    /// to match invalid UTF-8.
596
1.19k
    pub fn is_always_utf8(&self) -> bool {
597
1.19k
        self.info.is_always_utf8()
598
1.19k
    }
599
600
    /// Returns true if and only if this entire HIR expression is made up of
601
    /// zero-width assertions.
602
    ///
603
    /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
604
    /// not `^a`.
605
880
    pub fn is_all_assertions(&self) -> bool {
606
880
        self.info.is_all_assertions()
607
880
    }
608
609
    /// Return true if and only if this HIR is required to match from the
610
    /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
611
    /// `^foo|^bar` but not `^foo|bar`.
612
395
    pub fn is_anchored_start(&self) -> bool {
613
395
        self.info.is_anchored_start()
614
395
    }
615
616
    /// Return true if and only if this HIR is required to match at the end
617
    /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
618
    /// `foo$|bar$` but not `foo$|bar`.
619
395
    pub fn is_anchored_end(&self) -> bool {
620
395
        self.info.is_anchored_end()
621
395
    }
622
623
    /// Return true if and only if this HIR is required to match from the
624
    /// beginning of text or the beginning of a line. This includes expressions
625
    /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
626
    /// but not `^foo|bar` or `(?m)^foo|bar`.
627
    ///
628
    /// Note that if `is_anchored_start` is `true`, then
629
    /// `is_line_anchored_start` will also be `true`. The reverse implication
630
    /// is not true. For example, `(?m)^foo` is line anchored, but not
631
    /// `is_anchored_start`.
632
215
    pub fn is_line_anchored_start(&self) -> bool {
633
215
        self.info.is_line_anchored_start()
634
215
    }
635
636
    /// Return true if and only if this HIR is required to match at the
637
    /// end of text or the end of a line. This includes expressions like
638
    /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
639
    /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
640
    ///
641
    /// Note that if `is_anchored_end` is `true`, then
642
    /// `is_line_anchored_end` will also be `true`. The reverse implication
643
    /// is not true. For example, `(?m)foo$` is line anchored, but not
644
    /// `is_anchored_end`.
645
215
    pub fn is_line_anchored_end(&self) -> bool {
646
215
        self.info.is_line_anchored_end()
647
215
    }
648
649
    /// Return true if and only if this HIR contains any sub-expression that
650
    /// is required to match at the beginning of text. Specifically, this
651
    /// returns true if the `^` symbol (when multiline mode is disabled) or the
652
    /// `\A` escape appear anywhere in the regex.
653
925
    pub fn is_any_anchored_start(&self) -> bool {
654
925
        self.info.is_any_anchored_start()
655
925
    }
656
657
    /// Return true if and only if this HIR contains any sub-expression that is
658
    /// required to match at the end of text. Specifically, this returns true
659
    /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
660
    /// appear anywhere in the regex.
661
1.18k
    pub fn is_any_anchored_end(&self) -> bool {
662
1.18k
        self.info.is_any_anchored_end()
663
1.18k
    }
664
665
    /// Return true if and only if the empty string is part of the language
666
    /// matched by this regular expression.
667
    ///
668
    /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
669
    /// and `\B`, but not `a` or `a+`.
670
360
    pub fn is_match_empty(&self) -> bool {
671
360
        self.info.is_match_empty()
672
360
    }
673
674
    /// Return true if and only if this HIR is a simple literal. This is only
675
    /// true when this HIR expression is either itself a `Literal` or a
676
    /// concatenation of only `Literal`s.
677
    ///
678
    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
679
    /// `` are not (even though that contain sub-expressions that are literals).
680
325
    pub fn is_literal(&self) -> bool {
681
325
        self.info.is_literal()
682
325
    }
683
684
    /// Return true if and only if this HIR is either a simple literal or an
685
    /// alternation of simple literals. This is only
686
    /// true when this HIR expression is either itself a `Literal` or a
687
    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
688
    ///
689
    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
690
    /// literals, but `f+`, `(foo)`, `foo()`, ``
691
    /// are not (even though that contain sub-expressions that are literals).
692
310
    pub fn is_alternation_literal(&self) -> bool {
693
310
        self.info.is_alternation_literal()
694
310
    }
695
}
696
697
impl HirKind {
698
    /// Return true if and only if this HIR is the empty regular expression.
699
    ///
700
    /// Note that this is not defined inductively. That is, it only tests if
701
    /// this kind is the `Empty` variant. To get the inductive definition,
702
    /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
703
735
    pub fn is_empty(&self) -> bool {
704
735
        match *self {
705
0
            HirKind::Empty => true,
706
735
            _ => false,
707
        }
708
735
    }
709
710
    /// Returns true if and only if this kind has any (including possibly
711
    /// empty) subexpressions.
712
695
    pub fn has_subexprs(&self) -> bool {
713
695
        match *self {
714
            HirKind::Empty
715
            | HirKind::Literal(_)
716
            | HirKind::Class(_)
717
            | HirKind::Anchor(_)
718
655
            | HirKind::WordBoundary(_) => false,
719
            HirKind::Group(_)
720
            | HirKind::Repetition(_)
721
            | HirKind::Concat(_)
722
40
            | HirKind::Alternation(_) => true,
723
        }
724
695
    }
725
}
726
727
/// Print a display representation of this Hir.
728
///
729
/// The result of this is a valid regular expression pattern string.
730
///
731
/// This implementation uses constant stack space and heap space proportional
732
/// to the size of the `Hir`.
733
impl fmt::Display for Hir {
734
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
735
        use crate::hir::print::Printer;
736
0
        Printer::new().print(self, f)
737
0
    }
738
}
739
740
/// The high-level intermediate representation of a literal.
741
///
742
/// A literal corresponds to a single character, where a character is either
743
/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
744
/// are preferred whenever possible. In particular, a `Byte` variant is only
745
/// ever produced when it could match invalid UTF-8.
746
#[derive(Clone, Debug, Eq, PartialEq)]
747
pub enum Literal {
748
    /// A single character represented by a Unicode scalar value.
749
    Unicode(char),
750
    /// A single character represented by an arbitrary byte.
751
    Byte(u8),
752
}
753
754
impl Literal {
755
    /// Returns true if and only if this literal corresponds to a Unicode
756
    /// scalar value.
757
465
    pub fn is_unicode(&self) -> bool {
758
0
        match *self {
759
465
            Literal::Unicode(_) => true,
760
0
            Literal::Byte(b) if b <= 0x7F => true,
761
0
            Literal::Byte(_) => false,
762
        }
763
465
    }
764
}
765
766
/// The high-level intermediate representation of a character class.
767
///
768
/// A character class corresponds to a set of characters. A character is either
769
/// defined by a Unicode scalar value or a byte. Unicode characters are used
770
/// by default, while bytes are used when Unicode mode (via the `u` flag) is
771
/// disabled.
772
///
773
/// A character class, regardless of its character type, is represented by a
774
/// sequence of non-overlapping non-adjacent ranges of characters.
775
///
776
/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
777
/// be produced even when it exclusively matches valid UTF-8. This is because
778
/// a `Bytes` variant represents an intention by the author of the regular
779
/// expression to disable Unicode mode, which in turn impacts the semantics of
780
/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781
/// match the same set of strings.
782
#[derive(Clone, Debug, Eq, PartialEq)]
783
pub enum Class {
784
    /// A set of characters represented by Unicode scalar values.
785
    Unicode(ClassUnicode),
786
    /// A set of characters represented by arbitrary bytes (one byte per
787
    /// character).
788
    Bytes(ClassBytes),
789
}
790
791
impl Class {
792
    /// Apply Unicode simple case folding to this character class, in place.
793
    /// The character class will be expanded to include all simple case folded
794
    /// character variants.
795
    ///
796
    /// If this is a byte oriented character class, then this will be limited
797
    /// to the ASCII ranges `A-Z` and `a-z`.
798
0
    pub fn case_fold_simple(&mut self) {
799
0
        match *self {
800
0
            Class::Unicode(ref mut x) => x.case_fold_simple(),
801
0
            Class::Bytes(ref mut x) => x.case_fold_simple(),
802
        }
803
0
    }
804
805
    /// Negate this character class in place.
806
    ///
807
    /// After completion, this character class will contain precisely the
808
    /// characters that weren't previously in the class.
809
0
    pub fn negate(&mut self) {
810
0
        match *self {
811
0
            Class::Unicode(ref mut x) => x.negate(),
812
0
            Class::Bytes(ref mut x) => x.negate(),
813
        }
814
0
    }
815
816
    /// Returns true if and only if this character class will only ever match
817
    /// valid UTF-8.
818
    ///
819
    /// A character class can match invalid UTF-8 only when the following
820
    /// conditions are met:
821
    ///
822
    /// 1. The translator was configured to permit generating an expression
823
    ///    that can match invalid UTF-8. (By default, this is disabled.)
824
    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
825
    ///    syntax or in the parser builder. By default, Unicode mode is
826
    ///    enabled.
827
175
    pub fn is_always_utf8(&self) -> bool {
828
175
        match *self {
829
175
            Class::Unicode(_) => true,
830
0
            Class::Bytes(ref x) => x.is_all_ascii(),
831
        }
832
175
    }
833
}
834
835
/// A set of characters represented by Unicode scalar values.
836
#[derive(Clone, Debug, Eq, PartialEq)]
837
pub struct ClassUnicode {
838
    set: IntervalSet<ClassUnicodeRange>,
839
}
840
841
impl ClassUnicode {
842
    /// Create a new class from a sequence of ranges.
843
    ///
844
    /// The given ranges do not need to be in any specific order, and ranges
845
    /// may overlap.
846
325
    pub fn new<I>(ranges: I) -> ClassUnicode
847
325
    where
848
325
        I: IntoIterator<Item = ClassUnicodeRange>,
849
    {
850
325
        ClassUnicode { set: IntervalSet::new(ranges) }
851
325
    }
<regex_syntax::hir::ClassUnicode>::new::<alloc::vec::Vec<regex_syntax::hir::ClassUnicodeRange>>
Line
Count
Source
846
175
    pub fn new<I>(ranges: I) -> ClassUnicode
847
175
    where
848
175
        I: IntoIterator<Item = ClassUnicodeRange>,
849
    {
850
175
        ClassUnicode { set: IntervalSet::new(ranges) }
851
175
    }
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<(char, char)>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_unicode_class::{closure#0}>>
Line
Count
Source
846
150
    pub fn new<I>(ranges: I) -> ClassUnicode
847
150
    where
848
150
        I: IntoIterator<Item = ClassUnicodeRange>,
849
    {
850
150
        ClassUnicode { set: IntervalSet::new(ranges) }
851
150
    }
852
853
    /// Create a new class with no ranges.
854
175
    pub fn empty() -> ClassUnicode {
855
175
        ClassUnicode::new(vec![])
856
175
    }
857
858
    /// Add a new range to this set.
859
50
    pub fn push(&mut self, range: ClassUnicodeRange) {
860
50
        self.set.push(range);
861
50
    }
862
863
    /// Return an iterator over all ranges in this class.
864
    ///
865
    /// The iterator yields ranges in ascending order.
866
255
    pub fn iter(&self) -> ClassUnicodeIter<'_> {
867
255
        ClassUnicodeIter(self.set.iter())
868
255
    }
869
870
    /// Return the underlying ranges as a slice.
871
695
    pub fn ranges(&self) -> &[ClassUnicodeRange] {
872
695
        self.set.intervals()
873
695
    }
874
875
    /// Expand this character class such that it contains all case folded
876
    /// characters, according to Unicode's "simple" mapping. For example, if
877
    /// this class consists of the range `a-z`, then applying case folding will
878
    /// result in the class containing both the ranges `a-z` and `A-Z`.
879
    ///
880
    /// # Panics
881
    ///
882
    /// This routine panics when the case mapping data necessary for this
883
    /// routine to complete is unavailable. This occurs when the `unicode-case`
884
    /// feature is not enabled.
885
    ///
886
    /// Callers should prefer using `try_case_fold_simple` instead, which will
887
    /// return an error instead of panicking.
888
0
    pub fn case_fold_simple(&mut self) {
889
0
        self.set
890
0
            .case_fold_simple()
891
0
            .expect("unicode-case feature must be enabled");
892
0
    }
893
894
    /// Expand this character class such that it contains all case folded
895
    /// characters, according to Unicode's "simple" mapping. For example, if
896
    /// this class consists of the range `a-z`, then applying case folding will
897
    /// result in the class containing both the ranges `a-z` and `A-Z`.
898
    ///
899
    /// # Error
900
    ///
901
    /// This routine returns an error when the case mapping data necessary
902
    /// for this routine to complete is unavailable. This occurs when the
903
    /// `unicode-case` feature is not enabled.
904
0
    pub fn try_case_fold_simple(
905
0
        &mut self,
906
0
    ) -> result::Result<(), CaseFoldError> {
907
0
        self.set.case_fold_simple()
908
0
    }
909
910
    /// Negate this character class.
911
    ///
912
    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
913
    /// set, then it will not be in this set after negation.
914
0
    pub fn negate(&mut self) {
915
0
        self.set.negate();
916
0
    }
917
918
    /// Union this character class with the given character class, in place.
919
150
    pub fn union(&mut self, other: &ClassUnicode) {
920
150
        self.set.union(&other.set);
921
150
    }
922
923
    /// Intersect this character class with the given character class, in
924
    /// place.
925
0
    pub fn intersect(&mut self, other: &ClassUnicode) {
926
0
        self.set.intersect(&other.set);
927
0
    }
928
929
    /// Subtract the given character class from this character class, in place.
930
0
    pub fn difference(&mut self, other: &ClassUnicode) {
931
0
        self.set.difference(&other.set);
932
0
    }
933
934
    /// Compute the symmetric difference of the given character classes, in
935
    /// place.
936
    ///
937
    /// This computes the symmetric difference of two character classes. This
938
    /// removes all elements in this class that are also in the given class,
939
    /// but all adds all elements from the given class that aren't in this
940
    /// class. That is, the class will contain all elements in either class,
941
    /// but will not contain any elements that are in both classes.
942
0
    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
943
0
        self.set.symmetric_difference(&other.set);
944
0
    }
945
946
    /// Returns true if and only if this character class will either match
947
    /// nothing or only ASCII bytes. Stated differently, this returns false
948
    /// if and only if this class contains a non-ASCII codepoint.
949
0
    pub fn is_all_ascii(&self) -> bool {
950
0
        self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
951
0
    }
952
}
953
954
/// An iterator over all ranges in a Unicode character class.
955
///
956
/// The lifetime `'a` refers to the lifetime of the underlying class.
957
#[derive(Debug)]
958
pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
959
960
impl<'a> Iterator for ClassUnicodeIter<'a> {
961
    type Item = &'a ClassUnicodeRange;
962
963
715
    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
964
715
        self.0.next()
965
715
    }
966
}
967
968
/// A single range of characters represented by Unicode scalar values.
969
///
970
/// The range is closed. That is, the start and end of the range are included
971
/// in the range.
972
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
973
pub struct ClassUnicodeRange {
974
    start: char,
975
    end: char,
976
}
977
978
impl fmt::Debug for ClassUnicodeRange {
979
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
980
0
        let start = if !self.start.is_whitespace() && !self.start.is_control()
981
        {
982
0
            self.start.to_string()
983
        } else {
984
0
            format!("0x{:X}", self.start as u32)
985
        };
986
0
        let end = if !self.end.is_whitespace() && !self.end.is_control() {
987
0
            self.end.to_string()
988
        } else {
989
0
            format!("0x{:X}", self.end as u32)
990
        };
991
0
        f.debug_struct("ClassUnicodeRange")
992
0
            .field("start", &start)
993
0
            .field("end", &end)
994
0
            .finish()
995
0
    }
996
}
997
998
impl Interval for ClassUnicodeRange {
999
    type Bound = char;
1000
1001
    #[inline]
1002
2.58k
    fn lower(&self) -> char {
1003
2.58k
        self.start
1004
2.58k
    }
1005
    #[inline]
1006
2.58k
    fn upper(&self) -> char {
1007
2.58k
        self.end
1008
2.58k
    }
1009
    #[inline]
1010
1.23k
    fn set_lower(&mut self, bound: char) {
1011
1.23k
        self.start = bound;
1012
1.23k
    }
1013
    #[inline]
1014
1.23k
    fn set_upper(&mut self, bound: char) {
1015
1.23k
        self.end = bound;
1016
1.23k
    }
1017
1018
    /// Apply simple case folding to this Unicode scalar value range.
1019
    ///
1020
    /// Additional ranges are appended to the given vector. Canonical ordering
1021
    /// is *not* maintained in the given vector.
1022
0
    fn case_fold_simple(
1023
0
        &self,
1024
0
        ranges: &mut Vec<ClassUnicodeRange>,
1025
0
    ) -> Result<(), unicode::CaseFoldError> {
1026
0
        if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1027
0
            return Ok(());
1028
0
        }
1029
0
        let start = self.start as u32;
1030
0
        let end = (self.end as u32).saturating_add(1);
1031
0
        let mut next_simple_cp = None;
1032
0
        for cp in (start..end).filter_map(char::from_u32) {
1033
0
            if next_simple_cp.map_or(false, |next| cp < next) {
1034
0
                continue;
1035
0
            }
1036
0
            let it = match unicode::simple_fold(cp)? {
1037
0
                Ok(it) => it,
1038
0
                Err(next) => {
1039
0
                    next_simple_cp = next;
1040
0
                    continue;
1041
                }
1042
            };
1043
0
            for cp_folded in it {
1044
0
                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1045
0
            }
1046
        }
1047
0
        Ok(())
1048
0
    }
1049
}
1050
1051
impl ClassUnicodeRange {
1052
    /// Create a new Unicode scalar value range for a character class.
1053
    ///
1054
    /// The returned range is always in a canonical form. That is, the range
1055
    /// returned always satisfies the invariant that `start <= end`.
1056
775
    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1057
775
        ClassUnicodeRange::create(start, end)
1058
775
    }
1059
1060
    /// Return the start of this range.
1061
    ///
1062
    /// The start of a range is always less than or equal to the end of the
1063
    /// range.
1064
945
    pub fn start(&self) -> char {
1065
945
        self.start
1066
945
    }
1067
1068
    /// Return the end of this range.
1069
    ///
1070
    /// The end of a range is always greater than or equal to the start of the
1071
    /// range.
1072
945
    pub fn end(&self) -> char {
1073
945
        self.end
1074
945
    }
1075
}
1076
1077
/// A set of characters represented by arbitrary bytes (where one byte
1078
/// corresponds to one character).
1079
#[derive(Clone, Debug, Eq, PartialEq)]
1080
pub struct ClassBytes {
1081
    set: IntervalSet<ClassBytesRange>,
1082
}
1083
1084
impl ClassBytes {
1085
    /// Create a new class from a sequence of ranges.
1086
    ///
1087
    /// The given ranges do not need to be in any specific order, and ranges
1088
    /// may overlap.
1089
0
    pub fn new<I>(ranges: I) -> ClassBytes
1090
0
    where
1091
0
        I: IntoIterator<Item = ClassBytesRange>,
1092
    {
1093
0
        ClassBytes { set: IntervalSet::new(ranges) }
1094
0
    }
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<alloc::vec::Vec<regex_syntax::hir::ClassBytesRange>>
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<(char, char)>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_byte_class::{closure#0}>>
1095
1096
    /// Create a new class with no ranges.
1097
0
    pub fn empty() -> ClassBytes {
1098
0
        ClassBytes::new(vec![])
1099
0
    }
1100
1101
    /// Add a new range to this set.
1102
0
    pub fn push(&mut self, range: ClassBytesRange) {
1103
0
        self.set.push(range);
1104
0
    }
1105
1106
    /// Return an iterator over all ranges in this class.
1107
    ///
1108
    /// The iterator yields ranges in ascending order.
1109
0
    pub fn iter(&self) -> ClassBytesIter<'_> {
1110
0
        ClassBytesIter(self.set.iter())
1111
0
    }
1112
1113
    /// Return the underlying ranges as a slice.
1114
0
    pub fn ranges(&self) -> &[ClassBytesRange] {
1115
0
        self.set.intervals()
1116
0
    }
1117
1118
    /// Expand this character class such that it contains all case folded
1119
    /// characters. For example, if this class consists of the range `a-z`,
1120
    /// then applying case folding will result in the class containing both the
1121
    /// ranges `a-z` and `A-Z`.
1122
    ///
1123
    /// Note that this only applies ASCII case folding, which is limited to the
1124
    /// characters `a-z` and `A-Z`.
1125
0
    pub fn case_fold_simple(&mut self) {
1126
0
        self.set.case_fold_simple().expect("ASCII case folding never fails");
1127
0
    }
1128
1129
    /// Negate this byte class.
1130
    ///
1131
    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1132
    /// will not be in this set after negation.
1133
0
    pub fn negate(&mut self) {
1134
0
        self.set.negate();
1135
0
    }
1136
1137
    /// Union this byte class with the given byte class, in place.
1138
0
    pub fn union(&mut self, other: &ClassBytes) {
1139
0
        self.set.union(&other.set);
1140
0
    }
1141
1142
    /// Intersect this byte class with the given byte class, in place.
1143
0
    pub fn intersect(&mut self, other: &ClassBytes) {
1144
0
        self.set.intersect(&other.set);
1145
0
    }
1146
1147
    /// Subtract the given byte class from this byte class, in place.
1148
0
    pub fn difference(&mut self, other: &ClassBytes) {
1149
0
        self.set.difference(&other.set);
1150
0
    }
1151
1152
    /// Compute the symmetric difference of the given byte classes, in place.
1153
    ///
1154
    /// This computes the symmetric difference of two byte classes. This
1155
    /// removes all elements in this class that are also in the given class,
1156
    /// but all adds all elements from the given class that aren't in this
1157
    /// class. That is, the class will contain all elements in either class,
1158
    /// but will not contain any elements that are in both classes.
1159
0
    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1160
0
        self.set.symmetric_difference(&other.set);
1161
0
    }
1162
1163
    /// Returns true if and only if this character class will either match
1164
    /// nothing or only ASCII bytes. Stated differently, this returns false
1165
    /// if and only if this class contains a non-ASCII byte.
1166
0
    pub fn is_all_ascii(&self) -> bool {
1167
0
        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1168
0
    }
1169
}
1170
1171
/// An iterator over all ranges in a byte character class.
1172
///
1173
/// The lifetime `'a` refers to the lifetime of the underlying class.
1174
#[derive(Debug)]
1175
pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1176
1177
impl<'a> Iterator for ClassBytesIter<'a> {
1178
    type Item = &'a ClassBytesRange;
1179
1180
0
    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1181
0
        self.0.next()
1182
0
    }
1183
}
1184
1185
/// A single range of characters represented by arbitrary bytes.
1186
///
1187
/// The range is closed. That is, the start and end of the range are included
1188
/// in the range.
1189
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1190
pub struct ClassBytesRange {
1191
    start: u8,
1192
    end: u8,
1193
}
1194
1195
impl Interval for ClassBytesRange {
1196
    type Bound = u8;
1197
1198
    #[inline]
1199
0
    fn lower(&self) -> u8 {
1200
0
        self.start
1201
0
    }
1202
    #[inline]
1203
0
    fn upper(&self) -> u8 {
1204
0
        self.end
1205
0
    }
1206
    #[inline]
1207
0
    fn set_lower(&mut self, bound: u8) {
1208
0
        self.start = bound;
1209
0
    }
1210
    #[inline]
1211
0
    fn set_upper(&mut self, bound: u8) {
1212
0
        self.end = bound;
1213
0
    }
1214
1215
    /// Apply simple case folding to this byte range. Only ASCII case mappings
1216
    /// (for a-z) are applied.
1217
    ///
1218
    /// Additional ranges are appended to the given vector. Canonical ordering
1219
    /// is *not* maintained in the given vector.
1220
0
    fn case_fold_simple(
1221
0
        &self,
1222
0
        ranges: &mut Vec<ClassBytesRange>,
1223
0
    ) -> Result<(), unicode::CaseFoldError> {
1224
0
        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1225
0
            let lower = cmp::max(self.start, b'a');
1226
0
            let upper = cmp::min(self.end, b'z');
1227
0
            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1228
0
        }
1229
0
        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1230
0
            let lower = cmp::max(self.start, b'A');
1231
0
            let upper = cmp::min(self.end, b'Z');
1232
0
            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1233
0
        }
1234
0
        Ok(())
1235
0
    }
1236
}
1237
1238
impl ClassBytesRange {
1239
    /// Create a new byte range for a character class.
1240
    ///
1241
    /// The returned range is always in a canonical form. That is, the range
1242
    /// returned always satisfies the invariant that `start <= end`.
1243
0
    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1244
0
        ClassBytesRange::create(start, end)
1245
0
    }
1246
1247
    /// Return the start of this range.
1248
    ///
1249
    /// The start of a range is always less than or equal to the end of the
1250
    /// range.
1251
0
    pub fn start(&self) -> u8 {
1252
0
        self.start
1253
0
    }
1254
1255
    /// Return the end of this range.
1256
    ///
1257
    /// The end of a range is always greater than or equal to the start of the
1258
    /// range.
1259
0
    pub fn end(&self) -> u8 {
1260
0
        self.end
1261
0
    }
1262
}
1263
1264
impl fmt::Debug for ClassBytesRange {
1265
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1266
0
        let mut debug = f.debug_struct("ClassBytesRange");
1267
0
        if self.start <= 0x7F {
1268
0
            debug.field("start", &(self.start as char));
1269
0
        } else {
1270
0
            debug.field("start", &self.start);
1271
0
        }
1272
0
        if self.end <= 0x7F {
1273
0
            debug.field("end", &(self.end as char));
1274
0
        } else {
1275
0
            debug.field("end", &self.end);
1276
0
        }
1277
0
        debug.finish()
1278
0
    }
1279
}
1280
1281
/// The high-level intermediate representation for an anchor assertion.
1282
///
1283
/// A matching anchor assertion is always zero-length.
1284
#[derive(Clone, Debug, Eq, PartialEq)]
1285
pub enum Anchor {
1286
    /// Match the beginning of a line or the beginning of text. Specifically,
1287
    /// this matches at the starting position of the input, or at the position
1288
    /// immediately following a `\n` character.
1289
    StartLine,
1290
    /// Match the end of a line or the end of text. Specifically,
1291
    /// this matches at the end position of the input, or at the position
1292
    /// immediately preceding a `\n` character.
1293
    EndLine,
1294
    /// Match the beginning of text. Specifically, this matches at the starting
1295
    /// position of the input.
1296
    StartText,
1297
    /// Match the end of text. Specifically, this matches at the ending
1298
    /// position of the input.
1299
    EndText,
1300
}
1301
1302
/// The high-level intermediate representation for a word-boundary assertion.
1303
///
1304
/// A matching word boundary assertion is always zero-length.
1305
#[derive(Clone, Debug, Eq, PartialEq)]
1306
pub enum WordBoundary {
1307
    /// Match a Unicode-aware word boundary. That is, this matches a position
1308
    /// where the left adjacent character and right adjacent character
1309
    /// correspond to a word and non-word or a non-word and word character.
1310
    Unicode,
1311
    /// Match a Unicode-aware negation of a word boundary.
1312
    UnicodeNegate,
1313
    /// Match an ASCII-only word boundary. That is, this matches a position
1314
    /// where the left adjacent character and right adjacent character
1315
    /// correspond to a word and non-word or a non-word and word character.
1316
    Ascii,
1317
    /// Match an ASCII-only negation of a word boundary.
1318
    AsciiNegate,
1319
}
1320
1321
impl WordBoundary {
1322
    /// Returns true if and only if this word boundary assertion is negated.
1323
0
    pub fn is_negated(&self) -> bool {
1324
0
        match *self {
1325
0
            WordBoundary::Unicode | WordBoundary::Ascii => false,
1326
0
            WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1327
        }
1328
0
    }
1329
}
1330
1331
/// The high-level intermediate representation for a group.
1332
///
1333
/// This represents one of three possible group types:
1334
///
1335
/// 1. A non-capturing group (e.g., `(?:expr)`).
1336
/// 2. A capturing group (e.g., `(expr)`).
1337
/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1338
#[derive(Clone, Debug, Eq, PartialEq)]
1339
pub struct Group {
1340
    /// The kind of this group. If it is a capturing group, then the kind
1341
    /// contains the capture group index (and the name, if it is a named
1342
    /// group).
1343
    pub kind: GroupKind,
1344
    /// The expression inside the capturing group, which may be empty.
1345
    pub hir: Box<Hir>,
1346
}
1347
1348
/// The kind of group.
1349
#[derive(Clone, Debug, Eq, PartialEq)]
1350
pub enum GroupKind {
1351
    /// A normal unnamed capturing group.
1352
    ///
1353
    /// The value is the capture index of the group.
1354
    CaptureIndex(u32),
1355
    /// A named capturing group.
1356
    CaptureName {
1357
        /// The name of the group.
1358
        name: String,
1359
        /// The capture index of the group.
1360
        index: u32,
1361
    },
1362
    /// A non-capturing group.
1363
    NonCapturing,
1364
}
1365
1366
/// The high-level intermediate representation of a repetition operator.
1367
///
1368
/// A repetition operator permits the repetition of an arbitrary
1369
/// sub-expression.
1370
#[derive(Clone, Debug, Eq, PartialEq)]
1371
pub struct Repetition {
1372
    /// The kind of this repetition operator.
1373
    pub kind: RepetitionKind,
1374
    /// Whether this repetition operator is greedy or not. A greedy operator
1375
    /// will match as much as it can. A non-greedy operator will match as
1376
    /// little as it can.
1377
    ///
1378
    /// Typically, operators are greedy by default and are only non-greedy when
1379
    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1380
    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1381
    pub greedy: bool,
1382
    /// The expression being repeated.
1383
    pub hir: Box<Hir>,
1384
}
1385
1386
impl Repetition {
1387
    /// Returns true if and only if this repetition operator makes it possible
1388
    /// to match the empty string.
1389
    ///
1390
    /// Note that this is not defined inductively. For example, while `a*`
1391
    /// will report `true`, `()+` will not, even though `()` matches the empty
1392
    /// string and one or more occurrences of something that matches the empty
1393
    /// string will always match the empty string. In order to get the
1394
    /// inductive definition, see the corresponding method on
1395
    /// [`Hir`](struct.Hir.html).
1396
1.62k
    pub fn is_match_empty(&self) -> bool {
1397
0
        match self.kind {
1398
450
            RepetitionKind::ZeroOrOne => true,
1399
875
            RepetitionKind::ZeroOrMore => true,
1400
300
            RepetitionKind::OneOrMore => false,
1401
0
            RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1402
0
            RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1403
0
            RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1404
        }
1405
1.62k
    }
1406
}
1407
1408
/// The kind of a repetition operator.
1409
#[derive(Clone, Debug, Eq, PartialEq)]
1410
pub enum RepetitionKind {
1411
    /// Matches a sub-expression zero or one times.
1412
    ZeroOrOne,
1413
    /// Matches a sub-expression zero or more times.
1414
    ZeroOrMore,
1415
    /// Matches a sub-expression one or more times.
1416
    OneOrMore,
1417
    /// Matches a sub-expression within a bounded range of times.
1418
    Range(RepetitionRange),
1419
}
1420
1421
/// The kind of a counted repetition operator.
1422
#[derive(Clone, Debug, Eq, PartialEq)]
1423
pub enum RepetitionRange {
1424
    /// Matches a sub-expression exactly this many times.
1425
    Exactly(u32),
1426
    /// Matches a sub-expression at least this many times.
1427
    AtLeast(u32),
1428
    /// Matches a sub-expression at least `m` times and at most `n` times.
1429
    Bounded(u32, u32),
1430
}
1431
1432
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1433
/// space but heap space proportional to the depth of the total `Hir`.
1434
impl Drop for Hir {
1435
2.84k
    fn drop(&mut self) {
1436
        use std::mem;
1437
1438
2.84k
        match *self.kind() {
1439
            HirKind::Empty
1440
            | HirKind::Literal(_)
1441
            | HirKind::Class(_)
1442
            | HirKind::Anchor(_)
1443
1.91k
            | HirKind::WordBoundary(_) => return,
1444
200
            HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1445
495
            HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1446
220
            HirKind::Concat(ref x) if x.is_empty() => return,
1447
20
            HirKind::Alternation(ref x) if x.is_empty() => return,
1448
55
            _ => {}
1449
        }
1450
1451
55
        let mut stack = vec![mem::replace(self, Hir::empty())];
1452
2.15k
        while let Some(mut expr) = stack.pop() {
1453
2.10k
            match expr.kind {
1454
                HirKind::Empty
1455
                | HirKind::Literal(_)
1456
                | HirKind::Class(_)
1457
                | HirKind::Anchor(_)
1458
1.25k
                | HirKind::WordBoundary(_) => {}
1459
200
                HirKind::Group(ref mut x) => {
1460
200
                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1461
200
                }
1462
420
                HirKind::Repetition(ref mut x) => {
1463
420
                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1464
420
                }
1465
205
                HirKind::Concat(ref mut x) => {
1466
205
                    stack.extend(x.drain(..));
1467
205
                }
1468
20
                HirKind::Alternation(ref mut x) => {
1469
20
                    stack.extend(x.drain(..));
1470
20
                }
1471
            }
1472
        }
1473
2.84k
    }
1474
}
1475
1476
/// A type that documents various attributes of an HIR expression.
1477
///
1478
/// These attributes are typically defined inductively on the HIR.
1479
#[derive(Clone, Debug, Eq, PartialEq)]
1480
struct HirInfo {
1481
    /// Represent yes/no questions by a bitfield to conserve space, since
1482
    /// this is included in every HIR expression.
1483
    ///
1484
    /// If more attributes need to be added, it is OK to increase the size of
1485
    /// this as appropriate.
1486
    bools: u16,
1487
}
1488
1489
// A simple macro for defining bitfield accessors/mutators.
1490
macro_rules! define_bool {
1491
    ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1492
11.8k
        fn $is_fn_name(&self) -> bool {
1493
11.8k
            self.bools & (0b1 << $bit) > 0
1494
11.8k
        }
<regex_syntax::hir::HirInfo>::is_literal
Line
Count
Source
1492
1.06k
        fn $is_fn_name(&self) -> bool {
1493
1.06k
            self.bools & (0b1 << $bit) > 0
1494
1.06k
        }
<regex_syntax::hir::HirInfo>::is_always_utf8
Line
Count
Source
1492
1.96k
        fn $is_fn_name(&self) -> bool {
1493
1.96k
            self.bools & (0b1 << $bit) > 0
1494
1.96k
        }
<regex_syntax::hir::HirInfo>::is_match_empty
Line
Count
Source
1492
1.12k
        fn $is_fn_name(&self) -> bool {
1493
1.12k
            self.bools & (0b1 << $bit) > 0
1494
1.12k
        }
<regex_syntax::hir::HirInfo>::is_anchored_start
Line
Count
Source
1492
425
        fn $is_fn_name(&self) -> bool {
1493
425
            self.bools & (0b1 << $bit) > 0
1494
425
        }
<regex_syntax::hir::HirInfo>::is_any_anchored_end
Line
Count
Source
1492
1.94k
        fn $is_fn_name(&self) -> bool {
1493
1.94k
            self.bools & (0b1 << $bit) > 0
1494
1.94k
        }
<regex_syntax::hir::HirInfo>::is_any_anchored_start
Line
Count
Source
1492
1.69k
        fn $is_fn_name(&self) -> bool {
1493
1.69k
            self.bools & (0b1 << $bit) > 0
1494
1.69k
        }
<regex_syntax::hir::HirInfo>::is_alternation_literal
Line
Count
Source
1492
1.07k
        fn $is_fn_name(&self) -> bool {
1493
1.07k
            self.bools & (0b1 << $bit) > 0
1494
1.07k
        }
<regex_syntax::hir::HirInfo>::is_line_anchored_start
Line
Count
Source
1492
245
        fn $is_fn_name(&self) -> bool {
1493
245
            self.bools & (0b1 << $bit) > 0
1494
245
        }
<regex_syntax::hir::HirInfo>::is_anchored_end
Line
Count
Source
1492
425
        fn $is_fn_name(&self) -> bool {
1493
425
            self.bools & (0b1 << $bit) > 0
1494
425
        }
<regex_syntax::hir::HirInfo>::is_all_assertions
Line
Count
Source
1492
1.64k
        fn $is_fn_name(&self) -> bool {
1493
1.64k
            self.bools & (0b1 << $bit) > 0
1494
1.64k
        }
<regex_syntax::hir::HirInfo>::is_line_anchored_end
Line
Count
Source
1492
245
        fn $is_fn_name(&self) -> bool {
1493
245
            self.bools & (0b1 << $bit) > 0
1494
245
        }
1495
1496
26.1k
        fn $set_fn_name(&mut self, yes: bool) {
1497
26.1k
            if yes {
1498
6.71k
                self.bools |= 1 << $bit;
1499
19.3k
            } else {
1500
19.3k
                self.bools &= !(1 << $bit);
1501
19.3k
            }
1502
26.1k
        }
<regex_syntax::hir::HirInfo>::set_literal
Line
Count
Source
1496
2.60k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.60k
            if yes {
1498
790
                self.bools |= 1 << $bit;
1499
1.81k
            } else {
1500
1.81k
                self.bools &= !(1 << $bit);
1501
1.81k
            }
1502
2.60k
        }
<regex_syntax::hir::HirInfo>::set_always_utf8
Line
Count
Source
1496
2.63k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.63k
            if yes {
1498
2.63k
                self.bools |= 1 << $bit;
1499
2.63k
            } else {
1500
0
                self.bools &= !(1 << $bit);
1501
0
            }
1502
2.63k
        }
<regex_syntax::hir::HirInfo>::set_match_empty
Line
Count
Source
1496
2.63k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.63k
            if yes {
1498
1.15k
                self.bools |= 1 << $bit;
1499
1.48k
            } else {
1500
1.48k
                self.bools &= !(1 << $bit);
1501
1.48k
            }
1502
2.63k
        }
<regex_syntax::hir::HirInfo>::set_anchored_end
Line
Count
Source
1496
1.91k
        fn $set_fn_name(&mut self, yes: bool) {
1497
1.91k
            if yes {
1498
40
                self.bools |= 1 << $bit;
1499
1.87k
            } else {
1500
1.87k
                self.bools &= !(1 << $bit);
1501
1.87k
            }
1502
1.91k
        }
<regex_syntax::hir::HirInfo>::set_all_assertions
Line
Count
Source
1496
2.63k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.63k
            if yes {
1498
830
                self.bools |= 1 << $bit;
1499
1.80k
            } else {
1500
1.80k
                self.bools &= !(1 << $bit);
1501
1.80k
            }
1502
2.63k
        }
<regex_syntax::hir::HirInfo>::set_anchored_start
Line
Count
Source
1496
1.91k
        fn $set_fn_name(&mut self, yes: bool) {
1497
1.91k
            if yes {
1498
40
                self.bools |= 1 << $bit;
1499
1.87k
            } else {
1500
1.87k
                self.bools &= !(1 << $bit);
1501
1.87k
            }
1502
1.91k
        }
<regex_syntax::hir::HirInfo>::set_any_anchored_end
Line
Count
Source
1496
2.65k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.65k
            if yes {
1498
30
                self.bools |= 1 << $bit;
1499
2.62k
            } else {
1500
2.62k
                self.bools &= !(1 << $bit);
1501
2.62k
            }
1502
2.65k
        }
<regex_syntax::hir::HirInfo>::set_line_anchored_end
Line
Count
Source
1496
1.91k
        fn $set_fn_name(&mut self, yes: bool) {
1497
1.91k
            if yes {
1498
40
                self.bools |= 1 << $bit;
1499
1.87k
            } else {
1500
1.87k
                self.bools &= !(1 << $bit);
1501
1.87k
            }
1502
1.91k
        }
<regex_syntax::hir::HirInfo>::set_any_anchored_start
Line
Count
Source
1496
2.65k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.65k
            if yes {
1498
285
                self.bools |= 1 << $bit;
1499
2.36k
            } else {
1500
2.36k
                self.bools &= !(1 << $bit);
1501
2.36k
            }
1502
2.65k
        }
<regex_syntax::hir::HirInfo>::set_alternation_literal
Line
Count
Source
1496
2.63k
        fn $set_fn_name(&mut self, yes: bool) {
1497
2.63k
            if yes {
1498
830
                self.bools |= 1 << $bit;
1499
1.80k
            } else {
1500
1.80k
                self.bools &= !(1 << $bit);
1501
1.80k
            }
1502
2.63k
        }
<regex_syntax::hir::HirInfo>::set_line_anchored_start
Line
Count
Source
1496
1.91k
        fn $set_fn_name(&mut self, yes: bool) {
1497
1.91k
            if yes {
1498
40
                self.bools |= 1 << $bit;
1499
1.87k
            } else {
1500
1.87k
                self.bools &= !(1 << $bit);
1501
1.87k
            }
1502
1.91k
        }
1503
    };
1504
}
1505
1506
impl HirInfo {
1507
1.87k
    fn new() -> HirInfo {
1508
1.87k
        HirInfo { bools: 0 }
1509
1.87k
    }
1510
1511
    define_bool!(0, is_always_utf8, set_always_utf8);
1512
    define_bool!(1, is_all_assertions, set_all_assertions);
1513
    define_bool!(2, is_anchored_start, set_anchored_start);
1514
    define_bool!(3, is_anchored_end, set_anchored_end);
1515
    define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1516
    define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1517
    define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1518
    define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1519
    define_bool!(8, is_match_empty, set_match_empty);
1520
    define_bool!(9, is_literal, set_literal);
1521
    define_bool!(10, is_alternation_literal, set_alternation_literal);
1522
}
1523
1524
#[cfg(test)]
1525
mod tests {
1526
    use super::*;
1527
1528
    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1529
        let ranges: Vec<ClassUnicodeRange> = ranges
1530
            .iter()
1531
            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1532
            .collect();
1533
        ClassUnicode::new(ranges)
1534
    }
1535
1536
    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1537
        let ranges: Vec<ClassBytesRange> =
1538
            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1539
        ClassBytes::new(ranges)
1540
    }
1541
1542
    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1543
        cls.iter().map(|x| (x.start(), x.end())).collect()
1544
    }
1545
1546
    #[cfg(feature = "unicode-case")]
1547
    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1548
        let mut cls_ = cls.clone();
1549
        cls_.case_fold_simple();
1550
        cls_
1551
    }
1552
1553
    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1554
        let mut cls_ = cls1.clone();
1555
        cls_.union(cls2);
1556
        cls_
1557
    }
1558
1559
    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1560
        let mut cls_ = cls1.clone();
1561
        cls_.intersect(cls2);
1562
        cls_
1563
    }
1564
1565
    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1566
        let mut cls_ = cls1.clone();
1567
        cls_.difference(cls2);
1568
        cls_
1569
    }
1570
1571
    fn usymdifference(
1572
        cls1: &ClassUnicode,
1573
        cls2: &ClassUnicode,
1574
    ) -> ClassUnicode {
1575
        let mut cls_ = cls1.clone();
1576
        cls_.symmetric_difference(cls2);
1577
        cls_
1578
    }
1579
1580
    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1581
        let mut cls_ = cls.clone();
1582
        cls_.negate();
1583
        cls_
1584
    }
1585
1586
    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1587
        cls.iter().map(|x| (x.start(), x.end())).collect()
1588
    }
1589
1590
    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1591
        let mut cls_ = cls.clone();
1592
        cls_.case_fold_simple();
1593
        cls_
1594
    }
1595
1596
    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1597
        let mut cls_ = cls1.clone();
1598
        cls_.union(cls2);
1599
        cls_
1600
    }
1601
1602
    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1603
        let mut cls_ = cls1.clone();
1604
        cls_.intersect(cls2);
1605
        cls_
1606
    }
1607
1608
    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1609
        let mut cls_ = cls1.clone();
1610
        cls_.difference(cls2);
1611
        cls_
1612
    }
1613
1614
    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1615
        let mut cls_ = cls1.clone();
1616
        cls_.symmetric_difference(cls2);
1617
        cls_
1618
    }
1619
1620
    fn bnegate(cls: &ClassBytes) -> ClassBytes {
1621
        let mut cls_ = cls.clone();
1622
        cls_.negate();
1623
        cls_
1624
    }
1625
1626
    #[test]
1627
    fn class_range_canonical_unicode() {
1628
        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1629
        assert_eq!('\0', range.start());
1630
        assert_eq!('\u{00FF}', range.end());
1631
    }
1632
1633
    #[test]
1634
    fn class_range_canonical_bytes() {
1635
        let range = ClassBytesRange::new(b'\xFF', b'\0');
1636
        assert_eq!(b'\0', range.start());
1637
        assert_eq!(b'\xFF', range.end());
1638
    }
1639
1640
    #[test]
1641
    fn class_canonicalize_unicode() {
1642
        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1643
        let expected = vec![('a', 'c'), ('x', 'z')];
1644
        assert_eq!(expected, uranges(&cls));
1645
1646
        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1647
        let expected = vec![('a', 'c'), ('x', 'z')];
1648
        assert_eq!(expected, uranges(&cls));
1649
1650
        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1651
        let expected = vec![('w', 'z')];
1652
        assert_eq!(expected, uranges(&cls));
1653
1654
        let cls = uclass(&[
1655
            ('c', 'f'),
1656
            ('a', 'g'),
1657
            ('d', 'j'),
1658
            ('a', 'c'),
1659
            ('m', 'p'),
1660
            ('l', 's'),
1661
        ]);
1662
        let expected = vec![('a', 'j'), ('l', 's')];
1663
        assert_eq!(expected, uranges(&cls));
1664
1665
        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1666
        let expected = vec![('u', 'z')];
1667
        assert_eq!(expected, uranges(&cls));
1668
1669
        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1670
        let expected = vec![('\x00', '\u{10FFFF}')];
1671
        assert_eq!(expected, uranges(&cls));
1672
1673
        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1674
        let expected = vec![('a', 'b')];
1675
        assert_eq!(expected, uranges(&cls));
1676
    }
1677
1678
    #[test]
1679
    fn class_canonicalize_bytes() {
1680
        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1681
        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682
        assert_eq!(expected, branges(&cls));
1683
1684
        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1685
        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1686
        assert_eq!(expected, branges(&cls));
1687
1688
        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1689
        let expected = vec![(b'w', b'z')];
1690
        assert_eq!(expected, branges(&cls));
1691
1692
        let cls = bclass(&[
1693
            (b'c', b'f'),
1694
            (b'a', b'g'),
1695
            (b'd', b'j'),
1696
            (b'a', b'c'),
1697
            (b'm', b'p'),
1698
            (b'l', b's'),
1699
        ]);
1700
        let expected = vec![(b'a', b'j'), (b'l', b's')];
1701
        assert_eq!(expected, branges(&cls));
1702
1703
        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1704
        let expected = vec![(b'u', b'z')];
1705
        assert_eq!(expected, branges(&cls));
1706
1707
        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1708
        let expected = vec![(b'\x00', b'\xFF')];
1709
        assert_eq!(expected, branges(&cls));
1710
1711
        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1712
        let expected = vec![(b'a', b'b')];
1713
        assert_eq!(expected, branges(&cls));
1714
    }
1715
1716
    #[test]
1717
    #[cfg(feature = "unicode-case")]
1718
    fn class_case_fold_unicode() {
1719
        let cls = uclass(&[
1720
            ('C', 'F'),
1721
            ('A', 'G'),
1722
            ('D', 'J'),
1723
            ('A', 'C'),
1724
            ('M', 'P'),
1725
            ('L', 'S'),
1726
            ('c', 'f'),
1727
        ]);
1728
        let expected = uclass(&[
1729
            ('A', 'J'),
1730
            ('L', 'S'),
1731
            ('a', 'j'),
1732
            ('l', 's'),
1733
            ('\u{17F}', '\u{17F}'),
1734
        ]);
1735
        assert_eq!(expected, ucasefold(&cls));
1736
1737
        let cls = uclass(&[('A', 'Z')]);
1738
        let expected = uclass(&[
1739
            ('A', 'Z'),
1740
            ('a', 'z'),
1741
            ('\u{17F}', '\u{17F}'),
1742
            ('\u{212A}', '\u{212A}'),
1743
        ]);
1744
        assert_eq!(expected, ucasefold(&cls));
1745
1746
        let cls = uclass(&[('a', 'z')]);
1747
        let expected = uclass(&[
1748
            ('A', 'Z'),
1749
            ('a', 'z'),
1750
            ('\u{17F}', '\u{17F}'),
1751
            ('\u{212A}', '\u{212A}'),
1752
        ]);
1753
        assert_eq!(expected, ucasefold(&cls));
1754
1755
        let cls = uclass(&[('A', 'A'), ('_', '_')]);
1756
        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1757
        assert_eq!(expected, ucasefold(&cls));
1758
1759
        let cls = uclass(&[('A', 'A'), ('=', '=')]);
1760
        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1761
        assert_eq!(expected, ucasefold(&cls));
1762
1763
        let cls = uclass(&[('\x00', '\x10')]);
1764
        assert_eq!(cls, ucasefold(&cls));
1765
1766
        let cls = uclass(&[('k', 'k')]);
1767
        let expected =
1768
            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1769
        assert_eq!(expected, ucasefold(&cls));
1770
1771
        let cls = uclass(&[('@', '@')]);
1772
        assert_eq!(cls, ucasefold(&cls));
1773
    }
1774
1775
    #[test]
1776
    #[cfg(not(feature = "unicode-case"))]
1777
    fn class_case_fold_unicode_disabled() {
1778
        let mut cls = uclass(&[
1779
            ('C', 'F'),
1780
            ('A', 'G'),
1781
            ('D', 'J'),
1782
            ('A', 'C'),
1783
            ('M', 'P'),
1784
            ('L', 'S'),
1785
            ('c', 'f'),
1786
        ]);
1787
        assert!(cls.try_case_fold_simple().is_err());
1788
    }
1789
1790
    #[test]
1791
    #[should_panic]
1792
    #[cfg(not(feature = "unicode-case"))]
1793
    fn class_case_fold_unicode_disabled_panics() {
1794
        let mut cls = uclass(&[
1795
            ('C', 'F'),
1796
            ('A', 'G'),
1797
            ('D', 'J'),
1798
            ('A', 'C'),
1799
            ('M', 'P'),
1800
            ('L', 'S'),
1801
            ('c', 'f'),
1802
        ]);
1803
        cls.case_fold_simple();
1804
    }
1805
1806
    #[test]
1807
    fn class_case_fold_bytes() {
1808
        let cls = bclass(&[
1809
            (b'C', b'F'),
1810
            (b'A', b'G'),
1811
            (b'D', b'J'),
1812
            (b'A', b'C'),
1813
            (b'M', b'P'),
1814
            (b'L', b'S'),
1815
            (b'c', b'f'),
1816
        ]);
1817
        let expected =
1818
            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1819
        assert_eq!(expected, bcasefold(&cls));
1820
1821
        let cls = bclass(&[(b'A', b'Z')]);
1822
        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823
        assert_eq!(expected, bcasefold(&cls));
1824
1825
        let cls = bclass(&[(b'a', b'z')]);
1826
        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1827
        assert_eq!(expected, bcasefold(&cls));
1828
1829
        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1830
        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1831
        assert_eq!(expected, bcasefold(&cls));
1832
1833
        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1834
        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1835
        assert_eq!(expected, bcasefold(&cls));
1836
1837
        let cls = bclass(&[(b'\x00', b'\x10')]);
1838
        assert_eq!(cls, bcasefold(&cls));
1839
1840
        let cls = bclass(&[(b'k', b'k')]);
1841
        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1842
        assert_eq!(expected, bcasefold(&cls));
1843
1844
        let cls = bclass(&[(b'@', b'@')]);
1845
        assert_eq!(cls, bcasefold(&cls));
1846
    }
1847
1848
    #[test]
1849
    fn class_negate_unicode() {
1850
        let cls = uclass(&[('a', 'a')]);
1851
        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1852
        assert_eq!(expected, unegate(&cls));
1853
1854
        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1855
        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1856
        assert_eq!(expected, unegate(&cls));
1857
1858
        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1859
        let expected = uclass(&[
1860
            ('\x00', '\x60'),
1861
            ('\x64', '\x77'),
1862
            ('\x7B', '\u{10FFFF}'),
1863
        ]);
1864
        assert_eq!(expected, unegate(&cls));
1865
1866
        let cls = uclass(&[('\x00', 'a')]);
1867
        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1868
        assert_eq!(expected, unegate(&cls));
1869
1870
        let cls = uclass(&[('a', '\u{10FFFF}')]);
1871
        let expected = uclass(&[('\x00', '\x60')]);
1872
        assert_eq!(expected, unegate(&cls));
1873
1874
        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1875
        let expected = uclass(&[]);
1876
        assert_eq!(expected, unegate(&cls));
1877
1878
        let cls = uclass(&[]);
1879
        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1880
        assert_eq!(expected, unegate(&cls));
1881
1882
        let cls =
1883
            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1884
        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1885
        assert_eq!(expected, unegate(&cls));
1886
1887
        let cls = uclass(&[('\x00', '\u{D7FF}')]);
1888
        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1889
        assert_eq!(expected, unegate(&cls));
1890
1891
        let cls = uclass(&[('\x00', '\u{D7FE}')]);
1892
        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1893
        assert_eq!(expected, unegate(&cls));
1894
1895
        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1896
        let expected = uclass(&[('\x00', '\u{D7FF}')]);
1897
        assert_eq!(expected, unegate(&cls));
1898
1899
        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1900
        let expected = uclass(&[('\x00', '\u{E000}')]);
1901
        assert_eq!(expected, unegate(&cls));
1902
    }
1903
1904
    #[test]
1905
    fn class_negate_bytes() {
1906
        let cls = bclass(&[(b'a', b'a')]);
1907
        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1908
        assert_eq!(expected, bnegate(&cls));
1909
1910
        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1911
        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1912
        assert_eq!(expected, bnegate(&cls));
1913
1914
        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1915
        let expected = bclass(&[
1916
            (b'\x00', b'\x60'),
1917
            (b'\x64', b'\x77'),
1918
            (b'\x7B', b'\xFF'),
1919
        ]);
1920
        assert_eq!(expected, bnegate(&cls));
1921
1922
        let cls = bclass(&[(b'\x00', b'a')]);
1923
        let expected = bclass(&[(b'\x62', b'\xFF')]);
1924
        assert_eq!(expected, bnegate(&cls));
1925
1926
        let cls = bclass(&[(b'a', b'\xFF')]);
1927
        let expected = bclass(&[(b'\x00', b'\x60')]);
1928
        assert_eq!(expected, bnegate(&cls));
1929
1930
        let cls = bclass(&[(b'\x00', b'\xFF')]);
1931
        let expected = bclass(&[]);
1932
        assert_eq!(expected, bnegate(&cls));
1933
1934
        let cls = bclass(&[]);
1935
        let expected = bclass(&[(b'\x00', b'\xFF')]);
1936
        assert_eq!(expected, bnegate(&cls));
1937
1938
        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1939
        let expected = bclass(&[(b'\xFE', b'\xFE')]);
1940
        assert_eq!(expected, bnegate(&cls));
1941
    }
1942
1943
    #[test]
1944
    fn class_union_unicode() {
1945
        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1946
        let cls2 = uclass(&[('a', 'z')]);
1947
        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1948
        assert_eq!(expected, uunion(&cls1, &cls2));
1949
    }
1950
1951
    #[test]
1952
    fn class_union_bytes() {
1953
        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1954
        let cls2 = bclass(&[(b'a', b'z')]);
1955
        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1956
        assert_eq!(expected, bunion(&cls1, &cls2));
1957
    }
1958
1959
    #[test]
1960
    fn class_intersect_unicode() {
1961
        let cls1 = uclass(&[]);
1962
        let cls2 = uclass(&[('a', 'a')]);
1963
        let expected = uclass(&[]);
1964
        assert_eq!(expected, uintersect(&cls1, &cls2));
1965
1966
        let cls1 = uclass(&[('a', 'a')]);
1967
        let cls2 = uclass(&[('a', 'a')]);
1968
        let expected = uclass(&[('a', 'a')]);
1969
        assert_eq!(expected, uintersect(&cls1, &cls2));
1970
1971
        let cls1 = uclass(&[('a', 'a')]);
1972
        let cls2 = uclass(&[('b', 'b')]);
1973
        let expected = uclass(&[]);
1974
        assert_eq!(expected, uintersect(&cls1, &cls2));
1975
1976
        let cls1 = uclass(&[('a', 'a')]);
1977
        let cls2 = uclass(&[('a', 'c')]);
1978
        let expected = uclass(&[('a', 'a')]);
1979
        assert_eq!(expected, uintersect(&cls1, &cls2));
1980
1981
        let cls1 = uclass(&[('a', 'b')]);
1982
        let cls2 = uclass(&[('a', 'c')]);
1983
        let expected = uclass(&[('a', 'b')]);
1984
        assert_eq!(expected, uintersect(&cls1, &cls2));
1985
1986
        let cls1 = uclass(&[('a', 'b')]);
1987
        let cls2 = uclass(&[('b', 'c')]);
1988
        let expected = uclass(&[('b', 'b')]);
1989
        assert_eq!(expected, uintersect(&cls1, &cls2));
1990
1991
        let cls1 = uclass(&[('a', 'b')]);
1992
        let cls2 = uclass(&[('c', 'd')]);
1993
        let expected = uclass(&[]);
1994
        assert_eq!(expected, uintersect(&cls1, &cls2));
1995
1996
        let cls1 = uclass(&[('b', 'c')]);
1997
        let cls2 = uclass(&[('a', 'd')]);
1998
        let expected = uclass(&[('b', 'c')]);
1999
        assert_eq!(expected, uintersect(&cls1, &cls2));
2000
2001
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2002
        let cls2 = uclass(&[('a', 'h')]);
2003
        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004
        assert_eq!(expected, uintersect(&cls1, &cls2));
2005
2006
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2007
        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2008
        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2009
        assert_eq!(expected, uintersect(&cls1, &cls2));
2010
2011
        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2012
        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2013
        let expected = uclass(&[]);
2014
        assert_eq!(expected, uintersect(&cls1, &cls2));
2015
2016
        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2017
        let cls2 = uclass(&[('h', 'h')]);
2018
        let expected = uclass(&[('h', 'h')]);
2019
        assert_eq!(expected, uintersect(&cls1, &cls2));
2020
2021
        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2022
        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2023
        let expected = uclass(&[]);
2024
        assert_eq!(expected, uintersect(&cls1, &cls2));
2025
2026
        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2027
        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2028
        let expected = uclass(&[('b', 'f')]);
2029
        assert_eq!(expected, uintersect(&cls1, &cls2));
2030
    }
2031
2032
    #[test]
2033
    fn class_intersect_bytes() {
2034
        let cls1 = bclass(&[]);
2035
        let cls2 = bclass(&[(b'a', b'a')]);
2036
        let expected = bclass(&[]);
2037
        assert_eq!(expected, bintersect(&cls1, &cls2));
2038
2039
        let cls1 = bclass(&[(b'a', b'a')]);
2040
        let cls2 = bclass(&[(b'a', b'a')]);
2041
        let expected = bclass(&[(b'a', b'a')]);
2042
        assert_eq!(expected, bintersect(&cls1, &cls2));
2043
2044
        let cls1 = bclass(&[(b'a', b'a')]);
2045
        let cls2 = bclass(&[(b'b', b'b')]);
2046
        let expected = bclass(&[]);
2047
        assert_eq!(expected, bintersect(&cls1, &cls2));
2048
2049
        let cls1 = bclass(&[(b'a', b'a')]);
2050
        let cls2 = bclass(&[(b'a', b'c')]);
2051
        let expected = bclass(&[(b'a', b'a')]);
2052
        assert_eq!(expected, bintersect(&cls1, &cls2));
2053
2054
        let cls1 = bclass(&[(b'a', b'b')]);
2055
        let cls2 = bclass(&[(b'a', b'c')]);
2056
        let expected = bclass(&[(b'a', b'b')]);
2057
        assert_eq!(expected, bintersect(&cls1, &cls2));
2058
2059
        let cls1 = bclass(&[(b'a', b'b')]);
2060
        let cls2 = bclass(&[(b'b', b'c')]);
2061
        let expected = bclass(&[(b'b', b'b')]);
2062
        assert_eq!(expected, bintersect(&cls1, &cls2));
2063
2064
        let cls1 = bclass(&[(b'a', b'b')]);
2065
        let cls2 = bclass(&[(b'c', b'd')]);
2066
        let expected = bclass(&[]);
2067
        assert_eq!(expected, bintersect(&cls1, &cls2));
2068
2069
        let cls1 = bclass(&[(b'b', b'c')]);
2070
        let cls2 = bclass(&[(b'a', b'd')]);
2071
        let expected = bclass(&[(b'b', b'c')]);
2072
        assert_eq!(expected, bintersect(&cls1, &cls2));
2073
2074
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2075
        let cls2 = bclass(&[(b'a', b'h')]);
2076
        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077
        assert_eq!(expected, bintersect(&cls1, &cls2));
2078
2079
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2080
        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2081
        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2082
        assert_eq!(expected, bintersect(&cls1, &cls2));
2083
2084
        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2085
        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2086
        let expected = bclass(&[]);
2087
        assert_eq!(expected, bintersect(&cls1, &cls2));
2088
2089
        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2090
        let cls2 = bclass(&[(b'h', b'h')]);
2091
        let expected = bclass(&[(b'h', b'h')]);
2092
        assert_eq!(expected, bintersect(&cls1, &cls2));
2093
2094
        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2095
        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2096
        let expected = bclass(&[]);
2097
        assert_eq!(expected, bintersect(&cls1, &cls2));
2098
2099
        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2100
        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2101
        let expected = bclass(&[(b'b', b'f')]);
2102
        assert_eq!(expected, bintersect(&cls1, &cls2));
2103
    }
2104
2105
    #[test]
2106
    fn class_difference_unicode() {
2107
        let cls1 = uclass(&[('a', 'a')]);
2108
        let cls2 = uclass(&[('a', 'a')]);
2109
        let expected = uclass(&[]);
2110
        assert_eq!(expected, udifference(&cls1, &cls2));
2111
2112
        let cls1 = uclass(&[('a', 'a')]);
2113
        let cls2 = uclass(&[]);
2114
        let expected = uclass(&[('a', 'a')]);
2115
        assert_eq!(expected, udifference(&cls1, &cls2));
2116
2117
        let cls1 = uclass(&[]);
2118
        let cls2 = uclass(&[('a', 'a')]);
2119
        let expected = uclass(&[]);
2120
        assert_eq!(expected, udifference(&cls1, &cls2));
2121
2122
        let cls1 = uclass(&[('a', 'z')]);
2123
        let cls2 = uclass(&[('a', 'a')]);
2124
        let expected = uclass(&[('b', 'z')]);
2125
        assert_eq!(expected, udifference(&cls1, &cls2));
2126
2127
        let cls1 = uclass(&[('a', 'z')]);
2128
        let cls2 = uclass(&[('z', 'z')]);
2129
        let expected = uclass(&[('a', 'y')]);
2130
        assert_eq!(expected, udifference(&cls1, &cls2));
2131
2132
        let cls1 = uclass(&[('a', 'z')]);
2133
        let cls2 = uclass(&[('m', 'm')]);
2134
        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2135
        assert_eq!(expected, udifference(&cls1, &cls2));
2136
2137
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2138
        let cls2 = uclass(&[('a', 'z')]);
2139
        let expected = uclass(&[]);
2140
        assert_eq!(expected, udifference(&cls1, &cls2));
2141
2142
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2143
        let cls2 = uclass(&[('d', 'v')]);
2144
        let expected = uclass(&[('a', 'c')]);
2145
        assert_eq!(expected, udifference(&cls1, &cls2));
2146
2147
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2148
        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2149
        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2150
        assert_eq!(expected, udifference(&cls1, &cls2));
2151
2152
        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2153
        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2154
        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2155
        assert_eq!(expected, udifference(&cls1, &cls2));
2156
2157
        let cls1 = uclass(&[('x', 'z')]);
2158
        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2159
        let expected = uclass(&[('x', 'z')]);
2160
        assert_eq!(expected, udifference(&cls1, &cls2));
2161
2162
        let cls1 = uclass(&[('a', 'z')]);
2163
        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2164
        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2165
        assert_eq!(expected, udifference(&cls1, &cls2));
2166
    }
2167
2168
    #[test]
2169
    fn class_difference_bytes() {
2170
        let cls1 = bclass(&[(b'a', b'a')]);
2171
        let cls2 = bclass(&[(b'a', b'a')]);
2172
        let expected = bclass(&[]);
2173
        assert_eq!(expected, bdifference(&cls1, &cls2));
2174
2175
        let cls1 = bclass(&[(b'a', b'a')]);
2176
        let cls2 = bclass(&[]);
2177
        let expected = bclass(&[(b'a', b'a')]);
2178
        assert_eq!(expected, bdifference(&cls1, &cls2));
2179
2180
        let cls1 = bclass(&[]);
2181
        let cls2 = bclass(&[(b'a', b'a')]);
2182
        let expected = bclass(&[]);
2183
        assert_eq!(expected, bdifference(&cls1, &cls2));
2184
2185
        let cls1 = bclass(&[(b'a', b'z')]);
2186
        let cls2 = bclass(&[(b'a', b'a')]);
2187
        let expected = bclass(&[(b'b', b'z')]);
2188
        assert_eq!(expected, bdifference(&cls1, &cls2));
2189
2190
        let cls1 = bclass(&[(b'a', b'z')]);
2191
        let cls2 = bclass(&[(b'z', b'z')]);
2192
        let expected = bclass(&[(b'a', b'y')]);
2193
        assert_eq!(expected, bdifference(&cls1, &cls2));
2194
2195
        let cls1 = bclass(&[(b'a', b'z')]);
2196
        let cls2 = bclass(&[(b'm', b'm')]);
2197
        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2198
        assert_eq!(expected, bdifference(&cls1, &cls2));
2199
2200
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2201
        let cls2 = bclass(&[(b'a', b'z')]);
2202
        let expected = bclass(&[]);
2203
        assert_eq!(expected, bdifference(&cls1, &cls2));
2204
2205
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2206
        let cls2 = bclass(&[(b'd', b'v')]);
2207
        let expected = bclass(&[(b'a', b'c')]);
2208
        assert_eq!(expected, bdifference(&cls1, &cls2));
2209
2210
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2211
        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2212
        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2213
        assert_eq!(expected, bdifference(&cls1, &cls2));
2214
2215
        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2216
        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2217
        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2218
        assert_eq!(expected, bdifference(&cls1, &cls2));
2219
2220
        let cls1 = bclass(&[(b'x', b'z')]);
2221
        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2222
        let expected = bclass(&[(b'x', b'z')]);
2223
        assert_eq!(expected, bdifference(&cls1, &cls2));
2224
2225
        let cls1 = bclass(&[(b'a', b'z')]);
2226
        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2227
        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2228
        assert_eq!(expected, bdifference(&cls1, &cls2));
2229
    }
2230
2231
    #[test]
2232
    fn class_symmetric_difference_unicode() {
2233
        let cls1 = uclass(&[('a', 'm')]);
2234
        let cls2 = uclass(&[('g', 't')]);
2235
        let expected = uclass(&[('a', 'f'), ('n', 't')]);
2236
        assert_eq!(expected, usymdifference(&cls1, &cls2));
2237
    }
2238
2239
    #[test]
2240
    fn class_symmetric_difference_bytes() {
2241
        let cls1 = bclass(&[(b'a', b'm')]);
2242
        let cls2 = bclass(&[(b'g', b't')]);
2243
        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2244
        assert_eq!(expected, bsymdifference(&cls1, &cls2));
2245
    }
2246
2247
    #[test]
2248
    #[should_panic]
2249
    fn hir_byte_literal_non_ascii() {
2250
        Hir::literal(Literal::Byte(b'a'));
2251
    }
2252
2253
    // We use a thread with an explicit stack size to test that our destructor
2254
    // for Hir can handle arbitrarily sized expressions in constant stack
2255
    // space. In case we run on a platform without threads (WASM?), we limit
2256
    // this test to Windows/Unix.
2257
    #[test]
2258
    #[cfg(any(unix, windows))]
2259
    fn no_stack_overflow_on_drop() {
2260
        use std::thread;
2261
2262
        let run = || {
2263
            let mut expr = Hir::empty();
2264
            for _ in 0..100 {
2265
                expr = Hir::group(Group {
2266
                    kind: GroupKind::NonCapturing,
2267
                    hir: Box::new(expr),
2268
                });
2269
                expr = Hir::repetition(Repetition {
2270
                    kind: RepetitionKind::ZeroOrOne,
2271
                    greedy: true,
2272
                    hir: Box::new(expr),
2273
                });
2274
2275
                expr = Hir {
2276
                    kind: HirKind::Concat(vec![expr]),
2277
                    info: HirInfo::new(),
2278
                };
2279
                expr = Hir {
2280
                    kind: HirKind::Alternation(vec![expr]),
2281
                    info: HirInfo::new(),
2282
                };
2283
            }
2284
            assert!(!expr.kind.is_empty());
2285
        };
2286
2287
        // We run our test on a thread with a small stack size so we can
2288
        // force the issue more easily.
2289
        //
2290
        // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
2291
        // for context on the specific stack size chosen here.
2292
        thread::Builder::new()
2293
            .stack_size(16 << 10)
2294
            .spawn(run)
2295
            .unwrap()
2296
            .join()
2297
            .unwrap();
2298
    }
2299
}