Coverage Report

Created: 2025-07-11 07:02

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/hir/translate.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Defines a translator that converts an `Ast` to an `Hir`.
3
*/
4
5
use core::cell::{Cell, RefCell};
6
7
use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8
9
use crate::{
10
    ast::{self, Ast, Span, Visitor},
11
    either::Either,
12
    hir::{self, Error, ErrorKind, Hir, HirKind},
13
    unicode::{self, ClassQuery},
14
};
15
16
type Result<T> = core::result::Result<T, Error>;
17
18
/// A builder for constructing an AST->HIR translator.
19
#[derive(Clone, Debug)]
20
pub struct TranslatorBuilder {
21
    utf8: bool,
22
    line_terminator: u8,
23
    flags: Flags,
24
}
25
26
impl Default for TranslatorBuilder {
27
0
    fn default() -> TranslatorBuilder {
28
0
        TranslatorBuilder::new()
29
0
    }
30
}
31
32
impl TranslatorBuilder {
33
    /// Create a new translator builder with a default c onfiguration.
34
0
    pub fn new() -> TranslatorBuilder {
35
0
        TranslatorBuilder {
36
0
            utf8: true,
37
0
            line_terminator: b'\n',
38
0
            flags: Flags::default(),
39
0
        }
40
0
    }
41
42
    /// Build a translator using the current configuration.
43
0
    pub fn build(&self) -> Translator {
44
0
        Translator {
45
0
            stack: RefCell::new(vec![]),
46
0
            flags: Cell::new(self.flags),
47
0
            utf8: self.utf8,
48
0
            line_terminator: self.line_terminator,
49
0
        }
50
0
    }
51
52
    /// When disabled, translation will permit the construction of a regular
53
    /// expression that may match invalid UTF-8.
54
    ///
55
    /// When enabled (the default), the translator is guaranteed to produce an
56
    /// expression that, for non-empty matches, will only ever produce spans
57
    /// that are entirely valid UTF-8 (otherwise, the translator will return an
58
    /// error).
59
    ///
60
    /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
61
    /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
62
    /// syntax) will be allowed even though they can produce matches that split
63
    /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
64
    /// matches, and it is expected that the regex engine itself must handle
65
    /// these cases if necessary (perhaps by suppressing any zero-width matches
66
    /// that split a codepoint).
67
0
    pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
68
0
        self.utf8 = yes;
69
0
        self
70
0
    }
71
72
    /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
73
    ///
74
    /// Namely, instead of `.` (by default) matching everything except for `\n`,
75
    /// this will cause `.` to match everything except for the byte given.
76
    ///
77
    /// If `.` is used in a context where Unicode mode is enabled and this byte
78
    /// isn't ASCII, then an error will be returned. When Unicode mode is
79
    /// disabled, then any byte is permitted, but will return an error if UTF-8
80
    /// mode is enabled and it is a non-ASCII byte.
81
    ///
82
    /// In short, any ASCII value for a line terminator is always okay. But a
83
    /// non-ASCII byte might result in an error depending on whether Unicode
84
    /// mode or UTF-8 mode are enabled.
85
    ///
86
    /// Note that if `R` mode is enabled then it always takes precedence and
87
    /// the line terminator will be treated as `\r` and `\n` simultaneously.
88
    ///
89
    /// Note also that this *doesn't* impact the look-around assertions
90
    /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
91
    /// configuration in the regex engine itself.
92
0
    pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
93
0
        self.line_terminator = byte;
94
0
        self
95
0
    }
96
97
    /// Enable or disable the case insensitive flag (`i`) by default.
98
0
    pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
99
0
        self.flags.case_insensitive = if yes { Some(true) } else { None };
100
0
        self
101
0
    }
102
103
    /// Enable or disable the multi-line matching flag (`m`) by default.
104
0
    pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
105
0
        self.flags.multi_line = if yes { Some(true) } else { None };
106
0
        self
107
0
    }
108
109
    /// Enable or disable the "dot matches any character" flag (`s`) by
110
    /// default.
111
0
    pub fn dot_matches_new_line(
112
0
        &mut self,
113
0
        yes: bool,
114
0
    ) -> &mut TranslatorBuilder {
115
0
        self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
116
0
        self
117
0
    }
118
119
    /// Enable or disable the CRLF mode flag (`R`) by default.
120
0
    pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
121
0
        self.flags.crlf = if yes { Some(true) } else { None };
122
0
        self
123
0
    }
124
125
    /// Enable or disable the "swap greed" flag (`U`) by default.
126
0
    pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
127
0
        self.flags.swap_greed = if yes { Some(true) } else { None };
128
0
        self
129
0
    }
130
131
    /// Enable or disable the Unicode flag (`u`) by default.
132
0
    pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
133
0
        self.flags.unicode = if yes { None } else { Some(false) };
134
0
        self
135
0
    }
136
}
137
138
/// A translator maps abstract syntax to a high level intermediate
139
/// representation.
140
///
141
/// A translator may be benefit from reuse. That is, a translator can translate
142
/// many abstract syntax trees.
143
///
144
/// A `Translator` can be configured in more detail via a
145
/// [`TranslatorBuilder`].
146
#[derive(Clone, Debug)]
147
pub struct Translator {
148
    /// Our call stack, but on the heap.
149
    stack: RefCell<Vec<HirFrame>>,
150
    /// The current flag settings.
151
    flags: Cell<Flags>,
152
    /// Whether we're allowed to produce HIR that can match arbitrary bytes.
153
    utf8: bool,
154
    /// The line terminator to use for `.`.
155
    line_terminator: u8,
156
}
157
158
impl Translator {
159
    /// Create a new translator using the default configuration.
160
0
    pub fn new() -> Translator {
161
0
        TranslatorBuilder::new().build()
162
0
    }
163
164
    /// Translate the given abstract syntax tree (AST) into a high level
165
    /// intermediate representation (HIR).
166
    ///
167
    /// If there was a problem doing the translation, then an HIR-specific
168
    /// error is returned.
169
    ///
170
    /// The original pattern string used to produce the `Ast` *must* also be
171
    /// provided. The translator does not use the pattern string during any
172
    /// correct translation, but is used for error reporting.
173
0
    pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
174
0
        ast::visit(ast, TranslatorI::new(self, pattern))
175
0
    }
176
}
177
178
/// An HirFrame is a single stack frame, represented explicitly, which is
179
/// created for each item in the Ast that we traverse.
180
///
181
/// Note that technically, this type doesn't represent our entire stack
182
/// frame. In particular, the Ast visitor represents any state associated with
183
/// traversing the Ast itself.
184
#[derive(Clone, Debug)]
185
enum HirFrame {
186
    /// An arbitrary HIR expression. These get pushed whenever we hit a base
187
    /// case in the Ast. They get popped after an inductive (i.e., recursive)
188
    /// step is complete.
189
    Expr(Hir),
190
    /// A literal that is being constructed, character by character, from the
191
    /// AST. We need this because the AST gives each individual character its
192
    /// own node. So as we see characters, we peek at the top-most HirFrame.
193
    /// If it's a literal, then we add to it. Otherwise, we push a new literal.
194
    /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
195
    Literal(Vec<u8>),
196
    /// A Unicode character class. This frame is mutated as we descend into
197
    /// the Ast of a character class (which is itself its own mini recursive
198
    /// structure).
199
    ClassUnicode(hir::ClassUnicode),
200
    /// A byte-oriented character class. This frame is mutated as we descend
201
    /// into the Ast of a character class (which is itself its own mini
202
    /// recursive structure).
203
    ///
204
    /// Byte character classes are created when Unicode mode (`u`) is disabled.
205
    /// If `utf8` is enabled (the default), then a byte character is only
206
    /// permitted to match ASCII text.
207
    ClassBytes(hir::ClassBytes),
208
    /// This is pushed whenever a repetition is observed. After visiting every
209
    /// sub-expression in the repetition, the translator's stack is expected to
210
    /// have this sentinel at the top.
211
    ///
212
    /// This sentinel only exists to stop other things (like flattening
213
    /// literals) from reaching across repetition operators.
214
    Repetition,
215
    /// This is pushed on to the stack upon first seeing any kind of capture,
216
    /// indicated by parentheses (including non-capturing groups). It is popped
217
    /// upon leaving a group.
218
    Group {
219
        /// The old active flags when this group was opened.
220
        ///
221
        /// If this group sets flags, then the new active flags are set to the
222
        /// result of merging the old flags with the flags introduced by this
223
        /// group. If the group doesn't set any flags, then this is simply
224
        /// equivalent to whatever flags were set when the group was opened.
225
        ///
226
        /// When this group is popped, the active flags should be restored to
227
        /// the flags set here.
228
        ///
229
        /// The "active" flags correspond to whatever flags are set in the
230
        /// Translator.
231
        old_flags: Flags,
232
    },
233
    /// This is pushed whenever a concatenation is observed. After visiting
234
    /// every sub-expression in the concatenation, the translator's stack is
235
    /// popped until it sees a Concat frame.
236
    Concat,
237
    /// This is pushed whenever an alternation is observed. After visiting
238
    /// every sub-expression in the alternation, the translator's stack is
239
    /// popped until it sees an Alternation frame.
240
    Alternation,
241
    /// This is pushed immediately before each sub-expression in an
242
    /// alternation. This separates the branches of an alternation on the
243
    /// stack and prevents literal flattening from reaching across alternation
244
    /// branches.
245
    ///
246
    /// It is popped after each expression in a branch until an 'Alternation'
247
    /// frame is observed when doing a post visit on an alternation.
248
    AlternationBranch,
249
}
250
251
impl HirFrame {
252
    /// Assert that the current stack frame is an Hir expression and return it.
253
0
    fn unwrap_expr(self) -> Hir {
254
0
        match self {
255
0
            HirFrame::Expr(expr) => expr,
256
0
            HirFrame::Literal(lit) => Hir::literal(lit),
257
0
            _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
258
        }
259
0
    }
260
261
    /// Assert that the current stack frame is a Unicode class expression and
262
    /// return it.
263
0
    fn unwrap_class_unicode(self) -> hir::ClassUnicode {
264
0
        match self {
265
0
            HirFrame::ClassUnicode(cls) => cls,
266
0
            _ => panic!(
267
0
                "tried to unwrap Unicode class \
268
0
                 from HirFrame, got: {:?}",
269
0
                self
270
0
            ),
271
        }
272
0
    }
273
274
    /// Assert that the current stack frame is a byte class expression and
275
    /// return it.
276
0
    fn unwrap_class_bytes(self) -> hir::ClassBytes {
277
0
        match self {
278
0
            HirFrame::ClassBytes(cls) => cls,
279
0
            _ => panic!(
280
0
                "tried to unwrap byte class \
281
0
                 from HirFrame, got: {:?}",
282
0
                self
283
0
            ),
284
        }
285
0
    }
286
287
    /// Assert that the current stack frame is a repetition sentinel. If it
288
    /// isn't, then panic.
289
0
    fn unwrap_repetition(self) {
290
0
        match self {
291
0
            HirFrame::Repetition => {}
292
            _ => {
293
0
                panic!(
294
0
                    "tried to unwrap repetition from HirFrame, got: {:?}",
295
0
                    self
296
0
                )
297
            }
298
        }
299
0
    }
300
301
    /// Assert that the current stack frame is a group indicator and return
302
    /// its corresponding flags (the flags that were active at the time the
303
    /// group was entered).
304
0
    fn unwrap_group(self) -> Flags {
305
0
        match self {
306
0
            HirFrame::Group { old_flags } => old_flags,
307
            _ => {
308
0
                panic!("tried to unwrap group from HirFrame, got: {:?}", self)
309
            }
310
        }
311
0
    }
312
313
    /// Assert that the current stack frame is an alternation pipe sentinel. If
314
    /// it isn't, then panic.
315
0
    fn unwrap_alternation_pipe(self) {
316
0
        match self {
317
0
            HirFrame::AlternationBranch => {}
318
            _ => {
319
0
                panic!(
320
0
                    "tried to unwrap alt pipe from HirFrame, got: {:?}",
321
0
                    self
322
0
                )
323
            }
324
        }
325
0
    }
326
}
327
328
impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
329
    type Output = Hir;
330
    type Err = Error;
331
332
0
    fn finish(self) -> Result<Hir> {
333
0
        // ... otherwise, we should have exactly one HIR on the stack.
334
0
        assert_eq!(self.trans().stack.borrow().len(), 1);
335
0
        Ok(self.pop().unwrap().unwrap_expr())
336
0
    }
337
338
0
    fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
339
0
        match *ast {
340
            Ast::ClassBracketed(_) => {
341
0
                if self.flags().unicode() {
342
0
                    let cls = hir::ClassUnicode::empty();
343
0
                    self.push(HirFrame::ClassUnicode(cls));
344
0
                } else {
345
0
                    let cls = hir::ClassBytes::empty();
346
0
                    self.push(HirFrame::ClassBytes(cls));
347
0
                }
348
            }
349
0
            Ast::Repetition(_) => self.push(HirFrame::Repetition),
350
0
            Ast::Group(ref x) => {
351
0
                let old_flags = x
352
0
                    .flags()
353
0
                    .map(|ast| self.set_flags(ast))
354
0
                    .unwrap_or_else(|| self.flags());
355
0
                self.push(HirFrame::Group { old_flags });
356
0
            }
357
0
            Ast::Concat(_) => {
358
0
                self.push(HirFrame::Concat);
359
0
            }
360
0
            Ast::Alternation(ref x) => {
361
0
                self.push(HirFrame::Alternation);
362
0
                if !x.asts.is_empty() {
363
0
                    self.push(HirFrame::AlternationBranch);
364
0
                }
365
            }
366
0
            _ => {}
367
        }
368
0
        Ok(())
369
0
    }
370
371
0
    fn visit_post(&mut self, ast: &Ast) -> Result<()> {
372
0
        match *ast {
373
0
            Ast::Empty(_) => {
374
0
                self.push(HirFrame::Expr(Hir::empty()));
375
0
            }
376
0
            Ast::Flags(ref x) => {
377
0
                self.set_flags(&x.flags);
378
0
                // Flags in the AST are generally considered directives and
379
0
                // not actual sub-expressions. However, they can be used in
380
0
                // the concrete syntax like `((?i))`, and we need some kind of
381
0
                // indication of an expression there, and Empty is the correct
382
0
                // choice.
383
0
                //
384
0
                // There can also be things like `(?i)+`, but we rule those out
385
0
                // in the parser. In the future, we might allow them for
386
0
                // consistency sake.
387
0
                self.push(HirFrame::Expr(Hir::empty()));
388
0
            }
389
0
            Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
390
0
                Either::Right(byte) => self.push_byte(byte),
391
0
                Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
392
0
                    None => self.push_char(ch),
393
0
                    Some(expr) => self.push(HirFrame::Expr(expr)),
394
                },
395
            },
396
0
            Ast::Dot(ref span) => {
397
0
                self.push(HirFrame::Expr(self.hir_dot(**span)?));
398
            }
399
0
            Ast::Assertion(ref x) => {
400
0
                self.push(HirFrame::Expr(self.hir_assertion(x)?));
401
            }
402
0
            Ast::ClassPerl(ref x) => {
403
0
                if self.flags().unicode() {
404
0
                    let cls = self.hir_perl_unicode_class(x)?;
405
0
                    let hcls = hir::Class::Unicode(cls);
406
0
                    self.push(HirFrame::Expr(Hir::class(hcls)));
407
                } else {
408
0
                    let cls = self.hir_perl_byte_class(x)?;
409
0
                    let hcls = hir::Class::Bytes(cls);
410
0
                    self.push(HirFrame::Expr(Hir::class(hcls)));
411
                }
412
            }
413
0
            Ast::ClassUnicode(ref x) => {
414
0
                let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
415
0
                self.push(HirFrame::Expr(Hir::class(cls)));
416
            }
417
0
            Ast::ClassBracketed(ref ast) => {
418
0
                if self.flags().unicode() {
419
0
                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
420
0
                    self.unicode_fold_and_negate(
421
0
                        &ast.span,
422
0
                        ast.negated,
423
0
                        &mut cls,
424
0
                    )?;
425
0
                    let expr = Hir::class(hir::Class::Unicode(cls));
426
0
                    self.push(HirFrame::Expr(expr));
427
                } else {
428
0
                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
429
0
                    self.bytes_fold_and_negate(
430
0
                        &ast.span,
431
0
                        ast.negated,
432
0
                        &mut cls,
433
0
                    )?;
434
0
                    let expr = Hir::class(hir::Class::Bytes(cls));
435
0
                    self.push(HirFrame::Expr(expr));
436
                }
437
            }
438
0
            Ast::Repetition(ref x) => {
439
0
                let expr = self.pop().unwrap().unwrap_expr();
440
0
                self.pop().unwrap().unwrap_repetition();
441
0
                self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
442
0
            }
443
0
            Ast::Group(ref x) => {
444
0
                let expr = self.pop().unwrap().unwrap_expr();
445
0
                let old_flags = self.pop().unwrap().unwrap_group();
446
0
                self.trans().flags.set(old_flags);
447
0
                self.push(HirFrame::Expr(self.hir_capture(x, expr)));
448
0
            }
449
            Ast::Concat(_) => {
450
0
                let mut exprs = vec![];
451
0
                while let Some(expr) = self.pop_concat_expr() {
452
0
                    if !matches!(*expr.kind(), HirKind::Empty) {
453
0
                        exprs.push(expr);
454
0
                    }
455
                }
456
0
                exprs.reverse();
457
0
                self.push(HirFrame::Expr(Hir::concat(exprs)));
458
            }
459
            Ast::Alternation(_) => {
460
0
                let mut exprs = vec![];
461
0
                while let Some(expr) = self.pop_alt_expr() {
462
0
                    self.pop().unwrap().unwrap_alternation_pipe();
463
0
                    exprs.push(expr);
464
0
                }
465
0
                exprs.reverse();
466
0
                self.push(HirFrame::Expr(Hir::alternation(exprs)));
467
            }
468
        }
469
0
        Ok(())
470
0
    }
471
472
0
    fn visit_alternation_in(&mut self) -> Result<()> {
473
0
        self.push(HirFrame::AlternationBranch);
474
0
        Ok(())
475
0
    }
476
477
0
    fn visit_class_set_item_pre(
478
0
        &mut self,
479
0
        ast: &ast::ClassSetItem,
480
0
    ) -> Result<()> {
481
0
        match *ast {
482
            ast::ClassSetItem::Bracketed(_) => {
483
0
                if self.flags().unicode() {
484
0
                    let cls = hir::ClassUnicode::empty();
485
0
                    self.push(HirFrame::ClassUnicode(cls));
486
0
                } else {
487
0
                    let cls = hir::ClassBytes::empty();
488
0
                    self.push(HirFrame::ClassBytes(cls));
489
0
                }
490
            }
491
            // We needn't handle the Union case here since the visitor will
492
            // do it for us.
493
0
            _ => {}
494
        }
495
0
        Ok(())
496
0
    }
497
498
0
    fn visit_class_set_item_post(
499
0
        &mut self,
500
0
        ast: &ast::ClassSetItem,
501
0
    ) -> Result<()> {
502
0
        match *ast {
503
0
            ast::ClassSetItem::Empty(_) => {}
504
0
            ast::ClassSetItem::Literal(ref x) => {
505
0
                if self.flags().unicode() {
506
0
                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
507
0
                    cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
508
0
                    self.push(HirFrame::ClassUnicode(cls));
509
0
                } else {
510
0
                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
511
0
                    let byte = self.class_literal_byte(x)?;
512
0
                    cls.push(hir::ClassBytesRange::new(byte, byte));
513
0
                    self.push(HirFrame::ClassBytes(cls));
514
                }
515
            }
516
0
            ast::ClassSetItem::Range(ref x) => {
517
0
                if self.flags().unicode() {
518
0
                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
519
0
                    cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
520
0
                    self.push(HirFrame::ClassUnicode(cls));
521
0
                } else {
522
0
                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
523
0
                    let start = self.class_literal_byte(&x.start)?;
524
0
                    let end = self.class_literal_byte(&x.end)?;
525
0
                    cls.push(hir::ClassBytesRange::new(start, end));
526
0
                    self.push(HirFrame::ClassBytes(cls));
527
                }
528
            }
529
0
            ast::ClassSetItem::Ascii(ref x) => {
530
0
                if self.flags().unicode() {
531
0
                    let xcls = self.hir_ascii_unicode_class(x)?;
532
0
                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
533
0
                    cls.union(&xcls);
534
0
                    self.push(HirFrame::ClassUnicode(cls));
535
                } else {
536
0
                    let xcls = self.hir_ascii_byte_class(x)?;
537
0
                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
538
0
                    cls.union(&xcls);
539
0
                    self.push(HirFrame::ClassBytes(cls));
540
                }
541
            }
542
0
            ast::ClassSetItem::Unicode(ref x) => {
543
0
                let xcls = self.hir_unicode_class(x)?;
544
0
                let mut cls = self.pop().unwrap().unwrap_class_unicode();
545
0
                cls.union(&xcls);
546
0
                self.push(HirFrame::ClassUnicode(cls));
547
            }
548
0
            ast::ClassSetItem::Perl(ref x) => {
549
0
                if self.flags().unicode() {
550
0
                    let xcls = self.hir_perl_unicode_class(x)?;
551
0
                    let mut cls = self.pop().unwrap().unwrap_class_unicode();
552
0
                    cls.union(&xcls);
553
0
                    self.push(HirFrame::ClassUnicode(cls));
554
                } else {
555
0
                    let xcls = self.hir_perl_byte_class(x)?;
556
0
                    let mut cls = self.pop().unwrap().unwrap_class_bytes();
557
0
                    cls.union(&xcls);
558
0
                    self.push(HirFrame::ClassBytes(cls));
559
                }
560
            }
561
0
            ast::ClassSetItem::Bracketed(ref ast) => {
562
0
                if self.flags().unicode() {
563
0
                    let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
564
0
                    self.unicode_fold_and_negate(
565
0
                        &ast.span,
566
0
                        ast.negated,
567
0
                        &mut cls1,
568
0
                    )?;
569
570
0
                    let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
571
0
                    cls2.union(&cls1);
572
0
                    self.push(HirFrame::ClassUnicode(cls2));
573
                } else {
574
0
                    let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
575
0
                    self.bytes_fold_and_negate(
576
0
                        &ast.span,
577
0
                        ast.negated,
578
0
                        &mut cls1,
579
0
                    )?;
580
581
0
                    let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
582
0
                    cls2.union(&cls1);
583
0
                    self.push(HirFrame::ClassBytes(cls2));
584
                }
585
            }
586
            // This is handled automatically by the visitor.
587
0
            ast::ClassSetItem::Union(_) => {}
588
        }
589
0
        Ok(())
590
0
    }
591
592
0
    fn visit_class_set_binary_op_pre(
593
0
        &mut self,
594
0
        _op: &ast::ClassSetBinaryOp,
595
0
    ) -> Result<()> {
596
0
        if self.flags().unicode() {
597
0
            let cls = hir::ClassUnicode::empty();
598
0
            self.push(HirFrame::ClassUnicode(cls));
599
0
        } else {
600
0
            let cls = hir::ClassBytes::empty();
601
0
            self.push(HirFrame::ClassBytes(cls));
602
0
        }
603
0
        Ok(())
604
0
    }
605
606
0
    fn visit_class_set_binary_op_in(
607
0
        &mut self,
608
0
        _op: &ast::ClassSetBinaryOp,
609
0
    ) -> Result<()> {
610
0
        if self.flags().unicode() {
611
0
            let cls = hir::ClassUnicode::empty();
612
0
            self.push(HirFrame::ClassUnicode(cls));
613
0
        } else {
614
0
            let cls = hir::ClassBytes::empty();
615
0
            self.push(HirFrame::ClassBytes(cls));
616
0
        }
617
0
        Ok(())
618
0
    }
619
620
0
    fn visit_class_set_binary_op_post(
621
0
        &mut self,
622
0
        op: &ast::ClassSetBinaryOp,
623
0
    ) -> Result<()> {
624
        use crate::ast::ClassSetBinaryOpKind::*;
625
626
0
        if self.flags().unicode() {
627
0
            let mut rhs = self.pop().unwrap().unwrap_class_unicode();
628
0
            let mut lhs = self.pop().unwrap().unwrap_class_unicode();
629
0
            let mut cls = self.pop().unwrap().unwrap_class_unicode();
630
0
            if self.flags().case_insensitive() {
631
0
                rhs.try_case_fold_simple().map_err(|_| {
632
0
                    self.error(
633
0
                        op.rhs.span().clone(),
634
0
                        ErrorKind::UnicodeCaseUnavailable,
635
0
                    )
636
0
                })?;
637
0
                lhs.try_case_fold_simple().map_err(|_| {
638
0
                    self.error(
639
0
                        op.lhs.span().clone(),
640
0
                        ErrorKind::UnicodeCaseUnavailable,
641
0
                    )
642
0
                })?;
643
0
            }
644
0
            match op.kind {
645
0
                Intersection => lhs.intersect(&rhs),
646
0
                Difference => lhs.difference(&rhs),
647
0
                SymmetricDifference => lhs.symmetric_difference(&rhs),
648
            }
649
0
            cls.union(&lhs);
650
0
            self.push(HirFrame::ClassUnicode(cls));
651
        } else {
652
0
            let mut rhs = self.pop().unwrap().unwrap_class_bytes();
653
0
            let mut lhs = self.pop().unwrap().unwrap_class_bytes();
654
0
            let mut cls = self.pop().unwrap().unwrap_class_bytes();
655
0
            if self.flags().case_insensitive() {
656
0
                rhs.case_fold_simple();
657
0
                lhs.case_fold_simple();
658
0
            }
659
0
            match op.kind {
660
0
                Intersection => lhs.intersect(&rhs),
661
0
                Difference => lhs.difference(&rhs),
662
0
                SymmetricDifference => lhs.symmetric_difference(&rhs),
663
            }
664
0
            cls.union(&lhs);
665
0
            self.push(HirFrame::ClassBytes(cls));
666
        }
667
0
        Ok(())
668
0
    }
669
}
670
671
/// The internal implementation of a translator.
672
///
673
/// This type is responsible for carrying around the original pattern string,
674
/// which is not tied to the internal state of a translator.
675
///
676
/// A TranslatorI exists for the time it takes to translate a single Ast.
677
#[derive(Clone, Debug)]
678
struct TranslatorI<'t, 'p> {
679
    trans: &'t Translator,
680
    pattern: &'p str,
681
}
682
683
impl<'t, 'p> TranslatorI<'t, 'p> {
684
    /// Build a new internal translator.
685
0
    fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
686
0
        TranslatorI { trans, pattern }
687
0
    }
688
689
    /// Return a reference to the underlying translator.
690
0
    fn trans(&self) -> &Translator {
691
0
        &self.trans
692
0
    }
693
694
    /// Push the given frame on to the call stack.
695
0
    fn push(&self, frame: HirFrame) {
696
0
        self.trans().stack.borrow_mut().push(frame);
697
0
    }
698
699
    /// Push the given literal char on to the call stack.
700
    ///
701
    /// If the top-most element of the stack is a literal, then the char
702
    /// is appended to the end of that literal. Otherwise, a new literal
703
    /// containing just the given char is pushed to the top of the stack.
704
0
    fn push_char(&self, ch: char) {
705
0
        let mut buf = [0; 4];
706
0
        let bytes = ch.encode_utf8(&mut buf).as_bytes();
707
0
        let mut stack = self.trans().stack.borrow_mut();
708
0
        if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
709
0
            literal.extend_from_slice(bytes);
710
0
        } else {
711
0
            stack.push(HirFrame::Literal(bytes.to_vec()));
712
0
        }
713
0
    }
714
715
    /// Push the given literal byte on to the call stack.
716
    ///
717
    /// If the top-most element of the stack is a literal, then the byte
718
    /// is appended to the end of that literal. Otherwise, a new literal
719
    /// containing just the given byte is pushed to the top of the stack.
720
0
    fn push_byte(&self, byte: u8) {
721
0
        let mut stack = self.trans().stack.borrow_mut();
722
0
        if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
723
0
            literal.push(byte);
724
0
        } else {
725
0
            stack.push(HirFrame::Literal(vec![byte]));
726
0
        }
727
0
    }
728
729
    /// Pop the top of the call stack. If the call stack is empty, return None.
730
0
    fn pop(&self) -> Option<HirFrame> {
731
0
        self.trans().stack.borrow_mut().pop()
732
0
    }
733
734
    /// Pop an HIR expression from the top of the stack for a concatenation.
735
    ///
736
    /// This returns None if the stack is empty or when a concat frame is seen.
737
    /// Otherwise, it panics if it could not find an HIR expression.
738
0
    fn pop_concat_expr(&self) -> Option<Hir> {
739
0
        let frame = self.pop()?;
740
0
        match frame {
741
0
            HirFrame::Concat => None,
742
0
            HirFrame::Expr(expr) => Some(expr),
743
0
            HirFrame::Literal(lit) => Some(Hir::literal(lit)),
744
            HirFrame::ClassUnicode(_) => {
745
0
                unreachable!("expected expr or concat, got Unicode class")
746
            }
747
            HirFrame::ClassBytes(_) => {
748
0
                unreachable!("expected expr or concat, got byte class")
749
            }
750
            HirFrame::Repetition => {
751
0
                unreachable!("expected expr or concat, got repetition")
752
            }
753
            HirFrame::Group { .. } => {
754
0
                unreachable!("expected expr or concat, got group")
755
            }
756
            HirFrame::Alternation => {
757
0
                unreachable!("expected expr or concat, got alt marker")
758
            }
759
            HirFrame::AlternationBranch => {
760
0
                unreachable!("expected expr or concat, got alt branch marker")
761
            }
762
        }
763
0
    }
764
765
    /// Pop an HIR expression from the top of the stack for an alternation.
766
    ///
767
    /// This returns None if the stack is empty or when an alternation frame is
768
    /// seen. Otherwise, it panics if it could not find an HIR expression.
769
0
    fn pop_alt_expr(&self) -> Option<Hir> {
770
0
        let frame = self.pop()?;
771
0
        match frame {
772
0
            HirFrame::Alternation => None,
773
0
            HirFrame::Expr(expr) => Some(expr),
774
0
            HirFrame::Literal(lit) => Some(Hir::literal(lit)),
775
            HirFrame::ClassUnicode(_) => {
776
0
                unreachable!("expected expr or alt, got Unicode class")
777
            }
778
            HirFrame::ClassBytes(_) => {
779
0
                unreachable!("expected expr or alt, got byte class")
780
            }
781
            HirFrame::Repetition => {
782
0
                unreachable!("expected expr or alt, got repetition")
783
            }
784
            HirFrame::Group { .. } => {
785
0
                unreachable!("expected expr or alt, got group")
786
            }
787
            HirFrame::Concat => {
788
0
                unreachable!("expected expr or alt, got concat marker")
789
            }
790
            HirFrame::AlternationBranch => {
791
0
                unreachable!("expected expr or alt, got alt branch marker")
792
            }
793
        }
794
0
    }
795
796
    /// Create a new error with the given span and error type.
797
0
    fn error(&self, span: Span, kind: ErrorKind) -> Error {
798
0
        Error { kind, pattern: self.pattern.to_string(), span }
799
0
    }
800
801
    /// Return a copy of the active flags.
802
0
    fn flags(&self) -> Flags {
803
0
        self.trans().flags.get()
804
0
    }
805
806
    /// Set the flags of this translator from the flags set in the given AST.
807
    /// Then, return the old flags.
808
0
    fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
809
0
        let old_flags = self.flags();
810
0
        let mut new_flags = Flags::from_ast(ast_flags);
811
0
        new_flags.merge(&old_flags);
812
0
        self.trans().flags.set(new_flags);
813
0
        old_flags
814
0
    }
815
816
    /// Convert an Ast literal to its scalar representation.
817
    ///
818
    /// When Unicode mode is enabled, then this always succeeds and returns a
819
    /// `char` (Unicode scalar value).
820
    ///
821
    /// When Unicode mode is disabled, then a `char` will still be returned
822
    /// whenever possible. A byte is returned only when invalid UTF-8 is
823
    /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
824
    /// will result in an error when invalid UTF-8 is not allowed.
825
0
    fn ast_literal_to_scalar(
826
0
        &self,
827
0
        lit: &ast::Literal,
828
0
    ) -> Result<Either<char, u8>> {
829
0
        if self.flags().unicode() {
830
0
            return Ok(Either::Left(lit.c));
831
0
        }
832
0
        let byte = match lit.byte() {
833
0
            None => return Ok(Either::Left(lit.c)),
834
0
            Some(byte) => byte,
835
0
        };
836
0
        if byte <= 0x7F {
837
0
            return Ok(Either::Left(char::try_from(byte).unwrap()));
838
0
        }
839
0
        if self.trans().utf8 {
840
0
            return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
841
0
        }
842
0
        Ok(Either::Right(byte))
843
0
    }
844
845
0
    fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
846
0
        if !self.flags().case_insensitive() {
847
0
            return Ok(None);
848
0
        }
849
0
        if self.flags().unicode() {
850
            // If case folding won't do anything, then don't bother trying.
851
0
            let map = unicode::SimpleCaseFolder::new()
852
0
                .map(|f| f.overlaps(c, c))
853
0
                .map_err(|_| {
854
0
                    self.error(span, ErrorKind::UnicodeCaseUnavailable)
855
0
                })?;
856
0
            if !map {
857
0
                return Ok(None);
858
0
            }
859
0
            let mut cls =
860
0
                hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
861
0
                    c, c,
862
0
                )]);
863
0
            cls.try_case_fold_simple().map_err(|_| {
864
0
                self.error(span, ErrorKind::UnicodeCaseUnavailable)
865
0
            })?;
866
0
            Ok(Some(Hir::class(hir::Class::Unicode(cls))))
867
        } else {
868
0
            if !c.is_ascii() {
869
0
                return Ok(None);
870
0
            }
871
0
            // If case folding won't do anything, then don't bother trying.
872
0
            match c {
873
0
                'A'..='Z' | 'a'..='z' => {}
874
0
                _ => return Ok(None),
875
            }
876
0
            let mut cls =
877
0
                hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
878
0
                    // OK because 'c.len_utf8() == 1' which in turn implies
879
0
                    // that 'c' is ASCII.
880
0
                    u8::try_from(c).unwrap(),
881
0
                    u8::try_from(c).unwrap(),
882
0
                )]);
883
0
            cls.case_fold_simple();
884
0
            Ok(Some(Hir::class(hir::Class::Bytes(cls))))
885
        }
886
0
    }
887
888
0
    fn hir_dot(&self, span: Span) -> Result<Hir> {
889
0
        let (utf8, lineterm, flags) =
890
0
            (self.trans().utf8, self.trans().line_terminator, self.flags());
891
0
        if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
892
0
            return Err(self.error(span, ErrorKind::InvalidUtf8));
893
0
        }
894
0
        let dot = if flags.dot_matches_new_line() {
895
0
            if flags.unicode() {
896
0
                hir::Dot::AnyChar
897
            } else {
898
0
                hir::Dot::AnyByte
899
            }
900
        } else {
901
0
            if flags.unicode() {
902
0
                if flags.crlf() {
903
0
                    hir::Dot::AnyCharExceptCRLF
904
                } else {
905
0
                    if !lineterm.is_ascii() {
906
0
                        return Err(
907
0
                            self.error(span, ErrorKind::InvalidLineTerminator)
908
0
                        );
909
0
                    }
910
0
                    hir::Dot::AnyCharExcept(char::from(lineterm))
911
                }
912
            } else {
913
0
                if flags.crlf() {
914
0
                    hir::Dot::AnyByteExceptCRLF
915
                } else {
916
0
                    hir::Dot::AnyByteExcept(lineterm)
917
                }
918
            }
919
        };
920
0
        Ok(Hir::dot(dot))
921
0
    }
922
923
0
    fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
924
0
        let unicode = self.flags().unicode();
925
0
        let multi_line = self.flags().multi_line();
926
0
        let crlf = self.flags().crlf();
927
0
        Ok(match asst.kind {
928
0
            ast::AssertionKind::StartLine => Hir::look(if multi_line {
929
0
                if crlf {
930
0
                    hir::Look::StartCRLF
931
                } else {
932
0
                    hir::Look::StartLF
933
                }
934
            } else {
935
0
                hir::Look::Start
936
            }),
937
0
            ast::AssertionKind::EndLine => Hir::look(if multi_line {
938
0
                if crlf {
939
0
                    hir::Look::EndCRLF
940
                } else {
941
0
                    hir::Look::EndLF
942
                }
943
            } else {
944
0
                hir::Look::End
945
            }),
946
0
            ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
947
0
            ast::AssertionKind::EndText => Hir::look(hir::Look::End),
948
0
            ast::AssertionKind::WordBoundary => Hir::look(if unicode {
949
0
                hir::Look::WordUnicode
950
            } else {
951
0
                hir::Look::WordAscii
952
            }),
953
0
            ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
954
0
                hir::Look::WordUnicodeNegate
955
            } else {
956
0
                hir::Look::WordAsciiNegate
957
            }),
958
            ast::AssertionKind::WordBoundaryStart
959
            | ast::AssertionKind::WordBoundaryStartAngle => {
960
0
                Hir::look(if unicode {
961
0
                    hir::Look::WordStartUnicode
962
                } else {
963
0
                    hir::Look::WordStartAscii
964
                })
965
            }
966
            ast::AssertionKind::WordBoundaryEnd
967
            | ast::AssertionKind::WordBoundaryEndAngle => {
968
0
                Hir::look(if unicode {
969
0
                    hir::Look::WordEndUnicode
970
                } else {
971
0
                    hir::Look::WordEndAscii
972
                })
973
            }
974
            ast::AssertionKind::WordBoundaryStartHalf => {
975
0
                Hir::look(if unicode {
976
0
                    hir::Look::WordStartHalfUnicode
977
                } else {
978
0
                    hir::Look::WordStartHalfAscii
979
                })
980
            }
981
0
            ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
982
0
                hir::Look::WordEndHalfUnicode
983
            } else {
984
0
                hir::Look::WordEndHalfAscii
985
            }),
986
        })
987
0
    }
988
989
0
    fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
990
0
        let (index, name) = match group.kind {
991
0
            ast::GroupKind::CaptureIndex(index) => (index, None),
992
0
            ast::GroupKind::CaptureName { ref name, .. } => {
993
0
                (name.index, Some(name.name.clone().into_boxed_str()))
994
            }
995
            // The HIR doesn't need to use non-capturing groups, since the way
996
            // in which the data type is defined handles this automatically.
997
0
            ast::GroupKind::NonCapturing(_) => return expr,
998
        };
999
0
        Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
1000
0
    }
1001
1002
0
    fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
1003
0
        let (min, max) = match rep.op.kind {
1004
0
            ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
1005
0
            ast::RepetitionKind::ZeroOrMore => (0, None),
1006
0
            ast::RepetitionKind::OneOrMore => (1, None),
1007
0
            ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
1008
0
                (m, Some(m))
1009
            }
1010
0
            ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
1011
0
                (m, None)
1012
            }
1013
            ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
1014
0
                m,
1015
0
                n,
1016
0
            )) => (m, Some(n)),
1017
        };
1018
0
        let greedy =
1019
0
            if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
1020
0
        Hir::repetition(hir::Repetition {
1021
0
            min,
1022
0
            max,
1023
0
            greedy,
1024
0
            sub: Box::new(expr),
1025
0
        })
1026
0
    }
1027
1028
0
    fn hir_unicode_class(
1029
0
        &self,
1030
0
        ast_class: &ast::ClassUnicode,
1031
0
    ) -> Result<hir::ClassUnicode> {
1032
        use crate::ast::ClassUnicodeKind::*;
1033
1034
0
        if !self.flags().unicode() {
1035
0
            return Err(
1036
0
                self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
1037
0
            );
1038
0
        }
1039
0
        let query = match ast_class.kind {
1040
0
            OneLetter(name) => ClassQuery::OneLetter(name),
1041
0
            Named(ref name) => ClassQuery::Binary(name),
1042
0
            NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
1043
0
                property_name: name,
1044
0
                property_value: value,
1045
0
            },
1046
        };
1047
0
        let mut result = self.convert_unicode_class_error(
1048
0
            &ast_class.span,
1049
0
            unicode::class(query),
1050
0
        );
1051
0
        if let Ok(ref mut class) = result {
1052
0
            self.unicode_fold_and_negate(
1053
0
                &ast_class.span,
1054
0
                ast_class.negated,
1055
0
                class,
1056
0
            )?;
1057
0
        }
1058
0
        result
1059
0
    }
1060
1061
0
    fn hir_ascii_unicode_class(
1062
0
        &self,
1063
0
        ast: &ast::ClassAscii,
1064
0
    ) -> Result<hir::ClassUnicode> {
1065
0
        let mut cls = hir::ClassUnicode::new(
1066
0
            ascii_class_as_chars(&ast.kind)
1067
0
                .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1068
0
        );
1069
0
        self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1070
0
        Ok(cls)
1071
0
    }
1072
1073
0
    fn hir_ascii_byte_class(
1074
0
        &self,
1075
0
        ast: &ast::ClassAscii,
1076
0
    ) -> Result<hir::ClassBytes> {
1077
0
        let mut cls = hir::ClassBytes::new(
1078
0
            ascii_class(&ast.kind)
1079
0
                .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1080
0
        );
1081
0
        self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1082
0
        Ok(cls)
1083
0
    }
1084
1085
0
    fn hir_perl_unicode_class(
1086
0
        &self,
1087
0
        ast_class: &ast::ClassPerl,
1088
0
    ) -> Result<hir::ClassUnicode> {
1089
        use crate::ast::ClassPerlKind::*;
1090
1091
0
        assert!(self.flags().unicode());
1092
0
        let result = match ast_class.kind {
1093
0
            Digit => unicode::perl_digit(),
1094
0
            Space => unicode::perl_space(),
1095
0
            Word => unicode::perl_word(),
1096
        };
1097
0
        let mut class =
1098
0
            self.convert_unicode_class_error(&ast_class.span, result)?;
1099
        // We needn't apply case folding here because the Perl Unicode classes
1100
        // are already closed under Unicode simple case folding.
1101
0
        if ast_class.negated {
1102
0
            class.negate();
1103
0
        }
1104
0
        Ok(class)
1105
0
    }
1106
1107
0
    fn hir_perl_byte_class(
1108
0
        &self,
1109
0
        ast_class: &ast::ClassPerl,
1110
0
    ) -> Result<hir::ClassBytes> {
1111
        use crate::ast::ClassPerlKind::*;
1112
1113
0
        assert!(!self.flags().unicode());
1114
0
        let mut class = match ast_class.kind {
1115
0
            Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1116
0
            Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1117
0
            Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1118
        };
1119
        // We needn't apply case folding here because the Perl ASCII classes
1120
        // are already closed (under ASCII case folding).
1121
0
        if ast_class.negated {
1122
0
            class.negate();
1123
0
        }
1124
        // Negating a Perl byte class is likely to cause it to match invalid
1125
        // UTF-8. That's only OK if the translator is configured to allow such
1126
        // things.
1127
0
        if self.trans().utf8 && !class.is_ascii() {
1128
0
            return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1129
0
        }
1130
0
        Ok(class)
1131
0
    }
1132
1133
    /// Converts the given Unicode specific error to an HIR translation error.
1134
    ///
1135
    /// The span given should approximate the position at which an error would
1136
    /// occur.
1137
0
    fn convert_unicode_class_error(
1138
0
        &self,
1139
0
        span: &Span,
1140
0
        result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1141
0
    ) -> Result<hir::ClassUnicode> {
1142
0
        result.map_err(|err| {
1143
0
            let sp = span.clone();
1144
0
            match err {
1145
                unicode::Error::PropertyNotFound => {
1146
0
                    self.error(sp, ErrorKind::UnicodePropertyNotFound)
1147
                }
1148
                unicode::Error::PropertyValueNotFound => {
1149
0
                    self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1150
                }
1151
                unicode::Error::PerlClassNotFound => {
1152
0
                    self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1153
                }
1154
            }
1155
0
        })
1156
0
    }
1157
1158
0
    fn unicode_fold_and_negate(
1159
0
        &self,
1160
0
        span: &Span,
1161
0
        negated: bool,
1162
0
        class: &mut hir::ClassUnicode,
1163
0
    ) -> Result<()> {
1164
0
        // Note that we must apply case folding before negation!
1165
0
        // Consider `(?i)[^x]`. If we applied negation first, then
1166
0
        // the result would be the character class that matched any
1167
0
        // Unicode scalar value.
1168
0
        if self.flags().case_insensitive() {
1169
0
            class.try_case_fold_simple().map_err(|_| {
1170
0
                self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1171
0
            })?;
1172
0
        }
1173
0
        if negated {
1174
0
            class.negate();
1175
0
        }
1176
0
        Ok(())
1177
0
    }
1178
1179
0
    fn bytes_fold_and_negate(
1180
0
        &self,
1181
0
        span: &Span,
1182
0
        negated: bool,
1183
0
        class: &mut hir::ClassBytes,
1184
0
    ) -> Result<()> {
1185
0
        // Note that we must apply case folding before negation!
1186
0
        // Consider `(?i)[^x]`. If we applied negation first, then
1187
0
        // the result would be the character class that matched any
1188
0
        // Unicode scalar value.
1189
0
        if self.flags().case_insensitive() {
1190
0
            class.case_fold_simple();
1191
0
        }
1192
0
        if negated {
1193
0
            class.negate();
1194
0
        }
1195
0
        if self.trans().utf8 && !class.is_ascii() {
1196
0
            return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1197
0
        }
1198
0
        Ok(())
1199
0
    }
1200
1201
    /// Return a scalar byte value suitable for use as a literal in a byte
1202
    /// character class.
1203
0
    fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1204
0
        match self.ast_literal_to_scalar(ast)? {
1205
0
            Either::Right(byte) => Ok(byte),
1206
0
            Either::Left(ch) => {
1207
0
                if ch.is_ascii() {
1208
0
                    Ok(u8::try_from(ch).unwrap())
1209
                } else {
1210
                    // We can't feasibly support Unicode in
1211
                    // byte oriented classes. Byte classes don't
1212
                    // do Unicode case folding.
1213
0
                    Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1214
                }
1215
            }
1216
        }
1217
0
    }
1218
}
1219
1220
/// A translator's representation of a regular expression's flags at any given
1221
/// moment in time.
1222
///
1223
/// Each flag can be in one of three states: absent, present but disabled or
1224
/// present but enabled.
1225
#[derive(Clone, Copy, Debug, Default)]
1226
struct Flags {
1227
    case_insensitive: Option<bool>,
1228
    multi_line: Option<bool>,
1229
    dot_matches_new_line: Option<bool>,
1230
    swap_greed: Option<bool>,
1231
    unicode: Option<bool>,
1232
    crlf: Option<bool>,
1233
    // Note that `ignore_whitespace` is omitted here because it is handled
1234
    // entirely in the parser.
1235
}
1236
1237
impl Flags {
1238
0
    fn from_ast(ast: &ast::Flags) -> Flags {
1239
0
        let mut flags = Flags::default();
1240
0
        let mut enable = true;
1241
0
        for item in &ast.items {
1242
0
            match item.kind {
1243
0
                ast::FlagsItemKind::Negation => {
1244
0
                    enable = false;
1245
0
                }
1246
0
                ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1247
0
                    flags.case_insensitive = Some(enable);
1248
0
                }
1249
0
                ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1250
0
                    flags.multi_line = Some(enable);
1251
0
                }
1252
0
                ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1253
0
                    flags.dot_matches_new_line = Some(enable);
1254
0
                }
1255
0
                ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1256
0
                    flags.swap_greed = Some(enable);
1257
0
                }
1258
0
                ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1259
0
                    flags.unicode = Some(enable);
1260
0
                }
1261
0
                ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1262
0
                    flags.crlf = Some(enable);
1263
0
                }
1264
0
                ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1265
            }
1266
        }
1267
0
        flags
1268
0
    }
1269
1270
0
    fn merge(&mut self, previous: &Flags) {
1271
0
        if self.case_insensitive.is_none() {
1272
0
            self.case_insensitive = previous.case_insensitive;
1273
0
        }
1274
0
        if self.multi_line.is_none() {
1275
0
            self.multi_line = previous.multi_line;
1276
0
        }
1277
0
        if self.dot_matches_new_line.is_none() {
1278
0
            self.dot_matches_new_line = previous.dot_matches_new_line;
1279
0
        }
1280
0
        if self.swap_greed.is_none() {
1281
0
            self.swap_greed = previous.swap_greed;
1282
0
        }
1283
0
        if self.unicode.is_none() {
1284
0
            self.unicode = previous.unicode;
1285
0
        }
1286
0
        if self.crlf.is_none() {
1287
0
            self.crlf = previous.crlf;
1288
0
        }
1289
0
    }
1290
1291
0
    fn case_insensitive(&self) -> bool {
1292
0
        self.case_insensitive.unwrap_or(false)
1293
0
    }
1294
1295
0
    fn multi_line(&self) -> bool {
1296
0
        self.multi_line.unwrap_or(false)
1297
0
    }
1298
1299
0
    fn dot_matches_new_line(&self) -> bool {
1300
0
        self.dot_matches_new_line.unwrap_or(false)
1301
0
    }
1302
1303
0
    fn swap_greed(&self) -> bool {
1304
0
        self.swap_greed.unwrap_or(false)
1305
0
    }
1306
1307
0
    fn unicode(&self) -> bool {
1308
0
        self.unicode.unwrap_or(true)
1309
0
    }
1310
1311
0
    fn crlf(&self) -> bool {
1312
0
        self.crlf.unwrap_or(false)
1313
0
    }
1314
}
1315
1316
0
fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1317
0
    let ranges: Vec<_> = ascii_class(kind)
1318
0
        .map(|(s, e)| hir::ClassBytesRange::new(s, e))
1319
0
        .collect();
1320
0
    hir::ClassBytes::new(ranges)
1321
0
}
1322
1323
0
fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1324
    use crate::ast::ClassAsciiKind::*;
1325
1326
0
    let slice: &'static [(u8, u8)] = match *kind {
1327
0
        Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1328
0
        Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1329
0
        Ascii => &[(b'\x00', b'\x7F')],
1330
0
        Blank => &[(b'\t', b'\t'), (b' ', b' ')],
1331
0
        Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1332
0
        Digit => &[(b'0', b'9')],
1333
0
        Graph => &[(b'!', b'~')],
1334
0
        Lower => &[(b'a', b'z')],
1335
0
        Print => &[(b' ', b'~')],
1336
0
        Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1337
0
        Space => &[
1338
0
            (b'\t', b'\t'),
1339
0
            (b'\n', b'\n'),
1340
0
            (b'\x0B', b'\x0B'),
1341
0
            (b'\x0C', b'\x0C'),
1342
0
            (b'\r', b'\r'),
1343
0
            (b' ', b' '),
1344
0
        ],
1345
0
        Upper => &[(b'A', b'Z')],
1346
0
        Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1347
0
        Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1348
    };
1349
0
    slice.iter().copied()
1350
0
}
1351
1352
0
fn ascii_class_as_chars(
1353
0
    kind: &ast::ClassAsciiKind,
1354
0
) -> impl Iterator<Item = (char, char)> {
1355
0
    ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e)))
1356
0
}
1357
1358
#[cfg(test)]
1359
mod tests {
1360
    use crate::{
1361
        ast::{parse::ParserBuilder, Position},
1362
        hir::{Look, Properties},
1363
    };
1364
1365
    use super::*;
1366
1367
    // We create these errors to compare with real hir::Errors in the tests.
1368
    // We define equality between TestError and hir::Error to disregard the
1369
    // pattern string in hir::Error, which is annoying to provide in tests.
1370
    #[derive(Clone, Debug)]
1371
    struct TestError {
1372
        span: Span,
1373
        kind: hir::ErrorKind,
1374
    }
1375
1376
    impl PartialEq<hir::Error> for TestError {
1377
        fn eq(&self, other: &hir::Error) -> bool {
1378
            self.span == other.span && self.kind == other.kind
1379
        }
1380
    }
1381
1382
    impl PartialEq<TestError> for hir::Error {
1383
        fn eq(&self, other: &TestError) -> bool {
1384
            self.span == other.span && self.kind == other.kind
1385
        }
1386
    }
1387
1388
    fn parse(pattern: &str) -> Ast {
1389
        ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1390
    }
1391
1392
    fn t(pattern: &str) -> Hir {
1393
        TranslatorBuilder::new()
1394
            .utf8(true)
1395
            .build()
1396
            .translate(pattern, &parse(pattern))
1397
            .unwrap()
1398
    }
1399
1400
    fn t_err(pattern: &str) -> hir::Error {
1401
        TranslatorBuilder::new()
1402
            .utf8(true)
1403
            .build()
1404
            .translate(pattern, &parse(pattern))
1405
            .unwrap_err()
1406
    }
1407
1408
    fn t_bytes(pattern: &str) -> Hir {
1409
        TranslatorBuilder::new()
1410
            .utf8(false)
1411
            .build()
1412
            .translate(pattern, &parse(pattern))
1413
            .unwrap()
1414
    }
1415
1416
    fn props(pattern: &str) -> Properties {
1417
        t(pattern).properties().clone()
1418
    }
1419
1420
    fn props_bytes(pattern: &str) -> Properties {
1421
        t_bytes(pattern).properties().clone()
1422
    }
1423
1424
    fn hir_lit(s: &str) -> Hir {
1425
        hir_blit(s.as_bytes())
1426
    }
1427
1428
    fn hir_blit(s: &[u8]) -> Hir {
1429
        Hir::literal(s)
1430
    }
1431
1432
    fn hir_capture(index: u32, expr: Hir) -> Hir {
1433
        Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1434
    }
1435
1436
    fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1437
        Hir::capture(hir::Capture {
1438
            index,
1439
            name: Some(name.into()),
1440
            sub: Box::new(expr),
1441
        })
1442
    }
1443
1444
    fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1445
        Hir::repetition(hir::Repetition {
1446
            min: 0,
1447
            max: Some(1),
1448
            greedy,
1449
            sub: Box::new(expr),
1450
        })
1451
    }
1452
1453
    fn hir_star(greedy: bool, expr: Hir) -> Hir {
1454
        Hir::repetition(hir::Repetition {
1455
            min: 0,
1456
            max: None,
1457
            greedy,
1458
            sub: Box::new(expr),
1459
        })
1460
    }
1461
1462
    fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1463
        Hir::repetition(hir::Repetition {
1464
            min: 1,
1465
            max: None,
1466
            greedy,
1467
            sub: Box::new(expr),
1468
        })
1469
    }
1470
1471
    fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1472
        Hir::repetition(hir::Repetition {
1473
            min,
1474
            max,
1475
            greedy,
1476
            sub: Box::new(expr),
1477
        })
1478
    }
1479
1480
    fn hir_alt(alts: Vec<Hir>) -> Hir {
1481
        Hir::alternation(alts)
1482
    }
1483
1484
    fn hir_cat(exprs: Vec<Hir>) -> Hir {
1485
        Hir::concat(exprs)
1486
    }
1487
1488
    #[allow(dead_code)]
1489
    fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1490
        Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1491
    }
1492
1493
    #[allow(dead_code)]
1494
    fn hir_uclass_perl_word() -> Hir {
1495
        Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1496
    }
1497
1498
    fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1499
        Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1500
            ascii_class_as_chars(kind)
1501
                .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1502
        )))
1503
    }
1504
1505
    fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1506
        Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1507
            ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1508
        )))
1509
    }
1510
1511
    fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1512
        Hir::class(uclass(ranges))
1513
    }
1514
1515
    fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1516
        Hir::class(bclass(ranges))
1517
    }
1518
1519
    fn hir_case_fold(expr: Hir) -> Hir {
1520
        match expr.into_kind() {
1521
            HirKind::Class(mut cls) => {
1522
                cls.case_fold_simple();
1523
                Hir::class(cls)
1524
            }
1525
            _ => panic!("cannot case fold non-class Hir expr"),
1526
        }
1527
    }
1528
1529
    fn hir_negate(expr: Hir) -> Hir {
1530
        match expr.into_kind() {
1531
            HirKind::Class(mut cls) => {
1532
                cls.negate();
1533
                Hir::class(cls)
1534
            }
1535
            _ => panic!("cannot negate non-class Hir expr"),
1536
        }
1537
    }
1538
1539
    fn uclass(ranges: &[(char, char)]) -> hir::Class {
1540
        let ranges: Vec<hir::ClassUnicodeRange> = ranges
1541
            .iter()
1542
            .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1543
            .collect();
1544
        hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1545
    }
1546
1547
    fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1548
        let ranges: Vec<hir::ClassBytesRange> = ranges
1549
            .iter()
1550
            .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1551
            .collect();
1552
        hir::Class::Bytes(hir::ClassBytes::new(ranges))
1553
    }
1554
1555
    #[cfg(feature = "unicode-case")]
1556
    fn class_case_fold(mut cls: hir::Class) -> Hir {
1557
        cls.case_fold_simple();
1558
        Hir::class(cls)
1559
    }
1560
1561
    fn class_negate(mut cls: hir::Class) -> Hir {
1562
        cls.negate();
1563
        Hir::class(cls)
1564
    }
1565
1566
    #[allow(dead_code)]
1567
    fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1568
        use crate::hir::Class::{Bytes, Unicode};
1569
1570
        match (expr1.into_kind(), expr2.into_kind()) {
1571
            (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1572
                c1.union(&c2);
1573
                Hir::class(hir::Class::Unicode(c1))
1574
            }
1575
            (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1576
                c1.union(&c2);
1577
                Hir::class(hir::Class::Bytes(c1))
1578
            }
1579
            _ => panic!("cannot union non-class Hir exprs"),
1580
        }
1581
    }
1582
1583
    #[allow(dead_code)]
1584
    fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1585
        use crate::hir::Class::{Bytes, Unicode};
1586
1587
        match (expr1.into_kind(), expr2.into_kind()) {
1588
            (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1589
                c1.difference(&c2);
1590
                Hir::class(hir::Class::Unicode(c1))
1591
            }
1592
            (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1593
                c1.difference(&c2);
1594
                Hir::class(hir::Class::Bytes(c1))
1595
            }
1596
            _ => panic!("cannot difference non-class Hir exprs"),
1597
        }
1598
    }
1599
1600
    fn hir_look(look: hir::Look) -> Hir {
1601
        Hir::look(look)
1602
    }
1603
1604
    #[test]
1605
    fn empty() {
1606
        assert_eq!(t(""), Hir::empty());
1607
        assert_eq!(t("(?i)"), Hir::empty());
1608
        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1609
        assert_eq!(t("(?:)"), Hir::empty());
1610
        assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
1611
        assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1612
        assert_eq!(
1613
            t("()|()"),
1614
            hir_alt(vec![
1615
                hir_capture(1, Hir::empty()),
1616
                hir_capture(2, Hir::empty()),
1617
            ])
1618
        );
1619
        assert_eq!(
1620
            t("(|b)"),
1621
            hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1622
        );
1623
        assert_eq!(
1624
            t("(a|)"),
1625
            hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1626
        );
1627
        assert_eq!(
1628
            t("(a||c)"),
1629
            hir_capture(
1630
                1,
1631
                hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1632
            )
1633
        );
1634
        assert_eq!(
1635
            t("(||)"),
1636
            hir_capture(
1637
                1,
1638
                hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1639
            )
1640
        );
1641
    }
1642
1643
    #[test]
1644
    fn literal() {
1645
        assert_eq!(t("a"), hir_lit("a"));
1646
        assert_eq!(t("(?-u)a"), hir_lit("a"));
1647
        assert_eq!(t("☃"), hir_lit("☃"));
1648
        assert_eq!(t("abcd"), hir_lit("abcd"));
1649
1650
        assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1651
        assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1652
        assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1653
        assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1654
1655
        assert_eq!(t("(?-u)☃"), hir_lit("☃"));
1656
        assert_eq!(
1657
            t_err(r"(?-u)\xFF"),
1658
            TestError {
1659
                kind: hir::ErrorKind::InvalidUtf8,
1660
                span: Span::new(
1661
                    Position::new(5, 1, 6),
1662
                    Position::new(9, 1, 10)
1663
                ),
1664
            }
1665
        );
1666
    }
1667
1668
    #[test]
1669
    fn literal_case_insensitive() {
1670
        #[cfg(feature = "unicode-case")]
1671
        assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1672
        #[cfg(feature = "unicode-case")]
1673
        assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1674
        #[cfg(feature = "unicode-case")]
1675
        assert_eq!(
1676
            t("a(?i)a(?-i)a"),
1677
            hir_cat(vec![
1678
                hir_lit("a"),
1679
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1680
                hir_lit("a"),
1681
            ])
1682
        );
1683
        #[cfg(feature = "unicode-case")]
1684
        assert_eq!(
1685
            t("(?i)ab@c"),
1686
            hir_cat(vec![
1687
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1688
                hir_uclass(&[('B', 'B'), ('b', 'b')]),
1689
                hir_lit("@"),
1690
                hir_uclass(&[('C', 'C'), ('c', 'c')]),
1691
            ])
1692
        );
1693
        #[cfg(feature = "unicode-case")]
1694
        assert_eq!(
1695
            t("(?i)β"),
1696
            hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1697
        );
1698
1699
        assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1700
        #[cfg(feature = "unicode-case")]
1701
        assert_eq!(
1702
            t("(?-u)a(?i)a(?-i)a"),
1703
            hir_cat(vec![
1704
                hir_lit("a"),
1705
                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1706
                hir_lit("a"),
1707
            ])
1708
        );
1709
        assert_eq!(
1710
            t("(?i-u)ab@c"),
1711
            hir_cat(vec![
1712
                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1713
                hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1714
                hir_lit("@"),
1715
                hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1716
            ])
1717
        );
1718
1719
        assert_eq!(
1720
            t_bytes("(?i-u)a"),
1721
            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1722
        );
1723
        assert_eq!(
1724
            t_bytes("(?i-u)\x61"),
1725
            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1726
        );
1727
        assert_eq!(
1728
            t_bytes(r"(?i-u)\x61"),
1729
            hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1730
        );
1731
        assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1732
1733
        assert_eq!(t("(?i-u)β"), hir_lit("β"),);
1734
    }
1735
1736
    #[test]
1737
    fn dot() {
1738
        assert_eq!(
1739
            t("."),
1740
            hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
1741
        );
1742
        assert_eq!(
1743
            t("(?R)."),
1744
            hir_uclass(&[
1745
                ('\0', '\t'),
1746
                ('\x0B', '\x0C'),
1747
                ('\x0E', '\u{10FFFF}'),
1748
            ])
1749
        );
1750
        assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1751
        assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1752
        assert_eq!(
1753
            t_bytes("(?-u)."),
1754
            hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
1755
        );
1756
        assert_eq!(
1757
            t_bytes("(?R-u)."),
1758
            hir_bclass(&[
1759
                (b'\0', b'\t'),
1760
                (b'\x0B', b'\x0C'),
1761
                (b'\x0E', b'\xFF'),
1762
            ])
1763
        );
1764
        assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1765
        assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1766
1767
        // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1768
        assert_eq!(
1769
            t_err("(?-u)."),
1770
            TestError {
1771
                kind: hir::ErrorKind::InvalidUtf8,
1772
                span: Span::new(
1773
                    Position::new(5, 1, 6),
1774
                    Position::new(6, 1, 7)
1775
                ),
1776
            }
1777
        );
1778
        assert_eq!(
1779
            t_err("(?R-u)."),
1780
            TestError {
1781
                kind: hir::ErrorKind::InvalidUtf8,
1782
                span: Span::new(
1783
                    Position::new(6, 1, 7),
1784
                    Position::new(7, 1, 8)
1785
                ),
1786
            }
1787
        );
1788
        assert_eq!(
1789
            t_err("(?s-u)."),
1790
            TestError {
1791
                kind: hir::ErrorKind::InvalidUtf8,
1792
                span: Span::new(
1793
                    Position::new(6, 1, 7),
1794
                    Position::new(7, 1, 8)
1795
                ),
1796
            }
1797
        );
1798
        assert_eq!(
1799
            t_err("(?Rs-u)."),
1800
            TestError {
1801
                kind: hir::ErrorKind::InvalidUtf8,
1802
                span: Span::new(
1803
                    Position::new(7, 1, 8),
1804
                    Position::new(8, 1, 9)
1805
                ),
1806
            }
1807
        );
1808
    }
1809
1810
    #[test]
1811
    fn assertions() {
1812
        assert_eq!(t("^"), hir_look(hir::Look::Start));
1813
        assert_eq!(t("$"), hir_look(hir::Look::End));
1814
        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1815
        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1816
        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1817
        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1818
        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1819
        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1820
1821
        assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1822
        assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1823
        assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1824
        assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1825
    }
1826
1827
    #[test]
1828
    fn group() {
1829
        assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
1830
        assert_eq!(
1831
            t("(a)(b)"),
1832
            hir_cat(vec![
1833
                hir_capture(1, hir_lit("a")),
1834
                hir_capture(2, hir_lit("b")),
1835
            ])
1836
        );
1837
        assert_eq!(
1838
            t("(a)|(b)"),
1839
            hir_alt(vec![
1840
                hir_capture(1, hir_lit("a")),
1841
                hir_capture(2, hir_lit("b")),
1842
            ])
1843
        );
1844
        assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
1845
        assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
1846
        assert_eq!(
1847
            t("(?P<foo>a)(?P<bar>b)"),
1848
            hir_cat(vec![
1849
                hir_capture_name(1, "foo", hir_lit("a")),
1850
                hir_capture_name(2, "bar", hir_lit("b")),
1851
            ])
1852
        );
1853
        assert_eq!(t("(?:)"), Hir::empty());
1854
        assert_eq!(t("(?:a)"), hir_lit("a"));
1855
        assert_eq!(
1856
            t("(?:a)(b)"),
1857
            hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
1858
        );
1859
        assert_eq!(
1860
            t("(a)(?:b)(c)"),
1861
            hir_cat(vec![
1862
                hir_capture(1, hir_lit("a")),
1863
                hir_lit("b"),
1864
                hir_capture(2, hir_lit("c")),
1865
            ])
1866
        );
1867
        assert_eq!(
1868
            t("(a)(?P<foo>b)(c)"),
1869
            hir_cat(vec![
1870
                hir_capture(1, hir_lit("a")),
1871
                hir_capture_name(2, "foo", hir_lit("b")),
1872
                hir_capture(3, hir_lit("c")),
1873
            ])
1874
        );
1875
        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1876
        assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
1877
        assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
1878
        assert_eq!(
1879
            t("(((?x)))"),
1880
            hir_capture(1, hir_capture(2, Hir::empty()))
1881
        );
1882
    }
1883
1884
    #[test]
1885
    fn line_anchors() {
1886
        assert_eq!(t("^"), hir_look(hir::Look::Start));
1887
        assert_eq!(t("$"), hir_look(hir::Look::End));
1888
        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1889
        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1890
1891
        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1892
        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1893
        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1894
        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1895
1896
        assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1897
        assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1898
        assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1899
        assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1900
1901
        assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1902
        assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1903
        assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1904
        assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1905
    }
1906
1907
    #[test]
1908
    fn flags() {
1909
        #[cfg(feature = "unicode-case")]
1910
        assert_eq!(
1911
            t("(?i:a)a"),
1912
            hir_cat(
1913
                vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1914
            )
1915
        );
1916
        assert_eq!(
1917
            t("(?i-u:a)β"),
1918
            hir_cat(vec![
1919
                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1920
                hir_lit("β"),
1921
            ])
1922
        );
1923
        assert_eq!(
1924
            t("(?:(?i-u)a)b"),
1925
            hir_cat(vec![
1926
                hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1927
                hir_lit("b"),
1928
            ])
1929
        );
1930
        assert_eq!(
1931
            t("((?i-u)a)b"),
1932
            hir_cat(vec![
1933
                hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1934
                hir_lit("b"),
1935
            ])
1936
        );
1937
        #[cfg(feature = "unicode-case")]
1938
        assert_eq!(
1939
            t("(?i)(?-i:a)a"),
1940
            hir_cat(
1941
                vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1942
            )
1943
        );
1944
        #[cfg(feature = "unicode-case")]
1945
        assert_eq!(
1946
            t("(?im)a^"),
1947
            hir_cat(vec![
1948
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1949
                hir_look(hir::Look::StartLF),
1950
            ])
1951
        );
1952
        #[cfg(feature = "unicode-case")]
1953
        assert_eq!(
1954
            t("(?im)a^(?i-m)a^"),
1955
            hir_cat(vec![
1956
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1957
                hir_look(hir::Look::StartLF),
1958
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1959
                hir_look(hir::Look::Start),
1960
            ])
1961
        );
1962
        assert_eq!(
1963
            t("(?U)a*a*?(?-U)a*a*?"),
1964
            hir_cat(vec![
1965
                hir_star(false, hir_lit("a")),
1966
                hir_star(true, hir_lit("a")),
1967
                hir_star(true, hir_lit("a")),
1968
                hir_star(false, hir_lit("a")),
1969
            ])
1970
        );
1971
        #[cfg(feature = "unicode-case")]
1972
        assert_eq!(
1973
            t("(?:a(?i)a)a"),
1974
            hir_cat(vec![
1975
                hir_cat(vec![
1976
                    hir_lit("a"),
1977
                    hir_uclass(&[('A', 'A'), ('a', 'a')]),
1978
                ]),
1979
                hir_lit("a"),
1980
            ])
1981
        );
1982
        #[cfg(feature = "unicode-case")]
1983
        assert_eq!(
1984
            t("(?i)(?:a(?-i)a)a"),
1985
            hir_cat(vec![
1986
                hir_cat(vec![
1987
                    hir_uclass(&[('A', 'A'), ('a', 'a')]),
1988
                    hir_lit("a"),
1989
                ]),
1990
                hir_uclass(&[('A', 'A'), ('a', 'a')]),
1991
            ])
1992
        );
1993
    }
1994
1995
    #[test]
1996
    fn escape() {
1997
        assert_eq!(
1998
            t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1999
            hir_lit(r"\.+*?()|[]{}^$#")
2000
        );
2001
    }
2002
2003
    #[test]
2004
    fn repetition() {
2005
        assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
2006
        assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
2007
        assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
2008
        assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
2009
        assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
2010
        assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
2011
2012
        assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
2013
        assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
2014
        assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
2015
        assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
2016
        assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
2017
        assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
2018
2019
        assert_eq!(
2020
            t("ab?"),
2021
            hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2022
        );
2023
        assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
2024
        assert_eq!(
2025
            t("a|b?"),
2026
            hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2027
        );
2028
    }
2029
2030
    #[test]
2031
    fn cat_alt() {
2032
        let a = || hir_look(hir::Look::Start);
2033
        let b = || hir_look(hir::Look::End);
2034
        let c = || hir_look(hir::Look::WordUnicode);
2035
        let d = || hir_look(hir::Look::WordUnicodeNegate);
2036
2037
        assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
2038
        assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
2039
        assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
2040
        assert_eq!(
2041
            t(r"^$|$\b|\b\B"),
2042
            hir_alt(vec![
2043
                hir_cat(vec![a(), b()]),
2044
                hir_cat(vec![b(), c()]),
2045
                hir_cat(vec![c(), d()]),
2046
            ])
2047
        );
2048
        assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
2049
        assert_eq!(
2050
            t(r"(^|$|\b)"),
2051
            hir_capture(1, hir_alt(vec![a(), b(), c()]))
2052
        );
2053
        assert_eq!(
2054
            t(r"(^$|$\b|\b\B)"),
2055
            hir_capture(
2056
                1,
2057
                hir_alt(vec![
2058
                    hir_cat(vec![a(), b()]),
2059
                    hir_cat(vec![b(), c()]),
2060
                    hir_cat(vec![c(), d()]),
2061
                ])
2062
            )
2063
        );
2064
        assert_eq!(
2065
            t(r"(^$|($\b|(\b\B)))"),
2066
            hir_capture(
2067
                1,
2068
                hir_alt(vec![
2069
                    hir_cat(vec![a(), b()]),
2070
                    hir_capture(
2071
                        2,
2072
                        hir_alt(vec![
2073
                            hir_cat(vec![b(), c()]),
2074
                            hir_capture(3, hir_cat(vec![c(), d()])),
2075
                        ])
2076
                    ),
2077
                ])
2078
            )
2079
        );
2080
    }
2081
2082
    // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
2083
    // '[A-Za-z]'. In other words, an alternation of just classes is always
2084
    // equivalent to a single class corresponding to the union of the branches
2085
    // in that class. (Unless some branches match invalid UTF-8 and others
2086
    // match non-ASCII Unicode.)
2087
    #[test]
2088
    fn cat_class_flattened() {
2089
        assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2090
        // Combining all of the letter properties should give us the one giant
2091
        // letter property.
2092
        #[cfg(feature = "unicode-gencat")]
2093
        assert_eq!(
2094
            t(r"(?x)
2095
                \p{Lowercase_Letter}
2096
                |\p{Uppercase_Letter}
2097
                |\p{Titlecase_Letter}
2098
                |\p{Modifier_Letter}
2099
                |\p{Other_Letter}
2100
            "),
2101
            hir_uclass_query(ClassQuery::Binary("letter"))
2102
        );
2103
        // Byte classes that can truly match invalid UTF-8 cannot be combined
2104
        // with Unicode classes.
2105
        assert_eq!(
2106
            t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
2107
            hir_alt(vec![
2108
                hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2109
                hir_bclass(&[(b'\x90', b'\xFF')]),
2110
                hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2111
            ])
2112
        );
2113
        // Byte classes on their own can be combined, even if some are ASCII
2114
        // and others are invalid UTF-8.
2115
        assert_eq!(
2116
            t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2117
            hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2118
        );
2119
    }
2120
2121
    #[test]
2122
    fn class_ascii() {
2123
        assert_eq!(
2124
            t("[[:alnum:]]"),
2125
            hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2126
        );
2127
        assert_eq!(
2128
            t("[[:alpha:]]"),
2129
            hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2130
        );
2131
        assert_eq!(
2132
            t("[[:ascii:]]"),
2133
            hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2134
        );
2135
        assert_eq!(
2136
            t("[[:blank:]]"),
2137
            hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2138
        );
2139
        assert_eq!(
2140
            t("[[:cntrl:]]"),
2141
            hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2142
        );
2143
        assert_eq!(
2144
            t("[[:digit:]]"),
2145
            hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2146
        );
2147
        assert_eq!(
2148
            t("[[:graph:]]"),
2149
            hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2150
        );
2151
        assert_eq!(
2152
            t("[[:lower:]]"),
2153
            hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2154
        );
2155
        assert_eq!(
2156
            t("[[:print:]]"),
2157
            hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2158
        );
2159
        assert_eq!(
2160
            t("[[:punct:]]"),
2161
            hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2162
        );
2163
        assert_eq!(
2164
            t("[[:space:]]"),
2165
            hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2166
        );
2167
        assert_eq!(
2168
            t("[[:upper:]]"),
2169
            hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2170
        );
2171
        assert_eq!(
2172
            t("[[:word:]]"),
2173
            hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2174
        );
2175
        assert_eq!(
2176
            t("[[:xdigit:]]"),
2177
            hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2178
        );
2179
2180
        assert_eq!(
2181
            t("[[:^lower:]]"),
2182
            hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2183
        );
2184
        #[cfg(feature = "unicode-case")]
2185
        assert_eq!(
2186
            t("(?i)[[:lower:]]"),
2187
            hir_uclass(&[
2188
                ('A', 'Z'),
2189
                ('a', 'z'),
2190
                ('\u{17F}', '\u{17F}'),
2191
                ('\u{212A}', '\u{212A}'),
2192
            ])
2193
        );
2194
2195
        assert_eq!(
2196
            t("(?-u)[[:lower:]]"),
2197
            hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2198
        );
2199
        assert_eq!(
2200
            t("(?i-u)[[:lower:]]"),
2201
            hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2202
        );
2203
2204
        assert_eq!(
2205
            t_err("(?-u)[[:^lower:]]"),
2206
            TestError {
2207
                kind: hir::ErrorKind::InvalidUtf8,
2208
                span: Span::new(
2209
                    Position::new(6, 1, 7),
2210
                    Position::new(16, 1, 17)
2211
                ),
2212
            }
2213
        );
2214
        assert_eq!(
2215
            t_err("(?i-u)[[:^lower:]]"),
2216
            TestError {
2217
                kind: hir::ErrorKind::InvalidUtf8,
2218
                span: Span::new(
2219
                    Position::new(7, 1, 8),
2220
                    Position::new(17, 1, 18)
2221
                ),
2222
            }
2223
        );
2224
    }
2225
2226
    #[test]
2227
    fn class_ascii_multiple() {
2228
        // See: https://github.com/rust-lang/regex/issues/680
2229
        assert_eq!(
2230
            t("[[:alnum:][:^ascii:]]"),
2231
            hir_union(
2232
                hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2233
                hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
2234
            ),
2235
        );
2236
        assert_eq!(
2237
            t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2238
            hir_union(
2239
                hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2240
                hir_bclass(&[(0x80, 0xFF)]),
2241
            ),
2242
        );
2243
    }
2244
2245
    #[test]
2246
    #[cfg(feature = "unicode-perl")]
2247
    fn class_perl_unicode() {
2248
        // Unicode
2249
        assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2250
        assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2251
        assert_eq!(t(r"\w"), hir_uclass_perl_word());
2252
        #[cfg(feature = "unicode-case")]
2253
        assert_eq!(
2254
            t(r"(?i)\d"),
2255
            hir_uclass_query(ClassQuery::Binary("digit"))
2256
        );
2257
        #[cfg(feature = "unicode-case")]
2258
        assert_eq!(
2259
            t(r"(?i)\s"),
2260
            hir_uclass_query(ClassQuery::Binary("space"))
2261
        );
2262
        #[cfg(feature = "unicode-case")]
2263
        assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2264
2265
        // Unicode, negated
2266
        assert_eq!(
2267
            t(r"\D"),
2268
            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2269
        );
2270
        assert_eq!(
2271
            t(r"\S"),
2272
            hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2273
        );
2274
        assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2275
        #[cfg(feature = "unicode-case")]
2276
        assert_eq!(
2277
            t(r"(?i)\D"),
2278
            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2279
        );
2280
        #[cfg(feature = "unicode-case")]
2281
        assert_eq!(
2282
            t(r"(?i)\S"),
2283
            hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2284
        );
2285
        #[cfg(feature = "unicode-case")]
2286
        assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2287
    }
2288
2289
    #[test]
2290
    fn class_perl_ascii() {
2291
        // ASCII only
2292
        assert_eq!(
2293
            t(r"(?-u)\d"),
2294
            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2295
        );
2296
        assert_eq!(
2297
            t(r"(?-u)\s"),
2298
            hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2299
        );
2300
        assert_eq!(
2301
            t(r"(?-u)\w"),
2302
            hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2303
        );
2304
        assert_eq!(
2305
            t(r"(?i-u)\d"),
2306
            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2307
        );
2308
        assert_eq!(
2309
            t(r"(?i-u)\s"),
2310
            hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2311
        );
2312
        assert_eq!(
2313
            t(r"(?i-u)\w"),
2314
            hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2315
        );
2316
2317
        // ASCII only, negated
2318
        assert_eq!(
2319
            t_bytes(r"(?-u)\D"),
2320
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2321
        );
2322
        assert_eq!(
2323
            t_bytes(r"(?-u)\S"),
2324
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2325
        );
2326
        assert_eq!(
2327
            t_bytes(r"(?-u)\W"),
2328
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2329
        );
2330
        assert_eq!(
2331
            t_bytes(r"(?i-u)\D"),
2332
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2333
        );
2334
        assert_eq!(
2335
            t_bytes(r"(?i-u)\S"),
2336
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2337
        );
2338
        assert_eq!(
2339
            t_bytes(r"(?i-u)\W"),
2340
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2341
        );
2342
2343
        // ASCII only, negated, with UTF-8 mode enabled.
2344
        // In this case, negating any Perl class results in an error because
2345
        // all such classes can match invalid UTF-8.
2346
        assert_eq!(
2347
            t_err(r"(?-u)\D"),
2348
            TestError {
2349
                kind: hir::ErrorKind::InvalidUtf8,
2350
                span: Span::new(
2351
                    Position::new(5, 1, 6),
2352
                    Position::new(7, 1, 8),
2353
                ),
2354
            },
2355
        );
2356
        assert_eq!(
2357
            t_err(r"(?-u)\S"),
2358
            TestError {
2359
                kind: hir::ErrorKind::InvalidUtf8,
2360
                span: Span::new(
2361
                    Position::new(5, 1, 6),
2362
                    Position::new(7, 1, 8),
2363
                ),
2364
            },
2365
        );
2366
        assert_eq!(
2367
            t_err(r"(?-u)\W"),
2368
            TestError {
2369
                kind: hir::ErrorKind::InvalidUtf8,
2370
                span: Span::new(
2371
                    Position::new(5, 1, 6),
2372
                    Position::new(7, 1, 8),
2373
                ),
2374
            },
2375
        );
2376
        assert_eq!(
2377
            t_err(r"(?i-u)\D"),
2378
            TestError {
2379
                kind: hir::ErrorKind::InvalidUtf8,
2380
                span: Span::new(
2381
                    Position::new(6, 1, 7),
2382
                    Position::new(8, 1, 9),
2383
                ),
2384
            },
2385
        );
2386
        assert_eq!(
2387
            t_err(r"(?i-u)\S"),
2388
            TestError {
2389
                kind: hir::ErrorKind::InvalidUtf8,
2390
                span: Span::new(
2391
                    Position::new(6, 1, 7),
2392
                    Position::new(8, 1, 9),
2393
                ),
2394
            },
2395
        );
2396
        assert_eq!(
2397
            t_err(r"(?i-u)\W"),
2398
            TestError {
2399
                kind: hir::ErrorKind::InvalidUtf8,
2400
                span: Span::new(
2401
                    Position::new(6, 1, 7),
2402
                    Position::new(8, 1, 9),
2403
                ),
2404
            },
2405
        );
2406
    }
2407
2408
    #[test]
2409
    #[cfg(not(feature = "unicode-perl"))]
2410
    fn class_perl_word_disabled() {
2411
        assert_eq!(
2412
            t_err(r"\w"),
2413
            TestError {
2414
                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2415
                span: Span::new(
2416
                    Position::new(0, 1, 1),
2417
                    Position::new(2, 1, 3)
2418
                ),
2419
            }
2420
        );
2421
    }
2422
2423
    #[test]
2424
    #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
2425
    fn class_perl_space_disabled() {
2426
        assert_eq!(
2427
            t_err(r"\s"),
2428
            TestError {
2429
                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2430
                span: Span::new(
2431
                    Position::new(0, 1, 1),
2432
                    Position::new(2, 1, 3)
2433
                ),
2434
            }
2435
        );
2436
    }
2437
2438
    #[test]
2439
    #[cfg(all(
2440
        not(feature = "unicode-perl"),
2441
        not(feature = "unicode-gencat")
2442
    ))]
2443
    fn class_perl_digit_disabled() {
2444
        assert_eq!(
2445
            t_err(r"\d"),
2446
            TestError {
2447
                kind: hir::ErrorKind::UnicodePerlClassNotFound,
2448
                span: Span::new(
2449
                    Position::new(0, 1, 1),
2450
                    Position::new(2, 1, 3)
2451
                ),
2452
            }
2453
        );
2454
    }
2455
2456
    #[test]
2457
    #[cfg(feature = "unicode-gencat")]
2458
    fn class_unicode_gencat() {
2459
        assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2460
        assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2461
        assert_eq!(
2462
            t(r"\p{Separator}"),
2463
            hir_uclass_query(ClassQuery::Binary("Z"))
2464
        );
2465
        assert_eq!(
2466
            t(r"\p{se      PaRa ToR}"),
2467
            hir_uclass_query(ClassQuery::Binary("Z"))
2468
        );
2469
        assert_eq!(
2470
            t(r"\p{gc:Separator}"),
2471
            hir_uclass_query(ClassQuery::Binary("Z"))
2472
        );
2473
        assert_eq!(
2474
            t(r"\p{gc=Separator}"),
2475
            hir_uclass_query(ClassQuery::Binary("Z"))
2476
        );
2477
        assert_eq!(
2478
            t(r"\p{Other}"),
2479
            hir_uclass_query(ClassQuery::Binary("Other"))
2480
        );
2481
        assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2482
2483
        assert_eq!(
2484
            t(r"\PZ"),
2485
            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2486
        );
2487
        assert_eq!(
2488
            t(r"\P{separator}"),
2489
            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2490
        );
2491
        assert_eq!(
2492
            t(r"\P{gc!=separator}"),
2493
            hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2494
        );
2495
2496
        assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2497
        assert_eq!(
2498
            t(r"\p{assigned}"),
2499
            hir_uclass_query(ClassQuery::Binary("Assigned"))
2500
        );
2501
        assert_eq!(
2502
            t(r"\p{ascii}"),
2503
            hir_uclass_query(ClassQuery::Binary("ASCII"))
2504
        );
2505
        assert_eq!(
2506
            t(r"\p{gc:any}"),
2507
            hir_uclass_query(ClassQuery::Binary("Any"))
2508
        );
2509
        assert_eq!(
2510
            t(r"\p{gc:assigned}"),
2511
            hir_uclass_query(ClassQuery::Binary("Assigned"))
2512
        );
2513
        assert_eq!(
2514
            t(r"\p{gc:ascii}"),
2515
            hir_uclass_query(ClassQuery::Binary("ASCII"))
2516
        );
2517
2518
        assert_eq!(
2519
            t_err(r"(?-u)\pZ"),
2520
            TestError {
2521
                kind: hir::ErrorKind::UnicodeNotAllowed,
2522
                span: Span::new(
2523
                    Position::new(5, 1, 6),
2524
                    Position::new(8, 1, 9)
2525
                ),
2526
            }
2527
        );
2528
        assert_eq!(
2529
            t_err(r"(?-u)\p{Separator}"),
2530
            TestError {
2531
                kind: hir::ErrorKind::UnicodeNotAllowed,
2532
                span: Span::new(
2533
                    Position::new(5, 1, 6),
2534
                    Position::new(18, 1, 19)
2535
                ),
2536
            }
2537
        );
2538
        assert_eq!(
2539
            t_err(r"\pE"),
2540
            TestError {
2541
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2542
                span: Span::new(
2543
                    Position::new(0, 1, 1),
2544
                    Position::new(3, 1, 4)
2545
                ),
2546
            }
2547
        );
2548
        assert_eq!(
2549
            t_err(r"\p{Foo}"),
2550
            TestError {
2551
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2552
                span: Span::new(
2553
                    Position::new(0, 1, 1),
2554
                    Position::new(7, 1, 8)
2555
                ),
2556
            }
2557
        );
2558
        assert_eq!(
2559
            t_err(r"\p{gc:Foo}"),
2560
            TestError {
2561
                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2562
                span: Span::new(
2563
                    Position::new(0, 1, 1),
2564
                    Position::new(10, 1, 11)
2565
                ),
2566
            }
2567
        );
2568
    }
2569
2570
    #[test]
2571
    #[cfg(not(feature = "unicode-gencat"))]
2572
    fn class_unicode_gencat_disabled() {
2573
        assert_eq!(
2574
            t_err(r"\p{Separator}"),
2575
            TestError {
2576
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2577
                span: Span::new(
2578
                    Position::new(0, 1, 1),
2579
                    Position::new(13, 1, 14)
2580
                ),
2581
            }
2582
        );
2583
2584
        assert_eq!(
2585
            t_err(r"\p{Any}"),
2586
            TestError {
2587
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2588
                span: Span::new(
2589
                    Position::new(0, 1, 1),
2590
                    Position::new(7, 1, 8)
2591
                ),
2592
            }
2593
        );
2594
    }
2595
2596
    #[test]
2597
    #[cfg(feature = "unicode-script")]
2598
    fn class_unicode_script() {
2599
        assert_eq!(
2600
            t(r"\p{Greek}"),
2601
            hir_uclass_query(ClassQuery::Binary("Greek"))
2602
        );
2603
        #[cfg(feature = "unicode-case")]
2604
        assert_eq!(
2605
            t(r"(?i)\p{Greek}"),
2606
            hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2607
        );
2608
        #[cfg(feature = "unicode-case")]
2609
        assert_eq!(
2610
            t(r"(?i)\P{Greek}"),
2611
            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2612
                "Greek"
2613
            ))))
2614
        );
2615
2616
        assert_eq!(
2617
            t_err(r"\p{sc:Foo}"),
2618
            TestError {
2619
                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2620
                span: Span::new(
2621
                    Position::new(0, 1, 1),
2622
                    Position::new(10, 1, 11)
2623
                ),
2624
            }
2625
        );
2626
        assert_eq!(
2627
            t_err(r"\p{scx:Foo}"),
2628
            TestError {
2629
                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2630
                span: Span::new(
2631
                    Position::new(0, 1, 1),
2632
                    Position::new(11, 1, 12)
2633
                ),
2634
            }
2635
        );
2636
    }
2637
2638
    #[test]
2639
    #[cfg(not(feature = "unicode-script"))]
2640
    fn class_unicode_script_disabled() {
2641
        assert_eq!(
2642
            t_err(r"\p{Greek}"),
2643
            TestError {
2644
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2645
                span: Span::new(
2646
                    Position::new(0, 1, 1),
2647
                    Position::new(9, 1, 10)
2648
                ),
2649
            }
2650
        );
2651
2652
        assert_eq!(
2653
            t_err(r"\p{scx:Greek}"),
2654
            TestError {
2655
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2656
                span: Span::new(
2657
                    Position::new(0, 1, 1),
2658
                    Position::new(13, 1, 14)
2659
                ),
2660
            }
2661
        );
2662
    }
2663
2664
    #[test]
2665
    #[cfg(feature = "unicode-age")]
2666
    fn class_unicode_age() {
2667
        assert_eq!(
2668
            t_err(r"\p{age:Foo}"),
2669
            TestError {
2670
                kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2671
                span: Span::new(
2672
                    Position::new(0, 1, 1),
2673
                    Position::new(11, 1, 12)
2674
                ),
2675
            }
2676
        );
2677
    }
2678
2679
    #[test]
2680
    #[cfg(feature = "unicode-gencat")]
2681
    fn class_unicode_any_empty() {
2682
        assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2683
    }
2684
2685
    #[test]
2686
    #[cfg(not(feature = "unicode-age"))]
2687
    fn class_unicode_age_disabled() {
2688
        assert_eq!(
2689
            t_err(r"\p{age:3.0}"),
2690
            TestError {
2691
                kind: hir::ErrorKind::UnicodePropertyNotFound,
2692
                span: Span::new(
2693
                    Position::new(0, 1, 1),
2694
                    Position::new(11, 1, 12)
2695
                ),
2696
            }
2697
        );
2698
    }
2699
2700
    #[test]
2701
    fn class_bracketed() {
2702
        assert_eq!(t("[a]"), hir_lit("a"));
2703
        assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2704
        assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2705
        assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2706
        assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2707
        assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2708
        assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2709
        assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2710
        assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2711
        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2712
        assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2713
        #[cfg(feature = "unicode-gencat")]
2714
        assert_eq!(
2715
            t(r"[\pZ]"),
2716
            hir_uclass_query(ClassQuery::Binary("separator"))
2717
        );
2718
        #[cfg(feature = "unicode-gencat")]
2719
        assert_eq!(
2720
            t(r"[\p{separator}]"),
2721
            hir_uclass_query(ClassQuery::Binary("separator"))
2722
        );
2723
        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2724
        assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2725
        #[cfg(feature = "unicode-gencat")]
2726
        assert_eq!(
2727
            t(r"[^\PZ]"),
2728
            hir_uclass_query(ClassQuery::Binary("separator"))
2729
        );
2730
        #[cfg(feature = "unicode-gencat")]
2731
        assert_eq!(
2732
            t(r"[^\P{separator}]"),
2733
            hir_uclass_query(ClassQuery::Binary("separator"))
2734
        );
2735
        #[cfg(all(
2736
            feature = "unicode-case",
2737
            any(feature = "unicode-perl", feature = "unicode-gencat")
2738
        ))]
2739
        assert_eq!(
2740
            t(r"(?i)[^\D]"),
2741
            hir_uclass_query(ClassQuery::Binary("digit"))
2742
        );
2743
        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2744
        assert_eq!(
2745
            t(r"(?i)[^\P{greek}]"),
2746
            hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2747
        );
2748
2749
        assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2750
        assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2751
        assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2752
2753
        #[cfg(feature = "unicode-case")]
2754
        assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2755
        #[cfg(feature = "unicode-case")]
2756
        assert_eq!(
2757
            t("(?i)[k]"),
2758
            hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2759
        );
2760
        #[cfg(feature = "unicode-case")]
2761
        assert_eq!(
2762
            t("(?i)[β]"),
2763
            hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2764
        );
2765
        assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2766
2767
        assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2768
        assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
2769
        assert_eq!(
2770
            t_bytes("(?-u)[^a]"),
2771
            class_negate(bclass(&[(b'a', b'a')]))
2772
        );
2773
        #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2774
        assert_eq!(
2775
            t(r"[^\d]"),
2776
            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2777
        );
2778
        #[cfg(feature = "unicode-gencat")]
2779
        assert_eq!(
2780
            t(r"[^\pZ]"),
2781
            hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2782
        );
2783
        #[cfg(feature = "unicode-gencat")]
2784
        assert_eq!(
2785
            t(r"[^\p{separator}]"),
2786
            hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2787
        );
2788
        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2789
        assert_eq!(
2790
            t(r"(?i)[^\p{greek}]"),
2791
            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2792
                "greek"
2793
            ))))
2794
        );
2795
        #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2796
        assert_eq!(
2797
            t(r"(?i)[\P{greek}]"),
2798
            hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2799
                "greek"
2800
            ))))
2801
        );
2802
2803
        // Test some weird cases.
2804
        assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2805
2806
        assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2807
        assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2808
        assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2809
        assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2810
        assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2811
2812
        assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2813
        assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2814
        assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2815
        assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2816
        assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2817
2818
        assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2819
        assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2820
        assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2821
        assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2822
        assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2823
2824
        assert_eq!(
2825
            t_err("(?-u)[^a]"),
2826
            TestError {
2827
                kind: hir::ErrorKind::InvalidUtf8,
2828
                span: Span::new(
2829
                    Position::new(5, 1, 6),
2830
                    Position::new(9, 1, 10)
2831
                ),
2832
            }
2833
        );
2834
        #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2835
        assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2836
        #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2837
        assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2838
    }
2839
2840
    #[test]
2841
    fn class_bracketed_union() {
2842
        assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2843
        #[cfg(feature = "unicode-gencat")]
2844
        assert_eq!(
2845
            t(r"[a\pZb]"),
2846
            hir_union(
2847
                hir_uclass(&[('a', 'b')]),
2848
                hir_uclass_query(ClassQuery::Binary("separator"))
2849
            )
2850
        );
2851
        #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2852
        assert_eq!(
2853
            t(r"[\pZ\p{Greek}]"),
2854
            hir_union(
2855
                hir_uclass_query(ClassQuery::Binary("greek")),
2856
                hir_uclass_query(ClassQuery::Binary("separator"))
2857
            )
2858
        );
2859
        #[cfg(all(
2860
            feature = "unicode-age",
2861
            feature = "unicode-gencat",
2862
            feature = "unicode-script"
2863
        ))]
2864
        assert_eq!(
2865
            t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2866
            hir_union(
2867
                hir_uclass_query(ClassQuery::ByValue {
2868
                    property_name: "age",
2869
                    property_value: "3.0",
2870
                }),
2871
                hir_union(
2872
                    hir_uclass_query(ClassQuery::Binary("greek")),
2873
                    hir_uclass_query(ClassQuery::Binary("separator"))
2874
                )
2875
            )
2876
        );
2877
        #[cfg(all(
2878
            feature = "unicode-age",
2879
            feature = "unicode-gencat",
2880
            feature = "unicode-script"
2881
        ))]
2882
        assert_eq!(
2883
            t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2884
            hir_union(
2885
                hir_uclass_query(ClassQuery::ByValue {
2886
                    property_name: "age",
2887
                    property_value: "3.0",
2888
                }),
2889
                hir_union(
2890
                    hir_uclass_query(ClassQuery::Binary("cyrillic")),
2891
                    hir_union(
2892
                        hir_uclass_query(ClassQuery::Binary("greek")),
2893
                        hir_uclass_query(ClassQuery::Binary("separator"))
2894
                    )
2895
                )
2896
            )
2897
        );
2898
2899
        #[cfg(all(
2900
            feature = "unicode-age",
2901
            feature = "unicode-case",
2902
            feature = "unicode-gencat",
2903
            feature = "unicode-script"
2904
        ))]
2905
        assert_eq!(
2906
            t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2907
            hir_case_fold(hir_union(
2908
                hir_uclass_query(ClassQuery::ByValue {
2909
                    property_name: "age",
2910
                    property_value: "3.0",
2911
                }),
2912
                hir_union(
2913
                    hir_uclass_query(ClassQuery::Binary("greek")),
2914
                    hir_uclass_query(ClassQuery::Binary("separator"))
2915
                )
2916
            ))
2917
        );
2918
        #[cfg(all(
2919
            feature = "unicode-age",
2920
            feature = "unicode-gencat",
2921
            feature = "unicode-script"
2922
        ))]
2923
        assert_eq!(
2924
            t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2925
            hir_negate(hir_union(
2926
                hir_uclass_query(ClassQuery::ByValue {
2927
                    property_name: "age",
2928
                    property_value: "3.0",
2929
                }),
2930
                hir_union(
2931
                    hir_uclass_query(ClassQuery::Binary("greek")),
2932
                    hir_uclass_query(ClassQuery::Binary("separator"))
2933
                )
2934
            ))
2935
        );
2936
        #[cfg(all(
2937
            feature = "unicode-age",
2938
            feature = "unicode-case",
2939
            feature = "unicode-gencat",
2940
            feature = "unicode-script"
2941
        ))]
2942
        assert_eq!(
2943
            t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2944
            hir_negate(hir_case_fold(hir_union(
2945
                hir_uclass_query(ClassQuery::ByValue {
2946
                    property_name: "age",
2947
                    property_value: "3.0",
2948
                }),
2949
                hir_union(
2950
                    hir_uclass_query(ClassQuery::Binary("greek")),
2951
                    hir_uclass_query(ClassQuery::Binary("separator"))
2952
                )
2953
            )))
2954
        );
2955
    }
2956
2957
    #[test]
2958
    fn class_bracketed_nested() {
2959
        assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2960
        assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2961
        assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2962
2963
        assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2964
        assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2965
2966
        #[cfg(feature = "unicode-case")]
2967
        assert_eq!(
2968
            t(r"(?i)[a[^c]]"),
2969
            hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2970
        );
2971
        #[cfg(feature = "unicode-case")]
2972
        assert_eq!(
2973
            t(r"(?i)[a-b[^c]]"),
2974
            hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2975
        );
2976
2977
        #[cfg(feature = "unicode-case")]
2978
        assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2979
        #[cfg(feature = "unicode-case")]
2980
        assert_eq!(
2981
            t(r"(?i)[^a-b[^c]]"),
2982
            hir_uclass(&[('C', 'C'), ('c', 'c')])
2983
        );
2984
2985
        assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2986
        #[cfg(feature = "unicode-case")]
2987
        assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2988
    }
2989
2990
    #[test]
2991
    fn class_bracketed_intersect() {
2992
        assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2993
        assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2994
        assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2995
        assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2996
        assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2997
        assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2998
        assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2999
        assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
3000
        assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3001
3002
        assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
3003
        assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3004
        assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3005
        assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
3006
        assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
3007
        assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
3008
3009
        #[cfg(feature = "unicode-case")]
3010
        assert_eq!(
3011
            t("(?i)[abc&&b-c]"),
3012
            hir_case_fold(hir_uclass(&[('b', 'c')]))
3013
        );
3014
        #[cfg(feature = "unicode-case")]
3015
        assert_eq!(
3016
            t("(?i)[abc&&[b-c]]"),
3017
            hir_case_fold(hir_uclass(&[('b', 'c')]))
3018
        );
3019
        #[cfg(feature = "unicode-case")]
3020
        assert_eq!(
3021
            t("(?i)[[abc]&&[b-c]]"),
3022
            hir_case_fold(hir_uclass(&[('b', 'c')]))
3023
        );
3024
        #[cfg(feature = "unicode-case")]
3025
        assert_eq!(
3026
            t("(?i)[a-z&&b-y&&c-x]"),
3027
            hir_case_fold(hir_uclass(&[('c', 'x')]))
3028
        );
3029
        #[cfg(feature = "unicode-case")]
3030
        assert_eq!(
3031
            t("(?i)[c-da-b&&a-d]"),
3032
            hir_case_fold(hir_uclass(&[('a', 'd')]))
3033
        );
3034
        #[cfg(feature = "unicode-case")]
3035
        assert_eq!(
3036
            t("(?i)[a-d&&c-da-b]"),
3037
            hir_case_fold(hir_uclass(&[('a', 'd')]))
3038
        );
3039
3040
        assert_eq!(
3041
            t("(?i-u)[abc&&b-c]"),
3042
            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3043
        );
3044
        assert_eq!(
3045
            t("(?i-u)[abc&&[b-c]]"),
3046
            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3047
        );
3048
        assert_eq!(
3049
            t("(?i-u)[[abc]&&[b-c]]"),
3050
            hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3051
        );
3052
        assert_eq!(
3053
            t("(?i-u)[a-z&&b-y&&c-x]"),
3054
            hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3055
        );
3056
        assert_eq!(
3057
            t("(?i-u)[c-da-b&&a-d]"),
3058
            hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3059
        );
3060
        assert_eq!(
3061
            t("(?i-u)[a-d&&c-da-b]"),
3062
            hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3063
        );
3064
3065
        // In `[a^]`, `^` does not need to be escaped, so it makes sense that
3066
        // `^` is also allowed to be unescaped after `&&`.
3067
        assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3068
        // `]` needs to be escaped after `&&` since it's not at start of class.
3069
        assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3070
        assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3071
        assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3072
        assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3073
        // Test precedence.
3074
        assert_eq!(
3075
            t(r"[a-w&&[^c-g]z]"),
3076
            hir_uclass(&[('a', 'b'), ('h', 'w')])
3077
        );
3078
    }
3079
3080
    #[test]
3081
    fn class_bracketed_intersect_negate() {
3082
        #[cfg(feature = "unicode-perl")]
3083
        assert_eq!(
3084
            t(r"[^\w&&\d]"),
3085
            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3086
        );
3087
        assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3088
        #[cfg(feature = "unicode-perl")]
3089
        assert_eq!(
3090
            t(r"[^[\w&&\d]]"),
3091
            hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3092
        );
3093
        #[cfg(feature = "unicode-perl")]
3094
        assert_eq!(
3095
            t(r"[^[^\w&&\d]]"),
3096
            hir_uclass_query(ClassQuery::Binary("digit"))
3097
        );
3098
        #[cfg(feature = "unicode-perl")]
3099
        assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3100
3101
        #[cfg(feature = "unicode-perl")]
3102
        assert_eq!(
3103
            t_bytes(r"(?-u)[^\w&&\d]"),
3104
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3105
        );
3106
        assert_eq!(
3107
            t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3108
            hir_negate(hir_bclass(&[(b'a', b'c')]))
3109
        );
3110
        assert_eq!(
3111
            t_bytes(r"(?-u)[^[\w&&\d]]"),
3112
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3113
        );
3114
        assert_eq!(
3115
            t_bytes(r"(?-u)[^[^\w&&\d]]"),
3116
            hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3117
        );
3118
        assert_eq!(
3119
            t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3120
            hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3121
        );
3122
    }
3123
3124
    #[test]
3125
    fn class_bracketed_difference() {
3126
        #[cfg(feature = "unicode-gencat")]
3127
        assert_eq!(
3128
            t(r"[\pL--[:ascii:]]"),
3129
            hir_difference(
3130
                hir_uclass_query(ClassQuery::Binary("letter")),
3131
                hir_uclass(&[('\0', '\x7F')])
3132
            )
3133
        );
3134
3135
        assert_eq!(
3136
            t(r"(?-u)[[:alpha:]--[:lower:]]"),
3137
            hir_bclass(&[(b'A', b'Z')])
3138
        );
3139
    }
3140
3141
    #[test]
3142
    fn class_bracketed_symmetric_difference() {
3143
        #[cfg(feature = "unicode-script")]
3144
        assert_eq!(
3145
            t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3146
            // Class({
3147
            //     '·'..='·',
3148
            //     '\u{300}'..='\u{301}',
3149
            //     '\u{304}'..='\u{304}',
3150
            //     '\u{306}'..='\u{306}',
3151
            //     '\u{308}'..='\u{308}',
3152
            //     '\u{313}'..='\u{313}',
3153
            //     '\u{342}'..='\u{342}',
3154
            //     '\u{345}'..='\u{345}',
3155
            //     'ʹ'..='ʹ',
3156
            //     '\u{1dc0}'..='\u{1dc1}',
3157
            //     '⁝'..='⁝',
3158
            // })
3159
            hir_uclass(&[
3160
                ('·', '·'),
3161
                ('\u{0300}', '\u{0301}'),
3162
                ('\u{0304}', '\u{0304}'),
3163
                ('\u{0306}', '\u{0306}'),
3164
                ('\u{0308}', '\u{0308}'),
3165
                ('\u{0313}', '\u{0313}'),
3166
                ('\u{0342}', '\u{0342}'),
3167
                ('\u{0345}', '\u{0345}'),
3168
                ('ʹ', 'ʹ'),
3169
                ('\u{1DC0}', '\u{1DC1}'),
3170
                ('⁝', '⁝'),
3171
            ])
3172
        );
3173
        assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3174
3175
        assert_eq!(
3176
            t(r"(?-u)[a-g~~c-j]"),
3177
            hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3178
        );
3179
    }
3180
3181
    #[test]
3182
    fn ignore_whitespace() {
3183
        assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
3184
        assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3185
        assert_eq!(
3186
            t(r"(?x)\x # comment
3187
{ # comment
3188
    53 # comment
3189
} #comment"),
3190
            hir_lit("S")
3191
        );
3192
3193
        assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3194
        assert_eq!(
3195
            t(r"(?x)\x # comment
3196
        53 # comment"),
3197
            hir_lit("S")
3198
        );
3199
        assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3200
3201
        #[cfg(feature = "unicode-gencat")]
3202
        assert_eq!(
3203
            t(r"(?x)\p # comment
3204
{ # comment
3205
    Separator # comment
3206
} # comment"),
3207
            hir_uclass_query(ClassQuery::Binary("separator"))
3208
        );
3209
3210
        assert_eq!(
3211
            t(r"(?x)a # comment
3212
{ # comment
3213
    5 # comment
3214
    , # comment
3215
    10 # comment
3216
} # comment"),
3217
            hir_range(true, 5, Some(10), hir_lit("a"))
3218
        );
3219
3220
        assert_eq!(t(r"(?x)a\  # hi there"), hir_lit("a "));
3221
    }
3222
3223
    #[test]
3224
    fn analysis_is_utf8() {
3225
        // Positive examples.
3226
        assert!(props_bytes(r"a").is_utf8());
3227
        assert!(props_bytes(r"ab").is_utf8());
3228
        assert!(props_bytes(r"(?-u)a").is_utf8());
3229
        assert!(props_bytes(r"(?-u)ab").is_utf8());
3230
        assert!(props_bytes(r"\xFF").is_utf8());
3231
        assert!(props_bytes(r"\xFF\xFF").is_utf8());
3232
        assert!(props_bytes(r"[^a]").is_utf8());
3233
        assert!(props_bytes(r"[^a][^a]").is_utf8());
3234
        assert!(props_bytes(r"\b").is_utf8());
3235
        assert!(props_bytes(r"\B").is_utf8());
3236
        assert!(props_bytes(r"(?-u)\b").is_utf8());
3237
        assert!(props_bytes(r"(?-u)\B").is_utf8());
3238
3239
        // Negative examples.
3240
        assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3241
        assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3242
        assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3243
        assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3244
    }
3245
3246
    #[test]
3247
    fn analysis_captures_len() {
3248
        assert_eq!(0, props(r"a").explicit_captures_len());
3249
        assert_eq!(0, props(r"(?:a)").explicit_captures_len());
3250
        assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
3251
        assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
3252
        assert_eq!(1, props(r"(a)").explicit_captures_len());
3253
        assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
3254
        assert_eq!(1, props(r"()").explicit_captures_len());
3255
        assert_eq!(1, props(r"()a").explicit_captures_len());
3256
        assert_eq!(1, props(r"(a)+").explicit_captures_len());
3257
        assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
3258
        assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
3259
        assert_eq!(2, props(r"((a))").explicit_captures_len());
3260
        assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
3261
    }
3262
3263
    #[test]
3264
    fn analysis_static_captures_len() {
3265
        let len = |pattern| props(pattern).static_explicit_captures_len();
3266
        assert_eq!(Some(0), len(r""));
3267
        assert_eq!(Some(0), len(r"foo|bar"));
3268
        assert_eq!(None, len(r"(foo)|bar"));
3269
        assert_eq!(None, len(r"foo|(bar)"));
3270
        assert_eq!(Some(1), len(r"(foo|bar)"));
3271
        assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3272
        assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3273
        assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3274
        assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3275
        assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3276
        assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3277
        assert_eq!(None, len(r"(a)(b)(extra)?"));
3278
        assert_eq!(Some(1), len(r"(foo)|(bar)"));
3279
        assert_eq!(Some(2), len(r"(foo)(bar)"));
3280
        assert_eq!(Some(2), len(r"(foo)+(bar)"));
3281
        assert_eq!(None, len(r"(foo)*(bar)"));
3282
        assert_eq!(Some(0), len(r"(foo)?{0}"));
3283
        assert_eq!(None, len(r"(foo)?{1}"));
3284
        assert_eq!(Some(1), len(r"(foo){1}"));
3285
        assert_eq!(Some(1), len(r"(foo){1,}"));
3286
        assert_eq!(Some(1), len(r"(foo){1,}?"));
3287
        assert_eq!(None, len(r"(foo){1,}??"));
3288
        assert_eq!(None, len(r"(foo){0,}"));
3289
        assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3290
        assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3291
        assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3292
        assert_eq!(
3293
            Some(2),
3294
            len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3295
        );
3296
    }
3297
3298
    #[test]
3299
    fn analysis_is_all_assertions() {
3300
        // Positive examples.
3301
        let p = props(r"\b");
3302
        assert!(!p.look_set().is_empty());
3303
        assert_eq!(p.minimum_len(), Some(0));
3304
3305
        let p = props(r"\B");
3306
        assert!(!p.look_set().is_empty());
3307
        assert_eq!(p.minimum_len(), Some(0));
3308
3309
        let p = props(r"^");
3310
        assert!(!p.look_set().is_empty());
3311
        assert_eq!(p.minimum_len(), Some(0));
3312
3313
        let p = props(r"$");
3314
        assert!(!p.look_set().is_empty());
3315
        assert_eq!(p.minimum_len(), Some(0));
3316
3317
        let p = props(r"\A");
3318
        assert!(!p.look_set().is_empty());
3319
        assert_eq!(p.minimum_len(), Some(0));
3320
3321
        let p = props(r"\z");
3322
        assert!(!p.look_set().is_empty());
3323
        assert_eq!(p.minimum_len(), Some(0));
3324
3325
        let p = props(r"$^\z\A\b\B");
3326
        assert!(!p.look_set().is_empty());
3327
        assert_eq!(p.minimum_len(), Some(0));
3328
3329
        let p = props(r"$|^|\z|\A|\b|\B");
3330
        assert!(!p.look_set().is_empty());
3331
        assert_eq!(p.minimum_len(), Some(0));
3332
3333
        let p = props(r"^$|$^");
3334
        assert!(!p.look_set().is_empty());
3335
        assert_eq!(p.minimum_len(), Some(0));
3336
3337
        let p = props(r"((\b)+())*^");
3338
        assert!(!p.look_set().is_empty());
3339
        assert_eq!(p.minimum_len(), Some(0));
3340
3341
        // Negative examples.
3342
        let p = props(r"^a");
3343
        assert!(!p.look_set().is_empty());
3344
        assert_eq!(p.minimum_len(), Some(1));
3345
    }
3346
3347
    #[test]
3348
    fn analysis_look_set_prefix_any() {
3349
        let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3350
        assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3351
    }
3352
3353
    #[test]
3354
    fn analysis_is_anchored() {
3355
        let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
3356
        let is_end = |p| props(p).look_set_suffix().contains(Look::End);
3357
3358
        // Positive examples.
3359
        assert!(is_start(r"^"));
3360
        assert!(is_end(r"$"));
3361
3362
        assert!(is_start(r"^^"));
3363
        assert!(props(r"$$").look_set_suffix().contains(Look::End));
3364
3365
        assert!(is_start(r"^$"));
3366
        assert!(is_end(r"^$"));
3367
3368
        assert!(is_start(r"^foo"));
3369
        assert!(is_end(r"foo$"));
3370
3371
        assert!(is_start(r"^foo|^bar"));
3372
        assert!(is_end(r"foo$|bar$"));
3373
3374
        assert!(is_start(r"^(foo|bar)"));
3375
        assert!(is_end(r"(foo|bar)$"));
3376
3377
        assert!(is_start(r"^+"));
3378
        assert!(is_end(r"$+"));
3379
        assert!(is_start(r"^++"));
3380
        assert!(is_end(r"$++"));
3381
        assert!(is_start(r"(^)+"));
3382
        assert!(is_end(r"($)+"));
3383
3384
        assert!(is_start(r"$^"));
3385
        assert!(is_start(r"$^"));
3386
        assert!(is_start(r"$^|^$"));
3387
        assert!(is_end(r"$^|^$"));
3388
3389
        assert!(is_start(r"\b^"));
3390
        assert!(is_end(r"$\b"));
3391
        assert!(is_start(r"^(?m:^)"));
3392
        assert!(is_end(r"(?m:$)$"));
3393
        assert!(is_start(r"(?m:^)^"));
3394
        assert!(is_end(r"$(?m:$)"));
3395
3396
        // Negative examples.
3397
        assert!(!is_start(r"(?m)^"));
3398
        assert!(!is_end(r"(?m)$"));
3399
        assert!(!is_start(r"(?m:^$)|$^"));
3400
        assert!(!is_end(r"(?m:^$)|$^"));
3401
        assert!(!is_start(r"$^|(?m:^$)"));
3402
        assert!(!is_end(r"$^|(?m:^$)"));
3403
3404
        assert!(!is_start(r"a^"));
3405
        assert!(!is_start(r"$a"));
3406
3407
        assert!(!is_end(r"a^"));
3408
        assert!(!is_end(r"$a"));
3409
3410
        assert!(!is_start(r"^foo|bar"));
3411
        assert!(!is_end(r"foo|bar$"));
3412
3413
        assert!(!is_start(r"^*"));
3414
        assert!(!is_end(r"$*"));
3415
        assert!(!is_start(r"^*+"));
3416
        assert!(!is_end(r"$*+"));
3417
        assert!(!is_start(r"^+*"));
3418
        assert!(!is_end(r"$+*"));
3419
        assert!(!is_start(r"(^)*"));
3420
        assert!(!is_end(r"($)*"));
3421
    }
3422
3423
    #[test]
3424
    fn analysis_is_any_anchored() {
3425
        let is_start = |p| props(p).look_set().contains(Look::Start);
3426
        let is_end = |p| props(p).look_set().contains(Look::End);
3427
3428
        // Positive examples.
3429
        assert!(is_start(r"^"));
3430
        assert!(is_end(r"$"));
3431
        assert!(is_start(r"\A"));
3432
        assert!(is_end(r"\z"));
3433
3434
        // Negative examples.
3435
        assert!(!is_start(r"(?m)^"));
3436
        assert!(!is_end(r"(?m)$"));
3437
        assert!(!is_start(r"$"));
3438
        assert!(!is_end(r"^"));
3439
    }
3440
3441
    #[test]
3442
    fn analysis_can_empty() {
3443
        // Positive examples.
3444
        let assert_empty =
3445
            |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
3446
        assert_empty(r"");
3447
        assert_empty(r"()");
3448
        assert_empty(r"()*");
3449
        assert_empty(r"()+");
3450
        assert_empty(r"()?");
3451
        assert_empty(r"a*");
3452
        assert_empty(r"a?");
3453
        assert_empty(r"a{0}");
3454
        assert_empty(r"a{0,}");
3455
        assert_empty(r"a{0,1}");
3456
        assert_empty(r"a{0,10}");
3457
        #[cfg(feature = "unicode-gencat")]
3458
        assert_empty(r"\pL*");
3459
        assert_empty(r"a*|b");
3460
        assert_empty(r"b|a*");
3461
        assert_empty(r"a|");
3462
        assert_empty(r"|a");
3463
        assert_empty(r"a||b");
3464
        assert_empty(r"a*a?(abcd)*");
3465
        assert_empty(r"^");
3466
        assert_empty(r"$");
3467
        assert_empty(r"(?m)^");
3468
        assert_empty(r"(?m)$");
3469
        assert_empty(r"\A");
3470
        assert_empty(r"\z");
3471
        assert_empty(r"\B");
3472
        assert_empty(r"(?-u)\B");
3473
        assert_empty(r"\b");
3474
        assert_empty(r"(?-u)\b");
3475
3476
        // Negative examples.
3477
        let assert_non_empty =
3478
            |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
3479
        assert_non_empty(r"a+");
3480
        assert_non_empty(r"a{1}");
3481
        assert_non_empty(r"a{1,}");
3482
        assert_non_empty(r"a{1,2}");
3483
        assert_non_empty(r"a{1,10}");
3484
        assert_non_empty(r"b|a");
3485
        assert_non_empty(r"a*a+(abcd)*");
3486
        #[cfg(feature = "unicode-gencat")]
3487
        assert_non_empty(r"\P{any}");
3488
        assert_non_empty(r"[a--a]");
3489
        assert_non_empty(r"[a&&b]");
3490
    }
3491
3492
    #[test]
3493
    fn analysis_is_literal() {
3494
        // Positive examples.
3495
        assert!(props(r"a").is_literal());
3496
        assert!(props(r"ab").is_literal());
3497
        assert!(props(r"abc").is_literal());
3498
        assert!(props(r"(?m)abc").is_literal());
3499
        assert!(props(r"(?:a)").is_literal());
3500
        assert!(props(r"foo(?:a)").is_literal());
3501
        assert!(props(r"(?:a)foo").is_literal());
3502
        assert!(props(r"[a]").is_literal());
3503
3504
        // Negative examples.
3505
        assert!(!props(r"").is_literal());
3506
        assert!(!props(r"^").is_literal());
3507
        assert!(!props(r"a|b").is_literal());
3508
        assert!(!props(r"(a)").is_literal());
3509
        assert!(!props(r"a+").is_literal());
3510
        assert!(!props(r"foo(a)").is_literal());
3511
        assert!(!props(r"(a)foo").is_literal());
3512
        assert!(!props(r"[ab]").is_literal());
3513
    }
3514
3515
    #[test]
3516
    fn analysis_is_alternation_literal() {
3517
        // Positive examples.
3518
        assert!(props(r"a").is_alternation_literal());
3519
        assert!(props(r"ab").is_alternation_literal());
3520
        assert!(props(r"abc").is_alternation_literal());
3521
        assert!(props(r"(?m)abc").is_alternation_literal());
3522
        assert!(props(r"foo|bar").is_alternation_literal());
3523
        assert!(props(r"foo|bar|baz").is_alternation_literal());
3524
        assert!(props(r"[a]").is_alternation_literal());
3525
        assert!(props(r"(?:ab)|cd").is_alternation_literal());
3526
        assert!(props(r"ab|(?:cd)").is_alternation_literal());
3527
3528
        // Negative examples.
3529
        assert!(!props(r"").is_alternation_literal());
3530
        assert!(!props(r"^").is_alternation_literal());
3531
        assert!(!props(r"(a)").is_alternation_literal());
3532
        assert!(!props(r"a+").is_alternation_literal());
3533
        assert!(!props(r"foo(a)").is_alternation_literal());
3534
        assert!(!props(r"(a)foo").is_alternation_literal());
3535
        assert!(!props(r"[ab]").is_alternation_literal());
3536
        assert!(!props(r"[ab]|b").is_alternation_literal());
3537
        assert!(!props(r"a|[ab]").is_alternation_literal());
3538
        assert!(!props(r"(a)|b").is_alternation_literal());
3539
        assert!(!props(r"a|(b)").is_alternation_literal());
3540
        assert!(!props(r"a|b").is_alternation_literal());
3541
        assert!(!props(r"a|b|c").is_alternation_literal());
3542
        assert!(!props(r"[a]|b").is_alternation_literal());
3543
        assert!(!props(r"a|[b]").is_alternation_literal());
3544
        assert!(!props(r"(?:a)|b").is_alternation_literal());
3545
        assert!(!props(r"a|(?:b)").is_alternation_literal());
3546
        assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
3547
    }
3548
3549
    // This tests that the smart Hir::repetition constructors does some basic
3550
    // simplifications.
3551
    #[test]
3552
    fn smart_repetition() {
3553
        assert_eq!(t(r"a{0}"), Hir::empty());
3554
        assert_eq!(t(r"a{1}"), hir_lit("a"));
3555
        assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
3556
    }
3557
3558
    // This tests that the smart Hir::concat constructor simplifies the given
3559
    // exprs in a way we expect.
3560
    #[test]
3561
    fn smart_concat() {
3562
        assert_eq!(t(""), Hir::empty());
3563
        assert_eq!(t("(?:)"), Hir::empty());
3564
        assert_eq!(t("abc"), hir_lit("abc"));
3565
        assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3566
        assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3567
        assert_eq!(
3568
            t("foo(?:bar^baz)quux"),
3569
            hir_cat(vec![
3570
                hir_lit("foobar"),
3571
                hir_look(hir::Look::Start),
3572
                hir_lit("bazquux"),
3573
            ])
3574
        );
3575
        assert_eq!(
3576
            t("foo(?:ba(?:r^b)az)quux"),
3577
            hir_cat(vec![
3578
                hir_lit("foobar"),
3579
                hir_look(hir::Look::Start),
3580
                hir_lit("bazquux"),
3581
            ])
3582
        );
3583
    }
3584
3585
    // This tests that the smart Hir::alternation constructor simplifies the
3586
    // given exprs in a way we expect.
3587
    #[test]
3588
    fn smart_alternation() {
3589
        assert_eq!(
3590
            t("(?:foo)|(?:bar)"),
3591
            hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3592
        );
3593
        assert_eq!(
3594
            t("quux|(?:abc|def|xyz)|baz"),
3595
            hir_alt(vec![
3596
                hir_lit("quux"),
3597
                hir_lit("abc"),
3598
                hir_lit("def"),
3599
                hir_lit("xyz"),
3600
                hir_lit("baz"),
3601
            ])
3602
        );
3603
        assert_eq!(
3604
            t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3605
            hir_alt(vec![
3606
                hir_lit("quux"),
3607
                hir_lit("abc"),
3608
                hir_lit("def"),
3609
                hir_lit("mno"),
3610
                hir_lit("xyz"),
3611
                hir_lit("baz"),
3612
            ])
3613
        );
3614
        assert_eq!(
3615
            t("a|b|c|d|e|f|x|y|z"),
3616
            hir_uclass(&[('a', 'f'), ('x', 'z')]),
3617
        );
3618
        // Tests that we lift common prefixes out of an alternation.
3619
        assert_eq!(
3620
            t("[A-Z]foo|[A-Z]quux"),
3621
            hir_cat(vec![
3622
                hir_uclass(&[('A', 'Z')]),
3623
                hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3624
            ]),
3625
        );
3626
        assert_eq!(
3627
            t("[A-Z][A-Z]|[A-Z]quux"),
3628
            hir_cat(vec![
3629
                hir_uclass(&[('A', 'Z')]),
3630
                hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3631
            ]),
3632
        );
3633
        assert_eq!(
3634
            t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
3635
            hir_cat(vec![
3636
                hir_uclass(&[('A', 'Z')]),
3637
                hir_uclass(&[('A', 'Z')]),
3638
                hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3639
            ]),
3640
        );
3641
        assert_eq!(
3642
            t("[A-Z]foo|[A-Z]foobar"),
3643
            hir_cat(vec![
3644
                hir_uclass(&[('A', 'Z')]),
3645
                hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3646
            ]),
3647
        );
3648
    }
3649
3650
    #[test]
3651
    fn regression_alt_empty_concat() {
3652
        use crate::ast::{self, Ast};
3653
3654
        let span = Span::splat(Position::new(0, 0, 0));
3655
        let ast = Ast::alternation(ast::Alternation {
3656
            span,
3657
            asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
3658
        });
3659
3660
        let mut t = Translator::new();
3661
        assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
3662
    }
3663
3664
    #[test]
3665
    fn regression_empty_alt() {
3666
        use crate::ast::{self, Ast};
3667
3668
        let span = Span::splat(Position::new(0, 0, 0));
3669
        let ast = Ast::concat(ast::Concat {
3670
            span,
3671
            asts: vec![Ast::alternation(ast::Alternation {
3672
                span,
3673
                asts: vec![],
3674
            })],
3675
        });
3676
3677
        let mut t = Translator::new();
3678
        assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
3679
    }
3680
3681
    #[test]
3682
    fn regression_singleton_alt() {
3683
        use crate::{
3684
            ast::{self, Ast},
3685
            hir::Dot,
3686
        };
3687
3688
        let span = Span::splat(Position::new(0, 0, 0));
3689
        let ast = Ast::concat(ast::Concat {
3690
            span,
3691
            asts: vec![Ast::alternation(ast::Alternation {
3692
                span,
3693
                asts: vec![Ast::dot(span)],
3694
            })],
3695
        });
3696
3697
        let mut t = Translator::new();
3698
        assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
3699
    }
3700
3701
    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
3702
    #[test]
3703
    fn regression_fuzz_match() {
3704
        let pat = "[(\u{6} \0-\u{afdf5}]  \0 ";
3705
        let ast = ParserBuilder::new()
3706
            .octal(false)
3707
            .ignore_whitespace(true)
3708
            .build()
3709
            .parse(pat)
3710
            .unwrap();
3711
        let hir = TranslatorBuilder::new()
3712
            .utf8(true)
3713
            .case_insensitive(false)
3714
            .multi_line(false)
3715
            .dot_matches_new_line(false)
3716
            .swap_greed(true)
3717
            .unicode(true)
3718
            .build()
3719
            .translate(pat, &ast)
3720
            .unwrap();
3721
        assert_eq!(
3722
            hir,
3723
            Hir::concat(vec![
3724
                hir_uclass(&[('\0', '\u{afdf5}')]),
3725
                hir_lit("\0"),
3726
            ])
3727
        );
3728
    }
3729
3730
    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
3731
    #[cfg(feature = "unicode")]
3732
    #[test]
3733
    fn regression_fuzz_difference1() {
3734
        let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
3735
        let _ = t(pat); // shouldn't panic
3736
    }
3737
3738
    // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
3739
    #[test]
3740
    fn regression_fuzz_char_decrement1() {
3741
        let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
3742
        let _ = t(pat); // shouldn't panic
3743
    }
3744
}