/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/hir/translate.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Defines a translator that converts an `Ast` to an `Hir`. |
3 | | */ |
4 | | |
5 | | use core::cell::{Cell, RefCell}; |
6 | | |
7 | | use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; |
8 | | |
9 | | use crate::{ |
10 | | ast::{self, Ast, Span, Visitor}, |
11 | | either::Either, |
12 | | hir::{self, Error, ErrorKind, Hir, HirKind}, |
13 | | unicode::{self, ClassQuery}, |
14 | | }; |
15 | | |
16 | | type Result<T> = core::result::Result<T, Error>; |
17 | | |
18 | | /// A builder for constructing an AST->HIR translator. |
19 | | #[derive(Clone, Debug)] |
20 | | pub struct TranslatorBuilder { |
21 | | utf8: bool, |
22 | | line_terminator: u8, |
23 | | flags: Flags, |
24 | | } |
25 | | |
26 | | impl Default for TranslatorBuilder { |
27 | 0 | fn default() -> TranslatorBuilder { |
28 | 0 | TranslatorBuilder::new() |
29 | 0 | } |
30 | | } |
31 | | |
32 | | impl TranslatorBuilder { |
33 | | /// Create a new translator builder with a default c onfiguration. |
34 | 0 | pub fn new() -> TranslatorBuilder { |
35 | 0 | TranslatorBuilder { |
36 | 0 | utf8: true, |
37 | 0 | line_terminator: b'\n', |
38 | 0 | flags: Flags::default(), |
39 | 0 | } |
40 | 0 | } |
41 | | |
42 | | /// Build a translator using the current configuration. |
43 | 0 | pub fn build(&self) -> Translator { |
44 | 0 | Translator { |
45 | 0 | stack: RefCell::new(vec![]), |
46 | 0 | flags: Cell::new(self.flags), |
47 | 0 | utf8: self.utf8, |
48 | 0 | line_terminator: self.line_terminator, |
49 | 0 | } |
50 | 0 | } |
51 | | |
52 | | /// When disabled, translation will permit the construction of a regular |
53 | | /// expression that may match invalid UTF-8. |
54 | | /// |
55 | | /// When enabled (the default), the translator is guaranteed to produce an |
56 | | /// expression that, for non-empty matches, will only ever produce spans |
57 | | /// that are entirely valid UTF-8 (otherwise, the translator will return an |
58 | | /// error). |
59 | | /// |
60 | | /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even |
61 | | /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete |
62 | | /// syntax) will be allowed even though they can produce matches that split |
63 | | /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" |
64 | | /// matches, and it is expected that the regex engine itself must handle |
65 | | /// these cases if necessary (perhaps by suppressing any zero-width matches |
66 | | /// that split a codepoint). |
67 | 0 | pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { |
68 | 0 | self.utf8 = yes; |
69 | 0 | self |
70 | 0 | } |
71 | | |
72 | | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
73 | | /// |
74 | | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
75 | | /// this will cause `.` to match everything except for the byte given. |
76 | | /// |
77 | | /// If `.` is used in a context where Unicode mode is enabled and this byte |
78 | | /// isn't ASCII, then an error will be returned. When Unicode mode is |
79 | | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
80 | | /// mode is enabled and it is a non-ASCII byte. |
81 | | /// |
82 | | /// In short, any ASCII value for a line terminator is always okay. But a |
83 | | /// non-ASCII byte might result in an error depending on whether Unicode |
84 | | /// mode or UTF-8 mode are enabled. |
85 | | /// |
86 | | /// Note that if `R` mode is enabled then it always takes precedence and |
87 | | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
88 | | /// |
89 | | /// Note also that this *doesn't* impact the look-around assertions |
90 | | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
91 | | /// configuration in the regex engine itself. |
92 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { |
93 | 0 | self.line_terminator = byte; |
94 | 0 | self |
95 | 0 | } |
96 | | |
97 | | /// Enable or disable the case insensitive flag (`i`) by default. |
98 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { |
99 | 0 | self.flags.case_insensitive = if yes { Some(true) } else { None }; |
100 | 0 | self |
101 | 0 | } |
102 | | |
103 | | /// Enable or disable the multi-line matching flag (`m`) by default. |
104 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { |
105 | 0 | self.flags.multi_line = if yes { Some(true) } else { None }; |
106 | 0 | self |
107 | 0 | } |
108 | | |
109 | | /// Enable or disable the "dot matches any character" flag (`s`) by |
110 | | /// default. |
111 | 0 | pub fn dot_matches_new_line( |
112 | 0 | &mut self, |
113 | 0 | yes: bool, |
114 | 0 | ) -> &mut TranslatorBuilder { |
115 | 0 | self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; |
116 | 0 | self |
117 | 0 | } |
118 | | |
119 | | /// Enable or disable the CRLF mode flag (`R`) by default. |
120 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { |
121 | 0 | self.flags.crlf = if yes { Some(true) } else { None }; |
122 | 0 | self |
123 | 0 | } |
124 | | |
125 | | /// Enable or disable the "swap greed" flag (`U`) by default. |
126 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { |
127 | 0 | self.flags.swap_greed = if yes { Some(true) } else { None }; |
128 | 0 | self |
129 | 0 | } |
130 | | |
131 | | /// Enable or disable the Unicode flag (`u`) by default. |
132 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { |
133 | 0 | self.flags.unicode = if yes { None } else { Some(false) }; |
134 | 0 | self |
135 | 0 | } |
136 | | } |
137 | | |
138 | | /// A translator maps abstract syntax to a high level intermediate |
139 | | /// representation. |
140 | | /// |
141 | | /// A translator may be benefit from reuse. That is, a translator can translate |
142 | | /// many abstract syntax trees. |
143 | | /// |
144 | | /// A `Translator` can be configured in more detail via a |
145 | | /// [`TranslatorBuilder`]. |
146 | | #[derive(Clone, Debug)] |
147 | | pub struct Translator { |
148 | | /// Our call stack, but on the heap. |
149 | | stack: RefCell<Vec<HirFrame>>, |
150 | | /// The current flag settings. |
151 | | flags: Cell<Flags>, |
152 | | /// Whether we're allowed to produce HIR that can match arbitrary bytes. |
153 | | utf8: bool, |
154 | | /// The line terminator to use for `.`. |
155 | | line_terminator: u8, |
156 | | } |
157 | | |
158 | | impl Translator { |
159 | | /// Create a new translator using the default configuration. |
160 | 0 | pub fn new() -> Translator { |
161 | 0 | TranslatorBuilder::new().build() |
162 | 0 | } |
163 | | |
164 | | /// Translate the given abstract syntax tree (AST) into a high level |
165 | | /// intermediate representation (HIR). |
166 | | /// |
167 | | /// If there was a problem doing the translation, then an HIR-specific |
168 | | /// error is returned. |
169 | | /// |
170 | | /// The original pattern string used to produce the `Ast` *must* also be |
171 | | /// provided. The translator does not use the pattern string during any |
172 | | /// correct translation, but is used for error reporting. |
173 | 0 | pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { |
174 | 0 | ast::visit(ast, TranslatorI::new(self, pattern)) |
175 | 0 | } |
176 | | } |
177 | | |
178 | | /// An HirFrame is a single stack frame, represented explicitly, which is |
179 | | /// created for each item in the Ast that we traverse. |
180 | | /// |
181 | | /// Note that technically, this type doesn't represent our entire stack |
182 | | /// frame. In particular, the Ast visitor represents any state associated with |
183 | | /// traversing the Ast itself. |
184 | | #[derive(Clone, Debug)] |
185 | | enum HirFrame { |
186 | | /// An arbitrary HIR expression. These get pushed whenever we hit a base |
187 | | /// case in the Ast. They get popped after an inductive (i.e., recursive) |
188 | | /// step is complete. |
189 | | Expr(Hir), |
190 | | /// A literal that is being constructed, character by character, from the |
191 | | /// AST. We need this because the AST gives each individual character its |
192 | | /// own node. So as we see characters, we peek at the top-most HirFrame. |
193 | | /// If it's a literal, then we add to it. Otherwise, we push a new literal. |
194 | | /// When it comes time to pop it, we convert it to an Hir via Hir::literal. |
195 | | Literal(Vec<u8>), |
196 | | /// A Unicode character class. This frame is mutated as we descend into |
197 | | /// the Ast of a character class (which is itself its own mini recursive |
198 | | /// structure). |
199 | | ClassUnicode(hir::ClassUnicode), |
200 | | /// A byte-oriented character class. This frame is mutated as we descend |
201 | | /// into the Ast of a character class (which is itself its own mini |
202 | | /// recursive structure). |
203 | | /// |
204 | | /// Byte character classes are created when Unicode mode (`u`) is disabled. |
205 | | /// If `utf8` is enabled (the default), then a byte character is only |
206 | | /// permitted to match ASCII text. |
207 | | ClassBytes(hir::ClassBytes), |
208 | | /// This is pushed whenever a repetition is observed. After visiting every |
209 | | /// sub-expression in the repetition, the translator's stack is expected to |
210 | | /// have this sentinel at the top. |
211 | | /// |
212 | | /// This sentinel only exists to stop other things (like flattening |
213 | | /// literals) from reaching across repetition operators. |
214 | | Repetition, |
215 | | /// This is pushed on to the stack upon first seeing any kind of capture, |
216 | | /// indicated by parentheses (including non-capturing groups). It is popped |
217 | | /// upon leaving a group. |
218 | | Group { |
219 | | /// The old active flags when this group was opened. |
220 | | /// |
221 | | /// If this group sets flags, then the new active flags are set to the |
222 | | /// result of merging the old flags with the flags introduced by this |
223 | | /// group. If the group doesn't set any flags, then this is simply |
224 | | /// equivalent to whatever flags were set when the group was opened. |
225 | | /// |
226 | | /// When this group is popped, the active flags should be restored to |
227 | | /// the flags set here. |
228 | | /// |
229 | | /// The "active" flags correspond to whatever flags are set in the |
230 | | /// Translator. |
231 | | old_flags: Flags, |
232 | | }, |
233 | | /// This is pushed whenever a concatenation is observed. After visiting |
234 | | /// every sub-expression in the concatenation, the translator's stack is |
235 | | /// popped until it sees a Concat frame. |
236 | | Concat, |
237 | | /// This is pushed whenever an alternation is observed. After visiting |
238 | | /// every sub-expression in the alternation, the translator's stack is |
239 | | /// popped until it sees an Alternation frame. |
240 | | Alternation, |
241 | | /// This is pushed immediately before each sub-expression in an |
242 | | /// alternation. This separates the branches of an alternation on the |
243 | | /// stack and prevents literal flattening from reaching across alternation |
244 | | /// branches. |
245 | | /// |
246 | | /// It is popped after each expression in a branch until an 'Alternation' |
247 | | /// frame is observed when doing a post visit on an alternation. |
248 | | AlternationBranch, |
249 | | } |
250 | | |
251 | | impl HirFrame { |
252 | | /// Assert that the current stack frame is an Hir expression and return it. |
253 | 0 | fn unwrap_expr(self) -> Hir { |
254 | 0 | match self { |
255 | 0 | HirFrame::Expr(expr) => expr, |
256 | 0 | HirFrame::Literal(lit) => Hir::literal(lit), |
257 | 0 | _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), |
258 | | } |
259 | 0 | } |
260 | | |
261 | | /// Assert that the current stack frame is a Unicode class expression and |
262 | | /// return it. |
263 | 0 | fn unwrap_class_unicode(self) -> hir::ClassUnicode { |
264 | 0 | match self { |
265 | 0 | HirFrame::ClassUnicode(cls) => cls, |
266 | 0 | _ => panic!( |
267 | 0 | "tried to unwrap Unicode class \ |
268 | 0 | from HirFrame, got: {:?}", |
269 | 0 | self |
270 | 0 | ), |
271 | | } |
272 | 0 | } |
273 | | |
274 | | /// Assert that the current stack frame is a byte class expression and |
275 | | /// return it. |
276 | 0 | fn unwrap_class_bytes(self) -> hir::ClassBytes { |
277 | 0 | match self { |
278 | 0 | HirFrame::ClassBytes(cls) => cls, |
279 | 0 | _ => panic!( |
280 | 0 | "tried to unwrap byte class \ |
281 | 0 | from HirFrame, got: {:?}", |
282 | 0 | self |
283 | 0 | ), |
284 | | } |
285 | 0 | } |
286 | | |
287 | | /// Assert that the current stack frame is a repetition sentinel. If it |
288 | | /// isn't, then panic. |
289 | 0 | fn unwrap_repetition(self) { |
290 | 0 | match self { |
291 | 0 | HirFrame::Repetition => {} |
292 | | _ => { |
293 | 0 | panic!( |
294 | 0 | "tried to unwrap repetition from HirFrame, got: {:?}", |
295 | 0 | self |
296 | 0 | ) |
297 | | } |
298 | | } |
299 | 0 | } |
300 | | |
301 | | /// Assert that the current stack frame is a group indicator and return |
302 | | /// its corresponding flags (the flags that were active at the time the |
303 | | /// group was entered). |
304 | 0 | fn unwrap_group(self) -> Flags { |
305 | 0 | match self { |
306 | 0 | HirFrame::Group { old_flags } => old_flags, |
307 | | _ => { |
308 | 0 | panic!("tried to unwrap group from HirFrame, got: {:?}", self) |
309 | | } |
310 | | } |
311 | 0 | } |
312 | | |
313 | | /// Assert that the current stack frame is an alternation pipe sentinel. If |
314 | | /// it isn't, then panic. |
315 | 0 | fn unwrap_alternation_pipe(self) { |
316 | 0 | match self { |
317 | 0 | HirFrame::AlternationBranch => {} |
318 | | _ => { |
319 | 0 | panic!( |
320 | 0 | "tried to unwrap alt pipe from HirFrame, got: {:?}", |
321 | 0 | self |
322 | 0 | ) |
323 | | } |
324 | | } |
325 | 0 | } |
326 | | } |
327 | | |
328 | | impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { |
329 | | type Output = Hir; |
330 | | type Err = Error; |
331 | | |
332 | 0 | fn finish(self) -> Result<Hir> { |
333 | 0 | // ... otherwise, we should have exactly one HIR on the stack. |
334 | 0 | assert_eq!(self.trans().stack.borrow().len(), 1); |
335 | 0 | Ok(self.pop().unwrap().unwrap_expr()) |
336 | 0 | } |
337 | | |
338 | 0 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
339 | 0 | match *ast { |
340 | | Ast::ClassBracketed(_) => { |
341 | 0 | if self.flags().unicode() { |
342 | 0 | let cls = hir::ClassUnicode::empty(); |
343 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
344 | 0 | } else { |
345 | 0 | let cls = hir::ClassBytes::empty(); |
346 | 0 | self.push(HirFrame::ClassBytes(cls)); |
347 | 0 | } |
348 | | } |
349 | 0 | Ast::Repetition(_) => self.push(HirFrame::Repetition), |
350 | 0 | Ast::Group(ref x) => { |
351 | 0 | let old_flags = x |
352 | 0 | .flags() |
353 | 0 | .map(|ast| self.set_flags(ast)) |
354 | 0 | .unwrap_or_else(|| self.flags()); |
355 | 0 | self.push(HirFrame::Group { old_flags }); |
356 | 0 | } |
357 | 0 | Ast::Concat(_) => { |
358 | 0 | self.push(HirFrame::Concat); |
359 | 0 | } |
360 | 0 | Ast::Alternation(ref x) => { |
361 | 0 | self.push(HirFrame::Alternation); |
362 | 0 | if !x.asts.is_empty() { |
363 | 0 | self.push(HirFrame::AlternationBranch); |
364 | 0 | } |
365 | | } |
366 | 0 | _ => {} |
367 | | } |
368 | 0 | Ok(()) |
369 | 0 | } |
370 | | |
371 | 0 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
372 | 0 | match *ast { |
373 | 0 | Ast::Empty(_) => { |
374 | 0 | self.push(HirFrame::Expr(Hir::empty())); |
375 | 0 | } |
376 | 0 | Ast::Flags(ref x) => { |
377 | 0 | self.set_flags(&x.flags); |
378 | 0 | // Flags in the AST are generally considered directives and |
379 | 0 | // not actual sub-expressions. However, they can be used in |
380 | 0 | // the concrete syntax like `((?i))`, and we need some kind of |
381 | 0 | // indication of an expression there, and Empty is the correct |
382 | 0 | // choice. |
383 | 0 | // |
384 | 0 | // There can also be things like `(?i)+`, but we rule those out |
385 | 0 | // in the parser. In the future, we might allow them for |
386 | 0 | // consistency sake. |
387 | 0 | self.push(HirFrame::Expr(Hir::empty())); |
388 | 0 | } |
389 | 0 | Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { |
390 | 0 | Either::Right(byte) => self.push_byte(byte), |
391 | 0 | Either::Left(ch) => match self.case_fold_char(x.span, ch)? { |
392 | 0 | None => self.push_char(ch), |
393 | 0 | Some(expr) => self.push(HirFrame::Expr(expr)), |
394 | | }, |
395 | | }, |
396 | 0 | Ast::Dot(ref span) => { |
397 | 0 | self.push(HirFrame::Expr(self.hir_dot(**span)?)); |
398 | | } |
399 | 0 | Ast::Assertion(ref x) => { |
400 | 0 | self.push(HirFrame::Expr(self.hir_assertion(x)?)); |
401 | | } |
402 | 0 | Ast::ClassPerl(ref x) => { |
403 | 0 | if self.flags().unicode() { |
404 | 0 | let cls = self.hir_perl_unicode_class(x)?; |
405 | 0 | let hcls = hir::Class::Unicode(cls); |
406 | 0 | self.push(HirFrame::Expr(Hir::class(hcls))); |
407 | | } else { |
408 | 0 | let cls = self.hir_perl_byte_class(x)?; |
409 | 0 | let hcls = hir::Class::Bytes(cls); |
410 | 0 | self.push(HirFrame::Expr(Hir::class(hcls))); |
411 | | } |
412 | | } |
413 | 0 | Ast::ClassUnicode(ref x) => { |
414 | 0 | let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); |
415 | 0 | self.push(HirFrame::Expr(Hir::class(cls))); |
416 | | } |
417 | 0 | Ast::ClassBracketed(ref ast) => { |
418 | 0 | if self.flags().unicode() { |
419 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
420 | 0 | self.unicode_fold_and_negate( |
421 | 0 | &ast.span, |
422 | 0 | ast.negated, |
423 | 0 | &mut cls, |
424 | 0 | )?; |
425 | 0 | let expr = Hir::class(hir::Class::Unicode(cls)); |
426 | 0 | self.push(HirFrame::Expr(expr)); |
427 | | } else { |
428 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
429 | 0 | self.bytes_fold_and_negate( |
430 | 0 | &ast.span, |
431 | 0 | ast.negated, |
432 | 0 | &mut cls, |
433 | 0 | )?; |
434 | 0 | let expr = Hir::class(hir::Class::Bytes(cls)); |
435 | 0 | self.push(HirFrame::Expr(expr)); |
436 | | } |
437 | | } |
438 | 0 | Ast::Repetition(ref x) => { |
439 | 0 | let expr = self.pop().unwrap().unwrap_expr(); |
440 | 0 | self.pop().unwrap().unwrap_repetition(); |
441 | 0 | self.push(HirFrame::Expr(self.hir_repetition(x, expr))); |
442 | 0 | } |
443 | 0 | Ast::Group(ref x) => { |
444 | 0 | let expr = self.pop().unwrap().unwrap_expr(); |
445 | 0 | let old_flags = self.pop().unwrap().unwrap_group(); |
446 | 0 | self.trans().flags.set(old_flags); |
447 | 0 | self.push(HirFrame::Expr(self.hir_capture(x, expr))); |
448 | 0 | } |
449 | | Ast::Concat(_) => { |
450 | 0 | let mut exprs = vec![]; |
451 | 0 | while let Some(expr) = self.pop_concat_expr() { |
452 | 0 | if !matches!(*expr.kind(), HirKind::Empty) { |
453 | 0 | exprs.push(expr); |
454 | 0 | } |
455 | | } |
456 | 0 | exprs.reverse(); |
457 | 0 | self.push(HirFrame::Expr(Hir::concat(exprs))); |
458 | | } |
459 | | Ast::Alternation(_) => { |
460 | 0 | let mut exprs = vec![]; |
461 | 0 | while let Some(expr) = self.pop_alt_expr() { |
462 | 0 | self.pop().unwrap().unwrap_alternation_pipe(); |
463 | 0 | exprs.push(expr); |
464 | 0 | } |
465 | 0 | exprs.reverse(); |
466 | 0 | self.push(HirFrame::Expr(Hir::alternation(exprs))); |
467 | | } |
468 | | } |
469 | 0 | Ok(()) |
470 | 0 | } |
471 | | |
472 | 0 | fn visit_alternation_in(&mut self) -> Result<()> { |
473 | 0 | self.push(HirFrame::AlternationBranch); |
474 | 0 | Ok(()) |
475 | 0 | } |
476 | | |
477 | 0 | fn visit_class_set_item_pre( |
478 | 0 | &mut self, |
479 | 0 | ast: &ast::ClassSetItem, |
480 | 0 | ) -> Result<()> { |
481 | 0 | match *ast { |
482 | | ast::ClassSetItem::Bracketed(_) => { |
483 | 0 | if self.flags().unicode() { |
484 | 0 | let cls = hir::ClassUnicode::empty(); |
485 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
486 | 0 | } else { |
487 | 0 | let cls = hir::ClassBytes::empty(); |
488 | 0 | self.push(HirFrame::ClassBytes(cls)); |
489 | 0 | } |
490 | | } |
491 | | // We needn't handle the Union case here since the visitor will |
492 | | // do it for us. |
493 | 0 | _ => {} |
494 | | } |
495 | 0 | Ok(()) |
496 | 0 | } |
497 | | |
498 | 0 | fn visit_class_set_item_post( |
499 | 0 | &mut self, |
500 | 0 | ast: &ast::ClassSetItem, |
501 | 0 | ) -> Result<()> { |
502 | 0 | match *ast { |
503 | 0 | ast::ClassSetItem::Empty(_) => {} |
504 | 0 | ast::ClassSetItem::Literal(ref x) => { |
505 | 0 | if self.flags().unicode() { |
506 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
507 | 0 | cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); |
508 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
509 | 0 | } else { |
510 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
511 | 0 | let byte = self.class_literal_byte(x)?; |
512 | 0 | cls.push(hir::ClassBytesRange::new(byte, byte)); |
513 | 0 | self.push(HirFrame::ClassBytes(cls)); |
514 | | } |
515 | | } |
516 | 0 | ast::ClassSetItem::Range(ref x) => { |
517 | 0 | if self.flags().unicode() { |
518 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
519 | 0 | cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); |
520 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
521 | 0 | } else { |
522 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
523 | 0 | let start = self.class_literal_byte(&x.start)?; |
524 | 0 | let end = self.class_literal_byte(&x.end)?; |
525 | 0 | cls.push(hir::ClassBytesRange::new(start, end)); |
526 | 0 | self.push(HirFrame::ClassBytes(cls)); |
527 | | } |
528 | | } |
529 | 0 | ast::ClassSetItem::Ascii(ref x) => { |
530 | 0 | if self.flags().unicode() { |
531 | 0 | let xcls = self.hir_ascii_unicode_class(x)?; |
532 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
533 | 0 | cls.union(&xcls); |
534 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
535 | | } else { |
536 | 0 | let xcls = self.hir_ascii_byte_class(x)?; |
537 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
538 | 0 | cls.union(&xcls); |
539 | 0 | self.push(HirFrame::ClassBytes(cls)); |
540 | | } |
541 | | } |
542 | 0 | ast::ClassSetItem::Unicode(ref x) => { |
543 | 0 | let xcls = self.hir_unicode_class(x)?; |
544 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
545 | 0 | cls.union(&xcls); |
546 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
547 | | } |
548 | 0 | ast::ClassSetItem::Perl(ref x) => { |
549 | 0 | if self.flags().unicode() { |
550 | 0 | let xcls = self.hir_perl_unicode_class(x)?; |
551 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
552 | 0 | cls.union(&xcls); |
553 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
554 | | } else { |
555 | 0 | let xcls = self.hir_perl_byte_class(x)?; |
556 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
557 | 0 | cls.union(&xcls); |
558 | 0 | self.push(HirFrame::ClassBytes(cls)); |
559 | | } |
560 | | } |
561 | 0 | ast::ClassSetItem::Bracketed(ref ast) => { |
562 | 0 | if self.flags().unicode() { |
563 | 0 | let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); |
564 | 0 | self.unicode_fold_and_negate( |
565 | 0 | &ast.span, |
566 | 0 | ast.negated, |
567 | 0 | &mut cls1, |
568 | 0 | )?; |
569 | | |
570 | 0 | let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); |
571 | 0 | cls2.union(&cls1); |
572 | 0 | self.push(HirFrame::ClassUnicode(cls2)); |
573 | | } else { |
574 | 0 | let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); |
575 | 0 | self.bytes_fold_and_negate( |
576 | 0 | &ast.span, |
577 | 0 | ast.negated, |
578 | 0 | &mut cls1, |
579 | 0 | )?; |
580 | | |
581 | 0 | let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); |
582 | 0 | cls2.union(&cls1); |
583 | 0 | self.push(HirFrame::ClassBytes(cls2)); |
584 | | } |
585 | | } |
586 | | // This is handled automatically by the visitor. |
587 | 0 | ast::ClassSetItem::Union(_) => {} |
588 | | } |
589 | 0 | Ok(()) |
590 | 0 | } |
591 | | |
592 | 0 | fn visit_class_set_binary_op_pre( |
593 | 0 | &mut self, |
594 | 0 | _op: &ast::ClassSetBinaryOp, |
595 | 0 | ) -> Result<()> { |
596 | 0 | if self.flags().unicode() { |
597 | 0 | let cls = hir::ClassUnicode::empty(); |
598 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
599 | 0 | } else { |
600 | 0 | let cls = hir::ClassBytes::empty(); |
601 | 0 | self.push(HirFrame::ClassBytes(cls)); |
602 | 0 | } |
603 | 0 | Ok(()) |
604 | 0 | } |
605 | | |
606 | 0 | fn visit_class_set_binary_op_in( |
607 | 0 | &mut self, |
608 | 0 | _op: &ast::ClassSetBinaryOp, |
609 | 0 | ) -> Result<()> { |
610 | 0 | if self.flags().unicode() { |
611 | 0 | let cls = hir::ClassUnicode::empty(); |
612 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
613 | 0 | } else { |
614 | 0 | let cls = hir::ClassBytes::empty(); |
615 | 0 | self.push(HirFrame::ClassBytes(cls)); |
616 | 0 | } |
617 | 0 | Ok(()) |
618 | 0 | } |
619 | | |
620 | 0 | fn visit_class_set_binary_op_post( |
621 | 0 | &mut self, |
622 | 0 | op: &ast::ClassSetBinaryOp, |
623 | 0 | ) -> Result<()> { |
624 | | use crate::ast::ClassSetBinaryOpKind::*; |
625 | | |
626 | 0 | if self.flags().unicode() { |
627 | 0 | let mut rhs = self.pop().unwrap().unwrap_class_unicode(); |
628 | 0 | let mut lhs = self.pop().unwrap().unwrap_class_unicode(); |
629 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
630 | 0 | if self.flags().case_insensitive() { |
631 | 0 | rhs.try_case_fold_simple().map_err(|_| { |
632 | 0 | self.error( |
633 | 0 | op.rhs.span().clone(), |
634 | 0 | ErrorKind::UnicodeCaseUnavailable, |
635 | 0 | ) |
636 | 0 | })?; |
637 | 0 | lhs.try_case_fold_simple().map_err(|_| { |
638 | 0 | self.error( |
639 | 0 | op.lhs.span().clone(), |
640 | 0 | ErrorKind::UnicodeCaseUnavailable, |
641 | 0 | ) |
642 | 0 | })?; |
643 | 0 | } |
644 | 0 | match op.kind { |
645 | 0 | Intersection => lhs.intersect(&rhs), |
646 | 0 | Difference => lhs.difference(&rhs), |
647 | 0 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
648 | | } |
649 | 0 | cls.union(&lhs); |
650 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
651 | | } else { |
652 | 0 | let mut rhs = self.pop().unwrap().unwrap_class_bytes(); |
653 | 0 | let mut lhs = self.pop().unwrap().unwrap_class_bytes(); |
654 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
655 | 0 | if self.flags().case_insensitive() { |
656 | 0 | rhs.case_fold_simple(); |
657 | 0 | lhs.case_fold_simple(); |
658 | 0 | } |
659 | 0 | match op.kind { |
660 | 0 | Intersection => lhs.intersect(&rhs), |
661 | 0 | Difference => lhs.difference(&rhs), |
662 | 0 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
663 | | } |
664 | 0 | cls.union(&lhs); |
665 | 0 | self.push(HirFrame::ClassBytes(cls)); |
666 | | } |
667 | 0 | Ok(()) |
668 | 0 | } |
669 | | } |
670 | | |
671 | | /// The internal implementation of a translator. |
672 | | /// |
673 | | /// This type is responsible for carrying around the original pattern string, |
674 | | /// which is not tied to the internal state of a translator. |
675 | | /// |
676 | | /// A TranslatorI exists for the time it takes to translate a single Ast. |
677 | | #[derive(Clone, Debug)] |
678 | | struct TranslatorI<'t, 'p> { |
679 | | trans: &'t Translator, |
680 | | pattern: &'p str, |
681 | | } |
682 | | |
683 | | impl<'t, 'p> TranslatorI<'t, 'p> { |
684 | | /// Build a new internal translator. |
685 | 0 | fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { |
686 | 0 | TranslatorI { trans, pattern } |
687 | 0 | } |
688 | | |
689 | | /// Return a reference to the underlying translator. |
690 | 0 | fn trans(&self) -> &Translator { |
691 | 0 | &self.trans |
692 | 0 | } |
693 | | |
694 | | /// Push the given frame on to the call stack. |
695 | 0 | fn push(&self, frame: HirFrame) { |
696 | 0 | self.trans().stack.borrow_mut().push(frame); |
697 | 0 | } |
698 | | |
699 | | /// Push the given literal char on to the call stack. |
700 | | /// |
701 | | /// If the top-most element of the stack is a literal, then the char |
702 | | /// is appended to the end of that literal. Otherwise, a new literal |
703 | | /// containing just the given char is pushed to the top of the stack. |
704 | 0 | fn push_char(&self, ch: char) { |
705 | 0 | let mut buf = [0; 4]; |
706 | 0 | let bytes = ch.encode_utf8(&mut buf).as_bytes(); |
707 | 0 | let mut stack = self.trans().stack.borrow_mut(); |
708 | 0 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
709 | 0 | literal.extend_from_slice(bytes); |
710 | 0 | } else { |
711 | 0 | stack.push(HirFrame::Literal(bytes.to_vec())); |
712 | 0 | } |
713 | 0 | } |
714 | | |
715 | | /// Push the given literal byte on to the call stack. |
716 | | /// |
717 | | /// If the top-most element of the stack is a literal, then the byte |
718 | | /// is appended to the end of that literal. Otherwise, a new literal |
719 | | /// containing just the given byte is pushed to the top of the stack. |
720 | 0 | fn push_byte(&self, byte: u8) { |
721 | 0 | let mut stack = self.trans().stack.borrow_mut(); |
722 | 0 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
723 | 0 | literal.push(byte); |
724 | 0 | } else { |
725 | 0 | stack.push(HirFrame::Literal(vec![byte])); |
726 | 0 | } |
727 | 0 | } |
728 | | |
729 | | /// Pop the top of the call stack. If the call stack is empty, return None. |
730 | 0 | fn pop(&self) -> Option<HirFrame> { |
731 | 0 | self.trans().stack.borrow_mut().pop() |
732 | 0 | } |
733 | | |
734 | | /// Pop an HIR expression from the top of the stack for a concatenation. |
735 | | /// |
736 | | /// This returns None if the stack is empty or when a concat frame is seen. |
737 | | /// Otherwise, it panics if it could not find an HIR expression. |
738 | 0 | fn pop_concat_expr(&self) -> Option<Hir> { |
739 | 0 | let frame = self.pop()?; |
740 | 0 | match frame { |
741 | 0 | HirFrame::Concat => None, |
742 | 0 | HirFrame::Expr(expr) => Some(expr), |
743 | 0 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
744 | | HirFrame::ClassUnicode(_) => { |
745 | 0 | unreachable!("expected expr or concat, got Unicode class") |
746 | | } |
747 | | HirFrame::ClassBytes(_) => { |
748 | 0 | unreachable!("expected expr or concat, got byte class") |
749 | | } |
750 | | HirFrame::Repetition => { |
751 | 0 | unreachable!("expected expr or concat, got repetition") |
752 | | } |
753 | | HirFrame::Group { .. } => { |
754 | 0 | unreachable!("expected expr or concat, got group") |
755 | | } |
756 | | HirFrame::Alternation => { |
757 | 0 | unreachable!("expected expr or concat, got alt marker") |
758 | | } |
759 | | HirFrame::AlternationBranch => { |
760 | 0 | unreachable!("expected expr or concat, got alt branch marker") |
761 | | } |
762 | | } |
763 | 0 | } |
764 | | |
765 | | /// Pop an HIR expression from the top of the stack for an alternation. |
766 | | /// |
767 | | /// This returns None if the stack is empty or when an alternation frame is |
768 | | /// seen. Otherwise, it panics if it could not find an HIR expression. |
769 | 0 | fn pop_alt_expr(&self) -> Option<Hir> { |
770 | 0 | let frame = self.pop()?; |
771 | 0 | match frame { |
772 | 0 | HirFrame::Alternation => None, |
773 | 0 | HirFrame::Expr(expr) => Some(expr), |
774 | 0 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
775 | | HirFrame::ClassUnicode(_) => { |
776 | 0 | unreachable!("expected expr or alt, got Unicode class") |
777 | | } |
778 | | HirFrame::ClassBytes(_) => { |
779 | 0 | unreachable!("expected expr or alt, got byte class") |
780 | | } |
781 | | HirFrame::Repetition => { |
782 | 0 | unreachable!("expected expr or alt, got repetition") |
783 | | } |
784 | | HirFrame::Group { .. } => { |
785 | 0 | unreachable!("expected expr or alt, got group") |
786 | | } |
787 | | HirFrame::Concat => { |
788 | 0 | unreachable!("expected expr or alt, got concat marker") |
789 | | } |
790 | | HirFrame::AlternationBranch => { |
791 | 0 | unreachable!("expected expr or alt, got alt branch marker") |
792 | | } |
793 | | } |
794 | 0 | } |
795 | | |
796 | | /// Create a new error with the given span and error type. |
797 | 0 | fn error(&self, span: Span, kind: ErrorKind) -> Error { |
798 | 0 | Error { kind, pattern: self.pattern.to_string(), span } |
799 | 0 | } |
800 | | |
801 | | /// Return a copy of the active flags. |
802 | 0 | fn flags(&self) -> Flags { |
803 | 0 | self.trans().flags.get() |
804 | 0 | } |
805 | | |
806 | | /// Set the flags of this translator from the flags set in the given AST. |
807 | | /// Then, return the old flags. |
808 | 0 | fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { |
809 | 0 | let old_flags = self.flags(); |
810 | 0 | let mut new_flags = Flags::from_ast(ast_flags); |
811 | 0 | new_flags.merge(&old_flags); |
812 | 0 | self.trans().flags.set(new_flags); |
813 | 0 | old_flags |
814 | 0 | } |
815 | | |
816 | | /// Convert an Ast literal to its scalar representation. |
817 | | /// |
818 | | /// When Unicode mode is enabled, then this always succeeds and returns a |
819 | | /// `char` (Unicode scalar value). |
820 | | /// |
821 | | /// When Unicode mode is disabled, then a `char` will still be returned |
822 | | /// whenever possible. A byte is returned only when invalid UTF-8 is |
823 | | /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte |
824 | | /// will result in an error when invalid UTF-8 is not allowed. |
825 | 0 | fn ast_literal_to_scalar( |
826 | 0 | &self, |
827 | 0 | lit: &ast::Literal, |
828 | 0 | ) -> Result<Either<char, u8>> { |
829 | 0 | if self.flags().unicode() { |
830 | 0 | return Ok(Either::Left(lit.c)); |
831 | 0 | } |
832 | 0 | let byte = match lit.byte() { |
833 | 0 | None => return Ok(Either::Left(lit.c)), |
834 | 0 | Some(byte) => byte, |
835 | 0 | }; |
836 | 0 | if byte <= 0x7F { |
837 | 0 | return Ok(Either::Left(char::try_from(byte).unwrap())); |
838 | 0 | } |
839 | 0 | if self.trans().utf8 { |
840 | 0 | return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); |
841 | 0 | } |
842 | 0 | Ok(Either::Right(byte)) |
843 | 0 | } |
844 | | |
845 | 0 | fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> { |
846 | 0 | if !self.flags().case_insensitive() { |
847 | 0 | return Ok(None); |
848 | 0 | } |
849 | 0 | if self.flags().unicode() { |
850 | | // If case folding won't do anything, then don't bother trying. |
851 | 0 | let map = unicode::SimpleCaseFolder::new() |
852 | 0 | .map(|f| f.overlaps(c, c)) |
853 | 0 | .map_err(|_| { |
854 | 0 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
855 | 0 | })?; |
856 | 0 | if !map { |
857 | 0 | return Ok(None); |
858 | 0 | } |
859 | 0 | let mut cls = |
860 | 0 | hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( |
861 | 0 | c, c, |
862 | 0 | )]); |
863 | 0 | cls.try_case_fold_simple().map_err(|_| { |
864 | 0 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
865 | 0 | })?; |
866 | 0 | Ok(Some(Hir::class(hir::Class::Unicode(cls)))) |
867 | | } else { |
868 | 0 | if !c.is_ascii() { |
869 | 0 | return Ok(None); |
870 | 0 | } |
871 | 0 | // If case folding won't do anything, then don't bother trying. |
872 | 0 | match c { |
873 | 0 | 'A'..='Z' | 'a'..='z' => {} |
874 | 0 | _ => return Ok(None), |
875 | | } |
876 | 0 | let mut cls = |
877 | 0 | hir::ClassBytes::new(vec![hir::ClassBytesRange::new( |
878 | 0 | // OK because 'c.len_utf8() == 1' which in turn implies |
879 | 0 | // that 'c' is ASCII. |
880 | 0 | u8::try_from(c).unwrap(), |
881 | 0 | u8::try_from(c).unwrap(), |
882 | 0 | )]); |
883 | 0 | cls.case_fold_simple(); |
884 | 0 | Ok(Some(Hir::class(hir::Class::Bytes(cls)))) |
885 | | } |
886 | 0 | } |
887 | | |
888 | 0 | fn hir_dot(&self, span: Span) -> Result<Hir> { |
889 | 0 | let (utf8, lineterm, flags) = |
890 | 0 | (self.trans().utf8, self.trans().line_terminator, self.flags()); |
891 | 0 | if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { |
892 | 0 | return Err(self.error(span, ErrorKind::InvalidUtf8)); |
893 | 0 | } |
894 | 0 | let dot = if flags.dot_matches_new_line() { |
895 | 0 | if flags.unicode() { |
896 | 0 | hir::Dot::AnyChar |
897 | | } else { |
898 | 0 | hir::Dot::AnyByte |
899 | | } |
900 | | } else { |
901 | 0 | if flags.unicode() { |
902 | 0 | if flags.crlf() { |
903 | 0 | hir::Dot::AnyCharExceptCRLF |
904 | | } else { |
905 | 0 | if !lineterm.is_ascii() { |
906 | 0 | return Err( |
907 | 0 | self.error(span, ErrorKind::InvalidLineTerminator) |
908 | 0 | ); |
909 | 0 | } |
910 | 0 | hir::Dot::AnyCharExcept(char::from(lineterm)) |
911 | | } |
912 | | } else { |
913 | 0 | if flags.crlf() { |
914 | 0 | hir::Dot::AnyByteExceptCRLF |
915 | | } else { |
916 | 0 | hir::Dot::AnyByteExcept(lineterm) |
917 | | } |
918 | | } |
919 | | }; |
920 | 0 | Ok(Hir::dot(dot)) |
921 | 0 | } |
922 | | |
923 | 0 | fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { |
924 | 0 | let unicode = self.flags().unicode(); |
925 | 0 | let multi_line = self.flags().multi_line(); |
926 | 0 | let crlf = self.flags().crlf(); |
927 | 0 | Ok(match asst.kind { |
928 | 0 | ast::AssertionKind::StartLine => Hir::look(if multi_line { |
929 | 0 | if crlf { |
930 | 0 | hir::Look::StartCRLF |
931 | | } else { |
932 | 0 | hir::Look::StartLF |
933 | | } |
934 | | } else { |
935 | 0 | hir::Look::Start |
936 | | }), |
937 | 0 | ast::AssertionKind::EndLine => Hir::look(if multi_line { |
938 | 0 | if crlf { |
939 | 0 | hir::Look::EndCRLF |
940 | | } else { |
941 | 0 | hir::Look::EndLF |
942 | | } |
943 | | } else { |
944 | 0 | hir::Look::End |
945 | | }), |
946 | 0 | ast::AssertionKind::StartText => Hir::look(hir::Look::Start), |
947 | 0 | ast::AssertionKind::EndText => Hir::look(hir::Look::End), |
948 | 0 | ast::AssertionKind::WordBoundary => Hir::look(if unicode { |
949 | 0 | hir::Look::WordUnicode |
950 | | } else { |
951 | 0 | hir::Look::WordAscii |
952 | | }), |
953 | 0 | ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { |
954 | 0 | hir::Look::WordUnicodeNegate |
955 | | } else { |
956 | 0 | hir::Look::WordAsciiNegate |
957 | | }), |
958 | | ast::AssertionKind::WordBoundaryStart |
959 | | | ast::AssertionKind::WordBoundaryStartAngle => { |
960 | 0 | Hir::look(if unicode { |
961 | 0 | hir::Look::WordStartUnicode |
962 | | } else { |
963 | 0 | hir::Look::WordStartAscii |
964 | | }) |
965 | | } |
966 | | ast::AssertionKind::WordBoundaryEnd |
967 | | | ast::AssertionKind::WordBoundaryEndAngle => { |
968 | 0 | Hir::look(if unicode { |
969 | 0 | hir::Look::WordEndUnicode |
970 | | } else { |
971 | 0 | hir::Look::WordEndAscii |
972 | | }) |
973 | | } |
974 | | ast::AssertionKind::WordBoundaryStartHalf => { |
975 | 0 | Hir::look(if unicode { |
976 | 0 | hir::Look::WordStartHalfUnicode |
977 | | } else { |
978 | 0 | hir::Look::WordStartHalfAscii |
979 | | }) |
980 | | } |
981 | 0 | ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { |
982 | 0 | hir::Look::WordEndHalfUnicode |
983 | | } else { |
984 | 0 | hir::Look::WordEndHalfAscii |
985 | | }), |
986 | | }) |
987 | 0 | } |
988 | | |
989 | 0 | fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { |
990 | 0 | let (index, name) = match group.kind { |
991 | 0 | ast::GroupKind::CaptureIndex(index) => (index, None), |
992 | 0 | ast::GroupKind::CaptureName { ref name, .. } => { |
993 | 0 | (name.index, Some(name.name.clone().into_boxed_str())) |
994 | | } |
995 | | // The HIR doesn't need to use non-capturing groups, since the way |
996 | | // in which the data type is defined handles this automatically. |
997 | 0 | ast::GroupKind::NonCapturing(_) => return expr, |
998 | | }; |
999 | 0 | Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) |
1000 | 0 | } |
1001 | | |
1002 | 0 | fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { |
1003 | 0 | let (min, max) = match rep.op.kind { |
1004 | 0 | ast::RepetitionKind::ZeroOrOne => (0, Some(1)), |
1005 | 0 | ast::RepetitionKind::ZeroOrMore => (0, None), |
1006 | 0 | ast::RepetitionKind::OneOrMore => (1, None), |
1007 | 0 | ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { |
1008 | 0 | (m, Some(m)) |
1009 | | } |
1010 | 0 | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { |
1011 | 0 | (m, None) |
1012 | | } |
1013 | | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( |
1014 | 0 | m, |
1015 | 0 | n, |
1016 | 0 | )) => (m, Some(n)), |
1017 | | }; |
1018 | 0 | let greedy = |
1019 | 0 | if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; |
1020 | 0 | Hir::repetition(hir::Repetition { |
1021 | 0 | min, |
1022 | 0 | max, |
1023 | 0 | greedy, |
1024 | 0 | sub: Box::new(expr), |
1025 | 0 | }) |
1026 | 0 | } |
1027 | | |
1028 | 0 | fn hir_unicode_class( |
1029 | 0 | &self, |
1030 | 0 | ast_class: &ast::ClassUnicode, |
1031 | 0 | ) -> Result<hir::ClassUnicode> { |
1032 | | use crate::ast::ClassUnicodeKind::*; |
1033 | | |
1034 | 0 | if !self.flags().unicode() { |
1035 | 0 | return Err( |
1036 | 0 | self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) |
1037 | 0 | ); |
1038 | 0 | } |
1039 | 0 | let query = match ast_class.kind { |
1040 | 0 | OneLetter(name) => ClassQuery::OneLetter(name), |
1041 | 0 | Named(ref name) => ClassQuery::Binary(name), |
1042 | 0 | NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { |
1043 | 0 | property_name: name, |
1044 | 0 | property_value: value, |
1045 | 0 | }, |
1046 | | }; |
1047 | 0 | let mut result = self.convert_unicode_class_error( |
1048 | 0 | &ast_class.span, |
1049 | 0 | unicode::class(query), |
1050 | 0 | ); |
1051 | 0 | if let Ok(ref mut class) = result { |
1052 | 0 | self.unicode_fold_and_negate( |
1053 | 0 | &ast_class.span, |
1054 | 0 | ast_class.negated, |
1055 | 0 | class, |
1056 | 0 | )?; |
1057 | 0 | } |
1058 | 0 | result |
1059 | 0 | } |
1060 | | |
1061 | 0 | fn hir_ascii_unicode_class( |
1062 | 0 | &self, |
1063 | 0 | ast: &ast::ClassAscii, |
1064 | 0 | ) -> Result<hir::ClassUnicode> { |
1065 | 0 | let mut cls = hir::ClassUnicode::new( |
1066 | 0 | ascii_class_as_chars(&ast.kind) |
1067 | 0 | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
1068 | 0 | ); |
1069 | 0 | self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
1070 | 0 | Ok(cls) |
1071 | 0 | } |
1072 | | |
1073 | 0 | fn hir_ascii_byte_class( |
1074 | 0 | &self, |
1075 | 0 | ast: &ast::ClassAscii, |
1076 | 0 | ) -> Result<hir::ClassBytes> { |
1077 | 0 | let mut cls = hir::ClassBytes::new( |
1078 | 0 | ascii_class(&ast.kind) |
1079 | 0 | .map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1080 | 0 | ); |
1081 | 0 | self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
1082 | 0 | Ok(cls) |
1083 | 0 | } |
1084 | | |
1085 | 0 | fn hir_perl_unicode_class( |
1086 | 0 | &self, |
1087 | 0 | ast_class: &ast::ClassPerl, |
1088 | 0 | ) -> Result<hir::ClassUnicode> { |
1089 | | use crate::ast::ClassPerlKind::*; |
1090 | | |
1091 | 0 | assert!(self.flags().unicode()); |
1092 | 0 | let result = match ast_class.kind { |
1093 | 0 | Digit => unicode::perl_digit(), |
1094 | 0 | Space => unicode::perl_space(), |
1095 | 0 | Word => unicode::perl_word(), |
1096 | | }; |
1097 | 0 | let mut class = |
1098 | 0 | self.convert_unicode_class_error(&ast_class.span, result)?; |
1099 | | // We needn't apply case folding here because the Perl Unicode classes |
1100 | | // are already closed under Unicode simple case folding. |
1101 | 0 | if ast_class.negated { |
1102 | 0 | class.negate(); |
1103 | 0 | } |
1104 | 0 | Ok(class) |
1105 | 0 | } |
1106 | | |
1107 | 0 | fn hir_perl_byte_class( |
1108 | 0 | &self, |
1109 | 0 | ast_class: &ast::ClassPerl, |
1110 | 0 | ) -> Result<hir::ClassBytes> { |
1111 | | use crate::ast::ClassPerlKind::*; |
1112 | | |
1113 | 0 | assert!(!self.flags().unicode()); |
1114 | 0 | let mut class = match ast_class.kind { |
1115 | 0 | Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), |
1116 | 0 | Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), |
1117 | 0 | Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), |
1118 | | }; |
1119 | | // We needn't apply case folding here because the Perl ASCII classes |
1120 | | // are already closed (under ASCII case folding). |
1121 | 0 | if ast_class.negated { |
1122 | 0 | class.negate(); |
1123 | 0 | } |
1124 | | // Negating a Perl byte class is likely to cause it to match invalid |
1125 | | // UTF-8. That's only OK if the translator is configured to allow such |
1126 | | // things. |
1127 | 0 | if self.trans().utf8 && !class.is_ascii() { |
1128 | 0 | return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); |
1129 | 0 | } |
1130 | 0 | Ok(class) |
1131 | 0 | } |
1132 | | |
1133 | | /// Converts the given Unicode specific error to an HIR translation error. |
1134 | | /// |
1135 | | /// The span given should approximate the position at which an error would |
1136 | | /// occur. |
1137 | 0 | fn convert_unicode_class_error( |
1138 | 0 | &self, |
1139 | 0 | span: &Span, |
1140 | 0 | result: core::result::Result<hir::ClassUnicode, unicode::Error>, |
1141 | 0 | ) -> Result<hir::ClassUnicode> { |
1142 | 0 | result.map_err(|err| { |
1143 | 0 | let sp = span.clone(); |
1144 | 0 | match err { |
1145 | | unicode::Error::PropertyNotFound => { |
1146 | 0 | self.error(sp, ErrorKind::UnicodePropertyNotFound) |
1147 | | } |
1148 | | unicode::Error::PropertyValueNotFound => { |
1149 | 0 | self.error(sp, ErrorKind::UnicodePropertyValueNotFound) |
1150 | | } |
1151 | | unicode::Error::PerlClassNotFound => { |
1152 | 0 | self.error(sp, ErrorKind::UnicodePerlClassNotFound) |
1153 | | } |
1154 | | } |
1155 | 0 | }) |
1156 | 0 | } |
1157 | | |
1158 | 0 | fn unicode_fold_and_negate( |
1159 | 0 | &self, |
1160 | 0 | span: &Span, |
1161 | 0 | negated: bool, |
1162 | 0 | class: &mut hir::ClassUnicode, |
1163 | 0 | ) -> Result<()> { |
1164 | 0 | // Note that we must apply case folding before negation! |
1165 | 0 | // Consider `(?i)[^x]`. If we applied negation first, then |
1166 | 0 | // the result would be the character class that matched any |
1167 | 0 | // Unicode scalar value. |
1168 | 0 | if self.flags().case_insensitive() { |
1169 | 0 | class.try_case_fold_simple().map_err(|_| { |
1170 | 0 | self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) |
1171 | 0 | })?; |
1172 | 0 | } |
1173 | 0 | if negated { |
1174 | 0 | class.negate(); |
1175 | 0 | } |
1176 | 0 | Ok(()) |
1177 | 0 | } |
1178 | | |
1179 | 0 | fn bytes_fold_and_negate( |
1180 | 0 | &self, |
1181 | 0 | span: &Span, |
1182 | 0 | negated: bool, |
1183 | 0 | class: &mut hir::ClassBytes, |
1184 | 0 | ) -> Result<()> { |
1185 | 0 | // Note that we must apply case folding before negation! |
1186 | 0 | // Consider `(?i)[^x]`. If we applied negation first, then |
1187 | 0 | // the result would be the character class that matched any |
1188 | 0 | // Unicode scalar value. |
1189 | 0 | if self.flags().case_insensitive() { |
1190 | 0 | class.case_fold_simple(); |
1191 | 0 | } |
1192 | 0 | if negated { |
1193 | 0 | class.negate(); |
1194 | 0 | } |
1195 | 0 | if self.trans().utf8 && !class.is_ascii() { |
1196 | 0 | return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); |
1197 | 0 | } |
1198 | 0 | Ok(()) |
1199 | 0 | } |
1200 | | |
1201 | | /// Return a scalar byte value suitable for use as a literal in a byte |
1202 | | /// character class. |
1203 | 0 | fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { |
1204 | 0 | match self.ast_literal_to_scalar(ast)? { |
1205 | 0 | Either::Right(byte) => Ok(byte), |
1206 | 0 | Either::Left(ch) => { |
1207 | 0 | if ch.is_ascii() { |
1208 | 0 | Ok(u8::try_from(ch).unwrap()) |
1209 | | } else { |
1210 | | // We can't feasibly support Unicode in |
1211 | | // byte oriented classes. Byte classes don't |
1212 | | // do Unicode case folding. |
1213 | 0 | Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) |
1214 | | } |
1215 | | } |
1216 | | } |
1217 | 0 | } |
1218 | | } |
1219 | | |
1220 | | /// A translator's representation of a regular expression's flags at any given |
1221 | | /// moment in time. |
1222 | | /// |
1223 | | /// Each flag can be in one of three states: absent, present but disabled or |
1224 | | /// present but enabled. |
1225 | | #[derive(Clone, Copy, Debug, Default)] |
1226 | | struct Flags { |
1227 | | case_insensitive: Option<bool>, |
1228 | | multi_line: Option<bool>, |
1229 | | dot_matches_new_line: Option<bool>, |
1230 | | swap_greed: Option<bool>, |
1231 | | unicode: Option<bool>, |
1232 | | crlf: Option<bool>, |
1233 | | // Note that `ignore_whitespace` is omitted here because it is handled |
1234 | | // entirely in the parser. |
1235 | | } |
1236 | | |
1237 | | impl Flags { |
1238 | 0 | fn from_ast(ast: &ast::Flags) -> Flags { |
1239 | 0 | let mut flags = Flags::default(); |
1240 | 0 | let mut enable = true; |
1241 | 0 | for item in &ast.items { |
1242 | 0 | match item.kind { |
1243 | 0 | ast::FlagsItemKind::Negation => { |
1244 | 0 | enable = false; |
1245 | 0 | } |
1246 | 0 | ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { |
1247 | 0 | flags.case_insensitive = Some(enable); |
1248 | 0 | } |
1249 | 0 | ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { |
1250 | 0 | flags.multi_line = Some(enable); |
1251 | 0 | } |
1252 | 0 | ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { |
1253 | 0 | flags.dot_matches_new_line = Some(enable); |
1254 | 0 | } |
1255 | 0 | ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { |
1256 | 0 | flags.swap_greed = Some(enable); |
1257 | 0 | } |
1258 | 0 | ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { |
1259 | 0 | flags.unicode = Some(enable); |
1260 | 0 | } |
1261 | 0 | ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { |
1262 | 0 | flags.crlf = Some(enable); |
1263 | 0 | } |
1264 | 0 | ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} |
1265 | | } |
1266 | | } |
1267 | 0 | flags |
1268 | 0 | } |
1269 | | |
1270 | 0 | fn merge(&mut self, previous: &Flags) { |
1271 | 0 | if self.case_insensitive.is_none() { |
1272 | 0 | self.case_insensitive = previous.case_insensitive; |
1273 | 0 | } |
1274 | 0 | if self.multi_line.is_none() { |
1275 | 0 | self.multi_line = previous.multi_line; |
1276 | 0 | } |
1277 | 0 | if self.dot_matches_new_line.is_none() { |
1278 | 0 | self.dot_matches_new_line = previous.dot_matches_new_line; |
1279 | 0 | } |
1280 | 0 | if self.swap_greed.is_none() { |
1281 | 0 | self.swap_greed = previous.swap_greed; |
1282 | 0 | } |
1283 | 0 | if self.unicode.is_none() { |
1284 | 0 | self.unicode = previous.unicode; |
1285 | 0 | } |
1286 | 0 | if self.crlf.is_none() { |
1287 | 0 | self.crlf = previous.crlf; |
1288 | 0 | } |
1289 | 0 | } |
1290 | | |
1291 | 0 | fn case_insensitive(&self) -> bool { |
1292 | 0 | self.case_insensitive.unwrap_or(false) |
1293 | 0 | } |
1294 | | |
1295 | 0 | fn multi_line(&self) -> bool { |
1296 | 0 | self.multi_line.unwrap_or(false) |
1297 | 0 | } |
1298 | | |
1299 | 0 | fn dot_matches_new_line(&self) -> bool { |
1300 | 0 | self.dot_matches_new_line.unwrap_or(false) |
1301 | 0 | } |
1302 | | |
1303 | 0 | fn swap_greed(&self) -> bool { |
1304 | 0 | self.swap_greed.unwrap_or(false) |
1305 | 0 | } |
1306 | | |
1307 | 0 | fn unicode(&self) -> bool { |
1308 | 0 | self.unicode.unwrap_or(true) |
1309 | 0 | } |
1310 | | |
1311 | 0 | fn crlf(&self) -> bool { |
1312 | 0 | self.crlf.unwrap_or(false) |
1313 | 0 | } |
1314 | | } |
1315 | | |
1316 | 0 | fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { |
1317 | 0 | let ranges: Vec<_> = ascii_class(kind) |
1318 | 0 | .map(|(s, e)| hir::ClassBytesRange::new(s, e)) |
1319 | 0 | .collect(); |
1320 | 0 | hir::ClassBytes::new(ranges) |
1321 | 0 | } |
1322 | | |
1323 | 0 | fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> { |
1324 | | use crate::ast::ClassAsciiKind::*; |
1325 | | |
1326 | 0 | let slice: &'static [(u8, u8)] = match *kind { |
1327 | 0 | Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], |
1328 | 0 | Alpha => &[(b'A', b'Z'), (b'a', b'z')], |
1329 | 0 | Ascii => &[(b'\x00', b'\x7F')], |
1330 | 0 | Blank => &[(b'\t', b'\t'), (b' ', b' ')], |
1331 | 0 | Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], |
1332 | 0 | Digit => &[(b'0', b'9')], |
1333 | 0 | Graph => &[(b'!', b'~')], |
1334 | 0 | Lower => &[(b'a', b'z')], |
1335 | 0 | Print => &[(b' ', b'~')], |
1336 | 0 | Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], |
1337 | 0 | Space => &[ |
1338 | 0 | (b'\t', b'\t'), |
1339 | 0 | (b'\n', b'\n'), |
1340 | 0 | (b'\x0B', b'\x0B'), |
1341 | 0 | (b'\x0C', b'\x0C'), |
1342 | 0 | (b'\r', b'\r'), |
1343 | 0 | (b' ', b' '), |
1344 | 0 | ], |
1345 | 0 | Upper => &[(b'A', b'Z')], |
1346 | 0 | Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], |
1347 | 0 | Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], |
1348 | | }; |
1349 | 0 | slice.iter().copied() |
1350 | 0 | } |
1351 | | |
1352 | 0 | fn ascii_class_as_chars( |
1353 | 0 | kind: &ast::ClassAsciiKind, |
1354 | 0 | ) -> impl Iterator<Item = (char, char)> { |
1355 | 0 | ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) |
1356 | 0 | } |
1357 | | |
1358 | | #[cfg(test)] |
1359 | | mod tests { |
1360 | | use crate::{ |
1361 | | ast::{parse::ParserBuilder, Position}, |
1362 | | hir::{Look, Properties}, |
1363 | | }; |
1364 | | |
1365 | | use super::*; |
1366 | | |
1367 | | // We create these errors to compare with real hir::Errors in the tests. |
1368 | | // We define equality between TestError and hir::Error to disregard the |
1369 | | // pattern string in hir::Error, which is annoying to provide in tests. |
1370 | | #[derive(Clone, Debug)] |
1371 | | struct TestError { |
1372 | | span: Span, |
1373 | | kind: hir::ErrorKind, |
1374 | | } |
1375 | | |
1376 | | impl PartialEq<hir::Error> for TestError { |
1377 | | fn eq(&self, other: &hir::Error) -> bool { |
1378 | | self.span == other.span && self.kind == other.kind |
1379 | | } |
1380 | | } |
1381 | | |
1382 | | impl PartialEq<TestError> for hir::Error { |
1383 | | fn eq(&self, other: &TestError) -> bool { |
1384 | | self.span == other.span && self.kind == other.kind |
1385 | | } |
1386 | | } |
1387 | | |
1388 | | fn parse(pattern: &str) -> Ast { |
1389 | | ParserBuilder::new().octal(true).build().parse(pattern).unwrap() |
1390 | | } |
1391 | | |
1392 | | fn t(pattern: &str) -> Hir { |
1393 | | TranslatorBuilder::new() |
1394 | | .utf8(true) |
1395 | | .build() |
1396 | | .translate(pattern, &parse(pattern)) |
1397 | | .unwrap() |
1398 | | } |
1399 | | |
1400 | | fn t_err(pattern: &str) -> hir::Error { |
1401 | | TranslatorBuilder::new() |
1402 | | .utf8(true) |
1403 | | .build() |
1404 | | .translate(pattern, &parse(pattern)) |
1405 | | .unwrap_err() |
1406 | | } |
1407 | | |
1408 | | fn t_bytes(pattern: &str) -> Hir { |
1409 | | TranslatorBuilder::new() |
1410 | | .utf8(false) |
1411 | | .build() |
1412 | | .translate(pattern, &parse(pattern)) |
1413 | | .unwrap() |
1414 | | } |
1415 | | |
1416 | | fn props(pattern: &str) -> Properties { |
1417 | | t(pattern).properties().clone() |
1418 | | } |
1419 | | |
1420 | | fn props_bytes(pattern: &str) -> Properties { |
1421 | | t_bytes(pattern).properties().clone() |
1422 | | } |
1423 | | |
1424 | | fn hir_lit(s: &str) -> Hir { |
1425 | | hir_blit(s.as_bytes()) |
1426 | | } |
1427 | | |
1428 | | fn hir_blit(s: &[u8]) -> Hir { |
1429 | | Hir::literal(s) |
1430 | | } |
1431 | | |
1432 | | fn hir_capture(index: u32, expr: Hir) -> Hir { |
1433 | | Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) |
1434 | | } |
1435 | | |
1436 | | fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { |
1437 | | Hir::capture(hir::Capture { |
1438 | | index, |
1439 | | name: Some(name.into()), |
1440 | | sub: Box::new(expr), |
1441 | | }) |
1442 | | } |
1443 | | |
1444 | | fn hir_quest(greedy: bool, expr: Hir) -> Hir { |
1445 | | Hir::repetition(hir::Repetition { |
1446 | | min: 0, |
1447 | | max: Some(1), |
1448 | | greedy, |
1449 | | sub: Box::new(expr), |
1450 | | }) |
1451 | | } |
1452 | | |
1453 | | fn hir_star(greedy: bool, expr: Hir) -> Hir { |
1454 | | Hir::repetition(hir::Repetition { |
1455 | | min: 0, |
1456 | | max: None, |
1457 | | greedy, |
1458 | | sub: Box::new(expr), |
1459 | | }) |
1460 | | } |
1461 | | |
1462 | | fn hir_plus(greedy: bool, expr: Hir) -> Hir { |
1463 | | Hir::repetition(hir::Repetition { |
1464 | | min: 1, |
1465 | | max: None, |
1466 | | greedy, |
1467 | | sub: Box::new(expr), |
1468 | | }) |
1469 | | } |
1470 | | |
1471 | | fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir { |
1472 | | Hir::repetition(hir::Repetition { |
1473 | | min, |
1474 | | max, |
1475 | | greedy, |
1476 | | sub: Box::new(expr), |
1477 | | }) |
1478 | | } |
1479 | | |
1480 | | fn hir_alt(alts: Vec<Hir>) -> Hir { |
1481 | | Hir::alternation(alts) |
1482 | | } |
1483 | | |
1484 | | fn hir_cat(exprs: Vec<Hir>) -> Hir { |
1485 | | Hir::concat(exprs) |
1486 | | } |
1487 | | |
1488 | | #[allow(dead_code)] |
1489 | | fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { |
1490 | | Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) |
1491 | | } |
1492 | | |
1493 | | #[allow(dead_code)] |
1494 | | fn hir_uclass_perl_word() -> Hir { |
1495 | | Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) |
1496 | | } |
1497 | | |
1498 | | fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { |
1499 | | Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( |
1500 | | ascii_class_as_chars(kind) |
1501 | | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
1502 | | ))) |
1503 | | } |
1504 | | |
1505 | | fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { |
1506 | | Hir::class(hir::Class::Bytes(hir::ClassBytes::new( |
1507 | | ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1508 | | ))) |
1509 | | } |
1510 | | |
1511 | | fn hir_uclass(ranges: &[(char, char)]) -> Hir { |
1512 | | Hir::class(uclass(ranges)) |
1513 | | } |
1514 | | |
1515 | | fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { |
1516 | | Hir::class(bclass(ranges)) |
1517 | | } |
1518 | | |
1519 | | fn hir_case_fold(expr: Hir) -> Hir { |
1520 | | match expr.into_kind() { |
1521 | | HirKind::Class(mut cls) => { |
1522 | | cls.case_fold_simple(); |
1523 | | Hir::class(cls) |
1524 | | } |
1525 | | _ => panic!("cannot case fold non-class Hir expr"), |
1526 | | } |
1527 | | } |
1528 | | |
1529 | | fn hir_negate(expr: Hir) -> Hir { |
1530 | | match expr.into_kind() { |
1531 | | HirKind::Class(mut cls) => { |
1532 | | cls.negate(); |
1533 | | Hir::class(cls) |
1534 | | } |
1535 | | _ => panic!("cannot negate non-class Hir expr"), |
1536 | | } |
1537 | | } |
1538 | | |
1539 | | fn uclass(ranges: &[(char, char)]) -> hir::Class { |
1540 | | let ranges: Vec<hir::ClassUnicodeRange> = ranges |
1541 | | .iter() |
1542 | | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
1543 | | .collect(); |
1544 | | hir::Class::Unicode(hir::ClassUnicode::new(ranges)) |
1545 | | } |
1546 | | |
1547 | | fn bclass(ranges: &[(u8, u8)]) -> hir::Class { |
1548 | | let ranges: Vec<hir::ClassBytesRange> = ranges |
1549 | | .iter() |
1550 | | .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) |
1551 | | .collect(); |
1552 | | hir::Class::Bytes(hir::ClassBytes::new(ranges)) |
1553 | | } |
1554 | | |
1555 | | #[cfg(feature = "unicode-case")] |
1556 | | fn class_case_fold(mut cls: hir::Class) -> Hir { |
1557 | | cls.case_fold_simple(); |
1558 | | Hir::class(cls) |
1559 | | } |
1560 | | |
1561 | | fn class_negate(mut cls: hir::Class) -> Hir { |
1562 | | cls.negate(); |
1563 | | Hir::class(cls) |
1564 | | } |
1565 | | |
1566 | | #[allow(dead_code)] |
1567 | | fn hir_union(expr1: Hir, expr2: Hir) -> Hir { |
1568 | | use crate::hir::Class::{Bytes, Unicode}; |
1569 | | |
1570 | | match (expr1.into_kind(), expr2.into_kind()) { |
1571 | | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1572 | | c1.union(&c2); |
1573 | | Hir::class(hir::Class::Unicode(c1)) |
1574 | | } |
1575 | | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1576 | | c1.union(&c2); |
1577 | | Hir::class(hir::Class::Bytes(c1)) |
1578 | | } |
1579 | | _ => panic!("cannot union non-class Hir exprs"), |
1580 | | } |
1581 | | } |
1582 | | |
1583 | | #[allow(dead_code)] |
1584 | | fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { |
1585 | | use crate::hir::Class::{Bytes, Unicode}; |
1586 | | |
1587 | | match (expr1.into_kind(), expr2.into_kind()) { |
1588 | | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1589 | | c1.difference(&c2); |
1590 | | Hir::class(hir::Class::Unicode(c1)) |
1591 | | } |
1592 | | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1593 | | c1.difference(&c2); |
1594 | | Hir::class(hir::Class::Bytes(c1)) |
1595 | | } |
1596 | | _ => panic!("cannot difference non-class Hir exprs"), |
1597 | | } |
1598 | | } |
1599 | | |
1600 | | fn hir_look(look: hir::Look) -> Hir { |
1601 | | Hir::look(look) |
1602 | | } |
1603 | | |
1604 | | #[test] |
1605 | | fn empty() { |
1606 | | assert_eq!(t(""), Hir::empty()); |
1607 | | assert_eq!(t("(?i)"), Hir::empty()); |
1608 | | assert_eq!(t("()"), hir_capture(1, Hir::empty())); |
1609 | | assert_eq!(t("(?:)"), Hir::empty()); |
1610 | | assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty())); |
1611 | | assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); |
1612 | | assert_eq!( |
1613 | | t("()|()"), |
1614 | | hir_alt(vec![ |
1615 | | hir_capture(1, Hir::empty()), |
1616 | | hir_capture(2, Hir::empty()), |
1617 | | ]) |
1618 | | ); |
1619 | | assert_eq!( |
1620 | | t("(|b)"), |
1621 | | hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) |
1622 | | ); |
1623 | | assert_eq!( |
1624 | | t("(a|)"), |
1625 | | hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) |
1626 | | ); |
1627 | | assert_eq!( |
1628 | | t("(a||c)"), |
1629 | | hir_capture( |
1630 | | 1, |
1631 | | hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) |
1632 | | ) |
1633 | | ); |
1634 | | assert_eq!( |
1635 | | t("(||)"), |
1636 | | hir_capture( |
1637 | | 1, |
1638 | | hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) |
1639 | | ) |
1640 | | ); |
1641 | | } |
1642 | | |
1643 | | #[test] |
1644 | | fn literal() { |
1645 | | assert_eq!(t("a"), hir_lit("a")); |
1646 | | assert_eq!(t("(?-u)a"), hir_lit("a")); |
1647 | | assert_eq!(t("☃"), hir_lit("☃")); |
1648 | | assert_eq!(t("abcd"), hir_lit("abcd")); |
1649 | | |
1650 | | assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); |
1651 | | assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); |
1652 | | assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); |
1653 | | assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); |
1654 | | |
1655 | | assert_eq!(t("(?-u)☃"), hir_lit("☃")); |
1656 | | assert_eq!( |
1657 | | t_err(r"(?-u)\xFF"), |
1658 | | TestError { |
1659 | | kind: hir::ErrorKind::InvalidUtf8, |
1660 | | span: Span::new( |
1661 | | Position::new(5, 1, 6), |
1662 | | Position::new(9, 1, 10) |
1663 | | ), |
1664 | | } |
1665 | | ); |
1666 | | } |
1667 | | |
1668 | | #[test] |
1669 | | fn literal_case_insensitive() { |
1670 | | #[cfg(feature = "unicode-case")] |
1671 | | assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); |
1672 | | #[cfg(feature = "unicode-case")] |
1673 | | assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); |
1674 | | #[cfg(feature = "unicode-case")] |
1675 | | assert_eq!( |
1676 | | t("a(?i)a(?-i)a"), |
1677 | | hir_cat(vec![ |
1678 | | hir_lit("a"), |
1679 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1680 | | hir_lit("a"), |
1681 | | ]) |
1682 | | ); |
1683 | | #[cfg(feature = "unicode-case")] |
1684 | | assert_eq!( |
1685 | | t("(?i)ab@c"), |
1686 | | hir_cat(vec![ |
1687 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1688 | | hir_uclass(&[('B', 'B'), ('b', 'b')]), |
1689 | | hir_lit("@"), |
1690 | | hir_uclass(&[('C', 'C'), ('c', 'c')]), |
1691 | | ]) |
1692 | | ); |
1693 | | #[cfg(feature = "unicode-case")] |
1694 | | assert_eq!( |
1695 | | t("(?i)β"), |
1696 | | hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) |
1697 | | ); |
1698 | | |
1699 | | assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); |
1700 | | #[cfg(feature = "unicode-case")] |
1701 | | assert_eq!( |
1702 | | t("(?-u)a(?i)a(?-i)a"), |
1703 | | hir_cat(vec![ |
1704 | | hir_lit("a"), |
1705 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1706 | | hir_lit("a"), |
1707 | | ]) |
1708 | | ); |
1709 | | assert_eq!( |
1710 | | t("(?i-u)ab@c"), |
1711 | | hir_cat(vec![ |
1712 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1713 | | hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), |
1714 | | hir_lit("@"), |
1715 | | hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), |
1716 | | ]) |
1717 | | ); |
1718 | | |
1719 | | assert_eq!( |
1720 | | t_bytes("(?i-u)a"), |
1721 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1722 | | ); |
1723 | | assert_eq!( |
1724 | | t_bytes("(?i-u)\x61"), |
1725 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1726 | | ); |
1727 | | assert_eq!( |
1728 | | t_bytes(r"(?i-u)\x61"), |
1729 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1730 | | ); |
1731 | | assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); |
1732 | | |
1733 | | assert_eq!(t("(?i-u)β"), hir_lit("β"),); |
1734 | | } |
1735 | | |
1736 | | #[test] |
1737 | | fn dot() { |
1738 | | assert_eq!( |
1739 | | t("."), |
1740 | | hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) |
1741 | | ); |
1742 | | assert_eq!( |
1743 | | t("(?R)."), |
1744 | | hir_uclass(&[ |
1745 | | ('\0', '\t'), |
1746 | | ('\x0B', '\x0C'), |
1747 | | ('\x0E', '\u{10FFFF}'), |
1748 | | ]) |
1749 | | ); |
1750 | | assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); |
1751 | | assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); |
1752 | | assert_eq!( |
1753 | | t_bytes("(?-u)."), |
1754 | | hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) |
1755 | | ); |
1756 | | assert_eq!( |
1757 | | t_bytes("(?R-u)."), |
1758 | | hir_bclass(&[ |
1759 | | (b'\0', b'\t'), |
1760 | | (b'\x0B', b'\x0C'), |
1761 | | (b'\x0E', b'\xFF'), |
1762 | | ]) |
1763 | | ); |
1764 | | assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); |
1765 | | assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); |
1766 | | |
1767 | | // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. |
1768 | | assert_eq!( |
1769 | | t_err("(?-u)."), |
1770 | | TestError { |
1771 | | kind: hir::ErrorKind::InvalidUtf8, |
1772 | | span: Span::new( |
1773 | | Position::new(5, 1, 6), |
1774 | | Position::new(6, 1, 7) |
1775 | | ), |
1776 | | } |
1777 | | ); |
1778 | | assert_eq!( |
1779 | | t_err("(?R-u)."), |
1780 | | TestError { |
1781 | | kind: hir::ErrorKind::InvalidUtf8, |
1782 | | span: Span::new( |
1783 | | Position::new(6, 1, 7), |
1784 | | Position::new(7, 1, 8) |
1785 | | ), |
1786 | | } |
1787 | | ); |
1788 | | assert_eq!( |
1789 | | t_err("(?s-u)."), |
1790 | | TestError { |
1791 | | kind: hir::ErrorKind::InvalidUtf8, |
1792 | | span: Span::new( |
1793 | | Position::new(6, 1, 7), |
1794 | | Position::new(7, 1, 8) |
1795 | | ), |
1796 | | } |
1797 | | ); |
1798 | | assert_eq!( |
1799 | | t_err("(?Rs-u)."), |
1800 | | TestError { |
1801 | | kind: hir::ErrorKind::InvalidUtf8, |
1802 | | span: Span::new( |
1803 | | Position::new(7, 1, 8), |
1804 | | Position::new(8, 1, 9) |
1805 | | ), |
1806 | | } |
1807 | | ); |
1808 | | } |
1809 | | |
1810 | | #[test] |
1811 | | fn assertions() { |
1812 | | assert_eq!(t("^"), hir_look(hir::Look::Start)); |
1813 | | assert_eq!(t("$"), hir_look(hir::Look::End)); |
1814 | | assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); |
1815 | | assert_eq!(t(r"\z"), hir_look(hir::Look::End)); |
1816 | | assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); |
1817 | | assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); |
1818 | | assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); |
1819 | | assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); |
1820 | | |
1821 | | assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); |
1822 | | assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); |
1823 | | assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); |
1824 | | assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); |
1825 | | } |
1826 | | |
1827 | | #[test] |
1828 | | fn group() { |
1829 | | assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); |
1830 | | assert_eq!( |
1831 | | t("(a)(b)"), |
1832 | | hir_cat(vec![ |
1833 | | hir_capture(1, hir_lit("a")), |
1834 | | hir_capture(2, hir_lit("b")), |
1835 | | ]) |
1836 | | ); |
1837 | | assert_eq!( |
1838 | | t("(a)|(b)"), |
1839 | | hir_alt(vec![ |
1840 | | hir_capture(1, hir_lit("a")), |
1841 | | hir_capture(2, hir_lit("b")), |
1842 | | ]) |
1843 | | ); |
1844 | | assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty())); |
1845 | | assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a"))); |
1846 | | assert_eq!( |
1847 | | t("(?P<foo>a)(?P<bar>b)"), |
1848 | | hir_cat(vec![ |
1849 | | hir_capture_name(1, "foo", hir_lit("a")), |
1850 | | hir_capture_name(2, "bar", hir_lit("b")), |
1851 | | ]) |
1852 | | ); |
1853 | | assert_eq!(t("(?:)"), Hir::empty()); |
1854 | | assert_eq!(t("(?:a)"), hir_lit("a")); |
1855 | | assert_eq!( |
1856 | | t("(?:a)(b)"), |
1857 | | hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) |
1858 | | ); |
1859 | | assert_eq!( |
1860 | | t("(a)(?:b)(c)"), |
1861 | | hir_cat(vec![ |
1862 | | hir_capture(1, hir_lit("a")), |
1863 | | hir_lit("b"), |
1864 | | hir_capture(2, hir_lit("c")), |
1865 | | ]) |
1866 | | ); |
1867 | | assert_eq!( |
1868 | | t("(a)(?P<foo>b)(c)"), |
1869 | | hir_cat(vec![ |
1870 | | hir_capture(1, hir_lit("a")), |
1871 | | hir_capture_name(2, "foo", hir_lit("b")), |
1872 | | hir_capture(3, hir_lit("c")), |
1873 | | ]) |
1874 | | ); |
1875 | | assert_eq!(t("()"), hir_capture(1, Hir::empty())); |
1876 | | assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); |
1877 | | assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); |
1878 | | assert_eq!( |
1879 | | t("(((?x)))"), |
1880 | | hir_capture(1, hir_capture(2, Hir::empty())) |
1881 | | ); |
1882 | | } |
1883 | | |
1884 | | #[test] |
1885 | | fn line_anchors() { |
1886 | | assert_eq!(t("^"), hir_look(hir::Look::Start)); |
1887 | | assert_eq!(t("$"), hir_look(hir::Look::End)); |
1888 | | assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); |
1889 | | assert_eq!(t(r"\z"), hir_look(hir::Look::End)); |
1890 | | |
1891 | | assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); |
1892 | | assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); |
1893 | | assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); |
1894 | | assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); |
1895 | | |
1896 | | assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); |
1897 | | assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); |
1898 | | assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); |
1899 | | assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); |
1900 | | |
1901 | | assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); |
1902 | | assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); |
1903 | | assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); |
1904 | | assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); |
1905 | | } |
1906 | | |
1907 | | #[test] |
1908 | | fn flags() { |
1909 | | #[cfg(feature = "unicode-case")] |
1910 | | assert_eq!( |
1911 | | t("(?i:a)a"), |
1912 | | hir_cat( |
1913 | | vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] |
1914 | | ) |
1915 | | ); |
1916 | | assert_eq!( |
1917 | | t("(?i-u:a)β"), |
1918 | | hir_cat(vec![ |
1919 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1920 | | hir_lit("β"), |
1921 | | ]) |
1922 | | ); |
1923 | | assert_eq!( |
1924 | | t("(?:(?i-u)a)b"), |
1925 | | hir_cat(vec![ |
1926 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1927 | | hir_lit("b"), |
1928 | | ]) |
1929 | | ); |
1930 | | assert_eq!( |
1931 | | t("((?i-u)a)b"), |
1932 | | hir_cat(vec![ |
1933 | | hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), |
1934 | | hir_lit("b"), |
1935 | | ]) |
1936 | | ); |
1937 | | #[cfg(feature = "unicode-case")] |
1938 | | assert_eq!( |
1939 | | t("(?i)(?-i:a)a"), |
1940 | | hir_cat( |
1941 | | vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] |
1942 | | ) |
1943 | | ); |
1944 | | #[cfg(feature = "unicode-case")] |
1945 | | assert_eq!( |
1946 | | t("(?im)a^"), |
1947 | | hir_cat(vec![ |
1948 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1949 | | hir_look(hir::Look::StartLF), |
1950 | | ]) |
1951 | | ); |
1952 | | #[cfg(feature = "unicode-case")] |
1953 | | assert_eq!( |
1954 | | t("(?im)a^(?i-m)a^"), |
1955 | | hir_cat(vec![ |
1956 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1957 | | hir_look(hir::Look::StartLF), |
1958 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1959 | | hir_look(hir::Look::Start), |
1960 | | ]) |
1961 | | ); |
1962 | | assert_eq!( |
1963 | | t("(?U)a*a*?(?-U)a*a*?"), |
1964 | | hir_cat(vec![ |
1965 | | hir_star(false, hir_lit("a")), |
1966 | | hir_star(true, hir_lit("a")), |
1967 | | hir_star(true, hir_lit("a")), |
1968 | | hir_star(false, hir_lit("a")), |
1969 | | ]) |
1970 | | ); |
1971 | | #[cfg(feature = "unicode-case")] |
1972 | | assert_eq!( |
1973 | | t("(?:a(?i)a)a"), |
1974 | | hir_cat(vec![ |
1975 | | hir_cat(vec![ |
1976 | | hir_lit("a"), |
1977 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1978 | | ]), |
1979 | | hir_lit("a"), |
1980 | | ]) |
1981 | | ); |
1982 | | #[cfg(feature = "unicode-case")] |
1983 | | assert_eq!( |
1984 | | t("(?i)(?:a(?-i)a)a"), |
1985 | | hir_cat(vec![ |
1986 | | hir_cat(vec![ |
1987 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1988 | | hir_lit("a"), |
1989 | | ]), |
1990 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1991 | | ]) |
1992 | | ); |
1993 | | } |
1994 | | |
1995 | | #[test] |
1996 | | fn escape() { |
1997 | | assert_eq!( |
1998 | | t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), |
1999 | | hir_lit(r"\.+*?()|[]{}^$#") |
2000 | | ); |
2001 | | } |
2002 | | |
2003 | | #[test] |
2004 | | fn repetition() { |
2005 | | assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); |
2006 | | assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); |
2007 | | assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); |
2008 | | assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); |
2009 | | assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); |
2010 | | assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); |
2011 | | |
2012 | | assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); |
2013 | | assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); |
2014 | | assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); |
2015 | | assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); |
2016 | | assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); |
2017 | | assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); |
2018 | | |
2019 | | assert_eq!( |
2020 | | t("ab?"), |
2021 | | hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) |
2022 | | ); |
2023 | | assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); |
2024 | | assert_eq!( |
2025 | | t("a|b?"), |
2026 | | hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) |
2027 | | ); |
2028 | | } |
2029 | | |
2030 | | #[test] |
2031 | | fn cat_alt() { |
2032 | | let a = || hir_look(hir::Look::Start); |
2033 | | let b = || hir_look(hir::Look::End); |
2034 | | let c = || hir_look(hir::Look::WordUnicode); |
2035 | | let d = || hir_look(hir::Look::WordUnicodeNegate); |
2036 | | |
2037 | | assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); |
2038 | | assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); |
2039 | | assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); |
2040 | | assert_eq!( |
2041 | | t(r"^$|$\b|\b\B"), |
2042 | | hir_alt(vec![ |
2043 | | hir_cat(vec![a(), b()]), |
2044 | | hir_cat(vec![b(), c()]), |
2045 | | hir_cat(vec![c(), d()]), |
2046 | | ]) |
2047 | | ); |
2048 | | assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); |
2049 | | assert_eq!( |
2050 | | t(r"(^|$|\b)"), |
2051 | | hir_capture(1, hir_alt(vec![a(), b(), c()])) |
2052 | | ); |
2053 | | assert_eq!( |
2054 | | t(r"(^$|$\b|\b\B)"), |
2055 | | hir_capture( |
2056 | | 1, |
2057 | | hir_alt(vec![ |
2058 | | hir_cat(vec![a(), b()]), |
2059 | | hir_cat(vec![b(), c()]), |
2060 | | hir_cat(vec![c(), d()]), |
2061 | | ]) |
2062 | | ) |
2063 | | ); |
2064 | | assert_eq!( |
2065 | | t(r"(^$|($\b|(\b\B)))"), |
2066 | | hir_capture( |
2067 | | 1, |
2068 | | hir_alt(vec![ |
2069 | | hir_cat(vec![a(), b()]), |
2070 | | hir_capture( |
2071 | | 2, |
2072 | | hir_alt(vec![ |
2073 | | hir_cat(vec![b(), c()]), |
2074 | | hir_capture(3, hir_cat(vec![c(), d()])), |
2075 | | ]) |
2076 | | ), |
2077 | | ]) |
2078 | | ) |
2079 | | ); |
2080 | | } |
2081 | | |
2082 | | // Tests the HIR transformation of things like '[a-z]|[A-Z]' into |
2083 | | // '[A-Za-z]'. In other words, an alternation of just classes is always |
2084 | | // equivalent to a single class corresponding to the union of the branches |
2085 | | // in that class. (Unless some branches match invalid UTF-8 and others |
2086 | | // match non-ASCII Unicode.) |
2087 | | #[test] |
2088 | | fn cat_class_flattened() { |
2089 | | assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); |
2090 | | // Combining all of the letter properties should give us the one giant |
2091 | | // letter property. |
2092 | | #[cfg(feature = "unicode-gencat")] |
2093 | | assert_eq!( |
2094 | | t(r"(?x) |
2095 | | \p{Lowercase_Letter} |
2096 | | |\p{Uppercase_Letter} |
2097 | | |\p{Titlecase_Letter} |
2098 | | |\p{Modifier_Letter} |
2099 | | |\p{Other_Letter} |
2100 | | "), |
2101 | | hir_uclass_query(ClassQuery::Binary("letter")) |
2102 | | ); |
2103 | | // Byte classes that can truly match invalid UTF-8 cannot be combined |
2104 | | // with Unicode classes. |
2105 | | assert_eq!( |
2106 | | t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), |
2107 | | hir_alt(vec![ |
2108 | | hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), |
2109 | | hir_bclass(&[(b'\x90', b'\xFF')]), |
2110 | | hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), |
2111 | | ]) |
2112 | | ); |
2113 | | // Byte classes on their own can be combined, even if some are ASCII |
2114 | | // and others are invalid UTF-8. |
2115 | | assert_eq!( |
2116 | | t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), |
2117 | | hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), |
2118 | | ); |
2119 | | } |
2120 | | |
2121 | | #[test] |
2122 | | fn class_ascii() { |
2123 | | assert_eq!( |
2124 | | t("[[:alnum:]]"), |
2125 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) |
2126 | | ); |
2127 | | assert_eq!( |
2128 | | t("[[:alpha:]]"), |
2129 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) |
2130 | | ); |
2131 | | assert_eq!( |
2132 | | t("[[:ascii:]]"), |
2133 | | hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) |
2134 | | ); |
2135 | | assert_eq!( |
2136 | | t("[[:blank:]]"), |
2137 | | hir_ascii_uclass(&ast::ClassAsciiKind::Blank) |
2138 | | ); |
2139 | | assert_eq!( |
2140 | | t("[[:cntrl:]]"), |
2141 | | hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) |
2142 | | ); |
2143 | | assert_eq!( |
2144 | | t("[[:digit:]]"), |
2145 | | hir_ascii_uclass(&ast::ClassAsciiKind::Digit) |
2146 | | ); |
2147 | | assert_eq!( |
2148 | | t("[[:graph:]]"), |
2149 | | hir_ascii_uclass(&ast::ClassAsciiKind::Graph) |
2150 | | ); |
2151 | | assert_eq!( |
2152 | | t("[[:lower:]]"), |
2153 | | hir_ascii_uclass(&ast::ClassAsciiKind::Lower) |
2154 | | ); |
2155 | | assert_eq!( |
2156 | | t("[[:print:]]"), |
2157 | | hir_ascii_uclass(&ast::ClassAsciiKind::Print) |
2158 | | ); |
2159 | | assert_eq!( |
2160 | | t("[[:punct:]]"), |
2161 | | hir_ascii_uclass(&ast::ClassAsciiKind::Punct) |
2162 | | ); |
2163 | | assert_eq!( |
2164 | | t("[[:space:]]"), |
2165 | | hir_ascii_uclass(&ast::ClassAsciiKind::Space) |
2166 | | ); |
2167 | | assert_eq!( |
2168 | | t("[[:upper:]]"), |
2169 | | hir_ascii_uclass(&ast::ClassAsciiKind::Upper) |
2170 | | ); |
2171 | | assert_eq!( |
2172 | | t("[[:word:]]"), |
2173 | | hir_ascii_uclass(&ast::ClassAsciiKind::Word) |
2174 | | ); |
2175 | | assert_eq!( |
2176 | | t("[[:xdigit:]]"), |
2177 | | hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) |
2178 | | ); |
2179 | | |
2180 | | assert_eq!( |
2181 | | t("[[:^lower:]]"), |
2182 | | hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) |
2183 | | ); |
2184 | | #[cfg(feature = "unicode-case")] |
2185 | | assert_eq!( |
2186 | | t("(?i)[[:lower:]]"), |
2187 | | hir_uclass(&[ |
2188 | | ('A', 'Z'), |
2189 | | ('a', 'z'), |
2190 | | ('\u{17F}', '\u{17F}'), |
2191 | | ('\u{212A}', '\u{212A}'), |
2192 | | ]) |
2193 | | ); |
2194 | | |
2195 | | assert_eq!( |
2196 | | t("(?-u)[[:lower:]]"), |
2197 | | hir_ascii_bclass(&ast::ClassAsciiKind::Lower) |
2198 | | ); |
2199 | | assert_eq!( |
2200 | | t("(?i-u)[[:lower:]]"), |
2201 | | hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) |
2202 | | ); |
2203 | | |
2204 | | assert_eq!( |
2205 | | t_err("(?-u)[[:^lower:]]"), |
2206 | | TestError { |
2207 | | kind: hir::ErrorKind::InvalidUtf8, |
2208 | | span: Span::new( |
2209 | | Position::new(6, 1, 7), |
2210 | | Position::new(16, 1, 17) |
2211 | | ), |
2212 | | } |
2213 | | ); |
2214 | | assert_eq!( |
2215 | | t_err("(?i-u)[[:^lower:]]"), |
2216 | | TestError { |
2217 | | kind: hir::ErrorKind::InvalidUtf8, |
2218 | | span: Span::new( |
2219 | | Position::new(7, 1, 8), |
2220 | | Position::new(17, 1, 18) |
2221 | | ), |
2222 | | } |
2223 | | ); |
2224 | | } |
2225 | | |
2226 | | #[test] |
2227 | | fn class_ascii_multiple() { |
2228 | | // See: https://github.com/rust-lang/regex/issues/680 |
2229 | | assert_eq!( |
2230 | | t("[[:alnum:][:^ascii:]]"), |
2231 | | hir_union( |
2232 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), |
2233 | | hir_uclass(&[('\u{80}', '\u{10FFFF}')]), |
2234 | | ), |
2235 | | ); |
2236 | | assert_eq!( |
2237 | | t_bytes("(?-u)[[:alnum:][:^ascii:]]"), |
2238 | | hir_union( |
2239 | | hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), |
2240 | | hir_bclass(&[(0x80, 0xFF)]), |
2241 | | ), |
2242 | | ); |
2243 | | } |
2244 | | |
2245 | | #[test] |
2246 | | #[cfg(feature = "unicode-perl")] |
2247 | | fn class_perl_unicode() { |
2248 | | // Unicode |
2249 | | assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2250 | | assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); |
2251 | | assert_eq!(t(r"\w"), hir_uclass_perl_word()); |
2252 | | #[cfg(feature = "unicode-case")] |
2253 | | assert_eq!( |
2254 | | t(r"(?i)\d"), |
2255 | | hir_uclass_query(ClassQuery::Binary("digit")) |
2256 | | ); |
2257 | | #[cfg(feature = "unicode-case")] |
2258 | | assert_eq!( |
2259 | | t(r"(?i)\s"), |
2260 | | hir_uclass_query(ClassQuery::Binary("space")) |
2261 | | ); |
2262 | | #[cfg(feature = "unicode-case")] |
2263 | | assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); |
2264 | | |
2265 | | // Unicode, negated |
2266 | | assert_eq!( |
2267 | | t(r"\D"), |
2268 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2269 | | ); |
2270 | | assert_eq!( |
2271 | | t(r"\S"), |
2272 | | hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) |
2273 | | ); |
2274 | | assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); |
2275 | | #[cfg(feature = "unicode-case")] |
2276 | | assert_eq!( |
2277 | | t(r"(?i)\D"), |
2278 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2279 | | ); |
2280 | | #[cfg(feature = "unicode-case")] |
2281 | | assert_eq!( |
2282 | | t(r"(?i)\S"), |
2283 | | hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) |
2284 | | ); |
2285 | | #[cfg(feature = "unicode-case")] |
2286 | | assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); |
2287 | | } |
2288 | | |
2289 | | #[test] |
2290 | | fn class_perl_ascii() { |
2291 | | // ASCII only |
2292 | | assert_eq!( |
2293 | | t(r"(?-u)\d"), |
2294 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2295 | | ); |
2296 | | assert_eq!( |
2297 | | t(r"(?-u)\s"), |
2298 | | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2299 | | ); |
2300 | | assert_eq!( |
2301 | | t(r"(?-u)\w"), |
2302 | | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2303 | | ); |
2304 | | assert_eq!( |
2305 | | t(r"(?i-u)\d"), |
2306 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2307 | | ); |
2308 | | assert_eq!( |
2309 | | t(r"(?i-u)\s"), |
2310 | | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2311 | | ); |
2312 | | assert_eq!( |
2313 | | t(r"(?i-u)\w"), |
2314 | | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2315 | | ); |
2316 | | |
2317 | | // ASCII only, negated |
2318 | | assert_eq!( |
2319 | | t_bytes(r"(?-u)\D"), |
2320 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2321 | | ); |
2322 | | assert_eq!( |
2323 | | t_bytes(r"(?-u)\S"), |
2324 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2325 | | ); |
2326 | | assert_eq!( |
2327 | | t_bytes(r"(?-u)\W"), |
2328 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2329 | | ); |
2330 | | assert_eq!( |
2331 | | t_bytes(r"(?i-u)\D"), |
2332 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2333 | | ); |
2334 | | assert_eq!( |
2335 | | t_bytes(r"(?i-u)\S"), |
2336 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2337 | | ); |
2338 | | assert_eq!( |
2339 | | t_bytes(r"(?i-u)\W"), |
2340 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2341 | | ); |
2342 | | |
2343 | | // ASCII only, negated, with UTF-8 mode enabled. |
2344 | | // In this case, negating any Perl class results in an error because |
2345 | | // all such classes can match invalid UTF-8. |
2346 | | assert_eq!( |
2347 | | t_err(r"(?-u)\D"), |
2348 | | TestError { |
2349 | | kind: hir::ErrorKind::InvalidUtf8, |
2350 | | span: Span::new( |
2351 | | Position::new(5, 1, 6), |
2352 | | Position::new(7, 1, 8), |
2353 | | ), |
2354 | | }, |
2355 | | ); |
2356 | | assert_eq!( |
2357 | | t_err(r"(?-u)\S"), |
2358 | | TestError { |
2359 | | kind: hir::ErrorKind::InvalidUtf8, |
2360 | | span: Span::new( |
2361 | | Position::new(5, 1, 6), |
2362 | | Position::new(7, 1, 8), |
2363 | | ), |
2364 | | }, |
2365 | | ); |
2366 | | assert_eq!( |
2367 | | t_err(r"(?-u)\W"), |
2368 | | TestError { |
2369 | | kind: hir::ErrorKind::InvalidUtf8, |
2370 | | span: Span::new( |
2371 | | Position::new(5, 1, 6), |
2372 | | Position::new(7, 1, 8), |
2373 | | ), |
2374 | | }, |
2375 | | ); |
2376 | | assert_eq!( |
2377 | | t_err(r"(?i-u)\D"), |
2378 | | TestError { |
2379 | | kind: hir::ErrorKind::InvalidUtf8, |
2380 | | span: Span::new( |
2381 | | Position::new(6, 1, 7), |
2382 | | Position::new(8, 1, 9), |
2383 | | ), |
2384 | | }, |
2385 | | ); |
2386 | | assert_eq!( |
2387 | | t_err(r"(?i-u)\S"), |
2388 | | TestError { |
2389 | | kind: hir::ErrorKind::InvalidUtf8, |
2390 | | span: Span::new( |
2391 | | Position::new(6, 1, 7), |
2392 | | Position::new(8, 1, 9), |
2393 | | ), |
2394 | | }, |
2395 | | ); |
2396 | | assert_eq!( |
2397 | | t_err(r"(?i-u)\W"), |
2398 | | TestError { |
2399 | | kind: hir::ErrorKind::InvalidUtf8, |
2400 | | span: Span::new( |
2401 | | Position::new(6, 1, 7), |
2402 | | Position::new(8, 1, 9), |
2403 | | ), |
2404 | | }, |
2405 | | ); |
2406 | | } |
2407 | | |
2408 | | #[test] |
2409 | | #[cfg(not(feature = "unicode-perl"))] |
2410 | | fn class_perl_word_disabled() { |
2411 | | assert_eq!( |
2412 | | t_err(r"\w"), |
2413 | | TestError { |
2414 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2415 | | span: Span::new( |
2416 | | Position::new(0, 1, 1), |
2417 | | Position::new(2, 1, 3) |
2418 | | ), |
2419 | | } |
2420 | | ); |
2421 | | } |
2422 | | |
2423 | | #[test] |
2424 | | #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] |
2425 | | fn class_perl_space_disabled() { |
2426 | | assert_eq!( |
2427 | | t_err(r"\s"), |
2428 | | TestError { |
2429 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2430 | | span: Span::new( |
2431 | | Position::new(0, 1, 1), |
2432 | | Position::new(2, 1, 3) |
2433 | | ), |
2434 | | } |
2435 | | ); |
2436 | | } |
2437 | | |
2438 | | #[test] |
2439 | | #[cfg(all( |
2440 | | not(feature = "unicode-perl"), |
2441 | | not(feature = "unicode-gencat") |
2442 | | ))] |
2443 | | fn class_perl_digit_disabled() { |
2444 | | assert_eq!( |
2445 | | t_err(r"\d"), |
2446 | | TestError { |
2447 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2448 | | span: Span::new( |
2449 | | Position::new(0, 1, 1), |
2450 | | Position::new(2, 1, 3) |
2451 | | ), |
2452 | | } |
2453 | | ); |
2454 | | } |
2455 | | |
2456 | | #[test] |
2457 | | #[cfg(feature = "unicode-gencat")] |
2458 | | fn class_unicode_gencat() { |
2459 | | assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); |
2460 | | assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); |
2461 | | assert_eq!( |
2462 | | t(r"\p{Separator}"), |
2463 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2464 | | ); |
2465 | | assert_eq!( |
2466 | | t(r"\p{se PaRa ToR}"), |
2467 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2468 | | ); |
2469 | | assert_eq!( |
2470 | | t(r"\p{gc:Separator}"), |
2471 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2472 | | ); |
2473 | | assert_eq!( |
2474 | | t(r"\p{gc=Separator}"), |
2475 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2476 | | ); |
2477 | | assert_eq!( |
2478 | | t(r"\p{Other}"), |
2479 | | hir_uclass_query(ClassQuery::Binary("Other")) |
2480 | | ); |
2481 | | assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); |
2482 | | |
2483 | | assert_eq!( |
2484 | | t(r"\PZ"), |
2485 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2486 | | ); |
2487 | | assert_eq!( |
2488 | | t(r"\P{separator}"), |
2489 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2490 | | ); |
2491 | | assert_eq!( |
2492 | | t(r"\P{gc!=separator}"), |
2493 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2494 | | ); |
2495 | | |
2496 | | assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); |
2497 | | assert_eq!( |
2498 | | t(r"\p{assigned}"), |
2499 | | hir_uclass_query(ClassQuery::Binary("Assigned")) |
2500 | | ); |
2501 | | assert_eq!( |
2502 | | t(r"\p{ascii}"), |
2503 | | hir_uclass_query(ClassQuery::Binary("ASCII")) |
2504 | | ); |
2505 | | assert_eq!( |
2506 | | t(r"\p{gc:any}"), |
2507 | | hir_uclass_query(ClassQuery::Binary("Any")) |
2508 | | ); |
2509 | | assert_eq!( |
2510 | | t(r"\p{gc:assigned}"), |
2511 | | hir_uclass_query(ClassQuery::Binary("Assigned")) |
2512 | | ); |
2513 | | assert_eq!( |
2514 | | t(r"\p{gc:ascii}"), |
2515 | | hir_uclass_query(ClassQuery::Binary("ASCII")) |
2516 | | ); |
2517 | | |
2518 | | assert_eq!( |
2519 | | t_err(r"(?-u)\pZ"), |
2520 | | TestError { |
2521 | | kind: hir::ErrorKind::UnicodeNotAllowed, |
2522 | | span: Span::new( |
2523 | | Position::new(5, 1, 6), |
2524 | | Position::new(8, 1, 9) |
2525 | | ), |
2526 | | } |
2527 | | ); |
2528 | | assert_eq!( |
2529 | | t_err(r"(?-u)\p{Separator}"), |
2530 | | TestError { |
2531 | | kind: hir::ErrorKind::UnicodeNotAllowed, |
2532 | | span: Span::new( |
2533 | | Position::new(5, 1, 6), |
2534 | | Position::new(18, 1, 19) |
2535 | | ), |
2536 | | } |
2537 | | ); |
2538 | | assert_eq!( |
2539 | | t_err(r"\pE"), |
2540 | | TestError { |
2541 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2542 | | span: Span::new( |
2543 | | Position::new(0, 1, 1), |
2544 | | Position::new(3, 1, 4) |
2545 | | ), |
2546 | | } |
2547 | | ); |
2548 | | assert_eq!( |
2549 | | t_err(r"\p{Foo}"), |
2550 | | TestError { |
2551 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2552 | | span: Span::new( |
2553 | | Position::new(0, 1, 1), |
2554 | | Position::new(7, 1, 8) |
2555 | | ), |
2556 | | } |
2557 | | ); |
2558 | | assert_eq!( |
2559 | | t_err(r"\p{gc:Foo}"), |
2560 | | TestError { |
2561 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2562 | | span: Span::new( |
2563 | | Position::new(0, 1, 1), |
2564 | | Position::new(10, 1, 11) |
2565 | | ), |
2566 | | } |
2567 | | ); |
2568 | | } |
2569 | | |
2570 | | #[test] |
2571 | | #[cfg(not(feature = "unicode-gencat"))] |
2572 | | fn class_unicode_gencat_disabled() { |
2573 | | assert_eq!( |
2574 | | t_err(r"\p{Separator}"), |
2575 | | TestError { |
2576 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2577 | | span: Span::new( |
2578 | | Position::new(0, 1, 1), |
2579 | | Position::new(13, 1, 14) |
2580 | | ), |
2581 | | } |
2582 | | ); |
2583 | | |
2584 | | assert_eq!( |
2585 | | t_err(r"\p{Any}"), |
2586 | | TestError { |
2587 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2588 | | span: Span::new( |
2589 | | Position::new(0, 1, 1), |
2590 | | Position::new(7, 1, 8) |
2591 | | ), |
2592 | | } |
2593 | | ); |
2594 | | } |
2595 | | |
2596 | | #[test] |
2597 | | #[cfg(feature = "unicode-script")] |
2598 | | fn class_unicode_script() { |
2599 | | assert_eq!( |
2600 | | t(r"\p{Greek}"), |
2601 | | hir_uclass_query(ClassQuery::Binary("Greek")) |
2602 | | ); |
2603 | | #[cfg(feature = "unicode-case")] |
2604 | | assert_eq!( |
2605 | | t(r"(?i)\p{Greek}"), |
2606 | | hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) |
2607 | | ); |
2608 | | #[cfg(feature = "unicode-case")] |
2609 | | assert_eq!( |
2610 | | t(r"(?i)\P{Greek}"), |
2611 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2612 | | "Greek" |
2613 | | )))) |
2614 | | ); |
2615 | | |
2616 | | assert_eq!( |
2617 | | t_err(r"\p{sc:Foo}"), |
2618 | | TestError { |
2619 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2620 | | span: Span::new( |
2621 | | Position::new(0, 1, 1), |
2622 | | Position::new(10, 1, 11) |
2623 | | ), |
2624 | | } |
2625 | | ); |
2626 | | assert_eq!( |
2627 | | t_err(r"\p{scx:Foo}"), |
2628 | | TestError { |
2629 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2630 | | span: Span::new( |
2631 | | Position::new(0, 1, 1), |
2632 | | Position::new(11, 1, 12) |
2633 | | ), |
2634 | | } |
2635 | | ); |
2636 | | } |
2637 | | |
2638 | | #[test] |
2639 | | #[cfg(not(feature = "unicode-script"))] |
2640 | | fn class_unicode_script_disabled() { |
2641 | | assert_eq!( |
2642 | | t_err(r"\p{Greek}"), |
2643 | | TestError { |
2644 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2645 | | span: Span::new( |
2646 | | Position::new(0, 1, 1), |
2647 | | Position::new(9, 1, 10) |
2648 | | ), |
2649 | | } |
2650 | | ); |
2651 | | |
2652 | | assert_eq!( |
2653 | | t_err(r"\p{scx:Greek}"), |
2654 | | TestError { |
2655 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2656 | | span: Span::new( |
2657 | | Position::new(0, 1, 1), |
2658 | | Position::new(13, 1, 14) |
2659 | | ), |
2660 | | } |
2661 | | ); |
2662 | | } |
2663 | | |
2664 | | #[test] |
2665 | | #[cfg(feature = "unicode-age")] |
2666 | | fn class_unicode_age() { |
2667 | | assert_eq!( |
2668 | | t_err(r"\p{age:Foo}"), |
2669 | | TestError { |
2670 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2671 | | span: Span::new( |
2672 | | Position::new(0, 1, 1), |
2673 | | Position::new(11, 1, 12) |
2674 | | ), |
2675 | | } |
2676 | | ); |
2677 | | } |
2678 | | |
2679 | | #[test] |
2680 | | #[cfg(feature = "unicode-gencat")] |
2681 | | fn class_unicode_any_empty() { |
2682 | | assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); |
2683 | | } |
2684 | | |
2685 | | #[test] |
2686 | | #[cfg(not(feature = "unicode-age"))] |
2687 | | fn class_unicode_age_disabled() { |
2688 | | assert_eq!( |
2689 | | t_err(r"\p{age:3.0}"), |
2690 | | TestError { |
2691 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2692 | | span: Span::new( |
2693 | | Position::new(0, 1, 1), |
2694 | | Position::new(11, 1, 12) |
2695 | | ), |
2696 | | } |
2697 | | ); |
2698 | | } |
2699 | | |
2700 | | #[test] |
2701 | | fn class_bracketed() { |
2702 | | assert_eq!(t("[a]"), hir_lit("a")); |
2703 | | assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); |
2704 | | assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); |
2705 | | assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); |
2706 | | assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); |
2707 | | assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); |
2708 | | assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); |
2709 | | assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); |
2710 | | assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); |
2711 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2712 | | assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2713 | | #[cfg(feature = "unicode-gencat")] |
2714 | | assert_eq!( |
2715 | | t(r"[\pZ]"), |
2716 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2717 | | ); |
2718 | | #[cfg(feature = "unicode-gencat")] |
2719 | | assert_eq!( |
2720 | | t(r"[\p{separator}]"), |
2721 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2722 | | ); |
2723 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2724 | | assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2725 | | #[cfg(feature = "unicode-gencat")] |
2726 | | assert_eq!( |
2727 | | t(r"[^\PZ]"), |
2728 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2729 | | ); |
2730 | | #[cfg(feature = "unicode-gencat")] |
2731 | | assert_eq!( |
2732 | | t(r"[^\P{separator}]"), |
2733 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2734 | | ); |
2735 | | #[cfg(all( |
2736 | | feature = "unicode-case", |
2737 | | any(feature = "unicode-perl", feature = "unicode-gencat") |
2738 | | ))] |
2739 | | assert_eq!( |
2740 | | t(r"(?i)[^\D]"), |
2741 | | hir_uclass_query(ClassQuery::Binary("digit")) |
2742 | | ); |
2743 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2744 | | assert_eq!( |
2745 | | t(r"(?i)[^\P{greek}]"), |
2746 | | hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) |
2747 | | ); |
2748 | | |
2749 | | assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); |
2750 | | assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); |
2751 | | assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); |
2752 | | |
2753 | | #[cfg(feature = "unicode-case")] |
2754 | | assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); |
2755 | | #[cfg(feature = "unicode-case")] |
2756 | | assert_eq!( |
2757 | | t("(?i)[k]"), |
2758 | | hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) |
2759 | | ); |
2760 | | #[cfg(feature = "unicode-case")] |
2761 | | assert_eq!( |
2762 | | t("(?i)[β]"), |
2763 | | hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) |
2764 | | ); |
2765 | | assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); |
2766 | | |
2767 | | assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); |
2768 | | assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); |
2769 | | assert_eq!( |
2770 | | t_bytes("(?-u)[^a]"), |
2771 | | class_negate(bclass(&[(b'a', b'a')])) |
2772 | | ); |
2773 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2774 | | assert_eq!( |
2775 | | t(r"[^\d]"), |
2776 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2777 | | ); |
2778 | | #[cfg(feature = "unicode-gencat")] |
2779 | | assert_eq!( |
2780 | | t(r"[^\pZ]"), |
2781 | | hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) |
2782 | | ); |
2783 | | #[cfg(feature = "unicode-gencat")] |
2784 | | assert_eq!( |
2785 | | t(r"[^\p{separator}]"), |
2786 | | hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) |
2787 | | ); |
2788 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2789 | | assert_eq!( |
2790 | | t(r"(?i)[^\p{greek}]"), |
2791 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2792 | | "greek" |
2793 | | )))) |
2794 | | ); |
2795 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2796 | | assert_eq!( |
2797 | | t(r"(?i)[\P{greek}]"), |
2798 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2799 | | "greek" |
2800 | | )))) |
2801 | | ); |
2802 | | |
2803 | | // Test some weird cases. |
2804 | | assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); |
2805 | | |
2806 | | assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); |
2807 | | assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); |
2808 | | assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); |
2809 | | assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); |
2810 | | assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); |
2811 | | |
2812 | | assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); |
2813 | | assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); |
2814 | | assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); |
2815 | | assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); |
2816 | | assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); |
2817 | | |
2818 | | assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); |
2819 | | assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); |
2820 | | assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); |
2821 | | assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); |
2822 | | assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); |
2823 | | |
2824 | | assert_eq!( |
2825 | | t_err("(?-u)[^a]"), |
2826 | | TestError { |
2827 | | kind: hir::ErrorKind::InvalidUtf8, |
2828 | | span: Span::new( |
2829 | | Position::new(5, 1, 6), |
2830 | | Position::new(9, 1, 10) |
2831 | | ), |
2832 | | } |
2833 | | ); |
2834 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] |
2835 | | assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); |
2836 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] |
2837 | | assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); |
2838 | | } |
2839 | | |
2840 | | #[test] |
2841 | | fn class_bracketed_union() { |
2842 | | assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); |
2843 | | #[cfg(feature = "unicode-gencat")] |
2844 | | assert_eq!( |
2845 | | t(r"[a\pZb]"), |
2846 | | hir_union( |
2847 | | hir_uclass(&[('a', 'b')]), |
2848 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2849 | | ) |
2850 | | ); |
2851 | | #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] |
2852 | | assert_eq!( |
2853 | | t(r"[\pZ\p{Greek}]"), |
2854 | | hir_union( |
2855 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2856 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2857 | | ) |
2858 | | ); |
2859 | | #[cfg(all( |
2860 | | feature = "unicode-age", |
2861 | | feature = "unicode-gencat", |
2862 | | feature = "unicode-script" |
2863 | | ))] |
2864 | | assert_eq!( |
2865 | | t(r"[\p{age:3.0}\pZ\p{Greek}]"), |
2866 | | hir_union( |
2867 | | hir_uclass_query(ClassQuery::ByValue { |
2868 | | property_name: "age", |
2869 | | property_value: "3.0", |
2870 | | }), |
2871 | | hir_union( |
2872 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2873 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2874 | | ) |
2875 | | ) |
2876 | | ); |
2877 | | #[cfg(all( |
2878 | | feature = "unicode-age", |
2879 | | feature = "unicode-gencat", |
2880 | | feature = "unicode-script" |
2881 | | ))] |
2882 | | assert_eq!( |
2883 | | t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), |
2884 | | hir_union( |
2885 | | hir_uclass_query(ClassQuery::ByValue { |
2886 | | property_name: "age", |
2887 | | property_value: "3.0", |
2888 | | }), |
2889 | | hir_union( |
2890 | | hir_uclass_query(ClassQuery::Binary("cyrillic")), |
2891 | | hir_union( |
2892 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2893 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2894 | | ) |
2895 | | ) |
2896 | | ) |
2897 | | ); |
2898 | | |
2899 | | #[cfg(all( |
2900 | | feature = "unicode-age", |
2901 | | feature = "unicode-case", |
2902 | | feature = "unicode-gencat", |
2903 | | feature = "unicode-script" |
2904 | | ))] |
2905 | | assert_eq!( |
2906 | | t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), |
2907 | | hir_case_fold(hir_union( |
2908 | | hir_uclass_query(ClassQuery::ByValue { |
2909 | | property_name: "age", |
2910 | | property_value: "3.0", |
2911 | | }), |
2912 | | hir_union( |
2913 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2914 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2915 | | ) |
2916 | | )) |
2917 | | ); |
2918 | | #[cfg(all( |
2919 | | feature = "unicode-age", |
2920 | | feature = "unicode-gencat", |
2921 | | feature = "unicode-script" |
2922 | | ))] |
2923 | | assert_eq!( |
2924 | | t(r"[^\p{age:3.0}\pZ\p{Greek}]"), |
2925 | | hir_negate(hir_union( |
2926 | | hir_uclass_query(ClassQuery::ByValue { |
2927 | | property_name: "age", |
2928 | | property_value: "3.0", |
2929 | | }), |
2930 | | hir_union( |
2931 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2932 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2933 | | ) |
2934 | | )) |
2935 | | ); |
2936 | | #[cfg(all( |
2937 | | feature = "unicode-age", |
2938 | | feature = "unicode-case", |
2939 | | feature = "unicode-gencat", |
2940 | | feature = "unicode-script" |
2941 | | ))] |
2942 | | assert_eq!( |
2943 | | t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), |
2944 | | hir_negate(hir_case_fold(hir_union( |
2945 | | hir_uclass_query(ClassQuery::ByValue { |
2946 | | property_name: "age", |
2947 | | property_value: "3.0", |
2948 | | }), |
2949 | | hir_union( |
2950 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2951 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2952 | | ) |
2953 | | ))) |
2954 | | ); |
2955 | | } |
2956 | | |
2957 | | #[test] |
2958 | | fn class_bracketed_nested() { |
2959 | | assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); |
2960 | | assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); |
2961 | | assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); |
2962 | | |
2963 | | assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); |
2964 | | assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); |
2965 | | |
2966 | | #[cfg(feature = "unicode-case")] |
2967 | | assert_eq!( |
2968 | | t(r"(?i)[a[^c]]"), |
2969 | | hir_negate(class_case_fold(uclass(&[('c', 'c')]))) |
2970 | | ); |
2971 | | #[cfg(feature = "unicode-case")] |
2972 | | assert_eq!( |
2973 | | t(r"(?i)[a-b[^c]]"), |
2974 | | hir_negate(class_case_fold(uclass(&[('c', 'c')]))) |
2975 | | ); |
2976 | | |
2977 | | #[cfg(feature = "unicode-case")] |
2978 | | assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); |
2979 | | #[cfg(feature = "unicode-case")] |
2980 | | assert_eq!( |
2981 | | t(r"(?i)[^a-b[^c]]"), |
2982 | | hir_uclass(&[('C', 'C'), ('c', 'c')]) |
2983 | | ); |
2984 | | |
2985 | | assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); |
2986 | | #[cfg(feature = "unicode-case")] |
2987 | | assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); |
2988 | | } |
2989 | | |
2990 | | #[test] |
2991 | | fn class_bracketed_intersect() { |
2992 | | assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); |
2993 | | assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); |
2994 | | assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); |
2995 | | assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); |
2996 | | assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); |
2997 | | assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); |
2998 | | assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); |
2999 | | assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); |
3000 | | assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); |
3001 | | |
3002 | | assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); |
3003 | | assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); |
3004 | | assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); |
3005 | | assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); |
3006 | | assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); |
3007 | | assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); |
3008 | | |
3009 | | #[cfg(feature = "unicode-case")] |
3010 | | assert_eq!( |
3011 | | t("(?i)[abc&&b-c]"), |
3012 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3013 | | ); |
3014 | | #[cfg(feature = "unicode-case")] |
3015 | | assert_eq!( |
3016 | | t("(?i)[abc&&[b-c]]"), |
3017 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3018 | | ); |
3019 | | #[cfg(feature = "unicode-case")] |
3020 | | assert_eq!( |
3021 | | t("(?i)[[abc]&&[b-c]]"), |
3022 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3023 | | ); |
3024 | | #[cfg(feature = "unicode-case")] |
3025 | | assert_eq!( |
3026 | | t("(?i)[a-z&&b-y&&c-x]"), |
3027 | | hir_case_fold(hir_uclass(&[('c', 'x')])) |
3028 | | ); |
3029 | | #[cfg(feature = "unicode-case")] |
3030 | | assert_eq!( |
3031 | | t("(?i)[c-da-b&&a-d]"), |
3032 | | hir_case_fold(hir_uclass(&[('a', 'd')])) |
3033 | | ); |
3034 | | #[cfg(feature = "unicode-case")] |
3035 | | assert_eq!( |
3036 | | t("(?i)[a-d&&c-da-b]"), |
3037 | | hir_case_fold(hir_uclass(&[('a', 'd')])) |
3038 | | ); |
3039 | | |
3040 | | assert_eq!( |
3041 | | t("(?i-u)[abc&&b-c]"), |
3042 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3043 | | ); |
3044 | | assert_eq!( |
3045 | | t("(?i-u)[abc&&[b-c]]"), |
3046 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3047 | | ); |
3048 | | assert_eq!( |
3049 | | t("(?i-u)[[abc]&&[b-c]]"), |
3050 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3051 | | ); |
3052 | | assert_eq!( |
3053 | | t("(?i-u)[a-z&&b-y&&c-x]"), |
3054 | | hir_case_fold(hir_bclass(&[(b'c', b'x')])) |
3055 | | ); |
3056 | | assert_eq!( |
3057 | | t("(?i-u)[c-da-b&&a-d]"), |
3058 | | hir_case_fold(hir_bclass(&[(b'a', b'd')])) |
3059 | | ); |
3060 | | assert_eq!( |
3061 | | t("(?i-u)[a-d&&c-da-b]"), |
3062 | | hir_case_fold(hir_bclass(&[(b'a', b'd')])) |
3063 | | ); |
3064 | | |
3065 | | // In `[a^]`, `^` does not need to be escaped, so it makes sense that |
3066 | | // `^` is also allowed to be unescaped after `&&`. |
3067 | | assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); |
3068 | | // `]` needs to be escaped after `&&` since it's not at start of class. |
3069 | | assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); |
3070 | | assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); |
3071 | | assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); |
3072 | | assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); |
3073 | | // Test precedence. |
3074 | | assert_eq!( |
3075 | | t(r"[a-w&&[^c-g]z]"), |
3076 | | hir_uclass(&[('a', 'b'), ('h', 'w')]) |
3077 | | ); |
3078 | | } |
3079 | | |
3080 | | #[test] |
3081 | | fn class_bracketed_intersect_negate() { |
3082 | | #[cfg(feature = "unicode-perl")] |
3083 | | assert_eq!( |
3084 | | t(r"[^\w&&\d]"), |
3085 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
3086 | | ); |
3087 | | assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); |
3088 | | #[cfg(feature = "unicode-perl")] |
3089 | | assert_eq!( |
3090 | | t(r"[^[\w&&\d]]"), |
3091 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
3092 | | ); |
3093 | | #[cfg(feature = "unicode-perl")] |
3094 | | assert_eq!( |
3095 | | t(r"[^[^\w&&\d]]"), |
3096 | | hir_uclass_query(ClassQuery::Binary("digit")) |
3097 | | ); |
3098 | | #[cfg(feature = "unicode-perl")] |
3099 | | assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); |
3100 | | |
3101 | | #[cfg(feature = "unicode-perl")] |
3102 | | assert_eq!( |
3103 | | t_bytes(r"(?-u)[^\w&&\d]"), |
3104 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3105 | | ); |
3106 | | assert_eq!( |
3107 | | t_bytes(r"(?-u)[^[a-z&&a-c]]"), |
3108 | | hir_negate(hir_bclass(&[(b'a', b'c')])) |
3109 | | ); |
3110 | | assert_eq!( |
3111 | | t_bytes(r"(?-u)[^[\w&&\d]]"), |
3112 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3113 | | ); |
3114 | | assert_eq!( |
3115 | | t_bytes(r"(?-u)[^[^\w&&\d]]"), |
3116 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
3117 | | ); |
3118 | | assert_eq!( |
3119 | | t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), |
3120 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
3121 | | ); |
3122 | | } |
3123 | | |
3124 | | #[test] |
3125 | | fn class_bracketed_difference() { |
3126 | | #[cfg(feature = "unicode-gencat")] |
3127 | | assert_eq!( |
3128 | | t(r"[\pL--[:ascii:]]"), |
3129 | | hir_difference( |
3130 | | hir_uclass_query(ClassQuery::Binary("letter")), |
3131 | | hir_uclass(&[('\0', '\x7F')]) |
3132 | | ) |
3133 | | ); |
3134 | | |
3135 | | assert_eq!( |
3136 | | t(r"(?-u)[[:alpha:]--[:lower:]]"), |
3137 | | hir_bclass(&[(b'A', b'Z')]) |
3138 | | ); |
3139 | | } |
3140 | | |
3141 | | #[test] |
3142 | | fn class_bracketed_symmetric_difference() { |
3143 | | #[cfg(feature = "unicode-script")] |
3144 | | assert_eq!( |
3145 | | t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), |
3146 | | // Class({ |
3147 | | // '·'..='·', |
3148 | | // '\u{300}'..='\u{301}', |
3149 | | // '\u{304}'..='\u{304}', |
3150 | | // '\u{306}'..='\u{306}', |
3151 | | // '\u{308}'..='\u{308}', |
3152 | | // '\u{313}'..='\u{313}', |
3153 | | // '\u{342}'..='\u{342}', |
3154 | | // '\u{345}'..='\u{345}', |
3155 | | // 'ʹ'..='ʹ', |
3156 | | // '\u{1dc0}'..='\u{1dc1}', |
3157 | | // '⁝'..='⁝', |
3158 | | // }) |
3159 | | hir_uclass(&[ |
3160 | | ('·', '·'), |
3161 | | ('\u{0300}', '\u{0301}'), |
3162 | | ('\u{0304}', '\u{0304}'), |
3163 | | ('\u{0306}', '\u{0306}'), |
3164 | | ('\u{0308}', '\u{0308}'), |
3165 | | ('\u{0313}', '\u{0313}'), |
3166 | | ('\u{0342}', '\u{0342}'), |
3167 | | ('\u{0345}', '\u{0345}'), |
3168 | | ('ʹ', 'ʹ'), |
3169 | | ('\u{1DC0}', '\u{1DC1}'), |
3170 | | ('⁝', '⁝'), |
3171 | | ]) |
3172 | | ); |
3173 | | assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); |
3174 | | |
3175 | | assert_eq!( |
3176 | | t(r"(?-u)[a-g~~c-j]"), |
3177 | | hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) |
3178 | | ); |
3179 | | } |
3180 | | |
3181 | | #[test] |
3182 | | fn ignore_whitespace() { |
3183 | | assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); |
3184 | | assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); |
3185 | | assert_eq!( |
3186 | | t(r"(?x)\x # comment |
3187 | | { # comment |
3188 | | 53 # comment |
3189 | | } #comment"), |
3190 | | hir_lit("S") |
3191 | | ); |
3192 | | |
3193 | | assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); |
3194 | | assert_eq!( |
3195 | | t(r"(?x)\x # comment |
3196 | | 53 # comment"), |
3197 | | hir_lit("S") |
3198 | | ); |
3199 | | assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); |
3200 | | |
3201 | | #[cfg(feature = "unicode-gencat")] |
3202 | | assert_eq!( |
3203 | | t(r"(?x)\p # comment |
3204 | | { # comment |
3205 | | Separator # comment |
3206 | | } # comment"), |
3207 | | hir_uclass_query(ClassQuery::Binary("separator")) |
3208 | | ); |
3209 | | |
3210 | | assert_eq!( |
3211 | | t(r"(?x)a # comment |
3212 | | { # comment |
3213 | | 5 # comment |
3214 | | , # comment |
3215 | | 10 # comment |
3216 | | } # comment"), |
3217 | | hir_range(true, 5, Some(10), hir_lit("a")) |
3218 | | ); |
3219 | | |
3220 | | assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); |
3221 | | } |
3222 | | |
3223 | | #[test] |
3224 | | fn analysis_is_utf8() { |
3225 | | // Positive examples. |
3226 | | assert!(props_bytes(r"a").is_utf8()); |
3227 | | assert!(props_bytes(r"ab").is_utf8()); |
3228 | | assert!(props_bytes(r"(?-u)a").is_utf8()); |
3229 | | assert!(props_bytes(r"(?-u)ab").is_utf8()); |
3230 | | assert!(props_bytes(r"\xFF").is_utf8()); |
3231 | | assert!(props_bytes(r"\xFF\xFF").is_utf8()); |
3232 | | assert!(props_bytes(r"[^a]").is_utf8()); |
3233 | | assert!(props_bytes(r"[^a][^a]").is_utf8()); |
3234 | | assert!(props_bytes(r"\b").is_utf8()); |
3235 | | assert!(props_bytes(r"\B").is_utf8()); |
3236 | | assert!(props_bytes(r"(?-u)\b").is_utf8()); |
3237 | | assert!(props_bytes(r"(?-u)\B").is_utf8()); |
3238 | | |
3239 | | // Negative examples. |
3240 | | assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); |
3241 | | assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); |
3242 | | assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); |
3243 | | assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); |
3244 | | } |
3245 | | |
3246 | | #[test] |
3247 | | fn analysis_captures_len() { |
3248 | | assert_eq!(0, props(r"a").explicit_captures_len()); |
3249 | | assert_eq!(0, props(r"(?:a)").explicit_captures_len()); |
3250 | | assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); |
3251 | | assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); |
3252 | | assert_eq!(1, props(r"(a)").explicit_captures_len()); |
3253 | | assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len()); |
3254 | | assert_eq!(1, props(r"()").explicit_captures_len()); |
3255 | | assert_eq!(1, props(r"()a").explicit_captures_len()); |
3256 | | assert_eq!(1, props(r"(a)+").explicit_captures_len()); |
3257 | | assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); |
3258 | | assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); |
3259 | | assert_eq!(2, props(r"((a))").explicit_captures_len()); |
3260 | | assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); |
3261 | | } |
3262 | | |
3263 | | #[test] |
3264 | | fn analysis_static_captures_len() { |
3265 | | let len = |pattern| props(pattern).static_explicit_captures_len(); |
3266 | | assert_eq!(Some(0), len(r"")); |
3267 | | assert_eq!(Some(0), len(r"foo|bar")); |
3268 | | assert_eq!(None, len(r"(foo)|bar")); |
3269 | | assert_eq!(None, len(r"foo|(bar)")); |
3270 | | assert_eq!(Some(1), len(r"(foo|bar)")); |
3271 | | assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); |
3272 | | assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); |
3273 | | assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); |
3274 | | assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); |
3275 | | assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); |
3276 | | assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); |
3277 | | assert_eq!(None, len(r"(a)(b)(extra)?")); |
3278 | | assert_eq!(Some(1), len(r"(foo)|(bar)")); |
3279 | | assert_eq!(Some(2), len(r"(foo)(bar)")); |
3280 | | assert_eq!(Some(2), len(r"(foo)+(bar)")); |
3281 | | assert_eq!(None, len(r"(foo)*(bar)")); |
3282 | | assert_eq!(Some(0), len(r"(foo)?{0}")); |
3283 | | assert_eq!(None, len(r"(foo)?{1}")); |
3284 | | assert_eq!(Some(1), len(r"(foo){1}")); |
3285 | | assert_eq!(Some(1), len(r"(foo){1,}")); |
3286 | | assert_eq!(Some(1), len(r"(foo){1,}?")); |
3287 | | assert_eq!(None, len(r"(foo){1,}??")); |
3288 | | assert_eq!(None, len(r"(foo){0,}")); |
3289 | | assert_eq!(Some(1), len(r"(foo)(?:bar)")); |
3290 | | assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); |
3291 | | assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)")); |
3292 | | assert_eq!( |
3293 | | Some(2), |
3294 | | len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) |
3295 | | ); |
3296 | | } |
3297 | | |
3298 | | #[test] |
3299 | | fn analysis_is_all_assertions() { |
3300 | | // Positive examples. |
3301 | | let p = props(r"\b"); |
3302 | | assert!(!p.look_set().is_empty()); |
3303 | | assert_eq!(p.minimum_len(), Some(0)); |
3304 | | |
3305 | | let p = props(r"\B"); |
3306 | | assert!(!p.look_set().is_empty()); |
3307 | | assert_eq!(p.minimum_len(), Some(0)); |
3308 | | |
3309 | | let p = props(r"^"); |
3310 | | assert!(!p.look_set().is_empty()); |
3311 | | assert_eq!(p.minimum_len(), Some(0)); |
3312 | | |
3313 | | let p = props(r"$"); |
3314 | | assert!(!p.look_set().is_empty()); |
3315 | | assert_eq!(p.minimum_len(), Some(0)); |
3316 | | |
3317 | | let p = props(r"\A"); |
3318 | | assert!(!p.look_set().is_empty()); |
3319 | | assert_eq!(p.minimum_len(), Some(0)); |
3320 | | |
3321 | | let p = props(r"\z"); |
3322 | | assert!(!p.look_set().is_empty()); |
3323 | | assert_eq!(p.minimum_len(), Some(0)); |
3324 | | |
3325 | | let p = props(r"$^\z\A\b\B"); |
3326 | | assert!(!p.look_set().is_empty()); |
3327 | | assert_eq!(p.minimum_len(), Some(0)); |
3328 | | |
3329 | | let p = props(r"$|^|\z|\A|\b|\B"); |
3330 | | assert!(!p.look_set().is_empty()); |
3331 | | assert_eq!(p.minimum_len(), Some(0)); |
3332 | | |
3333 | | let p = props(r"^$|$^"); |
3334 | | assert!(!p.look_set().is_empty()); |
3335 | | assert_eq!(p.minimum_len(), Some(0)); |
3336 | | |
3337 | | let p = props(r"((\b)+())*^"); |
3338 | | assert!(!p.look_set().is_empty()); |
3339 | | assert_eq!(p.minimum_len(), Some(0)); |
3340 | | |
3341 | | // Negative examples. |
3342 | | let p = props(r"^a"); |
3343 | | assert!(!p.look_set().is_empty()); |
3344 | | assert_eq!(p.minimum_len(), Some(1)); |
3345 | | } |
3346 | | |
3347 | | #[test] |
3348 | | fn analysis_look_set_prefix_any() { |
3349 | | let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); |
3350 | | assert!(p.look_set_prefix_any().contains(Look::WordAscii)); |
3351 | | } |
3352 | | |
3353 | | #[test] |
3354 | | fn analysis_is_anchored() { |
3355 | | let is_start = |p| props(p).look_set_prefix().contains(Look::Start); |
3356 | | let is_end = |p| props(p).look_set_suffix().contains(Look::End); |
3357 | | |
3358 | | // Positive examples. |
3359 | | assert!(is_start(r"^")); |
3360 | | assert!(is_end(r"$")); |
3361 | | |
3362 | | assert!(is_start(r"^^")); |
3363 | | assert!(props(r"$$").look_set_suffix().contains(Look::End)); |
3364 | | |
3365 | | assert!(is_start(r"^$")); |
3366 | | assert!(is_end(r"^$")); |
3367 | | |
3368 | | assert!(is_start(r"^foo")); |
3369 | | assert!(is_end(r"foo$")); |
3370 | | |
3371 | | assert!(is_start(r"^foo|^bar")); |
3372 | | assert!(is_end(r"foo$|bar$")); |
3373 | | |
3374 | | assert!(is_start(r"^(foo|bar)")); |
3375 | | assert!(is_end(r"(foo|bar)$")); |
3376 | | |
3377 | | assert!(is_start(r"^+")); |
3378 | | assert!(is_end(r"$+")); |
3379 | | assert!(is_start(r"^++")); |
3380 | | assert!(is_end(r"$++")); |
3381 | | assert!(is_start(r"(^)+")); |
3382 | | assert!(is_end(r"($)+")); |
3383 | | |
3384 | | assert!(is_start(r"$^")); |
3385 | | assert!(is_start(r"$^")); |
3386 | | assert!(is_start(r"$^|^$")); |
3387 | | assert!(is_end(r"$^|^$")); |
3388 | | |
3389 | | assert!(is_start(r"\b^")); |
3390 | | assert!(is_end(r"$\b")); |
3391 | | assert!(is_start(r"^(?m:^)")); |
3392 | | assert!(is_end(r"(?m:$)$")); |
3393 | | assert!(is_start(r"(?m:^)^")); |
3394 | | assert!(is_end(r"$(?m:$)")); |
3395 | | |
3396 | | // Negative examples. |
3397 | | assert!(!is_start(r"(?m)^")); |
3398 | | assert!(!is_end(r"(?m)$")); |
3399 | | assert!(!is_start(r"(?m:^$)|$^")); |
3400 | | assert!(!is_end(r"(?m:^$)|$^")); |
3401 | | assert!(!is_start(r"$^|(?m:^$)")); |
3402 | | assert!(!is_end(r"$^|(?m:^$)")); |
3403 | | |
3404 | | assert!(!is_start(r"a^")); |
3405 | | assert!(!is_start(r"$a")); |
3406 | | |
3407 | | assert!(!is_end(r"a^")); |
3408 | | assert!(!is_end(r"$a")); |
3409 | | |
3410 | | assert!(!is_start(r"^foo|bar")); |
3411 | | assert!(!is_end(r"foo|bar$")); |
3412 | | |
3413 | | assert!(!is_start(r"^*")); |
3414 | | assert!(!is_end(r"$*")); |
3415 | | assert!(!is_start(r"^*+")); |
3416 | | assert!(!is_end(r"$*+")); |
3417 | | assert!(!is_start(r"^+*")); |
3418 | | assert!(!is_end(r"$+*")); |
3419 | | assert!(!is_start(r"(^)*")); |
3420 | | assert!(!is_end(r"($)*")); |
3421 | | } |
3422 | | |
3423 | | #[test] |
3424 | | fn analysis_is_any_anchored() { |
3425 | | let is_start = |p| props(p).look_set().contains(Look::Start); |
3426 | | let is_end = |p| props(p).look_set().contains(Look::End); |
3427 | | |
3428 | | // Positive examples. |
3429 | | assert!(is_start(r"^")); |
3430 | | assert!(is_end(r"$")); |
3431 | | assert!(is_start(r"\A")); |
3432 | | assert!(is_end(r"\z")); |
3433 | | |
3434 | | // Negative examples. |
3435 | | assert!(!is_start(r"(?m)^")); |
3436 | | assert!(!is_end(r"(?m)$")); |
3437 | | assert!(!is_start(r"$")); |
3438 | | assert!(!is_end(r"^")); |
3439 | | } |
3440 | | |
3441 | | #[test] |
3442 | | fn analysis_can_empty() { |
3443 | | // Positive examples. |
3444 | | let assert_empty = |
3445 | | |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); |
3446 | | assert_empty(r""); |
3447 | | assert_empty(r"()"); |
3448 | | assert_empty(r"()*"); |
3449 | | assert_empty(r"()+"); |
3450 | | assert_empty(r"()?"); |
3451 | | assert_empty(r"a*"); |
3452 | | assert_empty(r"a?"); |
3453 | | assert_empty(r"a{0}"); |
3454 | | assert_empty(r"a{0,}"); |
3455 | | assert_empty(r"a{0,1}"); |
3456 | | assert_empty(r"a{0,10}"); |
3457 | | #[cfg(feature = "unicode-gencat")] |
3458 | | assert_empty(r"\pL*"); |
3459 | | assert_empty(r"a*|b"); |
3460 | | assert_empty(r"b|a*"); |
3461 | | assert_empty(r"a|"); |
3462 | | assert_empty(r"|a"); |
3463 | | assert_empty(r"a||b"); |
3464 | | assert_empty(r"a*a?(abcd)*"); |
3465 | | assert_empty(r"^"); |
3466 | | assert_empty(r"$"); |
3467 | | assert_empty(r"(?m)^"); |
3468 | | assert_empty(r"(?m)$"); |
3469 | | assert_empty(r"\A"); |
3470 | | assert_empty(r"\z"); |
3471 | | assert_empty(r"\B"); |
3472 | | assert_empty(r"(?-u)\B"); |
3473 | | assert_empty(r"\b"); |
3474 | | assert_empty(r"(?-u)\b"); |
3475 | | |
3476 | | // Negative examples. |
3477 | | let assert_non_empty = |
3478 | | |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); |
3479 | | assert_non_empty(r"a+"); |
3480 | | assert_non_empty(r"a{1}"); |
3481 | | assert_non_empty(r"a{1,}"); |
3482 | | assert_non_empty(r"a{1,2}"); |
3483 | | assert_non_empty(r"a{1,10}"); |
3484 | | assert_non_empty(r"b|a"); |
3485 | | assert_non_empty(r"a*a+(abcd)*"); |
3486 | | #[cfg(feature = "unicode-gencat")] |
3487 | | assert_non_empty(r"\P{any}"); |
3488 | | assert_non_empty(r"[a--a]"); |
3489 | | assert_non_empty(r"[a&&b]"); |
3490 | | } |
3491 | | |
3492 | | #[test] |
3493 | | fn analysis_is_literal() { |
3494 | | // Positive examples. |
3495 | | assert!(props(r"a").is_literal()); |
3496 | | assert!(props(r"ab").is_literal()); |
3497 | | assert!(props(r"abc").is_literal()); |
3498 | | assert!(props(r"(?m)abc").is_literal()); |
3499 | | assert!(props(r"(?:a)").is_literal()); |
3500 | | assert!(props(r"foo(?:a)").is_literal()); |
3501 | | assert!(props(r"(?:a)foo").is_literal()); |
3502 | | assert!(props(r"[a]").is_literal()); |
3503 | | |
3504 | | // Negative examples. |
3505 | | assert!(!props(r"").is_literal()); |
3506 | | assert!(!props(r"^").is_literal()); |
3507 | | assert!(!props(r"a|b").is_literal()); |
3508 | | assert!(!props(r"(a)").is_literal()); |
3509 | | assert!(!props(r"a+").is_literal()); |
3510 | | assert!(!props(r"foo(a)").is_literal()); |
3511 | | assert!(!props(r"(a)foo").is_literal()); |
3512 | | assert!(!props(r"[ab]").is_literal()); |
3513 | | } |
3514 | | |
3515 | | #[test] |
3516 | | fn analysis_is_alternation_literal() { |
3517 | | // Positive examples. |
3518 | | assert!(props(r"a").is_alternation_literal()); |
3519 | | assert!(props(r"ab").is_alternation_literal()); |
3520 | | assert!(props(r"abc").is_alternation_literal()); |
3521 | | assert!(props(r"(?m)abc").is_alternation_literal()); |
3522 | | assert!(props(r"foo|bar").is_alternation_literal()); |
3523 | | assert!(props(r"foo|bar|baz").is_alternation_literal()); |
3524 | | assert!(props(r"[a]").is_alternation_literal()); |
3525 | | assert!(props(r"(?:ab)|cd").is_alternation_literal()); |
3526 | | assert!(props(r"ab|(?:cd)").is_alternation_literal()); |
3527 | | |
3528 | | // Negative examples. |
3529 | | assert!(!props(r"").is_alternation_literal()); |
3530 | | assert!(!props(r"^").is_alternation_literal()); |
3531 | | assert!(!props(r"(a)").is_alternation_literal()); |
3532 | | assert!(!props(r"a+").is_alternation_literal()); |
3533 | | assert!(!props(r"foo(a)").is_alternation_literal()); |
3534 | | assert!(!props(r"(a)foo").is_alternation_literal()); |
3535 | | assert!(!props(r"[ab]").is_alternation_literal()); |
3536 | | assert!(!props(r"[ab]|b").is_alternation_literal()); |
3537 | | assert!(!props(r"a|[ab]").is_alternation_literal()); |
3538 | | assert!(!props(r"(a)|b").is_alternation_literal()); |
3539 | | assert!(!props(r"a|(b)").is_alternation_literal()); |
3540 | | assert!(!props(r"a|b").is_alternation_literal()); |
3541 | | assert!(!props(r"a|b|c").is_alternation_literal()); |
3542 | | assert!(!props(r"[a]|b").is_alternation_literal()); |
3543 | | assert!(!props(r"a|[b]").is_alternation_literal()); |
3544 | | assert!(!props(r"(?:a)|b").is_alternation_literal()); |
3545 | | assert!(!props(r"a|(?:b)").is_alternation_literal()); |
3546 | | assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); |
3547 | | } |
3548 | | |
3549 | | // This tests that the smart Hir::repetition constructors does some basic |
3550 | | // simplifications. |
3551 | | #[test] |
3552 | | fn smart_repetition() { |
3553 | | assert_eq!(t(r"a{0}"), Hir::empty()); |
3554 | | assert_eq!(t(r"a{1}"), hir_lit("a")); |
3555 | | assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); |
3556 | | } |
3557 | | |
3558 | | // This tests that the smart Hir::concat constructor simplifies the given |
3559 | | // exprs in a way we expect. |
3560 | | #[test] |
3561 | | fn smart_concat() { |
3562 | | assert_eq!(t(""), Hir::empty()); |
3563 | | assert_eq!(t("(?:)"), Hir::empty()); |
3564 | | assert_eq!(t("abc"), hir_lit("abc")); |
3565 | | assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); |
3566 | | assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); |
3567 | | assert_eq!( |
3568 | | t("foo(?:bar^baz)quux"), |
3569 | | hir_cat(vec![ |
3570 | | hir_lit("foobar"), |
3571 | | hir_look(hir::Look::Start), |
3572 | | hir_lit("bazquux"), |
3573 | | ]) |
3574 | | ); |
3575 | | assert_eq!( |
3576 | | t("foo(?:ba(?:r^b)az)quux"), |
3577 | | hir_cat(vec![ |
3578 | | hir_lit("foobar"), |
3579 | | hir_look(hir::Look::Start), |
3580 | | hir_lit("bazquux"), |
3581 | | ]) |
3582 | | ); |
3583 | | } |
3584 | | |
3585 | | // This tests that the smart Hir::alternation constructor simplifies the |
3586 | | // given exprs in a way we expect. |
3587 | | #[test] |
3588 | | fn smart_alternation() { |
3589 | | assert_eq!( |
3590 | | t("(?:foo)|(?:bar)"), |
3591 | | hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) |
3592 | | ); |
3593 | | assert_eq!( |
3594 | | t("quux|(?:abc|def|xyz)|baz"), |
3595 | | hir_alt(vec![ |
3596 | | hir_lit("quux"), |
3597 | | hir_lit("abc"), |
3598 | | hir_lit("def"), |
3599 | | hir_lit("xyz"), |
3600 | | hir_lit("baz"), |
3601 | | ]) |
3602 | | ); |
3603 | | assert_eq!( |
3604 | | t("quux|(?:abc|(?:def|mno)|xyz)|baz"), |
3605 | | hir_alt(vec![ |
3606 | | hir_lit("quux"), |
3607 | | hir_lit("abc"), |
3608 | | hir_lit("def"), |
3609 | | hir_lit("mno"), |
3610 | | hir_lit("xyz"), |
3611 | | hir_lit("baz"), |
3612 | | ]) |
3613 | | ); |
3614 | | assert_eq!( |
3615 | | t("a|b|c|d|e|f|x|y|z"), |
3616 | | hir_uclass(&[('a', 'f'), ('x', 'z')]), |
3617 | | ); |
3618 | | // Tests that we lift common prefixes out of an alternation. |
3619 | | assert_eq!( |
3620 | | t("[A-Z]foo|[A-Z]quux"), |
3621 | | hir_cat(vec![ |
3622 | | hir_uclass(&[('A', 'Z')]), |
3623 | | hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), |
3624 | | ]), |
3625 | | ); |
3626 | | assert_eq!( |
3627 | | t("[A-Z][A-Z]|[A-Z]quux"), |
3628 | | hir_cat(vec![ |
3629 | | hir_uclass(&[('A', 'Z')]), |
3630 | | hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), |
3631 | | ]), |
3632 | | ); |
3633 | | assert_eq!( |
3634 | | t("[A-Z][A-Z]|[A-Z][A-Z]quux"), |
3635 | | hir_cat(vec![ |
3636 | | hir_uclass(&[('A', 'Z')]), |
3637 | | hir_uclass(&[('A', 'Z')]), |
3638 | | hir_alt(vec![Hir::empty(), hir_lit("quux")]), |
3639 | | ]), |
3640 | | ); |
3641 | | assert_eq!( |
3642 | | t("[A-Z]foo|[A-Z]foobar"), |
3643 | | hir_cat(vec![ |
3644 | | hir_uclass(&[('A', 'Z')]), |
3645 | | hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), |
3646 | | ]), |
3647 | | ); |
3648 | | } |
3649 | | |
3650 | | #[test] |
3651 | | fn regression_alt_empty_concat() { |
3652 | | use crate::ast::{self, Ast}; |
3653 | | |
3654 | | let span = Span::splat(Position::new(0, 0, 0)); |
3655 | | let ast = Ast::alternation(ast::Alternation { |
3656 | | span, |
3657 | | asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], |
3658 | | }); |
3659 | | |
3660 | | let mut t = Translator::new(); |
3661 | | assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); |
3662 | | } |
3663 | | |
3664 | | #[test] |
3665 | | fn regression_empty_alt() { |
3666 | | use crate::ast::{self, Ast}; |
3667 | | |
3668 | | let span = Span::splat(Position::new(0, 0, 0)); |
3669 | | let ast = Ast::concat(ast::Concat { |
3670 | | span, |
3671 | | asts: vec![Ast::alternation(ast::Alternation { |
3672 | | span, |
3673 | | asts: vec![], |
3674 | | })], |
3675 | | }); |
3676 | | |
3677 | | let mut t = Translator::new(); |
3678 | | assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); |
3679 | | } |
3680 | | |
3681 | | #[test] |
3682 | | fn regression_singleton_alt() { |
3683 | | use crate::{ |
3684 | | ast::{self, Ast}, |
3685 | | hir::Dot, |
3686 | | }; |
3687 | | |
3688 | | let span = Span::splat(Position::new(0, 0, 0)); |
3689 | | let ast = Ast::concat(ast::Concat { |
3690 | | span, |
3691 | | asts: vec![Ast::alternation(ast::Alternation { |
3692 | | span, |
3693 | | asts: vec![Ast::dot(span)], |
3694 | | })], |
3695 | | }); |
3696 | | |
3697 | | let mut t = Translator::new(); |
3698 | | assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); |
3699 | | } |
3700 | | |
3701 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 |
3702 | | #[test] |
3703 | | fn regression_fuzz_match() { |
3704 | | let pat = "[(\u{6} \0-\u{afdf5}] \0 "; |
3705 | | let ast = ParserBuilder::new() |
3706 | | .octal(false) |
3707 | | .ignore_whitespace(true) |
3708 | | .build() |
3709 | | .parse(pat) |
3710 | | .unwrap(); |
3711 | | let hir = TranslatorBuilder::new() |
3712 | | .utf8(true) |
3713 | | .case_insensitive(false) |
3714 | | .multi_line(false) |
3715 | | .dot_matches_new_line(false) |
3716 | | .swap_greed(true) |
3717 | | .unicode(true) |
3718 | | .build() |
3719 | | .translate(pat, &ast) |
3720 | | .unwrap(); |
3721 | | assert_eq!( |
3722 | | hir, |
3723 | | Hir::concat(vec![ |
3724 | | hir_uclass(&[('\0', '\u{afdf5}')]), |
3725 | | hir_lit("\0"), |
3726 | | ]) |
3727 | | ); |
3728 | | } |
3729 | | |
3730 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 |
3731 | | #[cfg(feature = "unicode")] |
3732 | | #[test] |
3733 | | fn regression_fuzz_difference1() { |
3734 | | let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; |
3735 | | let _ = t(pat); // shouldn't panic |
3736 | | } |
3737 | | |
3738 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 |
3739 | | #[test] |
3740 | | fn regression_fuzz_char_decrement1() { |
3741 | | let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 "; |
3742 | | let _ = t(pat); // shouldn't panic |
3743 | | } |
3744 | | } |