/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.8.8/src/hir/translate.rs
Line | Count | Source |
1 | | /*! |
2 | | Defines a translator that converts an `Ast` to an `Hir`. |
3 | | */ |
4 | | |
5 | | use core::cell::{Cell, RefCell}; |
6 | | |
7 | | use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; |
8 | | |
9 | | use crate::{ |
10 | | ast::{self, Ast, Span, Visitor}, |
11 | | either::Either, |
12 | | hir::{self, Error, ErrorKind, Hir, HirKind}, |
13 | | unicode::{self, ClassQuery}, |
14 | | }; |
15 | | |
16 | | type Result<T> = core::result::Result<T, Error>; |
17 | | |
18 | | /// A builder for constructing an AST->HIR translator. |
19 | | #[derive(Clone, Debug)] |
20 | | pub struct TranslatorBuilder { |
21 | | utf8: bool, |
22 | | line_terminator: u8, |
23 | | flags: Flags, |
24 | | } |
25 | | |
26 | | impl Default for TranslatorBuilder { |
27 | 0 | fn default() -> TranslatorBuilder { |
28 | 0 | TranslatorBuilder::new() |
29 | 0 | } |
30 | | } |
31 | | |
32 | | impl TranslatorBuilder { |
33 | | /// Create a new translator builder with a default configuration. |
34 | 0 | pub fn new() -> TranslatorBuilder { |
35 | 0 | TranslatorBuilder { |
36 | 0 | utf8: true, |
37 | 0 | line_terminator: b'\n', |
38 | 0 | flags: Flags::default(), |
39 | 0 | } |
40 | 0 | } |
41 | | |
42 | | /// Build a translator using the current configuration. |
43 | 0 | pub fn build(&self) -> Translator { |
44 | 0 | Translator { |
45 | 0 | stack: RefCell::new(vec![]), |
46 | 0 | flags: Cell::new(self.flags), |
47 | 0 | utf8: self.utf8, |
48 | 0 | line_terminator: self.line_terminator, |
49 | 0 | } |
50 | 0 | } |
51 | | |
52 | | /// When disabled, translation will permit the construction of a regular |
53 | | /// expression that may match invalid UTF-8. |
54 | | /// |
55 | | /// When enabled (the default), the translator is guaranteed to produce an |
56 | | /// expression that, for non-empty matches, will only ever produce spans |
57 | | /// that are entirely valid UTF-8 (otherwise, the translator will return an |
58 | | /// error). |
59 | | /// |
60 | | /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even |
61 | | /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete |
62 | | /// syntax) will be allowed even though they can produce matches that split |
63 | | /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" |
64 | | /// matches, and it is expected that the regex engine itself must handle |
65 | | /// these cases if necessary (perhaps by suppressing any zero-width matches |
66 | | /// that split a codepoint). |
67 | 0 | pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { |
68 | 0 | self.utf8 = yes; |
69 | 0 | self |
70 | 0 | } |
71 | | |
72 | | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
73 | | /// |
74 | | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
75 | | /// this will cause `.` to match everything except for the byte given. |
76 | | /// |
77 | | /// If `.` is used in a context where Unicode mode is enabled and this byte |
78 | | /// isn't ASCII, then an error will be returned. When Unicode mode is |
79 | | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
80 | | /// mode is enabled and it is a non-ASCII byte. |
81 | | /// |
82 | | /// In short, any ASCII value for a line terminator is always okay. But a |
83 | | /// non-ASCII byte might result in an error depending on whether Unicode |
84 | | /// mode or UTF-8 mode are enabled. |
85 | | /// |
86 | | /// Note that if `R` mode is enabled then it always takes precedence and |
87 | | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
88 | | /// |
89 | | /// Note also that this *doesn't* impact the look-around assertions |
90 | | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
91 | | /// configuration in the regex engine itself. |
92 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { |
93 | 0 | self.line_terminator = byte; |
94 | 0 | self |
95 | 0 | } |
96 | | |
97 | | /// Enable or disable the case insensitive flag (`i`) by default. |
98 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { |
99 | 0 | self.flags.case_insensitive = if yes { Some(true) } else { None }; |
100 | 0 | self |
101 | 0 | } |
102 | | |
103 | | /// Enable or disable the multi-line matching flag (`m`) by default. |
104 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { |
105 | 0 | self.flags.multi_line = if yes { Some(true) } else { None }; |
106 | 0 | self |
107 | 0 | } |
108 | | |
109 | | /// Enable or disable the "dot matches any character" flag (`s`) by |
110 | | /// default. |
111 | 0 | pub fn dot_matches_new_line( |
112 | 0 | &mut self, |
113 | 0 | yes: bool, |
114 | 0 | ) -> &mut TranslatorBuilder { |
115 | 0 | self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; |
116 | 0 | self |
117 | 0 | } |
118 | | |
119 | | /// Enable or disable the CRLF mode flag (`R`) by default. |
120 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { |
121 | 0 | self.flags.crlf = if yes { Some(true) } else { None }; |
122 | 0 | self |
123 | 0 | } |
124 | | |
125 | | /// Enable or disable the "swap greed" flag (`U`) by default. |
126 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { |
127 | 0 | self.flags.swap_greed = if yes { Some(true) } else { None }; |
128 | 0 | self |
129 | 0 | } |
130 | | |
131 | | /// Enable or disable the Unicode flag (`u`) by default. |
132 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { |
133 | 0 | self.flags.unicode = if yes { None } else { Some(false) }; |
134 | 0 | self |
135 | 0 | } |
136 | | } |
137 | | |
138 | | /// A translator maps abstract syntax to a high level intermediate |
139 | | /// representation. |
140 | | /// |
141 | | /// A translator may be benefit from reuse. That is, a translator can translate |
142 | | /// many abstract syntax trees. |
143 | | /// |
144 | | /// A `Translator` can be configured in more detail via a |
145 | | /// [`TranslatorBuilder`]. |
146 | | #[derive(Clone, Debug)] |
147 | | pub struct Translator { |
148 | | /// Our call stack, but on the heap. |
149 | | stack: RefCell<Vec<HirFrame>>, |
150 | | /// The current flag settings. |
151 | | flags: Cell<Flags>, |
152 | | /// Whether we're allowed to produce HIR that can match arbitrary bytes. |
153 | | utf8: bool, |
154 | | /// The line terminator to use for `.`. |
155 | | line_terminator: u8, |
156 | | } |
157 | | |
158 | | impl Translator { |
159 | | /// Create a new translator using the default configuration. |
160 | 0 | pub fn new() -> Translator { |
161 | 0 | TranslatorBuilder::new().build() |
162 | 0 | } |
163 | | |
164 | | /// Translate the given abstract syntax tree (AST) into a high level |
165 | | /// intermediate representation (HIR). |
166 | | /// |
167 | | /// If there was a problem doing the translation, then an HIR-specific |
168 | | /// error is returned. |
169 | | /// |
170 | | /// The original pattern string used to produce the `Ast` *must* also be |
171 | | /// provided. The translator does not use the pattern string during any |
172 | | /// correct translation, but is used for error reporting. |
173 | 0 | pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { |
174 | 0 | ast::visit(ast, TranslatorI::new(self, pattern)) |
175 | 0 | } |
176 | | } |
177 | | |
178 | | /// An HirFrame is a single stack frame, represented explicitly, which is |
179 | | /// created for each item in the Ast that we traverse. |
180 | | /// |
181 | | /// Note that technically, this type doesn't represent our entire stack |
182 | | /// frame. In particular, the Ast visitor represents any state associated with |
183 | | /// traversing the Ast itself. |
184 | | #[derive(Clone, Debug)] |
185 | | enum HirFrame { |
186 | | /// An arbitrary HIR expression. These get pushed whenever we hit a base |
187 | | /// case in the Ast. They get popped after an inductive (i.e., recursive) |
188 | | /// step is complete. |
189 | | Expr(Hir), |
190 | | /// A literal that is being constructed, character by character, from the |
191 | | /// AST. We need this because the AST gives each individual character its |
192 | | /// own node. So as we see characters, we peek at the top-most HirFrame. |
193 | | /// If it's a literal, then we add to it. Otherwise, we push a new literal. |
194 | | /// When it comes time to pop it, we convert it to an Hir via Hir::literal. |
195 | | Literal(Vec<u8>), |
196 | | /// A Unicode character class. This frame is mutated as we descend into |
197 | | /// the Ast of a character class (which is itself its own mini recursive |
198 | | /// structure). |
199 | | ClassUnicode(hir::ClassUnicode), |
200 | | /// A byte-oriented character class. This frame is mutated as we descend |
201 | | /// into the Ast of a character class (which is itself its own mini |
202 | | /// recursive structure). |
203 | | /// |
204 | | /// Byte character classes are created when Unicode mode (`u`) is disabled. |
205 | | /// If `utf8` is enabled (the default), then a byte character is only |
206 | | /// permitted to match ASCII text. |
207 | | ClassBytes(hir::ClassBytes), |
208 | | /// This is pushed whenever a repetition is observed. After visiting every |
209 | | /// sub-expression in the repetition, the translator's stack is expected to |
210 | | /// have this sentinel at the top. |
211 | | /// |
212 | | /// This sentinel only exists to stop other things (like flattening |
213 | | /// literals) from reaching across repetition operators. |
214 | | Repetition, |
215 | | /// This is pushed on to the stack upon first seeing any kind of capture, |
216 | | /// indicated by parentheses (including non-capturing groups). It is popped |
217 | | /// upon leaving a group. |
218 | | Group { |
219 | | /// The old active flags when this group was opened. |
220 | | /// |
221 | | /// If this group sets flags, then the new active flags are set to the |
222 | | /// result of merging the old flags with the flags introduced by this |
223 | | /// group. If the group doesn't set any flags, then this is simply |
224 | | /// equivalent to whatever flags were set when the group was opened. |
225 | | /// |
226 | | /// When this group is popped, the active flags should be restored to |
227 | | /// the flags set here. |
228 | | /// |
229 | | /// The "active" flags correspond to whatever flags are set in the |
230 | | /// Translator. |
231 | | old_flags: Flags, |
232 | | }, |
233 | | /// This is pushed whenever a concatenation is observed. After visiting |
234 | | /// every sub-expression in the concatenation, the translator's stack is |
235 | | /// popped until it sees a Concat frame. |
236 | | Concat, |
237 | | /// This is pushed whenever an alternation is observed. After visiting |
238 | | /// every sub-expression in the alternation, the translator's stack is |
239 | | /// popped until it sees an Alternation frame. |
240 | | Alternation, |
241 | | /// This is pushed immediately before each sub-expression in an |
242 | | /// alternation. This separates the branches of an alternation on the |
243 | | /// stack and prevents literal flattening from reaching across alternation |
244 | | /// branches. |
245 | | /// |
246 | | /// It is popped after each expression in a branch until an 'Alternation' |
247 | | /// frame is observed when doing a post visit on an alternation. |
248 | | AlternationBranch, |
249 | | } |
250 | | |
251 | | impl HirFrame { |
252 | | /// Assert that the current stack frame is an Hir expression and return it. |
253 | 0 | fn unwrap_expr(self) -> Hir { |
254 | 0 | match self { |
255 | 0 | HirFrame::Expr(expr) => expr, |
256 | 0 | HirFrame::Literal(lit) => Hir::literal(lit), |
257 | 0 | _ => panic!("tried to unwrap expr from HirFrame, got: {self:?}"), |
258 | | } |
259 | 0 | } |
260 | | |
261 | | /// Assert that the current stack frame is a Unicode class expression and |
262 | | /// return it. |
263 | 0 | fn unwrap_class_unicode(self) -> hir::ClassUnicode { |
264 | 0 | match self { |
265 | 0 | HirFrame::ClassUnicode(cls) => cls, |
266 | 0 | _ => panic!( |
267 | 0 | "tried to unwrap Unicode class \ |
268 | 0 | from HirFrame, got: {:?}", |
269 | | self |
270 | | ), |
271 | | } |
272 | 0 | } |
273 | | |
274 | | /// Assert that the current stack frame is a byte class expression and |
275 | | /// return it. |
276 | 0 | fn unwrap_class_bytes(self) -> hir::ClassBytes { |
277 | 0 | match self { |
278 | 0 | HirFrame::ClassBytes(cls) => cls, |
279 | 0 | _ => panic!( |
280 | 0 | "tried to unwrap byte class \ |
281 | 0 | from HirFrame, got: {:?}", |
282 | | self |
283 | | ), |
284 | | } |
285 | 0 | } |
286 | | |
287 | | /// Assert that the current stack frame is a repetition sentinel. If it |
288 | | /// isn't, then panic. |
289 | 0 | fn unwrap_repetition(self) { |
290 | 0 | match self { |
291 | 0 | HirFrame::Repetition => {} |
292 | | _ => { |
293 | 0 | panic!( |
294 | 0 | "tried to unwrap repetition from HirFrame, got: {self:?}" |
295 | | ) |
296 | | } |
297 | | } |
298 | 0 | } |
299 | | |
300 | | /// Assert that the current stack frame is a group indicator and return |
301 | | /// its corresponding flags (the flags that were active at the time the |
302 | | /// group was entered). |
303 | 0 | fn unwrap_group(self) -> Flags { |
304 | 0 | match self { |
305 | 0 | HirFrame::Group { old_flags } => old_flags, |
306 | | _ => { |
307 | 0 | panic!("tried to unwrap group from HirFrame, got: {self:?}") |
308 | | } |
309 | | } |
310 | 0 | } |
311 | | |
312 | | /// Assert that the current stack frame is an alternation pipe sentinel. If |
313 | | /// it isn't, then panic. |
314 | 0 | fn unwrap_alternation_pipe(self) { |
315 | 0 | match self { |
316 | 0 | HirFrame::AlternationBranch => {} |
317 | | _ => { |
318 | 0 | panic!("tried to unwrap alt pipe from HirFrame, got: {self:?}") |
319 | | } |
320 | | } |
321 | 0 | } |
322 | | } |
323 | | |
324 | | impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { |
325 | | type Output = Hir; |
326 | | type Err = Error; |
327 | | |
328 | 0 | fn finish(self) -> Result<Hir> { |
329 | | // ... otherwise, we should have exactly one HIR on the stack. |
330 | 0 | assert_eq!(self.trans().stack.borrow().len(), 1); |
331 | 0 | Ok(self.pop().unwrap().unwrap_expr()) |
332 | 0 | } |
333 | | |
334 | 0 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
335 | 0 | match *ast { |
336 | | Ast::ClassBracketed(_) => { |
337 | 0 | if self.flags().unicode() { |
338 | 0 | let cls = hir::ClassUnicode::empty(); |
339 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
340 | 0 | } else { |
341 | 0 | let cls = hir::ClassBytes::empty(); |
342 | 0 | self.push(HirFrame::ClassBytes(cls)); |
343 | 0 | } |
344 | | } |
345 | 0 | Ast::Repetition(_) => self.push(HirFrame::Repetition), |
346 | 0 | Ast::Group(ref x) => { |
347 | 0 | let old_flags = x |
348 | 0 | .flags() |
349 | 0 | .map(|ast| self.set_flags(ast)) |
350 | 0 | .unwrap_or_else(|| self.flags()); |
351 | 0 | self.push(HirFrame::Group { old_flags }); |
352 | | } |
353 | 0 | Ast::Concat(_) => { |
354 | 0 | self.push(HirFrame::Concat); |
355 | 0 | } |
356 | 0 | Ast::Alternation(ref x) => { |
357 | 0 | self.push(HirFrame::Alternation); |
358 | 0 | if !x.asts.is_empty() { |
359 | 0 | self.push(HirFrame::AlternationBranch); |
360 | 0 | } |
361 | | } |
362 | 0 | _ => {} |
363 | | } |
364 | 0 | Ok(()) |
365 | 0 | } |
366 | | |
367 | 0 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
368 | 0 | match *ast { |
369 | 0 | Ast::Empty(_) => { |
370 | 0 | self.push(HirFrame::Expr(Hir::empty())); |
371 | 0 | } |
372 | 0 | Ast::Flags(ref x) => { |
373 | 0 | self.set_flags(&x.flags); |
374 | 0 | // Flags in the AST are generally considered directives and |
375 | 0 | // not actual sub-expressions. However, they can be used in |
376 | 0 | // the concrete syntax like `((?i))`, and we need some kind of |
377 | 0 | // indication of an expression there, and Empty is the correct |
378 | 0 | // choice. |
379 | 0 | // |
380 | 0 | // There can also be things like `(?i)+`, but we rule those out |
381 | 0 | // in the parser. In the future, we might allow them for |
382 | 0 | // consistency sake. |
383 | 0 | self.push(HirFrame::Expr(Hir::empty())); |
384 | 0 | } |
385 | 0 | Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { |
386 | 0 | Either::Right(byte) => self.push_byte(byte), |
387 | 0 | Either::Left(ch) => match self.case_fold_char(x.span, ch)? { |
388 | 0 | None => self.push_char(ch), |
389 | 0 | Some(expr) => self.push(HirFrame::Expr(expr)), |
390 | | }, |
391 | | }, |
392 | 0 | Ast::Dot(ref span) => { |
393 | 0 | self.push(HirFrame::Expr(self.hir_dot(**span)?)); |
394 | | } |
395 | 0 | Ast::Assertion(ref x) => { |
396 | 0 | self.push(HirFrame::Expr(self.hir_assertion(x)?)); |
397 | | } |
398 | 0 | Ast::ClassPerl(ref x) => { |
399 | 0 | if self.flags().unicode() { |
400 | 0 | let cls = self.hir_perl_unicode_class(x)?; |
401 | 0 | let hcls = hir::Class::Unicode(cls); |
402 | 0 | self.push(HirFrame::Expr(Hir::class(hcls))); |
403 | | } else { |
404 | 0 | let cls = self.hir_perl_byte_class(x)?; |
405 | 0 | let hcls = hir::Class::Bytes(cls); |
406 | 0 | self.push(HirFrame::Expr(Hir::class(hcls))); |
407 | | } |
408 | | } |
409 | 0 | Ast::ClassUnicode(ref x) => { |
410 | 0 | let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); |
411 | 0 | self.push(HirFrame::Expr(Hir::class(cls))); |
412 | | } |
413 | 0 | Ast::ClassBracketed(ref ast) => { |
414 | 0 | if self.flags().unicode() { |
415 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
416 | 0 | self.unicode_fold_and_negate( |
417 | 0 | &ast.span, |
418 | 0 | ast.negated, |
419 | 0 | &mut cls, |
420 | 0 | )?; |
421 | 0 | let expr = Hir::class(hir::Class::Unicode(cls)); |
422 | 0 | self.push(HirFrame::Expr(expr)); |
423 | | } else { |
424 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
425 | 0 | self.bytes_fold_and_negate( |
426 | 0 | &ast.span, |
427 | 0 | ast.negated, |
428 | 0 | &mut cls, |
429 | 0 | )?; |
430 | 0 | let expr = Hir::class(hir::Class::Bytes(cls)); |
431 | 0 | self.push(HirFrame::Expr(expr)); |
432 | | } |
433 | | } |
434 | 0 | Ast::Repetition(ref x) => { |
435 | 0 | let expr = self.pop().unwrap().unwrap_expr(); |
436 | 0 | self.pop().unwrap().unwrap_repetition(); |
437 | 0 | self.push(HirFrame::Expr(self.hir_repetition(x, expr))); |
438 | 0 | } |
439 | 0 | Ast::Group(ref x) => { |
440 | 0 | let expr = self.pop().unwrap().unwrap_expr(); |
441 | 0 | let old_flags = self.pop().unwrap().unwrap_group(); |
442 | 0 | self.trans().flags.set(old_flags); |
443 | 0 | self.push(HirFrame::Expr(self.hir_capture(x, expr))); |
444 | 0 | } |
445 | | Ast::Concat(_) => { |
446 | 0 | let mut exprs = vec![]; |
447 | 0 | while let Some(expr) = self.pop_concat_expr() { |
448 | 0 | if !matches!(*expr.kind(), HirKind::Empty) { |
449 | 0 | exprs.push(expr); |
450 | 0 | } |
451 | | } |
452 | 0 | exprs.reverse(); |
453 | 0 | self.push(HirFrame::Expr(Hir::concat(exprs))); |
454 | | } |
455 | | Ast::Alternation(_) => { |
456 | 0 | let mut exprs = vec![]; |
457 | 0 | while let Some(expr) = self.pop_alt_expr() { |
458 | 0 | self.pop().unwrap().unwrap_alternation_pipe(); |
459 | 0 | exprs.push(expr); |
460 | 0 | } |
461 | 0 | exprs.reverse(); |
462 | 0 | self.push(HirFrame::Expr(Hir::alternation(exprs))); |
463 | | } |
464 | | } |
465 | 0 | Ok(()) |
466 | 0 | } |
467 | | |
468 | 0 | fn visit_alternation_in(&mut self) -> Result<()> { |
469 | 0 | self.push(HirFrame::AlternationBranch); |
470 | 0 | Ok(()) |
471 | 0 | } |
472 | | |
473 | 0 | fn visit_class_set_item_pre( |
474 | 0 | &mut self, |
475 | 0 | ast: &ast::ClassSetItem, |
476 | 0 | ) -> Result<()> { |
477 | 0 | match *ast { |
478 | | ast::ClassSetItem::Bracketed(_) => { |
479 | 0 | if self.flags().unicode() { |
480 | 0 | let cls = hir::ClassUnicode::empty(); |
481 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
482 | 0 | } else { |
483 | 0 | let cls = hir::ClassBytes::empty(); |
484 | 0 | self.push(HirFrame::ClassBytes(cls)); |
485 | 0 | } |
486 | | } |
487 | | // We needn't handle the Union case here since the visitor will |
488 | | // do it for us. |
489 | 0 | _ => {} |
490 | | } |
491 | 0 | Ok(()) |
492 | 0 | } |
493 | | |
494 | 0 | fn visit_class_set_item_post( |
495 | 0 | &mut self, |
496 | 0 | ast: &ast::ClassSetItem, |
497 | 0 | ) -> Result<()> { |
498 | 0 | match *ast { |
499 | 0 | ast::ClassSetItem::Empty(_) => {} |
500 | 0 | ast::ClassSetItem::Literal(ref x) => { |
501 | 0 | if self.flags().unicode() { |
502 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
503 | 0 | cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); |
504 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
505 | 0 | } else { |
506 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
507 | 0 | let byte = self.class_literal_byte(x)?; |
508 | 0 | cls.push(hir::ClassBytesRange::new(byte, byte)); |
509 | 0 | self.push(HirFrame::ClassBytes(cls)); |
510 | | } |
511 | | } |
512 | 0 | ast::ClassSetItem::Range(ref x) => { |
513 | 0 | if self.flags().unicode() { |
514 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
515 | 0 | cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); |
516 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
517 | 0 | } else { |
518 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
519 | 0 | let start = self.class_literal_byte(&x.start)?; |
520 | 0 | let end = self.class_literal_byte(&x.end)?; |
521 | 0 | cls.push(hir::ClassBytesRange::new(start, end)); |
522 | 0 | self.push(HirFrame::ClassBytes(cls)); |
523 | | } |
524 | | } |
525 | 0 | ast::ClassSetItem::Ascii(ref x) => { |
526 | 0 | if self.flags().unicode() { |
527 | 0 | let xcls = self.hir_ascii_unicode_class(x)?; |
528 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
529 | 0 | cls.union(&xcls); |
530 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
531 | | } else { |
532 | 0 | let xcls = self.hir_ascii_byte_class(x)?; |
533 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
534 | 0 | cls.union(&xcls); |
535 | 0 | self.push(HirFrame::ClassBytes(cls)); |
536 | | } |
537 | | } |
538 | 0 | ast::ClassSetItem::Unicode(ref x) => { |
539 | 0 | let xcls = self.hir_unicode_class(x)?; |
540 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
541 | 0 | cls.union(&xcls); |
542 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
543 | | } |
544 | 0 | ast::ClassSetItem::Perl(ref x) => { |
545 | 0 | if self.flags().unicode() { |
546 | 0 | let xcls = self.hir_perl_unicode_class(x)?; |
547 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
548 | 0 | cls.union(&xcls); |
549 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
550 | | } else { |
551 | 0 | let xcls = self.hir_perl_byte_class(x)?; |
552 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
553 | 0 | cls.union(&xcls); |
554 | 0 | self.push(HirFrame::ClassBytes(cls)); |
555 | | } |
556 | | } |
557 | 0 | ast::ClassSetItem::Bracketed(ref ast) => { |
558 | 0 | if self.flags().unicode() { |
559 | 0 | let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); |
560 | 0 | self.unicode_fold_and_negate( |
561 | 0 | &ast.span, |
562 | 0 | ast.negated, |
563 | 0 | &mut cls1, |
564 | 0 | )?; |
565 | | |
566 | 0 | let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); |
567 | 0 | cls2.union(&cls1); |
568 | 0 | self.push(HirFrame::ClassUnicode(cls2)); |
569 | | } else { |
570 | 0 | let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); |
571 | 0 | self.bytes_fold_and_negate( |
572 | 0 | &ast.span, |
573 | 0 | ast.negated, |
574 | 0 | &mut cls1, |
575 | 0 | )?; |
576 | | |
577 | 0 | let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); |
578 | 0 | cls2.union(&cls1); |
579 | 0 | self.push(HirFrame::ClassBytes(cls2)); |
580 | | } |
581 | | } |
582 | | // This is handled automatically by the visitor. |
583 | 0 | ast::ClassSetItem::Union(_) => {} |
584 | | } |
585 | 0 | Ok(()) |
586 | 0 | } |
587 | | |
588 | 0 | fn visit_class_set_binary_op_pre( |
589 | 0 | &mut self, |
590 | 0 | _op: &ast::ClassSetBinaryOp, |
591 | 0 | ) -> Result<()> { |
592 | 0 | if self.flags().unicode() { |
593 | 0 | let cls = hir::ClassUnicode::empty(); |
594 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
595 | 0 | } else { |
596 | 0 | let cls = hir::ClassBytes::empty(); |
597 | 0 | self.push(HirFrame::ClassBytes(cls)); |
598 | 0 | } |
599 | 0 | Ok(()) |
600 | 0 | } |
601 | | |
602 | 0 | fn visit_class_set_binary_op_in( |
603 | 0 | &mut self, |
604 | 0 | _op: &ast::ClassSetBinaryOp, |
605 | 0 | ) -> Result<()> { |
606 | 0 | if self.flags().unicode() { |
607 | 0 | let cls = hir::ClassUnicode::empty(); |
608 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
609 | 0 | } else { |
610 | 0 | let cls = hir::ClassBytes::empty(); |
611 | 0 | self.push(HirFrame::ClassBytes(cls)); |
612 | 0 | } |
613 | 0 | Ok(()) |
614 | 0 | } |
615 | | |
616 | 0 | fn visit_class_set_binary_op_post( |
617 | 0 | &mut self, |
618 | 0 | op: &ast::ClassSetBinaryOp, |
619 | 0 | ) -> Result<()> { |
620 | | use crate::ast::ClassSetBinaryOpKind::*; |
621 | | |
622 | 0 | if self.flags().unicode() { |
623 | 0 | let mut rhs = self.pop().unwrap().unwrap_class_unicode(); |
624 | 0 | let mut lhs = self.pop().unwrap().unwrap_class_unicode(); |
625 | 0 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
626 | 0 | if self.flags().case_insensitive() { |
627 | 0 | rhs.try_case_fold_simple().map_err(|_| { |
628 | 0 | self.error( |
629 | 0 | op.rhs.span().clone(), |
630 | 0 | ErrorKind::UnicodeCaseUnavailable, |
631 | | ) |
632 | 0 | })?; |
633 | 0 | lhs.try_case_fold_simple().map_err(|_| { |
634 | 0 | self.error( |
635 | 0 | op.lhs.span().clone(), |
636 | 0 | ErrorKind::UnicodeCaseUnavailable, |
637 | | ) |
638 | 0 | })?; |
639 | 0 | } |
640 | 0 | match op.kind { |
641 | 0 | Intersection => lhs.intersect(&rhs), |
642 | 0 | Difference => lhs.difference(&rhs), |
643 | 0 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
644 | | } |
645 | 0 | cls.union(&lhs); |
646 | 0 | self.push(HirFrame::ClassUnicode(cls)); |
647 | | } else { |
648 | 0 | let mut rhs = self.pop().unwrap().unwrap_class_bytes(); |
649 | 0 | let mut lhs = self.pop().unwrap().unwrap_class_bytes(); |
650 | 0 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
651 | 0 | if self.flags().case_insensitive() { |
652 | 0 | rhs.case_fold_simple(); |
653 | 0 | lhs.case_fold_simple(); |
654 | 0 | } |
655 | 0 | match op.kind { |
656 | 0 | Intersection => lhs.intersect(&rhs), |
657 | 0 | Difference => lhs.difference(&rhs), |
658 | 0 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
659 | | } |
660 | 0 | cls.union(&lhs); |
661 | 0 | self.push(HirFrame::ClassBytes(cls)); |
662 | | } |
663 | 0 | Ok(()) |
664 | 0 | } |
665 | | } |
666 | | |
667 | | /// The internal implementation of a translator. |
668 | | /// |
669 | | /// This type is responsible for carrying around the original pattern string, |
670 | | /// which is not tied to the internal state of a translator. |
671 | | /// |
672 | | /// A TranslatorI exists for the time it takes to translate a single Ast. |
673 | | #[derive(Clone, Debug)] |
674 | | struct TranslatorI<'t, 'p> { |
675 | | trans: &'t Translator, |
676 | | pattern: &'p str, |
677 | | } |
678 | | |
679 | | impl<'t, 'p> TranslatorI<'t, 'p> { |
680 | | /// Build a new internal translator. |
681 | 0 | fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { |
682 | 0 | TranslatorI { trans, pattern } |
683 | 0 | } |
684 | | |
685 | | /// Return a reference to the underlying translator. |
686 | 0 | fn trans(&self) -> &Translator { |
687 | 0 | &self.trans |
688 | 0 | } |
689 | | |
690 | | /// Push the given frame on to the call stack. |
691 | 0 | fn push(&self, frame: HirFrame) { |
692 | 0 | self.trans().stack.borrow_mut().push(frame); |
693 | 0 | } |
694 | | |
695 | | /// Push the given literal char on to the call stack. |
696 | | /// |
697 | | /// If the top-most element of the stack is a literal, then the char |
698 | | /// is appended to the end of that literal. Otherwise, a new literal |
699 | | /// containing just the given char is pushed to the top of the stack. |
700 | 0 | fn push_char(&self, ch: char) { |
701 | 0 | let mut buf = [0; 4]; |
702 | 0 | let bytes = ch.encode_utf8(&mut buf).as_bytes(); |
703 | 0 | let mut stack = self.trans().stack.borrow_mut(); |
704 | 0 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
705 | 0 | literal.extend_from_slice(bytes); |
706 | 0 | } else { |
707 | 0 | stack.push(HirFrame::Literal(bytes.to_vec())); |
708 | 0 | } |
709 | 0 | } |
710 | | |
711 | | /// Push the given literal byte on to the call stack. |
712 | | /// |
713 | | /// If the top-most element of the stack is a literal, then the byte |
714 | | /// is appended to the end of that literal. Otherwise, a new literal |
715 | | /// containing just the given byte is pushed to the top of the stack. |
716 | 0 | fn push_byte(&self, byte: u8) { |
717 | 0 | let mut stack = self.trans().stack.borrow_mut(); |
718 | 0 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
719 | 0 | literal.push(byte); |
720 | 0 | } else { |
721 | 0 | stack.push(HirFrame::Literal(vec![byte])); |
722 | 0 | } |
723 | 0 | } |
724 | | |
725 | | /// Pop the top of the call stack. If the call stack is empty, return None. |
726 | 0 | fn pop(&self) -> Option<HirFrame> { |
727 | 0 | self.trans().stack.borrow_mut().pop() |
728 | 0 | } |
729 | | |
730 | | /// Pop an HIR expression from the top of the stack for a concatenation. |
731 | | /// |
732 | | /// This returns None if the stack is empty or when a concat frame is seen. |
733 | | /// Otherwise, it panics if it could not find an HIR expression. |
734 | 0 | fn pop_concat_expr(&self) -> Option<Hir> { |
735 | 0 | let frame = self.pop()?; |
736 | 0 | match frame { |
737 | 0 | HirFrame::Concat => None, |
738 | 0 | HirFrame::Expr(expr) => Some(expr), |
739 | 0 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
740 | | HirFrame::ClassUnicode(_) => { |
741 | 0 | unreachable!("expected expr or concat, got Unicode class") |
742 | | } |
743 | | HirFrame::ClassBytes(_) => { |
744 | 0 | unreachable!("expected expr or concat, got byte class") |
745 | | } |
746 | | HirFrame::Repetition => { |
747 | 0 | unreachable!("expected expr or concat, got repetition") |
748 | | } |
749 | | HirFrame::Group { .. } => { |
750 | 0 | unreachable!("expected expr or concat, got group") |
751 | | } |
752 | | HirFrame::Alternation => { |
753 | 0 | unreachable!("expected expr or concat, got alt marker") |
754 | | } |
755 | | HirFrame::AlternationBranch => { |
756 | 0 | unreachable!("expected expr or concat, got alt branch marker") |
757 | | } |
758 | | } |
759 | 0 | } |
760 | | |
761 | | /// Pop an HIR expression from the top of the stack for an alternation. |
762 | | /// |
763 | | /// This returns None if the stack is empty or when an alternation frame is |
764 | | /// seen. Otherwise, it panics if it could not find an HIR expression. |
765 | 0 | fn pop_alt_expr(&self) -> Option<Hir> { |
766 | 0 | let frame = self.pop()?; |
767 | 0 | match frame { |
768 | 0 | HirFrame::Alternation => None, |
769 | 0 | HirFrame::Expr(expr) => Some(expr), |
770 | 0 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
771 | | HirFrame::ClassUnicode(_) => { |
772 | 0 | unreachable!("expected expr or alt, got Unicode class") |
773 | | } |
774 | | HirFrame::ClassBytes(_) => { |
775 | 0 | unreachable!("expected expr or alt, got byte class") |
776 | | } |
777 | | HirFrame::Repetition => { |
778 | 0 | unreachable!("expected expr or alt, got repetition") |
779 | | } |
780 | | HirFrame::Group { .. } => { |
781 | 0 | unreachable!("expected expr or alt, got group") |
782 | | } |
783 | | HirFrame::Concat => { |
784 | 0 | unreachable!("expected expr or alt, got concat marker") |
785 | | } |
786 | | HirFrame::AlternationBranch => { |
787 | 0 | unreachable!("expected expr or alt, got alt branch marker") |
788 | | } |
789 | | } |
790 | 0 | } |
791 | | |
792 | | /// Create a new error with the given span and error type. |
793 | 0 | fn error(&self, span: Span, kind: ErrorKind) -> Error { |
794 | 0 | Error { kind, pattern: self.pattern.to_string(), span } |
795 | 0 | } |
796 | | |
797 | | /// Return a copy of the active flags. |
798 | 0 | fn flags(&self) -> Flags { |
799 | 0 | self.trans().flags.get() |
800 | 0 | } |
801 | | |
802 | | /// Set the flags of this translator from the flags set in the given AST. |
803 | | /// Then, return the old flags. |
804 | 0 | fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { |
805 | 0 | let old_flags = self.flags(); |
806 | 0 | let mut new_flags = Flags::from_ast(ast_flags); |
807 | 0 | new_flags.merge(&old_flags); |
808 | 0 | self.trans().flags.set(new_flags); |
809 | 0 | old_flags |
810 | 0 | } |
811 | | |
812 | | /// Convert an Ast literal to its scalar representation. |
813 | | /// |
814 | | /// When Unicode mode is enabled, then this always succeeds and returns a |
815 | | /// `char` (Unicode scalar value). |
816 | | /// |
817 | | /// When Unicode mode is disabled, then a `char` will still be returned |
818 | | /// whenever possible. A byte is returned only when invalid UTF-8 is |
819 | | /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte |
820 | | /// will result in an error when invalid UTF-8 is not allowed. |
821 | 0 | fn ast_literal_to_scalar( |
822 | 0 | &self, |
823 | 0 | lit: &ast::Literal, |
824 | 0 | ) -> Result<Either<char, u8>> { |
825 | 0 | if self.flags().unicode() { |
826 | 0 | return Ok(Either::Left(lit.c)); |
827 | 0 | } |
828 | 0 | let byte = match lit.byte() { |
829 | 0 | None => return Ok(Either::Left(lit.c)), |
830 | 0 | Some(byte) => byte, |
831 | | }; |
832 | 0 | if byte <= 0x7F { |
833 | 0 | return Ok(Either::Left(char::try_from(byte).unwrap())); |
834 | 0 | } |
835 | 0 | if self.trans().utf8 { |
836 | 0 | return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); |
837 | 0 | } |
838 | 0 | Ok(Either::Right(byte)) |
839 | 0 | } |
840 | | |
841 | 0 | fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> { |
842 | 0 | if !self.flags().case_insensitive() { |
843 | 0 | return Ok(None); |
844 | 0 | } |
845 | 0 | if self.flags().unicode() { |
846 | | // If case folding won't do anything, then don't bother trying. |
847 | 0 | let map = unicode::SimpleCaseFolder::new() |
848 | 0 | .map(|f| f.overlaps(c, c)) |
849 | 0 | .map_err(|_| { |
850 | 0 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
851 | 0 | })?; |
852 | 0 | if !map { |
853 | 0 | return Ok(None); |
854 | 0 | } |
855 | 0 | let mut cls = |
856 | 0 | hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( |
857 | 0 | c, c, |
858 | | )]); |
859 | 0 | cls.try_case_fold_simple().map_err(|_| { |
860 | 0 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
861 | 0 | })?; |
862 | 0 | Ok(Some(Hir::class(hir::Class::Unicode(cls)))) |
863 | | } else { |
864 | 0 | if !c.is_ascii() { |
865 | 0 | return Ok(None); |
866 | 0 | } |
867 | | // If case folding won't do anything, then don't bother trying. |
868 | 0 | match c { |
869 | 0 | 'A'..='Z' | 'a'..='z' => {} |
870 | 0 | _ => return Ok(None), |
871 | | } |
872 | 0 | let mut cls = |
873 | 0 | hir::ClassBytes::new(vec![hir::ClassBytesRange::new( |
874 | | // OK because 'c.len_utf8() == 1' which in turn implies |
875 | | // that 'c' is ASCII. |
876 | 0 | u8::try_from(c).unwrap(), |
877 | 0 | u8::try_from(c).unwrap(), |
878 | | )]); |
879 | 0 | cls.case_fold_simple(); |
880 | 0 | Ok(Some(Hir::class(hir::Class::Bytes(cls)))) |
881 | | } |
882 | 0 | } |
883 | | |
884 | 0 | fn hir_dot(&self, span: Span) -> Result<Hir> { |
885 | 0 | let (utf8, lineterm, flags) = |
886 | 0 | (self.trans().utf8, self.trans().line_terminator, self.flags()); |
887 | 0 | if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { |
888 | 0 | return Err(self.error(span, ErrorKind::InvalidUtf8)); |
889 | 0 | } |
890 | 0 | let dot = if flags.dot_matches_new_line() { |
891 | 0 | if flags.unicode() { |
892 | 0 | hir::Dot::AnyChar |
893 | | } else { |
894 | 0 | hir::Dot::AnyByte |
895 | | } |
896 | | } else { |
897 | 0 | if flags.unicode() { |
898 | 0 | if flags.crlf() { |
899 | 0 | hir::Dot::AnyCharExceptCRLF |
900 | | } else { |
901 | 0 | if !lineterm.is_ascii() { |
902 | 0 | return Err( |
903 | 0 | self.error(span, ErrorKind::InvalidLineTerminator) |
904 | 0 | ); |
905 | 0 | } |
906 | 0 | hir::Dot::AnyCharExcept(char::from(lineterm)) |
907 | | } |
908 | | } else { |
909 | 0 | if flags.crlf() { |
910 | 0 | hir::Dot::AnyByteExceptCRLF |
911 | | } else { |
912 | 0 | hir::Dot::AnyByteExcept(lineterm) |
913 | | } |
914 | | } |
915 | | }; |
916 | 0 | Ok(Hir::dot(dot)) |
917 | 0 | } |
918 | | |
919 | 0 | fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { |
920 | 0 | let unicode = self.flags().unicode(); |
921 | 0 | let multi_line = self.flags().multi_line(); |
922 | 0 | let crlf = self.flags().crlf(); |
923 | 0 | Ok(match asst.kind { |
924 | 0 | ast::AssertionKind::StartLine => Hir::look(if multi_line { |
925 | 0 | if crlf { |
926 | 0 | hir::Look::StartCRLF |
927 | | } else { |
928 | 0 | hir::Look::StartLF |
929 | | } |
930 | | } else { |
931 | 0 | hir::Look::Start |
932 | | }), |
933 | 0 | ast::AssertionKind::EndLine => Hir::look(if multi_line { |
934 | 0 | if crlf { |
935 | 0 | hir::Look::EndCRLF |
936 | | } else { |
937 | 0 | hir::Look::EndLF |
938 | | } |
939 | | } else { |
940 | 0 | hir::Look::End |
941 | | }), |
942 | 0 | ast::AssertionKind::StartText => Hir::look(hir::Look::Start), |
943 | 0 | ast::AssertionKind::EndText => Hir::look(hir::Look::End), |
944 | 0 | ast::AssertionKind::WordBoundary => Hir::look(if unicode { |
945 | 0 | hir::Look::WordUnicode |
946 | | } else { |
947 | 0 | hir::Look::WordAscii |
948 | | }), |
949 | 0 | ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { |
950 | 0 | hir::Look::WordUnicodeNegate |
951 | | } else { |
952 | 0 | hir::Look::WordAsciiNegate |
953 | | }), |
954 | | ast::AssertionKind::WordBoundaryStart |
955 | | | ast::AssertionKind::WordBoundaryStartAngle => { |
956 | 0 | Hir::look(if unicode { |
957 | 0 | hir::Look::WordStartUnicode |
958 | | } else { |
959 | 0 | hir::Look::WordStartAscii |
960 | | }) |
961 | | } |
962 | | ast::AssertionKind::WordBoundaryEnd |
963 | | | ast::AssertionKind::WordBoundaryEndAngle => { |
964 | 0 | Hir::look(if unicode { |
965 | 0 | hir::Look::WordEndUnicode |
966 | | } else { |
967 | 0 | hir::Look::WordEndAscii |
968 | | }) |
969 | | } |
970 | | ast::AssertionKind::WordBoundaryStartHalf => { |
971 | 0 | Hir::look(if unicode { |
972 | 0 | hir::Look::WordStartHalfUnicode |
973 | | } else { |
974 | 0 | hir::Look::WordStartHalfAscii |
975 | | }) |
976 | | } |
977 | 0 | ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { |
978 | 0 | hir::Look::WordEndHalfUnicode |
979 | | } else { |
980 | 0 | hir::Look::WordEndHalfAscii |
981 | | }), |
982 | | }) |
983 | 0 | } |
984 | | |
985 | 0 | fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { |
986 | 0 | let (index, name) = match group.kind { |
987 | 0 | ast::GroupKind::CaptureIndex(index) => (index, None), |
988 | 0 | ast::GroupKind::CaptureName { ref name, .. } => { |
989 | 0 | (name.index, Some(name.name.clone().into_boxed_str())) |
990 | | } |
991 | | // The HIR doesn't need to use non-capturing groups, since the way |
992 | | // in which the data type is defined handles this automatically. |
993 | 0 | ast::GroupKind::NonCapturing(_) => return expr, |
994 | | }; |
995 | 0 | Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) |
996 | 0 | } |
997 | | |
998 | 0 | fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { |
999 | 0 | let (min, max) = match rep.op.kind { |
1000 | 0 | ast::RepetitionKind::ZeroOrOne => (0, Some(1)), |
1001 | 0 | ast::RepetitionKind::ZeroOrMore => (0, None), |
1002 | 0 | ast::RepetitionKind::OneOrMore => (1, None), |
1003 | 0 | ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { |
1004 | 0 | (m, Some(m)) |
1005 | | } |
1006 | 0 | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { |
1007 | 0 | (m, None) |
1008 | | } |
1009 | | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( |
1010 | 0 | m, |
1011 | 0 | n, |
1012 | 0 | )) => (m, Some(n)), |
1013 | | }; |
1014 | 0 | let greedy = |
1015 | 0 | if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; |
1016 | 0 | Hir::repetition(hir::Repetition { |
1017 | 0 | min, |
1018 | 0 | max, |
1019 | 0 | greedy, |
1020 | 0 | sub: Box::new(expr), |
1021 | 0 | }) |
1022 | 0 | } |
1023 | | |
1024 | 0 | fn hir_unicode_class( |
1025 | 0 | &self, |
1026 | 0 | ast_class: &ast::ClassUnicode, |
1027 | 0 | ) -> Result<hir::ClassUnicode> { |
1028 | | use crate::ast::ClassUnicodeKind::*; |
1029 | | |
1030 | 0 | if !self.flags().unicode() { |
1031 | 0 | return Err( |
1032 | 0 | self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) |
1033 | 0 | ); |
1034 | 0 | } |
1035 | 0 | let query = match ast_class.kind { |
1036 | 0 | OneLetter(name) => ClassQuery::OneLetter(name), |
1037 | 0 | Named(ref name) => ClassQuery::Binary(name), |
1038 | 0 | NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { |
1039 | 0 | property_name: name, |
1040 | 0 | property_value: value, |
1041 | 0 | }, |
1042 | | }; |
1043 | 0 | let mut result = self.convert_unicode_class_error( |
1044 | 0 | &ast_class.span, |
1045 | 0 | unicode::class(query), |
1046 | | ); |
1047 | 0 | if let Ok(ref mut class) = result { |
1048 | 0 | self.unicode_fold_and_negate( |
1049 | 0 | &ast_class.span, |
1050 | 0 | ast_class.negated, |
1051 | 0 | class, |
1052 | 0 | )?; |
1053 | 0 | } |
1054 | 0 | result |
1055 | 0 | } |
1056 | | |
1057 | 0 | fn hir_ascii_unicode_class( |
1058 | 0 | &self, |
1059 | 0 | ast: &ast::ClassAscii, |
1060 | 0 | ) -> Result<hir::ClassUnicode> { |
1061 | 0 | let mut cls = hir::ClassUnicode::new( |
1062 | 0 | ascii_class_as_chars(&ast.kind) |
1063 | 0 | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
1064 | | ); |
1065 | 0 | self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
1066 | 0 | Ok(cls) |
1067 | 0 | } |
1068 | | |
1069 | 0 | fn hir_ascii_byte_class( |
1070 | 0 | &self, |
1071 | 0 | ast: &ast::ClassAscii, |
1072 | 0 | ) -> Result<hir::ClassBytes> { |
1073 | 0 | let mut cls = hir::ClassBytes::new( |
1074 | 0 | ascii_class(&ast.kind) |
1075 | 0 | .map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1076 | | ); |
1077 | 0 | self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
1078 | 0 | Ok(cls) |
1079 | 0 | } |
1080 | | |
1081 | 0 | fn hir_perl_unicode_class( |
1082 | 0 | &self, |
1083 | 0 | ast_class: &ast::ClassPerl, |
1084 | 0 | ) -> Result<hir::ClassUnicode> { |
1085 | | use crate::ast::ClassPerlKind::*; |
1086 | | |
1087 | 0 | assert!(self.flags().unicode()); |
1088 | 0 | let result = match ast_class.kind { |
1089 | 0 | Digit => unicode::perl_digit(), |
1090 | 0 | Space => unicode::perl_space(), |
1091 | 0 | Word => unicode::perl_word(), |
1092 | | }; |
1093 | 0 | let mut class = |
1094 | 0 | self.convert_unicode_class_error(&ast_class.span, result)?; |
1095 | | // We needn't apply case folding here because the Perl Unicode classes |
1096 | | // are already closed under Unicode simple case folding. |
1097 | 0 | if ast_class.negated { |
1098 | 0 | class.negate(); |
1099 | 0 | } |
1100 | 0 | Ok(class) |
1101 | 0 | } |
1102 | | |
1103 | 0 | fn hir_perl_byte_class( |
1104 | 0 | &self, |
1105 | 0 | ast_class: &ast::ClassPerl, |
1106 | 0 | ) -> Result<hir::ClassBytes> { |
1107 | | use crate::ast::ClassPerlKind::*; |
1108 | | |
1109 | 0 | assert!(!self.flags().unicode()); |
1110 | 0 | let mut class = match ast_class.kind { |
1111 | 0 | Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), |
1112 | 0 | Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), |
1113 | 0 | Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), |
1114 | | }; |
1115 | | // We needn't apply case folding here because the Perl ASCII classes |
1116 | | // are already closed (under ASCII case folding). |
1117 | 0 | if ast_class.negated { |
1118 | 0 | class.negate(); |
1119 | 0 | } |
1120 | | // Negating a Perl byte class is likely to cause it to match invalid |
1121 | | // UTF-8. That's only OK if the translator is configured to allow such |
1122 | | // things. |
1123 | 0 | if self.trans().utf8 && !class.is_ascii() { |
1124 | 0 | return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); |
1125 | 0 | } |
1126 | 0 | Ok(class) |
1127 | 0 | } |
1128 | | |
1129 | | /// Converts the given Unicode specific error to an HIR translation error. |
1130 | | /// |
1131 | | /// The span given should approximate the position at which an error would |
1132 | | /// occur. |
1133 | 0 | fn convert_unicode_class_error( |
1134 | 0 | &self, |
1135 | 0 | span: &Span, |
1136 | 0 | result: core::result::Result<hir::ClassUnicode, unicode::Error>, |
1137 | 0 | ) -> Result<hir::ClassUnicode> { |
1138 | 0 | result.map_err(|err| { |
1139 | 0 | let sp = span.clone(); |
1140 | 0 | match err { |
1141 | | unicode::Error::PropertyNotFound => { |
1142 | 0 | self.error(sp, ErrorKind::UnicodePropertyNotFound) |
1143 | | } |
1144 | | unicode::Error::PropertyValueNotFound => { |
1145 | 0 | self.error(sp, ErrorKind::UnicodePropertyValueNotFound) |
1146 | | } |
1147 | | unicode::Error::PerlClassNotFound => { |
1148 | 0 | self.error(sp, ErrorKind::UnicodePerlClassNotFound) |
1149 | | } |
1150 | | } |
1151 | 0 | }) |
1152 | 0 | } |
1153 | | |
1154 | 0 | fn unicode_fold_and_negate( |
1155 | 0 | &self, |
1156 | 0 | span: &Span, |
1157 | 0 | negated: bool, |
1158 | 0 | class: &mut hir::ClassUnicode, |
1159 | 0 | ) -> Result<()> { |
1160 | | // Note that we must apply case folding before negation! |
1161 | | // Consider `(?i)[^x]`. If we applied negation first, then |
1162 | | // the result would be the character class that matched any |
1163 | | // Unicode scalar value. |
1164 | 0 | if self.flags().case_insensitive() { |
1165 | 0 | class.try_case_fold_simple().map_err(|_| { |
1166 | 0 | self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) |
1167 | 0 | })?; |
1168 | 0 | } |
1169 | 0 | if negated { |
1170 | 0 | class.negate(); |
1171 | 0 | } |
1172 | 0 | Ok(()) |
1173 | 0 | } |
1174 | | |
1175 | 0 | fn bytes_fold_and_negate( |
1176 | 0 | &self, |
1177 | 0 | span: &Span, |
1178 | 0 | negated: bool, |
1179 | 0 | class: &mut hir::ClassBytes, |
1180 | 0 | ) -> Result<()> { |
1181 | | // Note that we must apply case folding before negation! |
1182 | | // Consider `(?i)[^x]`. If we applied negation first, then |
1183 | | // the result would be the character class that matched any |
1184 | | // Unicode scalar value. |
1185 | 0 | if self.flags().case_insensitive() { |
1186 | 0 | class.case_fold_simple(); |
1187 | 0 | } |
1188 | 0 | if negated { |
1189 | 0 | class.negate(); |
1190 | 0 | } |
1191 | 0 | if self.trans().utf8 && !class.is_ascii() { |
1192 | 0 | return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); |
1193 | 0 | } |
1194 | 0 | Ok(()) |
1195 | 0 | } |
1196 | | |
1197 | | /// Return a scalar byte value suitable for use as a literal in a byte |
1198 | | /// character class. |
1199 | 0 | fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { |
1200 | 0 | match self.ast_literal_to_scalar(ast)? { |
1201 | 0 | Either::Right(byte) => Ok(byte), |
1202 | 0 | Either::Left(ch) => { |
1203 | 0 | if ch.is_ascii() { |
1204 | 0 | Ok(u8::try_from(ch).unwrap()) |
1205 | | } else { |
1206 | | // We can't feasibly support Unicode in |
1207 | | // byte oriented classes. Byte classes don't |
1208 | | // do Unicode case folding. |
1209 | 0 | Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) |
1210 | | } |
1211 | | } |
1212 | | } |
1213 | 0 | } |
1214 | | } |
1215 | | |
1216 | | /// A translator's representation of a regular expression's flags at any given |
1217 | | /// moment in time. |
1218 | | /// |
1219 | | /// Each flag can be in one of three states: absent, present but disabled or |
1220 | | /// present but enabled. |
1221 | | #[derive(Clone, Copy, Debug, Default)] |
1222 | | struct Flags { |
1223 | | case_insensitive: Option<bool>, |
1224 | | multi_line: Option<bool>, |
1225 | | dot_matches_new_line: Option<bool>, |
1226 | | swap_greed: Option<bool>, |
1227 | | unicode: Option<bool>, |
1228 | | crlf: Option<bool>, |
1229 | | // Note that `ignore_whitespace` is omitted here because it is handled |
1230 | | // entirely in the parser. |
1231 | | } |
1232 | | |
1233 | | impl Flags { |
1234 | 0 | fn from_ast(ast: &ast::Flags) -> Flags { |
1235 | 0 | let mut flags = Flags::default(); |
1236 | 0 | let mut enable = true; |
1237 | 0 | for item in &ast.items { |
1238 | 0 | match item.kind { |
1239 | 0 | ast::FlagsItemKind::Negation => { |
1240 | 0 | enable = false; |
1241 | 0 | } |
1242 | 0 | ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { |
1243 | 0 | flags.case_insensitive = Some(enable); |
1244 | 0 | } |
1245 | 0 | ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { |
1246 | 0 | flags.multi_line = Some(enable); |
1247 | 0 | } |
1248 | 0 | ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { |
1249 | 0 | flags.dot_matches_new_line = Some(enable); |
1250 | 0 | } |
1251 | 0 | ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { |
1252 | 0 | flags.swap_greed = Some(enable); |
1253 | 0 | } |
1254 | 0 | ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { |
1255 | 0 | flags.unicode = Some(enable); |
1256 | 0 | } |
1257 | 0 | ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { |
1258 | 0 | flags.crlf = Some(enable); |
1259 | 0 | } |
1260 | 0 | ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} |
1261 | | } |
1262 | | } |
1263 | 0 | flags |
1264 | 0 | } |
1265 | | |
1266 | 0 | fn merge(&mut self, previous: &Flags) { |
1267 | 0 | if self.case_insensitive.is_none() { |
1268 | 0 | self.case_insensitive = previous.case_insensitive; |
1269 | 0 | } |
1270 | 0 | if self.multi_line.is_none() { |
1271 | 0 | self.multi_line = previous.multi_line; |
1272 | 0 | } |
1273 | 0 | if self.dot_matches_new_line.is_none() { |
1274 | 0 | self.dot_matches_new_line = previous.dot_matches_new_line; |
1275 | 0 | } |
1276 | 0 | if self.swap_greed.is_none() { |
1277 | 0 | self.swap_greed = previous.swap_greed; |
1278 | 0 | } |
1279 | 0 | if self.unicode.is_none() { |
1280 | 0 | self.unicode = previous.unicode; |
1281 | 0 | } |
1282 | 0 | if self.crlf.is_none() { |
1283 | 0 | self.crlf = previous.crlf; |
1284 | 0 | } |
1285 | 0 | } |
1286 | | |
1287 | 0 | fn case_insensitive(&self) -> bool { |
1288 | 0 | self.case_insensitive.unwrap_or(false) |
1289 | 0 | } |
1290 | | |
1291 | 0 | fn multi_line(&self) -> bool { |
1292 | 0 | self.multi_line.unwrap_or(false) |
1293 | 0 | } |
1294 | | |
1295 | 0 | fn dot_matches_new_line(&self) -> bool { |
1296 | 0 | self.dot_matches_new_line.unwrap_or(false) |
1297 | 0 | } |
1298 | | |
1299 | 0 | fn swap_greed(&self) -> bool { |
1300 | 0 | self.swap_greed.unwrap_or(false) |
1301 | 0 | } |
1302 | | |
1303 | 0 | fn unicode(&self) -> bool { |
1304 | 0 | self.unicode.unwrap_or(true) |
1305 | 0 | } |
1306 | | |
1307 | 0 | fn crlf(&self) -> bool { |
1308 | 0 | self.crlf.unwrap_or(false) |
1309 | 0 | } |
1310 | | } |
1311 | | |
1312 | 0 | fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { |
1313 | 0 | let ranges: Vec<_> = ascii_class(kind) |
1314 | 0 | .map(|(s, e)| hir::ClassBytesRange::new(s, e)) |
1315 | 0 | .collect(); |
1316 | 0 | hir::ClassBytes::new(ranges) |
1317 | 0 | } |
1318 | | |
1319 | 0 | fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> { |
1320 | | use crate::ast::ClassAsciiKind::*; |
1321 | | |
1322 | 0 | let slice: &'static [(u8, u8)] = match *kind { |
1323 | 0 | Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], |
1324 | 0 | Alpha => &[(b'A', b'Z'), (b'a', b'z')], |
1325 | 0 | Ascii => &[(b'\x00', b'\x7F')], |
1326 | 0 | Blank => &[(b'\t', b'\t'), (b' ', b' ')], |
1327 | 0 | Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], |
1328 | 0 | Digit => &[(b'0', b'9')], |
1329 | 0 | Graph => &[(b'!', b'~')], |
1330 | 0 | Lower => &[(b'a', b'z')], |
1331 | 0 | Print => &[(b' ', b'~')], |
1332 | 0 | Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], |
1333 | 0 | Space => &[ |
1334 | 0 | (b'\t', b'\t'), |
1335 | 0 | (b'\n', b'\n'), |
1336 | 0 | (b'\x0B', b'\x0B'), |
1337 | 0 | (b'\x0C', b'\x0C'), |
1338 | 0 | (b'\r', b'\r'), |
1339 | 0 | (b' ', b' '), |
1340 | 0 | ], |
1341 | 0 | Upper => &[(b'A', b'Z')], |
1342 | 0 | Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], |
1343 | 0 | Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], |
1344 | | }; |
1345 | 0 | slice.iter().copied() |
1346 | 0 | } |
1347 | | |
1348 | 0 | fn ascii_class_as_chars( |
1349 | 0 | kind: &ast::ClassAsciiKind, |
1350 | 0 | ) -> impl Iterator<Item = (char, char)> { |
1351 | 0 | ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) |
1352 | 0 | } |
1353 | | |
1354 | | #[cfg(test)] |
1355 | | mod tests { |
1356 | | use crate::{ |
1357 | | ast::{parse::ParserBuilder, Position}, |
1358 | | hir::{Look, Properties}, |
1359 | | }; |
1360 | | |
1361 | | use super::*; |
1362 | | |
1363 | | // We create these errors to compare with real hir::Errors in the tests. |
1364 | | // We define equality between TestError and hir::Error to disregard the |
1365 | | // pattern string in hir::Error, which is annoying to provide in tests. |
1366 | | #[derive(Clone, Debug)] |
1367 | | struct TestError { |
1368 | | span: Span, |
1369 | | kind: hir::ErrorKind, |
1370 | | } |
1371 | | |
1372 | | impl PartialEq<hir::Error> for TestError { |
1373 | | fn eq(&self, other: &hir::Error) -> bool { |
1374 | | self.span == other.span && self.kind == other.kind |
1375 | | } |
1376 | | } |
1377 | | |
1378 | | impl PartialEq<TestError> for hir::Error { |
1379 | | fn eq(&self, other: &TestError) -> bool { |
1380 | | self.span == other.span && self.kind == other.kind |
1381 | | } |
1382 | | } |
1383 | | |
1384 | | fn parse(pattern: &str) -> Ast { |
1385 | | ParserBuilder::new().octal(true).build().parse(pattern).unwrap() |
1386 | | } |
1387 | | |
1388 | | fn t(pattern: &str) -> Hir { |
1389 | | TranslatorBuilder::new() |
1390 | | .utf8(true) |
1391 | | .build() |
1392 | | .translate(pattern, &parse(pattern)) |
1393 | | .unwrap() |
1394 | | } |
1395 | | |
1396 | | fn t_err(pattern: &str) -> hir::Error { |
1397 | | TranslatorBuilder::new() |
1398 | | .utf8(true) |
1399 | | .build() |
1400 | | .translate(pattern, &parse(pattern)) |
1401 | | .unwrap_err() |
1402 | | } |
1403 | | |
1404 | | fn t_bytes(pattern: &str) -> Hir { |
1405 | | TranslatorBuilder::new() |
1406 | | .utf8(false) |
1407 | | .build() |
1408 | | .translate(pattern, &parse(pattern)) |
1409 | | .unwrap() |
1410 | | } |
1411 | | |
1412 | | fn props(pattern: &str) -> Properties { |
1413 | | t(pattern).properties().clone() |
1414 | | } |
1415 | | |
1416 | | fn props_bytes(pattern: &str) -> Properties { |
1417 | | t_bytes(pattern).properties().clone() |
1418 | | } |
1419 | | |
1420 | | fn hir_lit(s: &str) -> Hir { |
1421 | | hir_blit(s.as_bytes()) |
1422 | | } |
1423 | | |
1424 | | fn hir_blit(s: &[u8]) -> Hir { |
1425 | | Hir::literal(s) |
1426 | | } |
1427 | | |
1428 | | fn hir_capture(index: u32, expr: Hir) -> Hir { |
1429 | | Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) |
1430 | | } |
1431 | | |
1432 | | fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { |
1433 | | Hir::capture(hir::Capture { |
1434 | | index, |
1435 | | name: Some(name.into()), |
1436 | | sub: Box::new(expr), |
1437 | | }) |
1438 | | } |
1439 | | |
1440 | | fn hir_quest(greedy: bool, expr: Hir) -> Hir { |
1441 | | Hir::repetition(hir::Repetition { |
1442 | | min: 0, |
1443 | | max: Some(1), |
1444 | | greedy, |
1445 | | sub: Box::new(expr), |
1446 | | }) |
1447 | | } |
1448 | | |
1449 | | fn hir_star(greedy: bool, expr: Hir) -> Hir { |
1450 | | Hir::repetition(hir::Repetition { |
1451 | | min: 0, |
1452 | | max: None, |
1453 | | greedy, |
1454 | | sub: Box::new(expr), |
1455 | | }) |
1456 | | } |
1457 | | |
1458 | | fn hir_plus(greedy: bool, expr: Hir) -> Hir { |
1459 | | Hir::repetition(hir::Repetition { |
1460 | | min: 1, |
1461 | | max: None, |
1462 | | greedy, |
1463 | | sub: Box::new(expr), |
1464 | | }) |
1465 | | } |
1466 | | |
1467 | | fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir { |
1468 | | Hir::repetition(hir::Repetition { |
1469 | | min, |
1470 | | max, |
1471 | | greedy, |
1472 | | sub: Box::new(expr), |
1473 | | }) |
1474 | | } |
1475 | | |
1476 | | fn hir_alt(alts: Vec<Hir>) -> Hir { |
1477 | | Hir::alternation(alts) |
1478 | | } |
1479 | | |
1480 | | fn hir_cat(exprs: Vec<Hir>) -> Hir { |
1481 | | Hir::concat(exprs) |
1482 | | } |
1483 | | |
1484 | | #[allow(dead_code)] |
1485 | | fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { |
1486 | | Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) |
1487 | | } |
1488 | | |
1489 | | #[allow(dead_code)] |
1490 | | fn hir_uclass_perl_word() -> Hir { |
1491 | | Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) |
1492 | | } |
1493 | | |
1494 | | fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { |
1495 | | Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( |
1496 | | ascii_class_as_chars(kind) |
1497 | | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
1498 | | ))) |
1499 | | } |
1500 | | |
1501 | | fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { |
1502 | | Hir::class(hir::Class::Bytes(hir::ClassBytes::new( |
1503 | | ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1504 | | ))) |
1505 | | } |
1506 | | |
1507 | | fn hir_uclass(ranges: &[(char, char)]) -> Hir { |
1508 | | Hir::class(uclass(ranges)) |
1509 | | } |
1510 | | |
1511 | | fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { |
1512 | | Hir::class(bclass(ranges)) |
1513 | | } |
1514 | | |
1515 | | fn hir_case_fold(expr: Hir) -> Hir { |
1516 | | match expr.into_kind() { |
1517 | | HirKind::Class(mut cls) => { |
1518 | | cls.case_fold_simple(); |
1519 | | Hir::class(cls) |
1520 | | } |
1521 | | _ => panic!("cannot case fold non-class Hir expr"), |
1522 | | } |
1523 | | } |
1524 | | |
1525 | | fn hir_negate(expr: Hir) -> Hir { |
1526 | | match expr.into_kind() { |
1527 | | HirKind::Class(mut cls) => { |
1528 | | cls.negate(); |
1529 | | Hir::class(cls) |
1530 | | } |
1531 | | _ => panic!("cannot negate non-class Hir expr"), |
1532 | | } |
1533 | | } |
1534 | | |
1535 | | fn uclass(ranges: &[(char, char)]) -> hir::Class { |
1536 | | let ranges: Vec<hir::ClassUnicodeRange> = ranges |
1537 | | .iter() |
1538 | | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
1539 | | .collect(); |
1540 | | hir::Class::Unicode(hir::ClassUnicode::new(ranges)) |
1541 | | } |
1542 | | |
1543 | | fn bclass(ranges: &[(u8, u8)]) -> hir::Class { |
1544 | | let ranges: Vec<hir::ClassBytesRange> = ranges |
1545 | | .iter() |
1546 | | .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) |
1547 | | .collect(); |
1548 | | hir::Class::Bytes(hir::ClassBytes::new(ranges)) |
1549 | | } |
1550 | | |
1551 | | #[cfg(feature = "unicode-case")] |
1552 | | fn class_case_fold(mut cls: hir::Class) -> Hir { |
1553 | | cls.case_fold_simple(); |
1554 | | Hir::class(cls) |
1555 | | } |
1556 | | |
1557 | | fn class_negate(mut cls: hir::Class) -> Hir { |
1558 | | cls.negate(); |
1559 | | Hir::class(cls) |
1560 | | } |
1561 | | |
1562 | | #[allow(dead_code)] |
1563 | | fn hir_union(expr1: Hir, expr2: Hir) -> Hir { |
1564 | | use crate::hir::Class::{Bytes, Unicode}; |
1565 | | |
1566 | | match (expr1.into_kind(), expr2.into_kind()) { |
1567 | | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1568 | | c1.union(&c2); |
1569 | | Hir::class(hir::Class::Unicode(c1)) |
1570 | | } |
1571 | | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1572 | | c1.union(&c2); |
1573 | | Hir::class(hir::Class::Bytes(c1)) |
1574 | | } |
1575 | | _ => panic!("cannot union non-class Hir exprs"), |
1576 | | } |
1577 | | } |
1578 | | |
1579 | | #[allow(dead_code)] |
1580 | | fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { |
1581 | | use crate::hir::Class::{Bytes, Unicode}; |
1582 | | |
1583 | | match (expr1.into_kind(), expr2.into_kind()) { |
1584 | | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1585 | | c1.difference(&c2); |
1586 | | Hir::class(hir::Class::Unicode(c1)) |
1587 | | } |
1588 | | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1589 | | c1.difference(&c2); |
1590 | | Hir::class(hir::Class::Bytes(c1)) |
1591 | | } |
1592 | | _ => panic!("cannot difference non-class Hir exprs"), |
1593 | | } |
1594 | | } |
1595 | | |
1596 | | fn hir_look(look: hir::Look) -> Hir { |
1597 | | Hir::look(look) |
1598 | | } |
1599 | | |
1600 | | #[test] |
1601 | | fn empty() { |
1602 | | assert_eq!(t(""), Hir::empty()); |
1603 | | assert_eq!(t("(?i)"), Hir::empty()); |
1604 | | assert_eq!(t("()"), hir_capture(1, Hir::empty())); |
1605 | | assert_eq!(t("(?:)"), Hir::empty()); |
1606 | | assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty())); |
1607 | | assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); |
1608 | | assert_eq!( |
1609 | | t("()|()"), |
1610 | | hir_alt(vec![ |
1611 | | hir_capture(1, Hir::empty()), |
1612 | | hir_capture(2, Hir::empty()), |
1613 | | ]) |
1614 | | ); |
1615 | | assert_eq!( |
1616 | | t("(|b)"), |
1617 | | hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) |
1618 | | ); |
1619 | | assert_eq!( |
1620 | | t("(a|)"), |
1621 | | hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) |
1622 | | ); |
1623 | | assert_eq!( |
1624 | | t("(a||c)"), |
1625 | | hir_capture( |
1626 | | 1, |
1627 | | hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) |
1628 | | ) |
1629 | | ); |
1630 | | assert_eq!( |
1631 | | t("(||)"), |
1632 | | hir_capture( |
1633 | | 1, |
1634 | | hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) |
1635 | | ) |
1636 | | ); |
1637 | | } |
1638 | | |
1639 | | #[test] |
1640 | | fn literal() { |
1641 | | assert_eq!(t("a"), hir_lit("a")); |
1642 | | assert_eq!(t("(?-u)a"), hir_lit("a")); |
1643 | | assert_eq!(t("☃"), hir_lit("☃")); |
1644 | | assert_eq!(t("abcd"), hir_lit("abcd")); |
1645 | | |
1646 | | assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); |
1647 | | assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); |
1648 | | assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); |
1649 | | assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); |
1650 | | |
1651 | | assert_eq!(t("(?-u)☃"), hir_lit("☃")); |
1652 | | assert_eq!( |
1653 | | t_err(r"(?-u)\xFF"), |
1654 | | TestError { |
1655 | | kind: hir::ErrorKind::InvalidUtf8, |
1656 | | span: Span::new( |
1657 | | Position::new(5, 1, 6), |
1658 | | Position::new(9, 1, 10) |
1659 | | ), |
1660 | | } |
1661 | | ); |
1662 | | } |
1663 | | |
1664 | | #[test] |
1665 | | fn literal_case_insensitive() { |
1666 | | #[cfg(feature = "unicode-case")] |
1667 | | assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); |
1668 | | #[cfg(feature = "unicode-case")] |
1669 | | assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); |
1670 | | #[cfg(feature = "unicode-case")] |
1671 | | assert_eq!( |
1672 | | t("a(?i)a(?-i)a"), |
1673 | | hir_cat(vec![ |
1674 | | hir_lit("a"), |
1675 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1676 | | hir_lit("a"), |
1677 | | ]) |
1678 | | ); |
1679 | | #[cfg(feature = "unicode-case")] |
1680 | | assert_eq!( |
1681 | | t("(?i)ab@c"), |
1682 | | hir_cat(vec![ |
1683 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1684 | | hir_uclass(&[('B', 'B'), ('b', 'b')]), |
1685 | | hir_lit("@"), |
1686 | | hir_uclass(&[('C', 'C'), ('c', 'c')]), |
1687 | | ]) |
1688 | | ); |
1689 | | #[cfg(feature = "unicode-case")] |
1690 | | assert_eq!( |
1691 | | t("(?i)β"), |
1692 | | hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) |
1693 | | ); |
1694 | | |
1695 | | assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); |
1696 | | #[cfg(feature = "unicode-case")] |
1697 | | assert_eq!( |
1698 | | t("(?-u)a(?i)a(?-i)a"), |
1699 | | hir_cat(vec![ |
1700 | | hir_lit("a"), |
1701 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1702 | | hir_lit("a"), |
1703 | | ]) |
1704 | | ); |
1705 | | assert_eq!( |
1706 | | t("(?i-u)ab@c"), |
1707 | | hir_cat(vec![ |
1708 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1709 | | hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), |
1710 | | hir_lit("@"), |
1711 | | hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), |
1712 | | ]) |
1713 | | ); |
1714 | | |
1715 | | assert_eq!( |
1716 | | t_bytes("(?i-u)a"), |
1717 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1718 | | ); |
1719 | | assert_eq!( |
1720 | | t_bytes("(?i-u)\x61"), |
1721 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1722 | | ); |
1723 | | assert_eq!( |
1724 | | t_bytes(r"(?i-u)\x61"), |
1725 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) |
1726 | | ); |
1727 | | assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); |
1728 | | |
1729 | | assert_eq!(t("(?i-u)β"), hir_lit("β"),); |
1730 | | } |
1731 | | |
1732 | | #[test] |
1733 | | fn dot() { |
1734 | | assert_eq!( |
1735 | | t("."), |
1736 | | hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) |
1737 | | ); |
1738 | | assert_eq!( |
1739 | | t("(?R)."), |
1740 | | hir_uclass(&[ |
1741 | | ('\0', '\t'), |
1742 | | ('\x0B', '\x0C'), |
1743 | | ('\x0E', '\u{10FFFF}'), |
1744 | | ]) |
1745 | | ); |
1746 | | assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); |
1747 | | assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); |
1748 | | assert_eq!( |
1749 | | t_bytes("(?-u)."), |
1750 | | hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) |
1751 | | ); |
1752 | | assert_eq!( |
1753 | | t_bytes("(?R-u)."), |
1754 | | hir_bclass(&[ |
1755 | | (b'\0', b'\t'), |
1756 | | (b'\x0B', b'\x0C'), |
1757 | | (b'\x0E', b'\xFF'), |
1758 | | ]) |
1759 | | ); |
1760 | | assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); |
1761 | | assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); |
1762 | | |
1763 | | // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. |
1764 | | assert_eq!( |
1765 | | t_err("(?-u)."), |
1766 | | TestError { |
1767 | | kind: hir::ErrorKind::InvalidUtf8, |
1768 | | span: Span::new( |
1769 | | Position::new(5, 1, 6), |
1770 | | Position::new(6, 1, 7) |
1771 | | ), |
1772 | | } |
1773 | | ); |
1774 | | assert_eq!( |
1775 | | t_err("(?R-u)."), |
1776 | | TestError { |
1777 | | kind: hir::ErrorKind::InvalidUtf8, |
1778 | | span: Span::new( |
1779 | | Position::new(6, 1, 7), |
1780 | | Position::new(7, 1, 8) |
1781 | | ), |
1782 | | } |
1783 | | ); |
1784 | | assert_eq!( |
1785 | | t_err("(?s-u)."), |
1786 | | TestError { |
1787 | | kind: hir::ErrorKind::InvalidUtf8, |
1788 | | span: Span::new( |
1789 | | Position::new(6, 1, 7), |
1790 | | Position::new(7, 1, 8) |
1791 | | ), |
1792 | | } |
1793 | | ); |
1794 | | assert_eq!( |
1795 | | t_err("(?Rs-u)."), |
1796 | | TestError { |
1797 | | kind: hir::ErrorKind::InvalidUtf8, |
1798 | | span: Span::new( |
1799 | | Position::new(7, 1, 8), |
1800 | | Position::new(8, 1, 9) |
1801 | | ), |
1802 | | } |
1803 | | ); |
1804 | | } |
1805 | | |
1806 | | #[test] |
1807 | | fn assertions() { |
1808 | | assert_eq!(t("^"), hir_look(hir::Look::Start)); |
1809 | | assert_eq!(t("$"), hir_look(hir::Look::End)); |
1810 | | assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); |
1811 | | assert_eq!(t(r"\z"), hir_look(hir::Look::End)); |
1812 | | assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); |
1813 | | assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); |
1814 | | assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); |
1815 | | assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); |
1816 | | |
1817 | | assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); |
1818 | | assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); |
1819 | | assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); |
1820 | | assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); |
1821 | | } |
1822 | | |
1823 | | #[test] |
1824 | | fn group() { |
1825 | | assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); |
1826 | | assert_eq!( |
1827 | | t("(a)(b)"), |
1828 | | hir_cat(vec![ |
1829 | | hir_capture(1, hir_lit("a")), |
1830 | | hir_capture(2, hir_lit("b")), |
1831 | | ]) |
1832 | | ); |
1833 | | assert_eq!( |
1834 | | t("(a)|(b)"), |
1835 | | hir_alt(vec![ |
1836 | | hir_capture(1, hir_lit("a")), |
1837 | | hir_capture(2, hir_lit("b")), |
1838 | | ]) |
1839 | | ); |
1840 | | assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty())); |
1841 | | assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a"))); |
1842 | | assert_eq!( |
1843 | | t("(?P<foo>a)(?P<bar>b)"), |
1844 | | hir_cat(vec![ |
1845 | | hir_capture_name(1, "foo", hir_lit("a")), |
1846 | | hir_capture_name(2, "bar", hir_lit("b")), |
1847 | | ]) |
1848 | | ); |
1849 | | assert_eq!(t("(?:)"), Hir::empty()); |
1850 | | assert_eq!(t("(?:a)"), hir_lit("a")); |
1851 | | assert_eq!( |
1852 | | t("(?:a)(b)"), |
1853 | | hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) |
1854 | | ); |
1855 | | assert_eq!( |
1856 | | t("(a)(?:b)(c)"), |
1857 | | hir_cat(vec![ |
1858 | | hir_capture(1, hir_lit("a")), |
1859 | | hir_lit("b"), |
1860 | | hir_capture(2, hir_lit("c")), |
1861 | | ]) |
1862 | | ); |
1863 | | assert_eq!( |
1864 | | t("(a)(?P<foo>b)(c)"), |
1865 | | hir_cat(vec![ |
1866 | | hir_capture(1, hir_lit("a")), |
1867 | | hir_capture_name(2, "foo", hir_lit("b")), |
1868 | | hir_capture(3, hir_lit("c")), |
1869 | | ]) |
1870 | | ); |
1871 | | assert_eq!(t("()"), hir_capture(1, Hir::empty())); |
1872 | | assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); |
1873 | | assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); |
1874 | | assert_eq!( |
1875 | | t("(((?x)))"), |
1876 | | hir_capture(1, hir_capture(2, Hir::empty())) |
1877 | | ); |
1878 | | } |
1879 | | |
1880 | | #[test] |
1881 | | fn line_anchors() { |
1882 | | assert_eq!(t("^"), hir_look(hir::Look::Start)); |
1883 | | assert_eq!(t("$"), hir_look(hir::Look::End)); |
1884 | | assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); |
1885 | | assert_eq!(t(r"\z"), hir_look(hir::Look::End)); |
1886 | | |
1887 | | assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); |
1888 | | assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); |
1889 | | assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); |
1890 | | assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); |
1891 | | |
1892 | | assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); |
1893 | | assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); |
1894 | | assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); |
1895 | | assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); |
1896 | | |
1897 | | assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); |
1898 | | assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); |
1899 | | assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); |
1900 | | assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); |
1901 | | } |
1902 | | |
1903 | | #[test] |
1904 | | fn flags() { |
1905 | | #[cfg(feature = "unicode-case")] |
1906 | | assert_eq!( |
1907 | | t("(?i:a)a"), |
1908 | | hir_cat( |
1909 | | vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] |
1910 | | ) |
1911 | | ); |
1912 | | assert_eq!( |
1913 | | t("(?i-u:a)β"), |
1914 | | hir_cat(vec![ |
1915 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1916 | | hir_lit("β"), |
1917 | | ]) |
1918 | | ); |
1919 | | assert_eq!( |
1920 | | t("(?:(?i-u)a)b"), |
1921 | | hir_cat(vec![ |
1922 | | hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), |
1923 | | hir_lit("b"), |
1924 | | ]) |
1925 | | ); |
1926 | | assert_eq!( |
1927 | | t("((?i-u)a)b"), |
1928 | | hir_cat(vec![ |
1929 | | hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), |
1930 | | hir_lit("b"), |
1931 | | ]) |
1932 | | ); |
1933 | | #[cfg(feature = "unicode-case")] |
1934 | | assert_eq!( |
1935 | | t("(?i)(?-i:a)a"), |
1936 | | hir_cat( |
1937 | | vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] |
1938 | | ) |
1939 | | ); |
1940 | | #[cfg(feature = "unicode-case")] |
1941 | | assert_eq!( |
1942 | | t("(?im)a^"), |
1943 | | hir_cat(vec![ |
1944 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1945 | | hir_look(hir::Look::StartLF), |
1946 | | ]) |
1947 | | ); |
1948 | | #[cfg(feature = "unicode-case")] |
1949 | | assert_eq!( |
1950 | | t("(?im)a^(?i-m)a^"), |
1951 | | hir_cat(vec![ |
1952 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1953 | | hir_look(hir::Look::StartLF), |
1954 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1955 | | hir_look(hir::Look::Start), |
1956 | | ]) |
1957 | | ); |
1958 | | assert_eq!( |
1959 | | t("(?U)a*a*?(?-U)a*a*?"), |
1960 | | hir_cat(vec![ |
1961 | | hir_star(false, hir_lit("a")), |
1962 | | hir_star(true, hir_lit("a")), |
1963 | | hir_star(true, hir_lit("a")), |
1964 | | hir_star(false, hir_lit("a")), |
1965 | | ]) |
1966 | | ); |
1967 | | #[cfg(feature = "unicode-case")] |
1968 | | assert_eq!( |
1969 | | t("(?:a(?i)a)a"), |
1970 | | hir_cat(vec![ |
1971 | | hir_cat(vec![ |
1972 | | hir_lit("a"), |
1973 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1974 | | ]), |
1975 | | hir_lit("a"), |
1976 | | ]) |
1977 | | ); |
1978 | | #[cfg(feature = "unicode-case")] |
1979 | | assert_eq!( |
1980 | | t("(?i)(?:a(?-i)a)a"), |
1981 | | hir_cat(vec![ |
1982 | | hir_cat(vec![ |
1983 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1984 | | hir_lit("a"), |
1985 | | ]), |
1986 | | hir_uclass(&[('A', 'A'), ('a', 'a')]), |
1987 | | ]) |
1988 | | ); |
1989 | | } |
1990 | | |
1991 | | #[test] |
1992 | | fn escape() { |
1993 | | assert_eq!( |
1994 | | t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), |
1995 | | hir_lit(r"\.+*?()|[]{}^$#") |
1996 | | ); |
1997 | | } |
1998 | | |
1999 | | #[test] |
2000 | | fn repetition() { |
2001 | | assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); |
2002 | | assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); |
2003 | | assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); |
2004 | | assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); |
2005 | | assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); |
2006 | | assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); |
2007 | | |
2008 | | assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); |
2009 | | assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); |
2010 | | assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); |
2011 | | assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); |
2012 | | assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); |
2013 | | assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); |
2014 | | |
2015 | | assert_eq!( |
2016 | | t("ab?"), |
2017 | | hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) |
2018 | | ); |
2019 | | assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); |
2020 | | assert_eq!( |
2021 | | t("a|b?"), |
2022 | | hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) |
2023 | | ); |
2024 | | } |
2025 | | |
2026 | | #[test] |
2027 | | fn cat_alt() { |
2028 | | let a = || hir_look(hir::Look::Start); |
2029 | | let b = || hir_look(hir::Look::End); |
2030 | | let c = || hir_look(hir::Look::WordUnicode); |
2031 | | let d = || hir_look(hir::Look::WordUnicodeNegate); |
2032 | | |
2033 | | assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); |
2034 | | assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); |
2035 | | assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); |
2036 | | assert_eq!( |
2037 | | t(r"^$|$\b|\b\B"), |
2038 | | hir_alt(vec![ |
2039 | | hir_cat(vec![a(), b()]), |
2040 | | hir_cat(vec![b(), c()]), |
2041 | | hir_cat(vec![c(), d()]), |
2042 | | ]) |
2043 | | ); |
2044 | | assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); |
2045 | | assert_eq!( |
2046 | | t(r"(^|$|\b)"), |
2047 | | hir_capture(1, hir_alt(vec![a(), b(), c()])) |
2048 | | ); |
2049 | | assert_eq!( |
2050 | | t(r"(^$|$\b|\b\B)"), |
2051 | | hir_capture( |
2052 | | 1, |
2053 | | hir_alt(vec![ |
2054 | | hir_cat(vec![a(), b()]), |
2055 | | hir_cat(vec![b(), c()]), |
2056 | | hir_cat(vec![c(), d()]), |
2057 | | ]) |
2058 | | ) |
2059 | | ); |
2060 | | assert_eq!( |
2061 | | t(r"(^$|($\b|(\b\B)))"), |
2062 | | hir_capture( |
2063 | | 1, |
2064 | | hir_alt(vec![ |
2065 | | hir_cat(vec![a(), b()]), |
2066 | | hir_capture( |
2067 | | 2, |
2068 | | hir_alt(vec![ |
2069 | | hir_cat(vec![b(), c()]), |
2070 | | hir_capture(3, hir_cat(vec![c(), d()])), |
2071 | | ]) |
2072 | | ), |
2073 | | ]) |
2074 | | ) |
2075 | | ); |
2076 | | } |
2077 | | |
2078 | | // Tests the HIR transformation of things like '[a-z]|[A-Z]' into |
2079 | | // '[A-Za-z]'. In other words, an alternation of just classes is always |
2080 | | // equivalent to a single class corresponding to the union of the branches |
2081 | | // in that class. (Unless some branches match invalid UTF-8 and others |
2082 | | // match non-ASCII Unicode.) |
2083 | | #[test] |
2084 | | fn cat_class_flattened() { |
2085 | | assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); |
2086 | | // Combining all of the letter properties should give us the one giant |
2087 | | // letter property. |
2088 | | #[cfg(feature = "unicode-gencat")] |
2089 | | assert_eq!( |
2090 | | t(r"(?x) |
2091 | | \p{Lowercase_Letter} |
2092 | | |\p{Uppercase_Letter} |
2093 | | |\p{Titlecase_Letter} |
2094 | | |\p{Modifier_Letter} |
2095 | | |\p{Other_Letter} |
2096 | | "), |
2097 | | hir_uclass_query(ClassQuery::Binary("letter")) |
2098 | | ); |
2099 | | // Byte classes that can truly match invalid UTF-8 cannot be combined |
2100 | | // with Unicode classes. |
2101 | | assert_eq!( |
2102 | | t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), |
2103 | | hir_alt(vec![ |
2104 | | hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), |
2105 | | hir_bclass(&[(b'\x90', b'\xFF')]), |
2106 | | hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), |
2107 | | ]) |
2108 | | ); |
2109 | | // Byte classes on their own can be combined, even if some are ASCII |
2110 | | // and others are invalid UTF-8. |
2111 | | assert_eq!( |
2112 | | t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), |
2113 | | hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), |
2114 | | ); |
2115 | | } |
2116 | | |
2117 | | #[test] |
2118 | | fn class_ascii() { |
2119 | | assert_eq!( |
2120 | | t("[[:alnum:]]"), |
2121 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) |
2122 | | ); |
2123 | | assert_eq!( |
2124 | | t("[[:alpha:]]"), |
2125 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) |
2126 | | ); |
2127 | | assert_eq!( |
2128 | | t("[[:ascii:]]"), |
2129 | | hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) |
2130 | | ); |
2131 | | assert_eq!( |
2132 | | t("[[:blank:]]"), |
2133 | | hir_ascii_uclass(&ast::ClassAsciiKind::Blank) |
2134 | | ); |
2135 | | assert_eq!( |
2136 | | t("[[:cntrl:]]"), |
2137 | | hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) |
2138 | | ); |
2139 | | assert_eq!( |
2140 | | t("[[:digit:]]"), |
2141 | | hir_ascii_uclass(&ast::ClassAsciiKind::Digit) |
2142 | | ); |
2143 | | assert_eq!( |
2144 | | t("[[:graph:]]"), |
2145 | | hir_ascii_uclass(&ast::ClassAsciiKind::Graph) |
2146 | | ); |
2147 | | assert_eq!( |
2148 | | t("[[:lower:]]"), |
2149 | | hir_ascii_uclass(&ast::ClassAsciiKind::Lower) |
2150 | | ); |
2151 | | assert_eq!( |
2152 | | t("[[:print:]]"), |
2153 | | hir_ascii_uclass(&ast::ClassAsciiKind::Print) |
2154 | | ); |
2155 | | assert_eq!( |
2156 | | t("[[:punct:]]"), |
2157 | | hir_ascii_uclass(&ast::ClassAsciiKind::Punct) |
2158 | | ); |
2159 | | assert_eq!( |
2160 | | t("[[:space:]]"), |
2161 | | hir_ascii_uclass(&ast::ClassAsciiKind::Space) |
2162 | | ); |
2163 | | assert_eq!( |
2164 | | t("[[:upper:]]"), |
2165 | | hir_ascii_uclass(&ast::ClassAsciiKind::Upper) |
2166 | | ); |
2167 | | assert_eq!( |
2168 | | t("[[:word:]]"), |
2169 | | hir_ascii_uclass(&ast::ClassAsciiKind::Word) |
2170 | | ); |
2171 | | assert_eq!( |
2172 | | t("[[:xdigit:]]"), |
2173 | | hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) |
2174 | | ); |
2175 | | |
2176 | | assert_eq!( |
2177 | | t("[[:^lower:]]"), |
2178 | | hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) |
2179 | | ); |
2180 | | #[cfg(feature = "unicode-case")] |
2181 | | assert_eq!( |
2182 | | t("(?i)[[:lower:]]"), |
2183 | | hir_uclass(&[ |
2184 | | ('A', 'Z'), |
2185 | | ('a', 'z'), |
2186 | | ('\u{17F}', '\u{17F}'), |
2187 | | ('\u{212A}', '\u{212A}'), |
2188 | | ]) |
2189 | | ); |
2190 | | |
2191 | | assert_eq!( |
2192 | | t("(?-u)[[:lower:]]"), |
2193 | | hir_ascii_bclass(&ast::ClassAsciiKind::Lower) |
2194 | | ); |
2195 | | assert_eq!( |
2196 | | t("(?i-u)[[:lower:]]"), |
2197 | | hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) |
2198 | | ); |
2199 | | |
2200 | | assert_eq!( |
2201 | | t_err("(?-u)[[:^lower:]]"), |
2202 | | TestError { |
2203 | | kind: hir::ErrorKind::InvalidUtf8, |
2204 | | span: Span::new( |
2205 | | Position::new(6, 1, 7), |
2206 | | Position::new(16, 1, 17) |
2207 | | ), |
2208 | | } |
2209 | | ); |
2210 | | assert_eq!( |
2211 | | t_err("(?i-u)[[:^lower:]]"), |
2212 | | TestError { |
2213 | | kind: hir::ErrorKind::InvalidUtf8, |
2214 | | span: Span::new( |
2215 | | Position::new(7, 1, 8), |
2216 | | Position::new(17, 1, 18) |
2217 | | ), |
2218 | | } |
2219 | | ); |
2220 | | } |
2221 | | |
2222 | | #[test] |
2223 | | fn class_ascii_multiple() { |
2224 | | // See: https://github.com/rust-lang/regex/issues/680 |
2225 | | assert_eq!( |
2226 | | t("[[:alnum:][:^ascii:]]"), |
2227 | | hir_union( |
2228 | | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), |
2229 | | hir_uclass(&[('\u{80}', '\u{10FFFF}')]), |
2230 | | ), |
2231 | | ); |
2232 | | assert_eq!( |
2233 | | t_bytes("(?-u)[[:alnum:][:^ascii:]]"), |
2234 | | hir_union( |
2235 | | hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), |
2236 | | hir_bclass(&[(0x80, 0xFF)]), |
2237 | | ), |
2238 | | ); |
2239 | | } |
2240 | | |
2241 | | #[test] |
2242 | | #[cfg(feature = "unicode-perl")] |
2243 | | fn class_perl_unicode() { |
2244 | | // Unicode |
2245 | | assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2246 | | assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); |
2247 | | assert_eq!(t(r"\w"), hir_uclass_perl_word()); |
2248 | | #[cfg(feature = "unicode-case")] |
2249 | | assert_eq!( |
2250 | | t(r"(?i)\d"), |
2251 | | hir_uclass_query(ClassQuery::Binary("digit")) |
2252 | | ); |
2253 | | #[cfg(feature = "unicode-case")] |
2254 | | assert_eq!( |
2255 | | t(r"(?i)\s"), |
2256 | | hir_uclass_query(ClassQuery::Binary("space")) |
2257 | | ); |
2258 | | #[cfg(feature = "unicode-case")] |
2259 | | assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); |
2260 | | |
2261 | | // Unicode, negated |
2262 | | assert_eq!( |
2263 | | t(r"\D"), |
2264 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2265 | | ); |
2266 | | assert_eq!( |
2267 | | t(r"\S"), |
2268 | | hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) |
2269 | | ); |
2270 | | assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); |
2271 | | #[cfg(feature = "unicode-case")] |
2272 | | assert_eq!( |
2273 | | t(r"(?i)\D"), |
2274 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2275 | | ); |
2276 | | #[cfg(feature = "unicode-case")] |
2277 | | assert_eq!( |
2278 | | t(r"(?i)\S"), |
2279 | | hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) |
2280 | | ); |
2281 | | #[cfg(feature = "unicode-case")] |
2282 | | assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); |
2283 | | } |
2284 | | |
2285 | | #[test] |
2286 | | fn class_perl_ascii() { |
2287 | | // ASCII only |
2288 | | assert_eq!( |
2289 | | t(r"(?-u)\d"), |
2290 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2291 | | ); |
2292 | | assert_eq!( |
2293 | | t(r"(?-u)\s"), |
2294 | | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2295 | | ); |
2296 | | assert_eq!( |
2297 | | t(r"(?-u)\w"), |
2298 | | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2299 | | ); |
2300 | | assert_eq!( |
2301 | | t(r"(?i-u)\d"), |
2302 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2303 | | ); |
2304 | | assert_eq!( |
2305 | | t(r"(?i-u)\s"), |
2306 | | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2307 | | ); |
2308 | | assert_eq!( |
2309 | | t(r"(?i-u)\w"), |
2310 | | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2311 | | ); |
2312 | | |
2313 | | // ASCII only, negated |
2314 | | assert_eq!( |
2315 | | t_bytes(r"(?-u)\D"), |
2316 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2317 | | ); |
2318 | | assert_eq!( |
2319 | | t_bytes(r"(?-u)\S"), |
2320 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2321 | | ); |
2322 | | assert_eq!( |
2323 | | t_bytes(r"(?-u)\W"), |
2324 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2325 | | ); |
2326 | | assert_eq!( |
2327 | | t_bytes(r"(?i-u)\D"), |
2328 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2329 | | ); |
2330 | | assert_eq!( |
2331 | | t_bytes(r"(?i-u)\S"), |
2332 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2333 | | ); |
2334 | | assert_eq!( |
2335 | | t_bytes(r"(?i-u)\W"), |
2336 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2337 | | ); |
2338 | | |
2339 | | // ASCII only, negated, with UTF-8 mode enabled. |
2340 | | // In this case, negating any Perl class results in an error because |
2341 | | // all such classes can match invalid UTF-8. |
2342 | | assert_eq!( |
2343 | | t_err(r"(?-u)\D"), |
2344 | | TestError { |
2345 | | kind: hir::ErrorKind::InvalidUtf8, |
2346 | | span: Span::new( |
2347 | | Position::new(5, 1, 6), |
2348 | | Position::new(7, 1, 8), |
2349 | | ), |
2350 | | }, |
2351 | | ); |
2352 | | assert_eq!( |
2353 | | t_err(r"(?-u)\S"), |
2354 | | TestError { |
2355 | | kind: hir::ErrorKind::InvalidUtf8, |
2356 | | span: Span::new( |
2357 | | Position::new(5, 1, 6), |
2358 | | Position::new(7, 1, 8), |
2359 | | ), |
2360 | | }, |
2361 | | ); |
2362 | | assert_eq!( |
2363 | | t_err(r"(?-u)\W"), |
2364 | | TestError { |
2365 | | kind: hir::ErrorKind::InvalidUtf8, |
2366 | | span: Span::new( |
2367 | | Position::new(5, 1, 6), |
2368 | | Position::new(7, 1, 8), |
2369 | | ), |
2370 | | }, |
2371 | | ); |
2372 | | assert_eq!( |
2373 | | t_err(r"(?i-u)\D"), |
2374 | | TestError { |
2375 | | kind: hir::ErrorKind::InvalidUtf8, |
2376 | | span: Span::new( |
2377 | | Position::new(6, 1, 7), |
2378 | | Position::new(8, 1, 9), |
2379 | | ), |
2380 | | }, |
2381 | | ); |
2382 | | assert_eq!( |
2383 | | t_err(r"(?i-u)\S"), |
2384 | | TestError { |
2385 | | kind: hir::ErrorKind::InvalidUtf8, |
2386 | | span: Span::new( |
2387 | | Position::new(6, 1, 7), |
2388 | | Position::new(8, 1, 9), |
2389 | | ), |
2390 | | }, |
2391 | | ); |
2392 | | assert_eq!( |
2393 | | t_err(r"(?i-u)\W"), |
2394 | | TestError { |
2395 | | kind: hir::ErrorKind::InvalidUtf8, |
2396 | | span: Span::new( |
2397 | | Position::new(6, 1, 7), |
2398 | | Position::new(8, 1, 9), |
2399 | | ), |
2400 | | }, |
2401 | | ); |
2402 | | } |
2403 | | |
2404 | | #[test] |
2405 | | #[cfg(not(feature = "unicode-perl"))] |
2406 | | fn class_perl_word_disabled() { |
2407 | | assert_eq!( |
2408 | | t_err(r"\w"), |
2409 | | TestError { |
2410 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2411 | | span: Span::new( |
2412 | | Position::new(0, 1, 1), |
2413 | | Position::new(2, 1, 3) |
2414 | | ), |
2415 | | } |
2416 | | ); |
2417 | | } |
2418 | | |
2419 | | #[test] |
2420 | | #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] |
2421 | | fn class_perl_space_disabled() { |
2422 | | assert_eq!( |
2423 | | t_err(r"\s"), |
2424 | | TestError { |
2425 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2426 | | span: Span::new( |
2427 | | Position::new(0, 1, 1), |
2428 | | Position::new(2, 1, 3) |
2429 | | ), |
2430 | | } |
2431 | | ); |
2432 | | } |
2433 | | |
2434 | | #[test] |
2435 | | #[cfg(all( |
2436 | | not(feature = "unicode-perl"), |
2437 | | not(feature = "unicode-gencat") |
2438 | | ))] |
2439 | | fn class_perl_digit_disabled() { |
2440 | | assert_eq!( |
2441 | | t_err(r"\d"), |
2442 | | TestError { |
2443 | | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2444 | | span: Span::new( |
2445 | | Position::new(0, 1, 1), |
2446 | | Position::new(2, 1, 3) |
2447 | | ), |
2448 | | } |
2449 | | ); |
2450 | | } |
2451 | | |
2452 | | #[test] |
2453 | | #[cfg(feature = "unicode-gencat")] |
2454 | | fn class_unicode_gencat() { |
2455 | | assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); |
2456 | | assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); |
2457 | | assert_eq!( |
2458 | | t(r"\p{Separator}"), |
2459 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2460 | | ); |
2461 | | assert_eq!( |
2462 | | t(r"\p{se PaRa ToR}"), |
2463 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2464 | | ); |
2465 | | assert_eq!( |
2466 | | t(r"\p{gc:Separator}"), |
2467 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2468 | | ); |
2469 | | assert_eq!( |
2470 | | t(r"\p{gc=Separator}"), |
2471 | | hir_uclass_query(ClassQuery::Binary("Z")) |
2472 | | ); |
2473 | | assert_eq!( |
2474 | | t(r"\p{Other}"), |
2475 | | hir_uclass_query(ClassQuery::Binary("Other")) |
2476 | | ); |
2477 | | assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); |
2478 | | |
2479 | | assert_eq!( |
2480 | | t(r"\PZ"), |
2481 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2482 | | ); |
2483 | | assert_eq!( |
2484 | | t(r"\P{separator}"), |
2485 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2486 | | ); |
2487 | | assert_eq!( |
2488 | | t(r"\P{gc!=separator}"), |
2489 | | hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) |
2490 | | ); |
2491 | | |
2492 | | assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); |
2493 | | assert_eq!( |
2494 | | t(r"\p{assigned}"), |
2495 | | hir_uclass_query(ClassQuery::Binary("Assigned")) |
2496 | | ); |
2497 | | assert_eq!( |
2498 | | t(r"\p{ascii}"), |
2499 | | hir_uclass_query(ClassQuery::Binary("ASCII")) |
2500 | | ); |
2501 | | assert_eq!( |
2502 | | t(r"\p{gc:any}"), |
2503 | | hir_uclass_query(ClassQuery::Binary("Any")) |
2504 | | ); |
2505 | | assert_eq!( |
2506 | | t(r"\p{gc:assigned}"), |
2507 | | hir_uclass_query(ClassQuery::Binary("Assigned")) |
2508 | | ); |
2509 | | assert_eq!( |
2510 | | t(r"\p{gc:ascii}"), |
2511 | | hir_uclass_query(ClassQuery::Binary("ASCII")) |
2512 | | ); |
2513 | | |
2514 | | assert_eq!( |
2515 | | t_err(r"(?-u)\pZ"), |
2516 | | TestError { |
2517 | | kind: hir::ErrorKind::UnicodeNotAllowed, |
2518 | | span: Span::new( |
2519 | | Position::new(5, 1, 6), |
2520 | | Position::new(8, 1, 9) |
2521 | | ), |
2522 | | } |
2523 | | ); |
2524 | | assert_eq!( |
2525 | | t_err(r"(?-u)\p{Separator}"), |
2526 | | TestError { |
2527 | | kind: hir::ErrorKind::UnicodeNotAllowed, |
2528 | | span: Span::new( |
2529 | | Position::new(5, 1, 6), |
2530 | | Position::new(18, 1, 19) |
2531 | | ), |
2532 | | } |
2533 | | ); |
2534 | | assert_eq!( |
2535 | | t_err(r"\pE"), |
2536 | | TestError { |
2537 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2538 | | span: Span::new( |
2539 | | Position::new(0, 1, 1), |
2540 | | Position::new(3, 1, 4) |
2541 | | ), |
2542 | | } |
2543 | | ); |
2544 | | assert_eq!( |
2545 | | t_err(r"\p{Foo}"), |
2546 | | TestError { |
2547 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2548 | | span: Span::new( |
2549 | | Position::new(0, 1, 1), |
2550 | | Position::new(7, 1, 8) |
2551 | | ), |
2552 | | } |
2553 | | ); |
2554 | | assert_eq!( |
2555 | | t_err(r"\p{gc:Foo}"), |
2556 | | TestError { |
2557 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2558 | | span: Span::new( |
2559 | | Position::new(0, 1, 1), |
2560 | | Position::new(10, 1, 11) |
2561 | | ), |
2562 | | } |
2563 | | ); |
2564 | | } |
2565 | | |
2566 | | #[test] |
2567 | | #[cfg(not(feature = "unicode-gencat"))] |
2568 | | fn class_unicode_gencat_disabled() { |
2569 | | assert_eq!( |
2570 | | t_err(r"\p{Separator}"), |
2571 | | TestError { |
2572 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2573 | | span: Span::new( |
2574 | | Position::new(0, 1, 1), |
2575 | | Position::new(13, 1, 14) |
2576 | | ), |
2577 | | } |
2578 | | ); |
2579 | | |
2580 | | assert_eq!( |
2581 | | t_err(r"\p{Any}"), |
2582 | | TestError { |
2583 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2584 | | span: Span::new( |
2585 | | Position::new(0, 1, 1), |
2586 | | Position::new(7, 1, 8) |
2587 | | ), |
2588 | | } |
2589 | | ); |
2590 | | } |
2591 | | |
2592 | | #[test] |
2593 | | #[cfg(feature = "unicode-script")] |
2594 | | fn class_unicode_script() { |
2595 | | assert_eq!( |
2596 | | t(r"\p{Greek}"), |
2597 | | hir_uclass_query(ClassQuery::Binary("Greek")) |
2598 | | ); |
2599 | | #[cfg(feature = "unicode-case")] |
2600 | | assert_eq!( |
2601 | | t(r"(?i)\p{Greek}"), |
2602 | | hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) |
2603 | | ); |
2604 | | #[cfg(feature = "unicode-case")] |
2605 | | assert_eq!( |
2606 | | t(r"(?i)\P{Greek}"), |
2607 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2608 | | "Greek" |
2609 | | )))) |
2610 | | ); |
2611 | | |
2612 | | assert_eq!( |
2613 | | t_err(r"\p{sc:Foo}"), |
2614 | | TestError { |
2615 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2616 | | span: Span::new( |
2617 | | Position::new(0, 1, 1), |
2618 | | Position::new(10, 1, 11) |
2619 | | ), |
2620 | | } |
2621 | | ); |
2622 | | assert_eq!( |
2623 | | t_err(r"\p{scx:Foo}"), |
2624 | | TestError { |
2625 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2626 | | span: Span::new( |
2627 | | Position::new(0, 1, 1), |
2628 | | Position::new(11, 1, 12) |
2629 | | ), |
2630 | | } |
2631 | | ); |
2632 | | } |
2633 | | |
2634 | | #[test] |
2635 | | #[cfg(not(feature = "unicode-script"))] |
2636 | | fn class_unicode_script_disabled() { |
2637 | | assert_eq!( |
2638 | | t_err(r"\p{Greek}"), |
2639 | | TestError { |
2640 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2641 | | span: Span::new( |
2642 | | Position::new(0, 1, 1), |
2643 | | Position::new(9, 1, 10) |
2644 | | ), |
2645 | | } |
2646 | | ); |
2647 | | |
2648 | | assert_eq!( |
2649 | | t_err(r"\p{scx:Greek}"), |
2650 | | TestError { |
2651 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2652 | | span: Span::new( |
2653 | | Position::new(0, 1, 1), |
2654 | | Position::new(13, 1, 14) |
2655 | | ), |
2656 | | } |
2657 | | ); |
2658 | | } |
2659 | | |
2660 | | #[test] |
2661 | | #[cfg(feature = "unicode-age")] |
2662 | | fn class_unicode_age() { |
2663 | | assert_eq!( |
2664 | | t_err(r"\p{age:Foo}"), |
2665 | | TestError { |
2666 | | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2667 | | span: Span::new( |
2668 | | Position::new(0, 1, 1), |
2669 | | Position::new(11, 1, 12) |
2670 | | ), |
2671 | | } |
2672 | | ); |
2673 | | } |
2674 | | |
2675 | | #[test] |
2676 | | #[cfg(feature = "unicode-gencat")] |
2677 | | fn class_unicode_any_empty() { |
2678 | | assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); |
2679 | | } |
2680 | | |
2681 | | #[test] |
2682 | | #[cfg(not(feature = "unicode-age"))] |
2683 | | fn class_unicode_age_disabled() { |
2684 | | assert_eq!( |
2685 | | t_err(r"\p{age:3.0}"), |
2686 | | TestError { |
2687 | | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2688 | | span: Span::new( |
2689 | | Position::new(0, 1, 1), |
2690 | | Position::new(11, 1, 12) |
2691 | | ), |
2692 | | } |
2693 | | ); |
2694 | | } |
2695 | | |
2696 | | #[test] |
2697 | | fn class_bracketed() { |
2698 | | assert_eq!(t("[a]"), hir_lit("a")); |
2699 | | assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); |
2700 | | assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); |
2701 | | assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); |
2702 | | assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); |
2703 | | assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); |
2704 | | assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); |
2705 | | assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); |
2706 | | assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); |
2707 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2708 | | assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2709 | | #[cfg(feature = "unicode-gencat")] |
2710 | | assert_eq!( |
2711 | | t(r"[\pZ]"), |
2712 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2713 | | ); |
2714 | | #[cfg(feature = "unicode-gencat")] |
2715 | | assert_eq!( |
2716 | | t(r"[\p{separator}]"), |
2717 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2718 | | ); |
2719 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2720 | | assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); |
2721 | | #[cfg(feature = "unicode-gencat")] |
2722 | | assert_eq!( |
2723 | | t(r"[^\PZ]"), |
2724 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2725 | | ); |
2726 | | #[cfg(feature = "unicode-gencat")] |
2727 | | assert_eq!( |
2728 | | t(r"[^\P{separator}]"), |
2729 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2730 | | ); |
2731 | | #[cfg(all( |
2732 | | feature = "unicode-case", |
2733 | | any(feature = "unicode-perl", feature = "unicode-gencat") |
2734 | | ))] |
2735 | | assert_eq!( |
2736 | | t(r"(?i)[^\D]"), |
2737 | | hir_uclass_query(ClassQuery::Binary("digit")) |
2738 | | ); |
2739 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2740 | | assert_eq!( |
2741 | | t(r"(?i)[^\P{greek}]"), |
2742 | | hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) |
2743 | | ); |
2744 | | |
2745 | | assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); |
2746 | | assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); |
2747 | | assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); |
2748 | | |
2749 | | #[cfg(feature = "unicode-case")] |
2750 | | assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); |
2751 | | #[cfg(feature = "unicode-case")] |
2752 | | assert_eq!( |
2753 | | t("(?i)[k]"), |
2754 | | hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) |
2755 | | ); |
2756 | | #[cfg(feature = "unicode-case")] |
2757 | | assert_eq!( |
2758 | | t("(?i)[β]"), |
2759 | | hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) |
2760 | | ); |
2761 | | assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); |
2762 | | |
2763 | | assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); |
2764 | | assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); |
2765 | | assert_eq!( |
2766 | | t_bytes("(?-u)[^a]"), |
2767 | | class_negate(bclass(&[(b'a', b'a')])) |
2768 | | ); |
2769 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] |
2770 | | assert_eq!( |
2771 | | t(r"[^\d]"), |
2772 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
2773 | | ); |
2774 | | #[cfg(feature = "unicode-gencat")] |
2775 | | assert_eq!( |
2776 | | t(r"[^\pZ]"), |
2777 | | hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) |
2778 | | ); |
2779 | | #[cfg(feature = "unicode-gencat")] |
2780 | | assert_eq!( |
2781 | | t(r"[^\p{separator}]"), |
2782 | | hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) |
2783 | | ); |
2784 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2785 | | assert_eq!( |
2786 | | t(r"(?i)[^\p{greek}]"), |
2787 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2788 | | "greek" |
2789 | | )))) |
2790 | | ); |
2791 | | #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] |
2792 | | assert_eq!( |
2793 | | t(r"(?i)[\P{greek}]"), |
2794 | | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2795 | | "greek" |
2796 | | )))) |
2797 | | ); |
2798 | | |
2799 | | // Test some weird cases. |
2800 | | assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); |
2801 | | |
2802 | | assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); |
2803 | | assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); |
2804 | | assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); |
2805 | | assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); |
2806 | | assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); |
2807 | | |
2808 | | assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); |
2809 | | assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); |
2810 | | assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); |
2811 | | assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); |
2812 | | assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); |
2813 | | |
2814 | | assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); |
2815 | | assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); |
2816 | | assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); |
2817 | | assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); |
2818 | | assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); |
2819 | | |
2820 | | assert_eq!( |
2821 | | t_err("(?-u)[^a]"), |
2822 | | TestError { |
2823 | | kind: hir::ErrorKind::InvalidUtf8, |
2824 | | span: Span::new( |
2825 | | Position::new(5, 1, 6), |
2826 | | Position::new(9, 1, 10) |
2827 | | ), |
2828 | | } |
2829 | | ); |
2830 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] |
2831 | | assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); |
2832 | | #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] |
2833 | | assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); |
2834 | | } |
2835 | | |
2836 | | #[test] |
2837 | | fn class_bracketed_union() { |
2838 | | assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); |
2839 | | #[cfg(feature = "unicode-gencat")] |
2840 | | assert_eq!( |
2841 | | t(r"[a\pZb]"), |
2842 | | hir_union( |
2843 | | hir_uclass(&[('a', 'b')]), |
2844 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2845 | | ) |
2846 | | ); |
2847 | | #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] |
2848 | | assert_eq!( |
2849 | | t(r"[\pZ\p{Greek}]"), |
2850 | | hir_union( |
2851 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2852 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2853 | | ) |
2854 | | ); |
2855 | | #[cfg(all( |
2856 | | feature = "unicode-age", |
2857 | | feature = "unicode-gencat", |
2858 | | feature = "unicode-script" |
2859 | | ))] |
2860 | | assert_eq!( |
2861 | | t(r"[\p{age:3.0}\pZ\p{Greek}]"), |
2862 | | hir_union( |
2863 | | hir_uclass_query(ClassQuery::ByValue { |
2864 | | property_name: "age", |
2865 | | property_value: "3.0", |
2866 | | }), |
2867 | | hir_union( |
2868 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2869 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2870 | | ) |
2871 | | ) |
2872 | | ); |
2873 | | #[cfg(all( |
2874 | | feature = "unicode-age", |
2875 | | feature = "unicode-gencat", |
2876 | | feature = "unicode-script" |
2877 | | ))] |
2878 | | assert_eq!( |
2879 | | t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), |
2880 | | hir_union( |
2881 | | hir_uclass_query(ClassQuery::ByValue { |
2882 | | property_name: "age", |
2883 | | property_value: "3.0", |
2884 | | }), |
2885 | | hir_union( |
2886 | | hir_uclass_query(ClassQuery::Binary("cyrillic")), |
2887 | | hir_union( |
2888 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2889 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2890 | | ) |
2891 | | ) |
2892 | | ) |
2893 | | ); |
2894 | | |
2895 | | #[cfg(all( |
2896 | | feature = "unicode-age", |
2897 | | feature = "unicode-case", |
2898 | | feature = "unicode-gencat", |
2899 | | feature = "unicode-script" |
2900 | | ))] |
2901 | | assert_eq!( |
2902 | | t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), |
2903 | | hir_case_fold(hir_union( |
2904 | | hir_uclass_query(ClassQuery::ByValue { |
2905 | | property_name: "age", |
2906 | | property_value: "3.0", |
2907 | | }), |
2908 | | hir_union( |
2909 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2910 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2911 | | ) |
2912 | | )) |
2913 | | ); |
2914 | | #[cfg(all( |
2915 | | feature = "unicode-age", |
2916 | | feature = "unicode-gencat", |
2917 | | feature = "unicode-script" |
2918 | | ))] |
2919 | | assert_eq!( |
2920 | | t(r"[^\p{age:3.0}\pZ\p{Greek}]"), |
2921 | | hir_negate(hir_union( |
2922 | | hir_uclass_query(ClassQuery::ByValue { |
2923 | | property_name: "age", |
2924 | | property_value: "3.0", |
2925 | | }), |
2926 | | hir_union( |
2927 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2928 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2929 | | ) |
2930 | | )) |
2931 | | ); |
2932 | | #[cfg(all( |
2933 | | feature = "unicode-age", |
2934 | | feature = "unicode-case", |
2935 | | feature = "unicode-gencat", |
2936 | | feature = "unicode-script" |
2937 | | ))] |
2938 | | assert_eq!( |
2939 | | t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), |
2940 | | hir_negate(hir_case_fold(hir_union( |
2941 | | hir_uclass_query(ClassQuery::ByValue { |
2942 | | property_name: "age", |
2943 | | property_value: "3.0", |
2944 | | }), |
2945 | | hir_union( |
2946 | | hir_uclass_query(ClassQuery::Binary("greek")), |
2947 | | hir_uclass_query(ClassQuery::Binary("separator")) |
2948 | | ) |
2949 | | ))) |
2950 | | ); |
2951 | | } |
2952 | | |
2953 | | #[test] |
2954 | | fn class_bracketed_nested() { |
2955 | | assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); |
2956 | | assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); |
2957 | | assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); |
2958 | | |
2959 | | assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); |
2960 | | assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); |
2961 | | |
2962 | | #[cfg(feature = "unicode-case")] |
2963 | | assert_eq!( |
2964 | | t(r"(?i)[a[^c]]"), |
2965 | | hir_negate(class_case_fold(uclass(&[('c', 'c')]))) |
2966 | | ); |
2967 | | #[cfg(feature = "unicode-case")] |
2968 | | assert_eq!( |
2969 | | t(r"(?i)[a-b[^c]]"), |
2970 | | hir_negate(class_case_fold(uclass(&[('c', 'c')]))) |
2971 | | ); |
2972 | | |
2973 | | #[cfg(feature = "unicode-case")] |
2974 | | assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); |
2975 | | #[cfg(feature = "unicode-case")] |
2976 | | assert_eq!( |
2977 | | t(r"(?i)[^a-b[^c]]"), |
2978 | | hir_uclass(&[('C', 'C'), ('c', 'c')]) |
2979 | | ); |
2980 | | |
2981 | | assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); |
2982 | | #[cfg(feature = "unicode-case")] |
2983 | | assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); |
2984 | | } |
2985 | | |
2986 | | #[test] |
2987 | | fn class_bracketed_intersect() { |
2988 | | assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); |
2989 | | assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); |
2990 | | assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); |
2991 | | assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); |
2992 | | assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); |
2993 | | assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); |
2994 | | assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); |
2995 | | assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); |
2996 | | assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); |
2997 | | |
2998 | | assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); |
2999 | | assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); |
3000 | | assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); |
3001 | | assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); |
3002 | | assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); |
3003 | | assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); |
3004 | | |
3005 | | #[cfg(feature = "unicode-case")] |
3006 | | assert_eq!( |
3007 | | t("(?i)[abc&&b-c]"), |
3008 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3009 | | ); |
3010 | | #[cfg(feature = "unicode-case")] |
3011 | | assert_eq!( |
3012 | | t("(?i)[abc&&[b-c]]"), |
3013 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3014 | | ); |
3015 | | #[cfg(feature = "unicode-case")] |
3016 | | assert_eq!( |
3017 | | t("(?i)[[abc]&&[b-c]]"), |
3018 | | hir_case_fold(hir_uclass(&[('b', 'c')])) |
3019 | | ); |
3020 | | #[cfg(feature = "unicode-case")] |
3021 | | assert_eq!( |
3022 | | t("(?i)[a-z&&b-y&&c-x]"), |
3023 | | hir_case_fold(hir_uclass(&[('c', 'x')])) |
3024 | | ); |
3025 | | #[cfg(feature = "unicode-case")] |
3026 | | assert_eq!( |
3027 | | t("(?i)[c-da-b&&a-d]"), |
3028 | | hir_case_fold(hir_uclass(&[('a', 'd')])) |
3029 | | ); |
3030 | | #[cfg(feature = "unicode-case")] |
3031 | | assert_eq!( |
3032 | | t("(?i)[a-d&&c-da-b]"), |
3033 | | hir_case_fold(hir_uclass(&[('a', 'd')])) |
3034 | | ); |
3035 | | |
3036 | | assert_eq!( |
3037 | | t("(?i-u)[abc&&b-c]"), |
3038 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3039 | | ); |
3040 | | assert_eq!( |
3041 | | t("(?i-u)[abc&&[b-c]]"), |
3042 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3043 | | ); |
3044 | | assert_eq!( |
3045 | | t("(?i-u)[[abc]&&[b-c]]"), |
3046 | | hir_case_fold(hir_bclass(&[(b'b', b'c')])) |
3047 | | ); |
3048 | | assert_eq!( |
3049 | | t("(?i-u)[a-z&&b-y&&c-x]"), |
3050 | | hir_case_fold(hir_bclass(&[(b'c', b'x')])) |
3051 | | ); |
3052 | | assert_eq!( |
3053 | | t("(?i-u)[c-da-b&&a-d]"), |
3054 | | hir_case_fold(hir_bclass(&[(b'a', b'd')])) |
3055 | | ); |
3056 | | assert_eq!( |
3057 | | t("(?i-u)[a-d&&c-da-b]"), |
3058 | | hir_case_fold(hir_bclass(&[(b'a', b'd')])) |
3059 | | ); |
3060 | | |
3061 | | // In `[a^]`, `^` does not need to be escaped, so it makes sense that |
3062 | | // `^` is also allowed to be unescaped after `&&`. |
3063 | | assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); |
3064 | | // `]` needs to be escaped after `&&` since it's not at start of class. |
3065 | | assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); |
3066 | | assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); |
3067 | | assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); |
3068 | | assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); |
3069 | | // Test precedence. |
3070 | | assert_eq!( |
3071 | | t(r"[a-w&&[^c-g]z]"), |
3072 | | hir_uclass(&[('a', 'b'), ('h', 'w')]) |
3073 | | ); |
3074 | | } |
3075 | | |
3076 | | #[test] |
3077 | | fn class_bracketed_intersect_negate() { |
3078 | | #[cfg(feature = "unicode-perl")] |
3079 | | assert_eq!( |
3080 | | t(r"[^\w&&\d]"), |
3081 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
3082 | | ); |
3083 | | assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); |
3084 | | #[cfg(feature = "unicode-perl")] |
3085 | | assert_eq!( |
3086 | | t(r"[^[\w&&\d]]"), |
3087 | | hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) |
3088 | | ); |
3089 | | #[cfg(feature = "unicode-perl")] |
3090 | | assert_eq!( |
3091 | | t(r"[^[^\w&&\d]]"), |
3092 | | hir_uclass_query(ClassQuery::Binary("digit")) |
3093 | | ); |
3094 | | #[cfg(feature = "unicode-perl")] |
3095 | | assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); |
3096 | | |
3097 | | #[cfg(feature = "unicode-perl")] |
3098 | | assert_eq!( |
3099 | | t_bytes(r"(?-u)[^\w&&\d]"), |
3100 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3101 | | ); |
3102 | | assert_eq!( |
3103 | | t_bytes(r"(?-u)[^[a-z&&a-c]]"), |
3104 | | hir_negate(hir_bclass(&[(b'a', b'c')])) |
3105 | | ); |
3106 | | assert_eq!( |
3107 | | t_bytes(r"(?-u)[^[\w&&\d]]"), |
3108 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3109 | | ); |
3110 | | assert_eq!( |
3111 | | t_bytes(r"(?-u)[^[^\w&&\d]]"), |
3112 | | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
3113 | | ); |
3114 | | assert_eq!( |
3115 | | t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), |
3116 | | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
3117 | | ); |
3118 | | } |
3119 | | |
3120 | | #[test] |
3121 | | fn class_bracketed_difference() { |
3122 | | #[cfg(feature = "unicode-gencat")] |
3123 | | assert_eq!( |
3124 | | t(r"[\pL--[:ascii:]]"), |
3125 | | hir_difference( |
3126 | | hir_uclass_query(ClassQuery::Binary("letter")), |
3127 | | hir_uclass(&[('\0', '\x7F')]) |
3128 | | ) |
3129 | | ); |
3130 | | |
3131 | | assert_eq!( |
3132 | | t(r"(?-u)[[:alpha:]--[:lower:]]"), |
3133 | | hir_bclass(&[(b'A', b'Z')]) |
3134 | | ); |
3135 | | } |
3136 | | |
3137 | | #[test] |
3138 | | fn class_bracketed_symmetric_difference() { |
3139 | | #[cfg(feature = "unicode-script")] |
3140 | | assert_eq!( |
3141 | | t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), |
3142 | | // Class({ |
3143 | | // '·'..='·', |
3144 | | // '\u{300}'..='\u{301}', |
3145 | | // '\u{304}'..='\u{304}', |
3146 | | // '\u{306}'..='\u{306}', |
3147 | | // '\u{308}'..='\u{308}', |
3148 | | // '\u{313}'..='\u{313}', |
3149 | | // '\u{342}'..='\u{342}', |
3150 | | // '\u{345}'..='\u{345}', |
3151 | | // 'ʹ'..='ʹ', |
3152 | | // '\u{1dc0}'..='\u{1dc1}', |
3153 | | // '⁝'..='⁝', |
3154 | | // }) |
3155 | | hir_uclass(&[ |
3156 | | ('·', '·'), |
3157 | | ('\u{0300}', '\u{0301}'), |
3158 | | ('\u{0304}', '\u{0304}'), |
3159 | | ('\u{0306}', '\u{0306}'), |
3160 | | ('\u{0308}', '\u{0308}'), |
3161 | | ('\u{0313}', '\u{0313}'), |
3162 | | ('\u{0342}', '\u{0342}'), |
3163 | | ('\u{0345}', '\u{0345}'), |
3164 | | ('ʹ', 'ʹ'), |
3165 | | ('\u{1DC0}', '\u{1DC1}'), |
3166 | | ('⁝', '⁝'), |
3167 | | ]) |
3168 | | ); |
3169 | | assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); |
3170 | | |
3171 | | assert_eq!( |
3172 | | t(r"(?-u)[a-g~~c-j]"), |
3173 | | hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) |
3174 | | ); |
3175 | | } |
3176 | | |
3177 | | #[test] |
3178 | | fn ignore_whitespace() { |
3179 | | assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); |
3180 | | assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); |
3181 | | assert_eq!( |
3182 | | t(r"(?x)\x # comment |
3183 | | { # comment |
3184 | | 53 # comment |
3185 | | } #comment"), |
3186 | | hir_lit("S") |
3187 | | ); |
3188 | | |
3189 | | assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); |
3190 | | assert_eq!( |
3191 | | t(r"(?x)\x # comment |
3192 | | 53 # comment"), |
3193 | | hir_lit("S") |
3194 | | ); |
3195 | | assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); |
3196 | | |
3197 | | #[cfg(feature = "unicode-gencat")] |
3198 | | assert_eq!( |
3199 | | t(r"(?x)\p # comment |
3200 | | { # comment |
3201 | | Separator # comment |
3202 | | } # comment"), |
3203 | | hir_uclass_query(ClassQuery::Binary("separator")) |
3204 | | ); |
3205 | | |
3206 | | assert_eq!( |
3207 | | t(r"(?x)a # comment |
3208 | | { # comment |
3209 | | 5 # comment |
3210 | | , # comment |
3211 | | 10 # comment |
3212 | | } # comment"), |
3213 | | hir_range(true, 5, Some(10), hir_lit("a")) |
3214 | | ); |
3215 | | |
3216 | | assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); |
3217 | | } |
3218 | | |
3219 | | #[test] |
3220 | | fn analysis_is_utf8() { |
3221 | | // Positive examples. |
3222 | | assert!(props_bytes(r"a").is_utf8()); |
3223 | | assert!(props_bytes(r"ab").is_utf8()); |
3224 | | assert!(props_bytes(r"(?-u)a").is_utf8()); |
3225 | | assert!(props_bytes(r"(?-u)ab").is_utf8()); |
3226 | | assert!(props_bytes(r"\xFF").is_utf8()); |
3227 | | assert!(props_bytes(r"\xFF\xFF").is_utf8()); |
3228 | | assert!(props_bytes(r"[^a]").is_utf8()); |
3229 | | assert!(props_bytes(r"[^a][^a]").is_utf8()); |
3230 | | assert!(props_bytes(r"\b").is_utf8()); |
3231 | | assert!(props_bytes(r"\B").is_utf8()); |
3232 | | assert!(props_bytes(r"(?-u)\b").is_utf8()); |
3233 | | assert!(props_bytes(r"(?-u)\B").is_utf8()); |
3234 | | |
3235 | | // Negative examples. |
3236 | | assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); |
3237 | | assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); |
3238 | | assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); |
3239 | | assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); |
3240 | | } |
3241 | | |
3242 | | #[test] |
3243 | | fn analysis_captures_len() { |
3244 | | assert_eq!(0, props(r"a").explicit_captures_len()); |
3245 | | assert_eq!(0, props(r"(?:a)").explicit_captures_len()); |
3246 | | assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); |
3247 | | assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); |
3248 | | assert_eq!(1, props(r"(a)").explicit_captures_len()); |
3249 | | assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len()); |
3250 | | assert_eq!(1, props(r"()").explicit_captures_len()); |
3251 | | assert_eq!(1, props(r"()a").explicit_captures_len()); |
3252 | | assert_eq!(1, props(r"(a)+").explicit_captures_len()); |
3253 | | assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); |
3254 | | assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); |
3255 | | assert_eq!(2, props(r"((a))").explicit_captures_len()); |
3256 | | assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); |
3257 | | } |
3258 | | |
3259 | | #[test] |
3260 | | fn analysis_static_captures_len() { |
3261 | | let len = |pattern| props(pattern).static_explicit_captures_len(); |
3262 | | assert_eq!(Some(0), len(r"")); |
3263 | | assert_eq!(Some(0), len(r"foo|bar")); |
3264 | | assert_eq!(None, len(r"(foo)|bar")); |
3265 | | assert_eq!(None, len(r"foo|(bar)")); |
3266 | | assert_eq!(Some(1), len(r"(foo|bar)")); |
3267 | | assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); |
3268 | | assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); |
3269 | | assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); |
3270 | | assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); |
3271 | | assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); |
3272 | | assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); |
3273 | | assert_eq!(None, len(r"(a)(b)(extra)?")); |
3274 | | assert_eq!(Some(1), len(r"(foo)|(bar)")); |
3275 | | assert_eq!(Some(2), len(r"(foo)(bar)")); |
3276 | | assert_eq!(Some(2), len(r"(foo)+(bar)")); |
3277 | | assert_eq!(None, len(r"(foo)*(bar)")); |
3278 | | assert_eq!(Some(0), len(r"(foo)?{0}")); |
3279 | | assert_eq!(None, len(r"(foo)?{1}")); |
3280 | | assert_eq!(Some(1), len(r"(foo){1}")); |
3281 | | assert_eq!(Some(1), len(r"(foo){1,}")); |
3282 | | assert_eq!(Some(1), len(r"(foo){1,}?")); |
3283 | | assert_eq!(None, len(r"(foo){1,}??")); |
3284 | | assert_eq!(None, len(r"(foo){0,}")); |
3285 | | assert_eq!(Some(1), len(r"(foo)(?:bar)")); |
3286 | | assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); |
3287 | | assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)")); |
3288 | | assert_eq!( |
3289 | | Some(2), |
3290 | | len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) |
3291 | | ); |
3292 | | } |
3293 | | |
3294 | | #[test] |
3295 | | fn analysis_is_all_assertions() { |
3296 | | // Positive examples. |
3297 | | let p = props(r"\b"); |
3298 | | assert!(!p.look_set().is_empty()); |
3299 | | assert_eq!(p.minimum_len(), Some(0)); |
3300 | | |
3301 | | let p = props(r"\B"); |
3302 | | assert!(!p.look_set().is_empty()); |
3303 | | assert_eq!(p.minimum_len(), Some(0)); |
3304 | | |
3305 | | let p = props(r"^"); |
3306 | | assert!(!p.look_set().is_empty()); |
3307 | | assert_eq!(p.minimum_len(), Some(0)); |
3308 | | |
3309 | | let p = props(r"$"); |
3310 | | assert!(!p.look_set().is_empty()); |
3311 | | assert_eq!(p.minimum_len(), Some(0)); |
3312 | | |
3313 | | let p = props(r"\A"); |
3314 | | assert!(!p.look_set().is_empty()); |
3315 | | assert_eq!(p.minimum_len(), Some(0)); |
3316 | | |
3317 | | let p = props(r"\z"); |
3318 | | assert!(!p.look_set().is_empty()); |
3319 | | assert_eq!(p.minimum_len(), Some(0)); |
3320 | | |
3321 | | let p = props(r"$^\z\A\b\B"); |
3322 | | assert!(!p.look_set().is_empty()); |
3323 | | assert_eq!(p.minimum_len(), Some(0)); |
3324 | | |
3325 | | let p = props(r"$|^|\z|\A|\b|\B"); |
3326 | | assert!(!p.look_set().is_empty()); |
3327 | | assert_eq!(p.minimum_len(), Some(0)); |
3328 | | |
3329 | | let p = props(r"^$|$^"); |
3330 | | assert!(!p.look_set().is_empty()); |
3331 | | assert_eq!(p.minimum_len(), Some(0)); |
3332 | | |
3333 | | let p = props(r"((\b)+())*^"); |
3334 | | assert!(!p.look_set().is_empty()); |
3335 | | assert_eq!(p.minimum_len(), Some(0)); |
3336 | | |
3337 | | // Negative examples. |
3338 | | let p = props(r"^a"); |
3339 | | assert!(!p.look_set().is_empty()); |
3340 | | assert_eq!(p.minimum_len(), Some(1)); |
3341 | | } |
3342 | | |
3343 | | #[test] |
3344 | | fn analysis_look_set_prefix_any() { |
3345 | | let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); |
3346 | | assert!(p.look_set_prefix_any().contains(Look::WordAscii)); |
3347 | | } |
3348 | | |
3349 | | #[test] |
3350 | | fn analysis_is_anchored() { |
3351 | | let is_start = |p| props(p).look_set_prefix().contains(Look::Start); |
3352 | | let is_end = |p| props(p).look_set_suffix().contains(Look::End); |
3353 | | |
3354 | | // Positive examples. |
3355 | | assert!(is_start(r"^")); |
3356 | | assert!(is_end(r"$")); |
3357 | | |
3358 | | assert!(is_start(r"^^")); |
3359 | | assert!(props(r"$$").look_set_suffix().contains(Look::End)); |
3360 | | |
3361 | | assert!(is_start(r"^$")); |
3362 | | assert!(is_end(r"^$")); |
3363 | | |
3364 | | assert!(is_start(r"^foo")); |
3365 | | assert!(is_end(r"foo$")); |
3366 | | |
3367 | | assert!(is_start(r"^foo|^bar")); |
3368 | | assert!(is_end(r"foo$|bar$")); |
3369 | | |
3370 | | assert!(is_start(r"^(foo|bar)")); |
3371 | | assert!(is_end(r"(foo|bar)$")); |
3372 | | |
3373 | | assert!(is_start(r"^+")); |
3374 | | assert!(is_end(r"$+")); |
3375 | | assert!(is_start(r"^++")); |
3376 | | assert!(is_end(r"$++")); |
3377 | | assert!(is_start(r"(^)+")); |
3378 | | assert!(is_end(r"($)+")); |
3379 | | |
3380 | | assert!(is_start(r"$^")); |
3381 | | assert!(is_start(r"$^")); |
3382 | | assert!(is_start(r"$^|^$")); |
3383 | | assert!(is_end(r"$^|^$")); |
3384 | | |
3385 | | assert!(is_start(r"\b^")); |
3386 | | assert!(is_end(r"$\b")); |
3387 | | assert!(is_start(r"^(?m:^)")); |
3388 | | assert!(is_end(r"(?m:$)$")); |
3389 | | assert!(is_start(r"(?m:^)^")); |
3390 | | assert!(is_end(r"$(?m:$)")); |
3391 | | |
3392 | | // Negative examples. |
3393 | | assert!(!is_start(r"(?m)^")); |
3394 | | assert!(!is_end(r"(?m)$")); |
3395 | | assert!(!is_start(r"(?m:^$)|$^")); |
3396 | | assert!(!is_end(r"(?m:^$)|$^")); |
3397 | | assert!(!is_start(r"$^|(?m:^$)")); |
3398 | | assert!(!is_end(r"$^|(?m:^$)")); |
3399 | | |
3400 | | assert!(!is_start(r"a^")); |
3401 | | assert!(!is_start(r"$a")); |
3402 | | |
3403 | | assert!(!is_end(r"a^")); |
3404 | | assert!(!is_end(r"$a")); |
3405 | | |
3406 | | assert!(!is_start(r"^foo|bar")); |
3407 | | assert!(!is_end(r"foo|bar$")); |
3408 | | |
3409 | | assert!(!is_start(r"^*")); |
3410 | | assert!(!is_end(r"$*")); |
3411 | | assert!(!is_start(r"^*+")); |
3412 | | assert!(!is_end(r"$*+")); |
3413 | | assert!(!is_start(r"^+*")); |
3414 | | assert!(!is_end(r"$+*")); |
3415 | | assert!(!is_start(r"(^)*")); |
3416 | | assert!(!is_end(r"($)*")); |
3417 | | } |
3418 | | |
3419 | | #[test] |
3420 | | fn analysis_is_any_anchored() { |
3421 | | let is_start = |p| props(p).look_set().contains(Look::Start); |
3422 | | let is_end = |p| props(p).look_set().contains(Look::End); |
3423 | | |
3424 | | // Positive examples. |
3425 | | assert!(is_start(r"^")); |
3426 | | assert!(is_end(r"$")); |
3427 | | assert!(is_start(r"\A")); |
3428 | | assert!(is_end(r"\z")); |
3429 | | |
3430 | | // Negative examples. |
3431 | | assert!(!is_start(r"(?m)^")); |
3432 | | assert!(!is_end(r"(?m)$")); |
3433 | | assert!(!is_start(r"$")); |
3434 | | assert!(!is_end(r"^")); |
3435 | | } |
3436 | | |
3437 | | #[test] |
3438 | | fn analysis_can_empty() { |
3439 | | // Positive examples. |
3440 | | let assert_empty = |
3441 | | |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); |
3442 | | assert_empty(r""); |
3443 | | assert_empty(r"()"); |
3444 | | assert_empty(r"()*"); |
3445 | | assert_empty(r"()+"); |
3446 | | assert_empty(r"()?"); |
3447 | | assert_empty(r"a*"); |
3448 | | assert_empty(r"a?"); |
3449 | | assert_empty(r"a{0}"); |
3450 | | assert_empty(r"a{0,}"); |
3451 | | assert_empty(r"a{0,1}"); |
3452 | | assert_empty(r"a{0,10}"); |
3453 | | #[cfg(feature = "unicode-gencat")] |
3454 | | assert_empty(r"\pL*"); |
3455 | | assert_empty(r"a*|b"); |
3456 | | assert_empty(r"b|a*"); |
3457 | | assert_empty(r"a|"); |
3458 | | assert_empty(r"|a"); |
3459 | | assert_empty(r"a||b"); |
3460 | | assert_empty(r"a*a?(abcd)*"); |
3461 | | assert_empty(r"^"); |
3462 | | assert_empty(r"$"); |
3463 | | assert_empty(r"(?m)^"); |
3464 | | assert_empty(r"(?m)$"); |
3465 | | assert_empty(r"\A"); |
3466 | | assert_empty(r"\z"); |
3467 | | assert_empty(r"\B"); |
3468 | | assert_empty(r"(?-u)\B"); |
3469 | | assert_empty(r"\b"); |
3470 | | assert_empty(r"(?-u)\b"); |
3471 | | |
3472 | | // Negative examples. |
3473 | | let assert_non_empty = |
3474 | | |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); |
3475 | | assert_non_empty(r"a+"); |
3476 | | assert_non_empty(r"a{1}"); |
3477 | | assert_non_empty(r"a{1,}"); |
3478 | | assert_non_empty(r"a{1,2}"); |
3479 | | assert_non_empty(r"a{1,10}"); |
3480 | | assert_non_empty(r"b|a"); |
3481 | | assert_non_empty(r"a*a+(abcd)*"); |
3482 | | #[cfg(feature = "unicode-gencat")] |
3483 | | assert_non_empty(r"\P{any}"); |
3484 | | assert_non_empty(r"[a--a]"); |
3485 | | assert_non_empty(r"[a&&b]"); |
3486 | | } |
3487 | | |
3488 | | #[test] |
3489 | | fn analysis_is_literal() { |
3490 | | // Positive examples. |
3491 | | assert!(props(r"a").is_literal()); |
3492 | | assert!(props(r"ab").is_literal()); |
3493 | | assert!(props(r"abc").is_literal()); |
3494 | | assert!(props(r"(?m)abc").is_literal()); |
3495 | | assert!(props(r"(?:a)").is_literal()); |
3496 | | assert!(props(r"foo(?:a)").is_literal()); |
3497 | | assert!(props(r"(?:a)foo").is_literal()); |
3498 | | assert!(props(r"[a]").is_literal()); |
3499 | | |
3500 | | // Negative examples. |
3501 | | assert!(!props(r"").is_literal()); |
3502 | | assert!(!props(r"^").is_literal()); |
3503 | | assert!(!props(r"a|b").is_literal()); |
3504 | | assert!(!props(r"(a)").is_literal()); |
3505 | | assert!(!props(r"a+").is_literal()); |
3506 | | assert!(!props(r"foo(a)").is_literal()); |
3507 | | assert!(!props(r"(a)foo").is_literal()); |
3508 | | assert!(!props(r"[ab]").is_literal()); |
3509 | | } |
3510 | | |
3511 | | #[test] |
3512 | | fn analysis_is_alternation_literal() { |
3513 | | // Positive examples. |
3514 | | assert!(props(r"a").is_alternation_literal()); |
3515 | | assert!(props(r"ab").is_alternation_literal()); |
3516 | | assert!(props(r"abc").is_alternation_literal()); |
3517 | | assert!(props(r"(?m)abc").is_alternation_literal()); |
3518 | | assert!(props(r"foo|bar").is_alternation_literal()); |
3519 | | assert!(props(r"foo|bar|baz").is_alternation_literal()); |
3520 | | assert!(props(r"[a]").is_alternation_literal()); |
3521 | | assert!(props(r"(?:ab)|cd").is_alternation_literal()); |
3522 | | assert!(props(r"ab|(?:cd)").is_alternation_literal()); |
3523 | | |
3524 | | // Negative examples. |
3525 | | assert!(!props(r"").is_alternation_literal()); |
3526 | | assert!(!props(r"^").is_alternation_literal()); |
3527 | | assert!(!props(r"(a)").is_alternation_literal()); |
3528 | | assert!(!props(r"a+").is_alternation_literal()); |
3529 | | assert!(!props(r"foo(a)").is_alternation_literal()); |
3530 | | assert!(!props(r"(a)foo").is_alternation_literal()); |
3531 | | assert!(!props(r"[ab]").is_alternation_literal()); |
3532 | | assert!(!props(r"[ab]|b").is_alternation_literal()); |
3533 | | assert!(!props(r"a|[ab]").is_alternation_literal()); |
3534 | | assert!(!props(r"(a)|b").is_alternation_literal()); |
3535 | | assert!(!props(r"a|(b)").is_alternation_literal()); |
3536 | | assert!(!props(r"a|b").is_alternation_literal()); |
3537 | | assert!(!props(r"a|b|c").is_alternation_literal()); |
3538 | | assert!(!props(r"[a]|b").is_alternation_literal()); |
3539 | | assert!(!props(r"a|[b]").is_alternation_literal()); |
3540 | | assert!(!props(r"(?:a)|b").is_alternation_literal()); |
3541 | | assert!(!props(r"a|(?:b)").is_alternation_literal()); |
3542 | | assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); |
3543 | | } |
3544 | | |
3545 | | // This tests that the smart Hir::repetition constructors does some basic |
3546 | | // simplifications. |
3547 | | #[test] |
3548 | | fn smart_repetition() { |
3549 | | assert_eq!(t(r"a{0}"), Hir::empty()); |
3550 | | assert_eq!(t(r"a{1}"), hir_lit("a")); |
3551 | | assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); |
3552 | | } |
3553 | | |
3554 | | // This tests that the smart Hir::concat constructor simplifies the given |
3555 | | // exprs in a way we expect. |
3556 | | #[test] |
3557 | | fn smart_concat() { |
3558 | | assert_eq!(t(""), Hir::empty()); |
3559 | | assert_eq!(t("(?:)"), Hir::empty()); |
3560 | | assert_eq!(t("abc"), hir_lit("abc")); |
3561 | | assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); |
3562 | | assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); |
3563 | | assert_eq!( |
3564 | | t("foo(?:bar^baz)quux"), |
3565 | | hir_cat(vec![ |
3566 | | hir_lit("foobar"), |
3567 | | hir_look(hir::Look::Start), |
3568 | | hir_lit("bazquux"), |
3569 | | ]) |
3570 | | ); |
3571 | | assert_eq!( |
3572 | | t("foo(?:ba(?:r^b)az)quux"), |
3573 | | hir_cat(vec![ |
3574 | | hir_lit("foobar"), |
3575 | | hir_look(hir::Look::Start), |
3576 | | hir_lit("bazquux"), |
3577 | | ]) |
3578 | | ); |
3579 | | } |
3580 | | |
3581 | | // This tests that the smart Hir::alternation constructor simplifies the |
3582 | | // given exprs in a way we expect. |
3583 | | #[test] |
3584 | | fn smart_alternation() { |
3585 | | assert_eq!( |
3586 | | t("(?:foo)|(?:bar)"), |
3587 | | hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) |
3588 | | ); |
3589 | | assert_eq!( |
3590 | | t("quux|(?:abc|def|xyz)|baz"), |
3591 | | hir_alt(vec![ |
3592 | | hir_lit("quux"), |
3593 | | hir_lit("abc"), |
3594 | | hir_lit("def"), |
3595 | | hir_lit("xyz"), |
3596 | | hir_lit("baz"), |
3597 | | ]) |
3598 | | ); |
3599 | | assert_eq!( |
3600 | | t("quux|(?:abc|(?:def|mno)|xyz)|baz"), |
3601 | | hir_alt(vec![ |
3602 | | hir_lit("quux"), |
3603 | | hir_lit("abc"), |
3604 | | hir_lit("def"), |
3605 | | hir_lit("mno"), |
3606 | | hir_lit("xyz"), |
3607 | | hir_lit("baz"), |
3608 | | ]) |
3609 | | ); |
3610 | | assert_eq!( |
3611 | | t("a|b|c|d|e|f|x|y|z"), |
3612 | | hir_uclass(&[('a', 'f'), ('x', 'z')]), |
3613 | | ); |
3614 | | // Tests that we lift common prefixes out of an alternation. |
3615 | | assert_eq!( |
3616 | | t("[A-Z]foo|[A-Z]quux"), |
3617 | | hir_cat(vec![ |
3618 | | hir_uclass(&[('A', 'Z')]), |
3619 | | hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), |
3620 | | ]), |
3621 | | ); |
3622 | | assert_eq!( |
3623 | | t("[A-Z][A-Z]|[A-Z]quux"), |
3624 | | hir_cat(vec![ |
3625 | | hir_uclass(&[('A', 'Z')]), |
3626 | | hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), |
3627 | | ]), |
3628 | | ); |
3629 | | assert_eq!( |
3630 | | t("[A-Z][A-Z]|[A-Z][A-Z]quux"), |
3631 | | hir_cat(vec![ |
3632 | | hir_uclass(&[('A', 'Z')]), |
3633 | | hir_uclass(&[('A', 'Z')]), |
3634 | | hir_alt(vec![Hir::empty(), hir_lit("quux")]), |
3635 | | ]), |
3636 | | ); |
3637 | | assert_eq!( |
3638 | | t("[A-Z]foo|[A-Z]foobar"), |
3639 | | hir_cat(vec![ |
3640 | | hir_uclass(&[('A', 'Z')]), |
3641 | | hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), |
3642 | | ]), |
3643 | | ); |
3644 | | } |
3645 | | |
3646 | | #[test] |
3647 | | fn regression_alt_empty_concat() { |
3648 | | use crate::ast::{self, Ast}; |
3649 | | |
3650 | | let span = Span::splat(Position::new(0, 0, 0)); |
3651 | | let ast = Ast::alternation(ast::Alternation { |
3652 | | span, |
3653 | | asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], |
3654 | | }); |
3655 | | |
3656 | | let mut t = Translator::new(); |
3657 | | assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); |
3658 | | } |
3659 | | |
3660 | | #[test] |
3661 | | fn regression_empty_alt() { |
3662 | | use crate::ast::{self, Ast}; |
3663 | | |
3664 | | let span = Span::splat(Position::new(0, 0, 0)); |
3665 | | let ast = Ast::concat(ast::Concat { |
3666 | | span, |
3667 | | asts: vec![Ast::alternation(ast::Alternation { |
3668 | | span, |
3669 | | asts: vec![], |
3670 | | })], |
3671 | | }); |
3672 | | |
3673 | | let mut t = Translator::new(); |
3674 | | assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); |
3675 | | } |
3676 | | |
3677 | | #[test] |
3678 | | fn regression_singleton_alt() { |
3679 | | use crate::{ |
3680 | | ast::{self, Ast}, |
3681 | | hir::Dot, |
3682 | | }; |
3683 | | |
3684 | | let span = Span::splat(Position::new(0, 0, 0)); |
3685 | | let ast = Ast::concat(ast::Concat { |
3686 | | span, |
3687 | | asts: vec![Ast::alternation(ast::Alternation { |
3688 | | span, |
3689 | | asts: vec![Ast::dot(span)], |
3690 | | })], |
3691 | | }); |
3692 | | |
3693 | | let mut t = Translator::new(); |
3694 | | assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); |
3695 | | } |
3696 | | |
3697 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 |
3698 | | #[test] |
3699 | | fn regression_fuzz_match() { |
3700 | | let pat = "[(\u{6} \0-\u{afdf5}] \0 "; |
3701 | | let ast = ParserBuilder::new() |
3702 | | .octal(false) |
3703 | | .ignore_whitespace(true) |
3704 | | .build() |
3705 | | .parse(pat) |
3706 | | .unwrap(); |
3707 | | let hir = TranslatorBuilder::new() |
3708 | | .utf8(true) |
3709 | | .case_insensitive(false) |
3710 | | .multi_line(false) |
3711 | | .dot_matches_new_line(false) |
3712 | | .swap_greed(true) |
3713 | | .unicode(true) |
3714 | | .build() |
3715 | | .translate(pat, &ast) |
3716 | | .unwrap(); |
3717 | | assert_eq!( |
3718 | | hir, |
3719 | | Hir::concat(vec![ |
3720 | | hir_uclass(&[('\0', '\u{afdf5}')]), |
3721 | | hir_lit("\0"), |
3722 | | ]) |
3723 | | ); |
3724 | | } |
3725 | | |
3726 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 |
3727 | | #[cfg(feature = "unicode")] |
3728 | | #[test] |
3729 | | fn regression_fuzz_difference1() { |
3730 | | let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; |
3731 | | let _ = t(pat); // shouldn't panic |
3732 | | } |
3733 | | |
3734 | | // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 |
3735 | | #[test] |
3736 | | fn regression_fuzz_char_decrement1() { |
3737 | | let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 "; |
3738 | | let _ = t(pat); // shouldn't panic |
3739 | | } |
3740 | | } |