/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.8.6/src/hir/print.rs
Line | Count | Source |
1 | | /*! |
2 | | This module provides a regular expression printer for `Hir`. |
3 | | */ |
4 | | |
5 | | use core::fmt; |
6 | | |
7 | | use crate::{ |
8 | | hir::{ |
9 | | self, |
10 | | visitor::{self, Visitor}, |
11 | | Hir, HirKind, |
12 | | }, |
13 | | is_meta_character, |
14 | | }; |
15 | | |
16 | | /// A builder for constructing a printer. |
17 | | /// |
18 | | /// Note that since a printer doesn't have any configuration knobs, this type |
19 | | /// remains unexported. |
20 | | #[derive(Clone, Debug)] |
21 | | struct PrinterBuilder { |
22 | | _priv: (), |
23 | | } |
24 | | |
25 | | impl Default for PrinterBuilder { |
26 | 0 | fn default() -> PrinterBuilder { |
27 | 0 | PrinterBuilder::new() |
28 | 0 | } |
29 | | } |
30 | | |
31 | | impl PrinterBuilder { |
32 | 0 | fn new() -> PrinterBuilder { |
33 | 0 | PrinterBuilder { _priv: () } |
34 | 0 | } |
35 | | |
36 | 0 | fn build(&self) -> Printer { |
37 | 0 | Printer { _priv: () } |
38 | 0 | } |
39 | | } |
40 | | |
41 | | /// A printer for a regular expression's high-level intermediate |
42 | | /// representation. |
43 | | /// |
44 | | /// A printer converts a high-level intermediate representation (HIR) to a |
45 | | /// regular expression pattern string. This particular printer uses constant |
46 | | /// stack space and heap space proportional to the size of the HIR. |
47 | | /// |
48 | | /// Since this printer is only using the HIR, the pattern it prints will likely |
49 | | /// not resemble the original pattern at all. For example, a pattern like |
50 | | /// `\pL` will have its entire class written out. |
51 | | /// |
52 | | /// The purpose of this printer is to provide a means to mutate an HIR and then |
53 | | /// build a regular expression from the result of that mutation. (A regex |
54 | | /// library could provide a constructor from this HIR explicitly, but that |
55 | | /// creates an unnecessary public coupling between the regex library and this |
56 | | /// specific HIR representation.) |
57 | | #[derive(Debug)] |
58 | | pub struct Printer { |
59 | | _priv: (), |
60 | | } |
61 | | |
62 | | impl Printer { |
63 | | /// Create a new printer. |
64 | 0 | pub fn new() -> Printer { |
65 | 0 | PrinterBuilder::new().build() |
66 | 0 | } |
67 | | |
68 | | /// Print the given `Ast` to the given writer. The writer must implement |
69 | | /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used |
70 | | /// here are a `fmt::Formatter` (which is available in `fmt::Display` |
71 | | /// implementations) or a `&mut String`. |
72 | 0 | pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { |
73 | 0 | visitor::visit(hir, Writer { wtr }) |
74 | 0 | } |
75 | | } |
76 | | |
77 | | #[derive(Debug)] |
78 | | struct Writer<W> { |
79 | | wtr: W, |
80 | | } |
81 | | |
82 | | impl<W: fmt::Write> Visitor for Writer<W> { |
83 | | type Output = (); |
84 | | type Err = fmt::Error; |
85 | | |
86 | 0 | fn finish(self) -> fmt::Result { |
87 | 0 | Ok(()) |
88 | 0 | } |
89 | | |
90 | 0 | fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { |
91 | 0 | match *hir.kind() { |
92 | | HirKind::Empty => { |
93 | | // Technically an empty sub-expression could be "printed" by |
94 | | // just ignoring it, but in practice, you could have a |
95 | | // repetition operator attached to an empty expression, and you |
96 | | // really need something in the concrete syntax to make that |
97 | | // work as you'd expect. |
98 | 0 | self.wtr.write_str(r"(?:)")?; |
99 | | } |
100 | | // Repetition operators are strictly suffix oriented. |
101 | 0 | HirKind::Repetition(_) => {} |
102 | 0 | HirKind::Literal(hir::Literal(ref bytes)) => { |
103 | | // See the comment on the 'Concat' and 'Alternation' case below |
104 | | // for why we put parens here. Literals are, conceptually, |
105 | | // a special case of concatenation where each element is a |
106 | | // character. The HIR flattens this into a Box<[u8]>, but we |
107 | | // still need to treat it like a concatenation for correct |
108 | | // printing. As a special case, we don't write parens if there |
109 | | // is only one character. One character means there is no |
110 | | // concat so we don't need parens. Adding parens would still be |
111 | | // correct, but we drop them here because it tends to create |
112 | | // rather noisy regexes even in simple cases. |
113 | 0 | let result = core::str::from_utf8(bytes); |
114 | 0 | let len = result.map_or(bytes.len(), |s| s.chars().count()); |
115 | 0 | if len > 1 { |
116 | 0 | self.wtr.write_str(r"(?:")?; |
117 | 0 | } |
118 | 0 | match result { |
119 | 0 | Ok(string) => { |
120 | 0 | for c in string.chars() { |
121 | 0 | self.write_literal_char(c)?; |
122 | | } |
123 | | } |
124 | | Err(_) => { |
125 | 0 | for &b in bytes.iter() { |
126 | 0 | self.write_literal_byte(b)?; |
127 | | } |
128 | | } |
129 | | } |
130 | 0 | if len > 1 { |
131 | 0 | self.wtr.write_str(r")")?; |
132 | 0 | } |
133 | | } |
134 | 0 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
135 | 0 | if cls.ranges().is_empty() { |
136 | 0 | return self.wtr.write_str("[a&&b]"); |
137 | 0 | } |
138 | 0 | self.wtr.write_str("[")?; |
139 | 0 | for range in cls.iter() { |
140 | 0 | if range.start() == range.end() { |
141 | 0 | self.write_literal_char(range.start())?; |
142 | 0 | } else if u32::from(range.start()) + 1 |
143 | 0 | == u32::from(range.end()) |
144 | | { |
145 | 0 | self.write_literal_char(range.start())?; |
146 | 0 | self.write_literal_char(range.end())?; |
147 | | } else { |
148 | 0 | self.write_literal_char(range.start())?; |
149 | 0 | self.wtr.write_str("-")?; |
150 | 0 | self.write_literal_char(range.end())?; |
151 | | } |
152 | | } |
153 | 0 | self.wtr.write_str("]")?; |
154 | | } |
155 | 0 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
156 | 0 | if cls.ranges().is_empty() { |
157 | 0 | return self.wtr.write_str("[a&&b]"); |
158 | 0 | } |
159 | 0 | self.wtr.write_str("(?-u:[")?; |
160 | 0 | for range in cls.iter() { |
161 | 0 | if range.start() == range.end() { |
162 | 0 | self.write_literal_class_byte(range.start())?; |
163 | 0 | } else if range.start() + 1 == range.end() { |
164 | 0 | self.write_literal_class_byte(range.start())?; |
165 | 0 | self.write_literal_class_byte(range.end())?; |
166 | | } else { |
167 | 0 | self.write_literal_class_byte(range.start())?; |
168 | 0 | self.wtr.write_str("-")?; |
169 | 0 | self.write_literal_class_byte(range.end())?; |
170 | | } |
171 | | } |
172 | 0 | self.wtr.write_str("])")?; |
173 | | } |
174 | 0 | HirKind::Look(ref look) => match *look { |
175 | | hir::Look::Start => { |
176 | 0 | self.wtr.write_str(r"\A")?; |
177 | | } |
178 | | hir::Look::End => { |
179 | 0 | self.wtr.write_str(r"\z")?; |
180 | | } |
181 | | hir::Look::StartLF => { |
182 | 0 | self.wtr.write_str("(?m:^)")?; |
183 | | } |
184 | | hir::Look::EndLF => { |
185 | 0 | self.wtr.write_str("(?m:$)")?; |
186 | | } |
187 | | hir::Look::StartCRLF => { |
188 | 0 | self.wtr.write_str("(?mR:^)")?; |
189 | | } |
190 | | hir::Look::EndCRLF => { |
191 | 0 | self.wtr.write_str("(?mR:$)")?; |
192 | | } |
193 | | hir::Look::WordAscii => { |
194 | 0 | self.wtr.write_str(r"(?-u:\b)")?; |
195 | | } |
196 | | hir::Look::WordAsciiNegate => { |
197 | 0 | self.wtr.write_str(r"(?-u:\B)")?; |
198 | | } |
199 | | hir::Look::WordUnicode => { |
200 | 0 | self.wtr.write_str(r"\b")?; |
201 | | } |
202 | | hir::Look::WordUnicodeNegate => { |
203 | 0 | self.wtr.write_str(r"\B")?; |
204 | | } |
205 | | hir::Look::WordStartAscii => { |
206 | 0 | self.wtr.write_str(r"(?-u:\b{start})")?; |
207 | | } |
208 | | hir::Look::WordEndAscii => { |
209 | 0 | self.wtr.write_str(r"(?-u:\b{end})")?; |
210 | | } |
211 | | hir::Look::WordStartUnicode => { |
212 | 0 | self.wtr.write_str(r"\b{start}")?; |
213 | | } |
214 | | hir::Look::WordEndUnicode => { |
215 | 0 | self.wtr.write_str(r"\b{end}")?; |
216 | | } |
217 | | hir::Look::WordStartHalfAscii => { |
218 | 0 | self.wtr.write_str(r"(?-u:\b{start-half})")?; |
219 | | } |
220 | | hir::Look::WordEndHalfAscii => { |
221 | 0 | self.wtr.write_str(r"(?-u:\b{end-half})")?; |
222 | | } |
223 | | hir::Look::WordStartHalfUnicode => { |
224 | 0 | self.wtr.write_str(r"\b{start-half}")?; |
225 | | } |
226 | | hir::Look::WordEndHalfUnicode => { |
227 | 0 | self.wtr.write_str(r"\b{end-half}")?; |
228 | | } |
229 | | }, |
230 | 0 | HirKind::Capture(hir::Capture { ref name, .. }) => { |
231 | 0 | self.wtr.write_str("(")?; |
232 | 0 | if let Some(ref name) = *name { |
233 | 0 | write!(self.wtr, "?P<{}>", name)?; |
234 | 0 | } |
235 | | } |
236 | | // Why do this? Wrapping concats and alts in non-capturing groups |
237 | | // is not *always* necessary, but is sometimes necessary. For |
238 | | // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' |
239 | | // and not 'ab|c'. The former is clearly the intended meaning, but |
240 | | // the latter is actually 'alt(concat(a, b), c)'. |
241 | | // |
242 | | // It would be possible to only group these things in cases where |
243 | | // it's strictly necessary, but it requires knowing the parent |
244 | | // expression. And since this technique is simpler and always |
245 | | // correct, we take this route. More to the point, it is a non-goal |
246 | | // of an HIR printer to show a nice easy-to-read regex. Indeed, |
247 | | // its construction forbids it from doing so. Therefore, inserting |
248 | | // extra groups where they aren't necessary is perfectly okay. |
249 | | HirKind::Concat(_) | HirKind::Alternation(_) => { |
250 | 0 | self.wtr.write_str(r"(?:")?; |
251 | | } |
252 | | } |
253 | 0 | Ok(()) |
254 | 0 | } |
255 | | |
256 | 0 | fn visit_post(&mut self, hir: &Hir) -> fmt::Result { |
257 | 0 | match *hir.kind() { |
258 | | // Handled during visit_pre |
259 | | HirKind::Empty |
260 | | | HirKind::Literal(_) |
261 | | | HirKind::Class(_) |
262 | 0 | | HirKind::Look(_) => {} |
263 | 0 | HirKind::Repetition(ref x) => { |
264 | 0 | match (x.min, x.max) { |
265 | | (0, Some(1)) => { |
266 | 0 | self.wtr.write_str("?")?; |
267 | | } |
268 | | (0, None) => { |
269 | 0 | self.wtr.write_str("*")?; |
270 | | } |
271 | | (1, None) => { |
272 | 0 | self.wtr.write_str("+")?; |
273 | | } |
274 | | (1, Some(1)) => { |
275 | | // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. |
276 | 0 | return Ok(()); |
277 | | } |
278 | 0 | (m, None) => { |
279 | 0 | write!(self.wtr, "{{{},}}", m)?; |
280 | | } |
281 | 0 | (m, Some(n)) if m == n => { |
282 | 0 | write!(self.wtr, "{{{}}}", m)?; |
283 | | // a{m} and a{m}? are always exactly equivalent. |
284 | 0 | return Ok(()); |
285 | | } |
286 | 0 | (m, Some(n)) => { |
287 | 0 | write!(self.wtr, "{{{},{}}}", m, n)?; |
288 | | } |
289 | | } |
290 | 0 | if !x.greedy { |
291 | 0 | self.wtr.write_str("?")?; |
292 | 0 | } |
293 | | } |
294 | | HirKind::Capture(_) |
295 | | | HirKind::Concat(_) |
296 | | | HirKind::Alternation(_) => { |
297 | 0 | self.wtr.write_str(r")")?; |
298 | | } |
299 | | } |
300 | 0 | Ok(()) |
301 | 0 | } |
302 | | |
303 | 0 | fn visit_alternation_in(&mut self) -> fmt::Result { |
304 | 0 | self.wtr.write_str("|") |
305 | 0 | } |
306 | | } |
307 | | |
308 | | impl<W: fmt::Write> Writer<W> { |
309 | 0 | fn write_literal_char(&mut self, c: char) -> fmt::Result { |
310 | 0 | if is_meta_character(c) { |
311 | 0 | self.wtr.write_str("\\")?; |
312 | 0 | } |
313 | 0 | self.wtr.write_char(c) |
314 | 0 | } |
315 | | |
316 | 0 | fn write_literal_byte(&mut self, b: u8) -> fmt::Result { |
317 | 0 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { |
318 | 0 | self.write_literal_char(char::try_from(b).unwrap()) |
319 | | } else { |
320 | 0 | write!(self.wtr, "(?-u:\\x{:02X})", b) |
321 | | } |
322 | 0 | } |
323 | | |
324 | 0 | fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { |
325 | 0 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { |
326 | 0 | self.write_literal_char(char::try_from(b).unwrap()) |
327 | | } else { |
328 | 0 | write!(self.wtr, "\\x{:02X}", b) |
329 | | } |
330 | 0 | } |
331 | | } |
332 | | |
333 | | #[cfg(test)] |
334 | | mod tests { |
335 | | use alloc::{ |
336 | | boxed::Box, |
337 | | string::{String, ToString}, |
338 | | }; |
339 | | |
340 | | use crate::ParserBuilder; |
341 | | |
342 | | use super::*; |
343 | | |
344 | | fn roundtrip(given: &str, expected: &str) { |
345 | | roundtrip_with(|b| b, given, expected); |
346 | | } |
347 | | |
348 | | fn roundtrip_bytes(given: &str, expected: &str) { |
349 | | roundtrip_with(|b| b.utf8(false), given, expected); |
350 | | } |
351 | | |
352 | | fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) |
353 | | where |
354 | | F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, |
355 | | { |
356 | | let mut builder = ParserBuilder::new(); |
357 | | f(&mut builder); |
358 | | let hir = builder.build().parse(given).unwrap(); |
359 | | |
360 | | let mut printer = Printer::new(); |
361 | | let mut dst = String::new(); |
362 | | printer.print(&hir, &mut dst).unwrap(); |
363 | | |
364 | | // Check that the result is actually valid. |
365 | | builder.build().parse(&dst).unwrap(); |
366 | | |
367 | | assert_eq!(expected, dst); |
368 | | } |
369 | | |
370 | | #[test] |
371 | | fn print_literal() { |
372 | | roundtrip("a", "a"); |
373 | | roundtrip(r"\xff", "\u{FF}"); |
374 | | roundtrip_bytes(r"\xff", "\u{FF}"); |
375 | | roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); |
376 | | roundtrip("☃", "☃"); |
377 | | } |
378 | | |
379 | | #[test] |
380 | | fn print_class() { |
381 | | roundtrip(r"[a]", r"a"); |
382 | | roundtrip(r"[ab]", r"[ab]"); |
383 | | roundtrip(r"[a-z]", r"[a-z]"); |
384 | | roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); |
385 | | roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); |
386 | | roundtrip(r"[-]", r"\-"); |
387 | | roundtrip(r"[☃-⛄]", r"[☃-⛄]"); |
388 | | |
389 | | roundtrip(r"(?-u)[a]", r"a"); |
390 | | roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); |
391 | | roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); |
392 | | roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); |
393 | | |
394 | | // The following test that the printer escapes meta characters |
395 | | // in character classes. |
396 | | roundtrip(r"[\[]", r"\["); |
397 | | roundtrip(r"[Z-_]", r"[Z-_]"); |
398 | | roundtrip(r"[Z-_--Z]", r"[\[-_]"); |
399 | | |
400 | | // The following test that the printer escapes meta characters |
401 | | // in byte oriented character classes. |
402 | | roundtrip_bytes(r"(?-u)[\[]", r"\["); |
403 | | roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); |
404 | | roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); |
405 | | |
406 | | // This tests that an empty character class is correctly roundtripped. |
407 | | #[cfg(feature = "unicode-gencat")] |
408 | | roundtrip(r"\P{any}", r"[a&&b]"); |
409 | | roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); |
410 | | } |
411 | | |
412 | | #[test] |
413 | | fn print_anchor() { |
414 | | roundtrip(r"^", r"\A"); |
415 | | roundtrip(r"$", r"\z"); |
416 | | roundtrip(r"(?m)^", r"(?m:^)"); |
417 | | roundtrip(r"(?m)$", r"(?m:$)"); |
418 | | } |
419 | | |
420 | | #[test] |
421 | | fn print_word_boundary() { |
422 | | roundtrip(r"\b", r"\b"); |
423 | | roundtrip(r"\B", r"\B"); |
424 | | roundtrip(r"(?-u)\b", r"(?-u:\b)"); |
425 | | roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); |
426 | | } |
427 | | |
428 | | #[test] |
429 | | fn print_repetition() { |
430 | | roundtrip("a?", "a?"); |
431 | | roundtrip("a??", "a??"); |
432 | | roundtrip("(?U)a?", "a??"); |
433 | | |
434 | | roundtrip("a*", "a*"); |
435 | | roundtrip("a*?", "a*?"); |
436 | | roundtrip("(?U)a*", "a*?"); |
437 | | |
438 | | roundtrip("a+", "a+"); |
439 | | roundtrip("a+?", "a+?"); |
440 | | roundtrip("(?U)a+", "a+?"); |
441 | | |
442 | | roundtrip("a{1}", "a"); |
443 | | roundtrip("a{2}", "a{2}"); |
444 | | roundtrip("a{1,}", "a+"); |
445 | | roundtrip("a{1,5}", "a{1,5}"); |
446 | | roundtrip("a{1}?", "a"); |
447 | | roundtrip("a{2}?", "a{2}"); |
448 | | roundtrip("a{1,}?", "a+?"); |
449 | | roundtrip("a{1,5}?", "a{1,5}?"); |
450 | | roundtrip("(?U)a{1}", "a"); |
451 | | roundtrip("(?U)a{2}", "a{2}"); |
452 | | roundtrip("(?U)a{1,}", "a+?"); |
453 | | roundtrip("(?U)a{1,5}", "a{1,5}?"); |
454 | | |
455 | | // Test that various zero-length repetitions always translate to an |
456 | | // empty regex. This is more a property of HIR's smart constructors |
457 | | // than the printer though. |
458 | | roundtrip("a{0}", "(?:)"); |
459 | | roundtrip("(?:ab){0}", "(?:)"); |
460 | | #[cfg(feature = "unicode-gencat")] |
461 | | { |
462 | | roundtrip(r"\p{any}{0}", "(?:)"); |
463 | | roundtrip(r"\P{any}{0}", "(?:)"); |
464 | | } |
465 | | } |
466 | | |
467 | | #[test] |
468 | | fn print_group() { |
469 | | roundtrip("()", "((?:))"); |
470 | | roundtrip("(?P<foo>)", "(?P<foo>(?:))"); |
471 | | roundtrip("(?:)", "(?:)"); |
472 | | |
473 | | roundtrip("(a)", "(a)"); |
474 | | roundtrip("(?P<foo>a)", "(?P<foo>a)"); |
475 | | roundtrip("(?:a)", "a"); |
476 | | |
477 | | roundtrip("((((a))))", "((((a))))"); |
478 | | } |
479 | | |
480 | | #[test] |
481 | | fn print_alternation() { |
482 | | roundtrip("|", "(?:(?:)|(?:))"); |
483 | | roundtrip("||", "(?:(?:)|(?:)|(?:))"); |
484 | | |
485 | | roundtrip("a|b", "[ab]"); |
486 | | roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); |
487 | | roundtrip("a|b|c", "[a-c]"); |
488 | | roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); |
489 | | roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); |
490 | | } |
491 | | |
492 | | // This is a regression test that stresses a peculiarity of how the HIR |
493 | | // is both constructed and printed. Namely, it is legal for a repetition |
494 | | // to directly contain a concatenation. This particular construct isn't |
495 | | // really possible to build from the concrete syntax directly, since you'd |
496 | | // be forced to put the concatenation into (at least) a non-capturing |
497 | | // group. Concurrently, the printer doesn't consider this case and just |
498 | | // kind of naively prints the child expression and tacks on the repetition |
499 | | // operator. |
500 | | // |
501 | | // As a result, if you attached '+' to a 'concat(a, b)', the printer gives |
502 | | // you 'ab+', but clearly it really should be '(?:ab)+'. |
503 | | // |
504 | | // This bug isn't easy to surface because most ways of building an HIR |
505 | | // come directly from the concrete syntax, and as mentioned above, it just |
506 | | // isn't possible to build this kind of HIR from the concrete syntax. |
507 | | // Nevertheless, this is definitely a bug. |
508 | | // |
509 | | // See: https://github.com/rust-lang/regex/issues/731 |
510 | | #[test] |
511 | | fn regression_repetition_concat() { |
512 | | let expr = Hir::concat(alloc::vec![ |
513 | | Hir::literal("x".as_bytes()), |
514 | | Hir::repetition(hir::Repetition { |
515 | | min: 1, |
516 | | max: None, |
517 | | greedy: true, |
518 | | sub: Box::new(Hir::literal("ab".as_bytes())), |
519 | | }), |
520 | | Hir::literal("y".as_bytes()), |
521 | | ]); |
522 | | assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); |
523 | | |
524 | | let expr = Hir::concat(alloc::vec![ |
525 | | Hir::look(hir::Look::Start), |
526 | | Hir::repetition(hir::Repetition { |
527 | | min: 1, |
528 | | max: None, |
529 | | greedy: true, |
530 | | sub: Box::new(Hir::concat(alloc::vec![ |
531 | | Hir::look(hir::Look::Start), |
532 | | Hir::look(hir::Look::End), |
533 | | ])), |
534 | | }), |
535 | | Hir::look(hir::Look::End), |
536 | | ]); |
537 | | assert_eq!(r"(?:\A\A\z\z)", expr.to_string()); |
538 | | } |
539 | | |
540 | | // Just like regression_repetition_concat, but with the repetition using |
541 | | // an alternation as a child expression instead. |
542 | | // |
543 | | // See: https://github.com/rust-lang/regex/issues/731 |
544 | | #[test] |
545 | | fn regression_repetition_alternation() { |
546 | | let expr = Hir::concat(alloc::vec![ |
547 | | Hir::literal("ab".as_bytes()), |
548 | | Hir::repetition(hir::Repetition { |
549 | | min: 1, |
550 | | max: None, |
551 | | greedy: true, |
552 | | sub: Box::new(Hir::alternation(alloc::vec![ |
553 | | Hir::literal("cd".as_bytes()), |
554 | | Hir::literal("ef".as_bytes()), |
555 | | ])), |
556 | | }), |
557 | | Hir::literal("gh".as_bytes()), |
558 | | ]); |
559 | | assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); |
560 | | |
561 | | let expr = Hir::concat(alloc::vec![ |
562 | | Hir::look(hir::Look::Start), |
563 | | Hir::repetition(hir::Repetition { |
564 | | min: 1, |
565 | | max: None, |
566 | | greedy: true, |
567 | | sub: Box::new(Hir::alternation(alloc::vec![ |
568 | | Hir::look(hir::Look::Start), |
569 | | Hir::look(hir::Look::End), |
570 | | ])), |
571 | | }), |
572 | | Hir::look(hir::Look::End), |
573 | | ]); |
574 | | assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string()); |
575 | | } |
576 | | |
577 | | // This regression test is very similar in flavor to |
578 | | // regression_repetition_concat in that the root of the issue lies in a |
579 | | // peculiarity of how the HIR is represented and how the printer writes it |
580 | | // out. Like the other regression, this one is also rooted in the fact that |
581 | | // you can't produce the peculiar HIR from the concrete syntax. Namely, you |
582 | | // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally |
583 | | // be in (at least) a non-capturing group. Why? Because the '|' has very |
584 | | // low precedence (lower that concatenation), and so something like 'ab|c' |
585 | | // is actually 'alt(ab, c)'. |
586 | | // |
587 | | // See: https://github.com/rust-lang/regex/issues/516 |
588 | | #[test] |
589 | | fn regression_alternation_concat() { |
590 | | let expr = Hir::concat(alloc::vec![ |
591 | | Hir::literal("ab".as_bytes()), |
592 | | Hir::alternation(alloc::vec![ |
593 | | Hir::literal("mn".as_bytes()), |
594 | | Hir::literal("xy".as_bytes()), |
595 | | ]), |
596 | | ]); |
597 | | assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); |
598 | | |
599 | | let expr = Hir::concat(alloc::vec![ |
600 | | Hir::look(hir::Look::Start), |
601 | | Hir::alternation(alloc::vec![ |
602 | | Hir::look(hir::Look::Start), |
603 | | Hir::look(hir::Look::End), |
604 | | ]), |
605 | | ]); |
606 | | assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); |
607 | | } |
608 | | } |