/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.8.8/src/hir/print.rs
Line  | Count  | Source  | 
1  |  | /*!  | 
2  |  | This module provides a regular expression printer for `Hir`.  | 
3  |  | */  | 
4  |  |  | 
5  |  | use core::fmt;  | 
6  |  |  | 
7  |  | use crate::{ | 
8  |  |     hir::{ | 
9  |  |         self,  | 
10  |  |         visitor::{self, Visitor}, | 
11  |  |         Hir, HirKind,  | 
12  |  |     },  | 
13  |  |     is_meta_character,  | 
14  |  | };  | 
15  |  |  | 
16  |  | /// A builder for constructing a printer.  | 
17  |  | ///  | 
18  |  | /// Note that since a printer doesn't have any configuration knobs, this type  | 
19  |  | /// remains unexported.  | 
20  |  | #[derive(Clone, Debug)]  | 
21  |  | struct PrinterBuilder { | 
22  |  |     _priv: (),  | 
23  |  | }  | 
24  |  |  | 
25  |  | impl Default for PrinterBuilder { | 
26  | 0  |     fn default() -> PrinterBuilder { | 
27  | 0  |         PrinterBuilder::new()  | 
28  | 0  |     }  | 
29  |  | }  | 
30  |  |  | 
31  |  | impl PrinterBuilder { | 
32  | 0  |     fn new() -> PrinterBuilder { | 
33  | 0  |         PrinterBuilder { _priv: () } | 
34  | 0  |     }  | 
35  |  |  | 
36  | 0  |     fn build(&self) -> Printer { | 
37  | 0  |         Printer { _priv: () } | 
38  | 0  |     }  | 
39  |  | }  | 
40  |  |  | 
41  |  | /// A printer for a regular expression's high-level intermediate  | 
42  |  | /// representation.  | 
43  |  | ///  | 
44  |  | /// A printer converts a high-level intermediate representation (HIR) to a  | 
45  |  | /// regular expression pattern string. This particular printer uses constant  | 
46  |  | /// stack space and heap space proportional to the size of the HIR.  | 
47  |  | ///  | 
48  |  | /// Since this printer is only using the HIR, the pattern it prints will likely  | 
49  |  | /// not resemble the original pattern at all. For example, a pattern like  | 
50  |  | /// `\pL` will have its entire class written out.  | 
51  |  | ///  | 
52  |  | /// The purpose of this printer is to provide a means to mutate an HIR and then  | 
53  |  | /// build a regular expression from the result of that mutation. (A regex  | 
54  |  | /// library could provide a constructor from this HIR explicitly, but that  | 
55  |  | /// creates an unnecessary public coupling between the regex library and this  | 
56  |  | /// specific HIR representation.)  | 
57  |  | #[derive(Debug)]  | 
58  |  | pub struct Printer { | 
59  |  |     _priv: (),  | 
60  |  | }  | 
61  |  |  | 
62  |  | impl Printer { | 
63  |  |     /// Create a new printer.  | 
64  | 0  |     pub fn new() -> Printer { | 
65  | 0  |         PrinterBuilder::new().build()  | 
66  | 0  |     }  | 
67  |  |  | 
68  |  |     /// Print the given `Ast` to the given writer. The writer must implement  | 
69  |  |     /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used  | 
70  |  |     /// here are a `fmt::Formatter` (which is available in `fmt::Display`  | 
71  |  |     /// implementations) or a `&mut String`.  | 
72  | 0  |     pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { | 
73  | 0  |         visitor::visit(hir, Writer { wtr }) | 
74  | 0  |     }  | 
75  |  | }  | 
76  |  |  | 
77  |  | #[derive(Debug)]  | 
78  |  | struct Writer<W> { | 
79  |  |     wtr: W,  | 
80  |  | }  | 
81  |  |  | 
82  |  | impl<W: fmt::Write> Visitor for Writer<W> { | 
83  |  |     type Output = ();  | 
84  |  |     type Err = fmt::Error;  | 
85  |  |  | 
86  | 0  |     fn finish(self) -> fmt::Result { | 
87  | 0  |         Ok(())  | 
88  | 0  |     }  | 
89  |  |  | 
90  | 0  |     fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { | 
91  | 0  |         match *hir.kind() { | 
92  |  |             HirKind::Empty => { | 
93  |  |                 // Technically an empty sub-expression could be "printed" by  | 
94  |  |                 // just ignoring it, but in practice, you could have a  | 
95  |  |                 // repetition operator attached to an empty expression, and you  | 
96  |  |                 // really need something in the concrete syntax to make that  | 
97  |  |                 // work as you'd expect.  | 
98  | 0  |                 self.wtr.write_str(r"(?:)")?;  | 
99  |  |             }  | 
100  |  |             // Repetition operators are strictly suffix oriented.  | 
101  | 0  |             HirKind::Repetition(_) => {} | 
102  | 0  |             HirKind::Literal(hir::Literal(ref bytes)) => { | 
103  |  |                 // See the comment on the 'Concat' and 'Alternation' case below  | 
104  |  |                 // for why we put parens here. Literals are, conceptually,  | 
105  |  |                 // a special case of concatenation where each element is a  | 
106  |  |                 // character. The HIR flattens this into a Box<[u8]>, but we  | 
107  |  |                 // still need to treat it like a concatenation for correct  | 
108  |  |                 // printing. As a special case, we don't write parens if there  | 
109  |  |                 // is only one character. One character means there is no  | 
110  |  |                 // concat so we don't need parens. Adding parens would still be  | 
111  |  |                 // correct, but we drop them here because it tends to create  | 
112  |  |                 // rather noisy regexes even in simple cases.  | 
113  | 0  |                 let result = core::str::from_utf8(bytes);  | 
114  | 0  |                 let len = result.map_or(bytes.len(), |s| s.chars().count());  | 
115  | 0  |                 if len > 1 { | 
116  | 0  |                     self.wtr.write_str(r"(?:")?;  | 
117  | 0  |                 }  | 
118  | 0  |                 match result { | 
119  | 0  |                     Ok(string) => { | 
120  | 0  |                         for c in string.chars() { | 
121  | 0  |                             self.write_literal_char(c)?;  | 
122  |  |                         }  | 
123  |  |                     }  | 
124  |  |                     Err(_) => { | 
125  | 0  |                         for &b in bytes.iter() { | 
126  | 0  |                             self.write_literal_byte(b)?;  | 
127  |  |                         }  | 
128  |  |                     }  | 
129  |  |                 }  | 
130  | 0  |                 if len > 1 { | 
131  | 0  |                     self.wtr.write_str(r")")?;  | 
132  | 0  |                 }  | 
133  |  |             }  | 
134  | 0  |             HirKind::Class(hir::Class::Unicode(ref cls)) => { | 
135  | 0  |                 if cls.ranges().is_empty() { | 
136  | 0  |                     return self.wtr.write_str("[a&&b]"); | 
137  | 0  |                 }  | 
138  | 0  |                 self.wtr.write_str("[")?; | 
139  | 0  |                 for range in cls.iter() { | 
140  | 0  |                     if range.start() == range.end() { | 
141  | 0  |                         self.write_literal_char(range.start())?;  | 
142  | 0  |                     } else if u32::from(range.start()) + 1  | 
143  | 0  |                         == u32::from(range.end())  | 
144  |  |                     { | 
145  | 0  |                         self.write_literal_char(range.start())?;  | 
146  | 0  |                         self.write_literal_char(range.end())?;  | 
147  |  |                     } else { | 
148  | 0  |                         self.write_literal_char(range.start())?;  | 
149  | 0  |                         self.wtr.write_str("-")?; | 
150  | 0  |                         self.write_literal_char(range.end())?;  | 
151  |  |                     }  | 
152  |  |                 }  | 
153  | 0  |                 self.wtr.write_str("]")?; | 
154  |  |             }  | 
155  | 0  |             HirKind::Class(hir::Class::Bytes(ref cls)) => { | 
156  | 0  |                 if cls.ranges().is_empty() { | 
157  | 0  |                     return self.wtr.write_str("[a&&b]"); | 
158  | 0  |                 }  | 
159  | 0  |                 self.wtr.write_str("(?-u:[")?; | 
160  | 0  |                 for range in cls.iter() { | 
161  | 0  |                     if range.start() == range.end() { | 
162  | 0  |                         self.write_literal_class_byte(range.start())?;  | 
163  | 0  |                     } else if range.start() + 1 == range.end() { | 
164  | 0  |                         self.write_literal_class_byte(range.start())?;  | 
165  | 0  |                         self.write_literal_class_byte(range.end())?;  | 
166  |  |                     } else { | 
167  | 0  |                         self.write_literal_class_byte(range.start())?;  | 
168  | 0  |                         self.wtr.write_str("-")?; | 
169  | 0  |                         self.write_literal_class_byte(range.end())?;  | 
170  |  |                     }  | 
171  |  |                 }  | 
172  | 0  |                 self.wtr.write_str("])")?; | 
173  |  |             }  | 
174  | 0  |             HirKind::Look(ref look) => match *look { | 
175  |  |                 hir::Look::Start => { | 
176  | 0  |                     self.wtr.write_str(r"\A")?;  | 
177  |  |                 }  | 
178  |  |                 hir::Look::End => { | 
179  | 0  |                     self.wtr.write_str(r"\z")?;  | 
180  |  |                 }  | 
181  |  |                 hir::Look::StartLF => { | 
182  | 0  |                     self.wtr.write_str("(?m:^)")?; | 
183  |  |                 }  | 
184  |  |                 hir::Look::EndLF => { | 
185  | 0  |                     self.wtr.write_str("(?m:$)")?; | 
186  |  |                 }  | 
187  |  |                 hir::Look::StartCRLF => { | 
188  | 0  |                     self.wtr.write_str("(?mR:^)")?; | 
189  |  |                 }  | 
190  |  |                 hir::Look::EndCRLF => { | 
191  | 0  |                     self.wtr.write_str("(?mR:$)")?; | 
192  |  |                 }  | 
193  |  |                 hir::Look::WordAscii => { | 
194  | 0  |                     self.wtr.write_str(r"(?-u:\b)")?;  | 
195  |  |                 }  | 
196  |  |                 hir::Look::WordAsciiNegate => { | 
197  | 0  |                     self.wtr.write_str(r"(?-u:\B)")?;  | 
198  |  |                 }  | 
199  |  |                 hir::Look::WordUnicode => { | 
200  | 0  |                     self.wtr.write_str(r"\b")?;  | 
201  |  |                 }  | 
202  |  |                 hir::Look::WordUnicodeNegate => { | 
203  | 0  |                     self.wtr.write_str(r"\B")?;  | 
204  |  |                 }  | 
205  |  |                 hir::Look::WordStartAscii => { | 
206  | 0  |                     self.wtr.write_str(r"(?-u:\b{start})")?; | 
207  |  |                 }  | 
208  |  |                 hir::Look::WordEndAscii => { | 
209  | 0  |                     self.wtr.write_str(r"(?-u:\b{end})")?; | 
210  |  |                 }  | 
211  |  |                 hir::Look::WordStartUnicode => { | 
212  | 0  |                     self.wtr.write_str(r"\b{start}")?; | 
213  |  |                 }  | 
214  |  |                 hir::Look::WordEndUnicode => { | 
215  | 0  |                     self.wtr.write_str(r"\b{end}")?; | 
216  |  |                 }  | 
217  |  |                 hir::Look::WordStartHalfAscii => { | 
218  | 0  |                     self.wtr.write_str(r"(?-u:\b{start-half})")?; | 
219  |  |                 }  | 
220  |  |                 hir::Look::WordEndHalfAscii => { | 
221  | 0  |                     self.wtr.write_str(r"(?-u:\b{end-half})")?; | 
222  |  |                 }  | 
223  |  |                 hir::Look::WordStartHalfUnicode => { | 
224  | 0  |                     self.wtr.write_str(r"\b{start-half}")?; | 
225  |  |                 }  | 
226  |  |                 hir::Look::WordEndHalfUnicode => { | 
227  | 0  |                     self.wtr.write_str(r"\b{end-half}")?; | 
228  |  |                 }  | 
229  |  |             },  | 
230  | 0  |             HirKind::Capture(hir::Capture { ref name, .. }) => { | 
231  | 0  |                 self.wtr.write_str("(")?; | 
232  | 0  |                 if let Some(ref name) = *name { | 
233  | 0  |                     write!(self.wtr, "?P<{name}>")?; | 
234  | 0  |                 }  | 
235  |  |             }  | 
236  |  |             // Why do this? Wrapping concats and alts in non-capturing groups  | 
237  |  |             // is not *always* necessary, but is sometimes necessary. For  | 
238  |  |             // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'  | 
239  |  |             // and not 'ab|c'. The former is clearly the intended meaning, but  | 
240  |  |             // the latter is actually 'alt(concat(a, b), c)'.  | 
241  |  |             //  | 
242  |  |             // It would be possible to only group these things in cases where  | 
243  |  |             // it's strictly necessary, but it requires knowing the parent  | 
244  |  |             // expression. And since this technique is simpler and always  | 
245  |  |             // correct, we take this route. More to the point, it is a non-goal  | 
246  |  |             // of an HIR printer to show a nice easy-to-read regex. Indeed,  | 
247  |  |             // its construction forbids it from doing so. Therefore, inserting  | 
248  |  |             // extra groups where they aren't necessary is perfectly okay.  | 
249  |  |             HirKind::Concat(_) | HirKind::Alternation(_) => { | 
250  | 0  |                 self.wtr.write_str(r"(?:")?;  | 
251  |  |             }  | 
252  |  |         }  | 
253  | 0  |         Ok(())  | 
254  | 0  |     }  | 
255  |  |  | 
256  | 0  |     fn visit_post(&mut self, hir: &Hir) -> fmt::Result { | 
257  | 0  |         match *hir.kind() { | 
258  |  |             // Handled during visit_pre  | 
259  |  |             HirKind::Empty  | 
260  |  |             | HirKind::Literal(_)  | 
261  |  |             | HirKind::Class(_)  | 
262  | 0  |             | HirKind::Look(_) => {} | 
263  | 0  |             HirKind::Repetition(ref x) => { | 
264  | 0  |                 match (x.min, x.max) { | 
265  |  |                     (0, Some(1)) => { | 
266  | 0  |                         self.wtr.write_str("?")?; | 
267  |  |                     }  | 
268  |  |                     (0, None) => { | 
269  | 0  |                         self.wtr.write_str("*")?; | 
270  |  |                     }  | 
271  |  |                     (1, None) => { | 
272  | 0  |                         self.wtr.write_str("+")?; | 
273  |  |                     }  | 
274  |  |                     (1, Some(1)) => { | 
275  |  |                         // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. | 
276  | 0  |                         return Ok(());  | 
277  |  |                     }  | 
278  | 0  |                     (m, None) => { | 
279  | 0  |                         write!(self.wtr, "{{{m},}}")?; | 
280  |  |                     }  | 
281  | 0  |                     (m, Some(n)) if m == n => { | 
282  | 0  |                         write!(self.wtr, "{{{m}}}")?; | 
283  |  |                         // a{m} and a{m}? are always exactly equivalent. | 
284  | 0  |                         return Ok(());  | 
285  |  |                     }  | 
286  | 0  |                     (m, Some(n)) => { | 
287  | 0  |                         write!(self.wtr, "{{{m},{n}}}")?; | 
288  |  |                     }  | 
289  |  |                 }  | 
290  | 0  |                 if !x.greedy { | 
291  | 0  |                     self.wtr.write_str("?")?; | 
292  | 0  |                 }  | 
293  |  |             }  | 
294  |  |             HirKind::Capture(_)  | 
295  |  |             | HirKind::Concat(_)  | 
296  |  |             | HirKind::Alternation(_) => { | 
297  | 0  |                 self.wtr.write_str(r")")?;  | 
298  |  |             }  | 
299  |  |         }  | 
300  | 0  |         Ok(())  | 
301  | 0  |     }  | 
302  |  |  | 
303  | 0  |     fn visit_alternation_in(&mut self) -> fmt::Result { | 
304  | 0  |         self.wtr.write_str("|") | 
305  | 0  |     }  | 
306  |  | }  | 
307  |  |  | 
308  |  | impl<W: fmt::Write> Writer<W> { | 
309  | 0  |     fn write_literal_char(&mut self, c: char) -> fmt::Result { | 
310  | 0  |         if is_meta_character(c) { | 
311  | 0  |             self.wtr.write_str("\\")?; | 
312  | 0  |         }  | 
313  | 0  |         self.wtr.write_char(c)  | 
314  | 0  |     }  | 
315  |  |  | 
316  | 0  |     fn write_literal_byte(&mut self, b: u8) -> fmt::Result { | 
317  | 0  |         if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { | 
318  | 0  |             self.write_literal_char(char::try_from(b).unwrap())  | 
319  |  |         } else { | 
320  | 0  |             write!(self.wtr, "(?-u:\\x{b:02X})") | 
321  |  |         }  | 
322  | 0  |     }  | 
323  |  |  | 
324  | 0  |     fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { | 
325  | 0  |         if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { | 
326  | 0  |             self.write_literal_char(char::try_from(b).unwrap())  | 
327  |  |         } else { | 
328  | 0  |             write!(self.wtr, "\\x{b:02X}") | 
329  |  |         }  | 
330  | 0  |     }  | 
331  |  | }  | 
332  |  |  | 
333  |  | #[cfg(test)]  | 
334  |  | mod tests { | 
335  |  |     use alloc::{ | 
336  |  |         boxed::Box,  | 
337  |  |         string::{String, ToString}, | 
338  |  |     };  | 
339  |  |  | 
340  |  |     use crate::ParserBuilder;  | 
341  |  |  | 
342  |  |     use super::*;  | 
343  |  |  | 
344  |  |     fn roundtrip(given: &str, expected: &str) { | 
345  |  |         roundtrip_with(|b| b, given, expected);  | 
346  |  |     }  | 
347  |  |  | 
348  |  |     fn roundtrip_bytes(given: &str, expected: &str) { | 
349  |  |         roundtrip_with(|b| b.utf8(false), given, expected);  | 
350  |  |     }  | 
351  |  |  | 
352  |  |     fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)  | 
353  |  |     where  | 
354  |  |         F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,  | 
355  |  |     { | 
356  |  |         let mut builder = ParserBuilder::new();  | 
357  |  |         f(&mut builder);  | 
358  |  |         let hir = builder.build().parse(given).unwrap();  | 
359  |  |  | 
360  |  |         let mut printer = Printer::new();  | 
361  |  |         let mut dst = String::new();  | 
362  |  |         printer.print(&hir, &mut dst).unwrap();  | 
363  |  |  | 
364  |  |         // Check that the result is actually valid.  | 
365  |  |         builder.build().parse(&dst).unwrap();  | 
366  |  |  | 
367  |  |         assert_eq!(expected, dst);  | 
368  |  |     }  | 
369  |  |  | 
370  |  |     #[test]  | 
371  |  |     fn print_literal() { | 
372  |  |         roundtrip("a", "a"); | 
373  |  |         roundtrip(r"\xff", "\u{FF}"); | 
374  |  |         roundtrip_bytes(r"\xff", "\u{FF}"); | 
375  |  |         roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");  | 
376  |  |         roundtrip("☃", "☃"); | 
377  |  |     }  | 
378  |  |  | 
379  |  |     #[test]  | 
380  |  |     fn print_class() { | 
381  |  |         roundtrip(r"[a]", r"a");  | 
382  |  |         roundtrip(r"[ab]", r"[ab]");  | 
383  |  |         roundtrip(r"[a-z]", r"[a-z]");  | 
384  |  |         roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");  | 
385  |  |         roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); | 
386  |  |         roundtrip(r"[-]", r"\-");  | 
387  |  |         roundtrip(r"[☃-⛄]", r"[☃-⛄]");  | 
388  |  |  | 
389  |  |         roundtrip(r"(?-u)[a]", r"a");  | 
390  |  |         roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");  | 
391  |  |         roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");  | 
392  |  |         roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");  | 
393  |  |  | 
394  |  |         // The following test that the printer escapes meta characters  | 
395  |  |         // in character classes.  | 
396  |  |         roundtrip(r"[\[]", r"\[");  | 
397  |  |         roundtrip(r"[Z-_]", r"[Z-_]");  | 
398  |  |         roundtrip(r"[Z-_--Z]", r"[\[-_]");  | 
399  |  |  | 
400  |  |         // The following test that the printer escapes meta characters  | 
401  |  |         // in byte oriented character classes.  | 
402  |  |         roundtrip_bytes(r"(?-u)[\[]", r"\[");  | 
403  |  |         roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");  | 
404  |  |         roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");  | 
405  |  |  | 
406  |  |         // This tests that an empty character class is correctly roundtripped.  | 
407  |  |         #[cfg(feature = "unicode-gencat")]  | 
408  |  |         roundtrip(r"\P{any}", r"[a&&b]"); | 
409  |  |         roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");  | 
410  |  |     }  | 
411  |  |  | 
412  |  |     #[test]  | 
413  |  |     fn print_anchor() { | 
414  |  |         roundtrip(r"^", r"\A");  | 
415  |  |         roundtrip(r"$", r"\z");  | 
416  |  |         roundtrip(r"(?m)^", r"(?m:^)");  | 
417  |  |         roundtrip(r"(?m)$", r"(?m:$)");  | 
418  |  |     }  | 
419  |  |  | 
420  |  |     #[test]  | 
421  |  |     fn print_word_boundary() { | 
422  |  |         roundtrip(r"\b", r"\b");  | 
423  |  |         roundtrip(r"\B", r"\B");  | 
424  |  |         roundtrip(r"(?-u)\b", r"(?-u:\b)");  | 
425  |  |         roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");  | 
426  |  |     }  | 
427  |  |  | 
428  |  |     #[test]  | 
429  |  |     fn print_repetition() { | 
430  |  |         roundtrip("a?", "a?"); | 
431  |  |         roundtrip("a??", "a??"); | 
432  |  |         roundtrip("(?U)a?", "a??"); | 
433  |  |  | 
434  |  |         roundtrip("a*", "a*"); | 
435  |  |         roundtrip("a*?", "a*?"); | 
436  |  |         roundtrip("(?U)a*", "a*?"); | 
437  |  |  | 
438  |  |         roundtrip("a+", "a+"); | 
439  |  |         roundtrip("a+?", "a+?"); | 
440  |  |         roundtrip("(?U)a+", "a+?"); | 
441  |  |  | 
442  |  |         roundtrip("a{1}", "a"); | 
443  |  |         roundtrip("a{2}", "a{2}"); | 
444  |  |         roundtrip("a{1,}", "a+"); | 
445  |  |         roundtrip("a{1,5}", "a{1,5}"); | 
446  |  |         roundtrip("a{1}?", "a"); | 
447  |  |         roundtrip("a{2}?", "a{2}"); | 
448  |  |         roundtrip("a{1,}?", "a+?"); | 
449  |  |         roundtrip("a{1,5}?", "a{1,5}?"); | 
450  |  |         roundtrip("(?U)a{1}", "a"); | 
451  |  |         roundtrip("(?U)a{2}", "a{2}"); | 
452  |  |         roundtrip("(?U)a{1,}", "a+?"); | 
453  |  |         roundtrip("(?U)a{1,5}", "a{1,5}?"); | 
454  |  |  | 
455  |  |         // Test that various zero-length repetitions always translate to an  | 
456  |  |         // empty regex. This is more a property of HIR's smart constructors  | 
457  |  |         // than the printer though.  | 
458  |  |         roundtrip("a{0}", "(?:)"); | 
459  |  |         roundtrip("(?:ab){0}", "(?:)"); | 
460  |  |         #[cfg(feature = "unicode-gencat")]  | 
461  |  |         { | 
462  |  |             roundtrip(r"\p{any}{0}", "(?:)"); | 
463  |  |             roundtrip(r"\P{any}{0}", "(?:)"); | 
464  |  |         }  | 
465  |  |     }  | 
466  |  |  | 
467  |  |     #[test]  | 
468  |  |     fn print_group() { | 
469  |  |         roundtrip("()", "((?:))"); | 
470  |  |         roundtrip("(?P<foo>)", "(?P<foo>(?:))"); | 
471  |  |         roundtrip("(?:)", "(?:)"); | 
472  |  |  | 
473  |  |         roundtrip("(a)", "(a)"); | 
474  |  |         roundtrip("(?P<foo>a)", "(?P<foo>a)"); | 
475  |  |         roundtrip("(?:a)", "a"); | 
476  |  |  | 
477  |  |         roundtrip("((((a))))", "((((a))))"); | 
478  |  |     }  | 
479  |  |  | 
480  |  |     #[test]  | 
481  |  |     fn print_alternation() { | 
482  |  |         roundtrip("|", "(?:(?:)|(?:))"); | 
483  |  |         roundtrip("||", "(?:(?:)|(?:)|(?:))"); | 
484  |  |  | 
485  |  |         roundtrip("a|b", "[ab]"); | 
486  |  |         roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); | 
487  |  |         roundtrip("a|b|c", "[a-c]"); | 
488  |  |         roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); | 
489  |  |         roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); | 
490  |  |     }  | 
491  |  |  | 
492  |  |     // This is a regression test that stresses a peculiarity of how the HIR  | 
493  |  |     // is both constructed and printed. Namely, it is legal for a repetition  | 
494  |  |     // to directly contain a concatenation. This particular construct isn't  | 
495  |  |     // really possible to build from the concrete syntax directly, since you'd  | 
496  |  |     // be forced to put the concatenation into (at least) a non-capturing  | 
497  |  |     // group. Concurrently, the printer doesn't consider this case and just  | 
498  |  |     // kind of naively prints the child expression and tacks on the repetition  | 
499  |  |     // operator.  | 
500  |  |     //  | 
501  |  |     // As a result, if you attached '+' to a 'concat(a, b)', the printer gives  | 
502  |  |     // you 'ab+', but clearly it really should be '(?:ab)+'.  | 
503  |  |     //  | 
504  |  |     // This bug isn't easy to surface because most ways of building an HIR  | 
505  |  |     // come directly from the concrete syntax, and as mentioned above, it just  | 
506  |  |     // isn't possible to build this kind of HIR from the concrete syntax.  | 
507  |  |     // Nevertheless, this is definitely a bug.  | 
508  |  |     //  | 
509  |  |     // See: https://github.com/rust-lang/regex/issues/731  | 
510  |  |     #[test]  | 
511  |  |     fn regression_repetition_concat() { | 
512  |  |         let expr = Hir::concat(alloc::vec![  | 
513  |  |             Hir::literal("x".as_bytes()), | 
514  |  |             Hir::repetition(hir::Repetition { | 
515  |  |                 min: 1,  | 
516  |  |                 max: None,  | 
517  |  |                 greedy: true,  | 
518  |  |                 sub: Box::new(Hir::literal("ab".as_bytes())), | 
519  |  |             }),  | 
520  |  |             Hir::literal("y".as_bytes()), | 
521  |  |         ]);  | 
522  |  |         assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());  | 
523  |  |  | 
524  |  |         let expr = Hir::concat(alloc::vec![  | 
525  |  |             Hir::look(hir::Look::Start),  | 
526  |  |             Hir::repetition(hir::Repetition { | 
527  |  |                 min: 1,  | 
528  |  |                 max: None,  | 
529  |  |                 greedy: true,  | 
530  |  |                 sub: Box::new(Hir::concat(alloc::vec![  | 
531  |  |                     Hir::look(hir::Look::Start),  | 
532  |  |                     Hir::look(hir::Look::End),  | 
533  |  |                 ])),  | 
534  |  |             }),  | 
535  |  |             Hir::look(hir::Look::End),  | 
536  |  |         ]);  | 
537  |  |         assert_eq!(r"(?:\A\A\z\z)", expr.to_string());  | 
538  |  |     }  | 
539  |  |  | 
540  |  |     // Just like regression_repetition_concat, but with the repetition using  | 
541  |  |     // an alternation as a child expression instead.  | 
542  |  |     //  | 
543  |  |     // See: https://github.com/rust-lang/regex/issues/731  | 
544  |  |     #[test]  | 
545  |  |     fn regression_repetition_alternation() { | 
546  |  |         let expr = Hir::concat(alloc::vec![  | 
547  |  |             Hir::literal("ab".as_bytes()), | 
548  |  |             Hir::repetition(hir::Repetition { | 
549  |  |                 min: 1,  | 
550  |  |                 max: None,  | 
551  |  |                 greedy: true,  | 
552  |  |                 sub: Box::new(Hir::alternation(alloc::vec![  | 
553  |  |                     Hir::literal("cd".as_bytes()), | 
554  |  |                     Hir::literal("ef".as_bytes()), | 
555  |  |                 ])),  | 
556  |  |             }),  | 
557  |  |             Hir::literal("gh".as_bytes()), | 
558  |  |         ]);  | 
559  |  |         assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());  | 
560  |  |  | 
561  |  |         let expr = Hir::concat(alloc::vec![  | 
562  |  |             Hir::look(hir::Look::Start),  | 
563  |  |             Hir::repetition(hir::Repetition { | 
564  |  |                 min: 1,  | 
565  |  |                 max: None,  | 
566  |  |                 greedy: true,  | 
567  |  |                 sub: Box::new(Hir::alternation(alloc::vec![  | 
568  |  |                     Hir::look(hir::Look::Start),  | 
569  |  |                     Hir::look(hir::Look::End),  | 
570  |  |                 ])),  | 
571  |  |             }),  | 
572  |  |             Hir::look(hir::Look::End),  | 
573  |  |         ]);  | 
574  |  |         assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());  | 
575  |  |     }  | 
576  |  |  | 
577  |  |     // This regression test is very similar in flavor to  | 
578  |  |     // regression_repetition_concat in that the root of the issue lies in a  | 
579  |  |     // peculiarity of how the HIR is represented and how the printer writes it  | 
580  |  |     // out. Like the other regression, this one is also rooted in the fact that  | 
581  |  |     // you can't produce the peculiar HIR from the concrete syntax. Namely, you  | 
582  |  |     // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally  | 
583  |  |     // be in (at least) a non-capturing group. Why? Because the '|' has very  | 
584  |  |     // low precedence (lower that concatenation), and so something like 'ab|c'  | 
585  |  |     // is actually 'alt(ab, c)'.  | 
586  |  |     //  | 
587  |  |     // See: https://github.com/rust-lang/regex/issues/516  | 
588  |  |     #[test]  | 
589  |  |     fn regression_alternation_concat() { | 
590  |  |         let expr = Hir::concat(alloc::vec![  | 
591  |  |             Hir::literal("ab".as_bytes()), | 
592  |  |             Hir::alternation(alloc::vec![  | 
593  |  |                 Hir::literal("mn".as_bytes()), | 
594  |  |                 Hir::literal("xy".as_bytes()), | 
595  |  |             ]),  | 
596  |  |         ]);  | 
597  |  |         assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());  | 
598  |  |  | 
599  |  |         let expr = Hir::concat(alloc::vec![  | 
600  |  |             Hir::look(hir::Look::Start),  | 
601  |  |             Hir::alternation(alloc::vec![  | 
602  |  |                 Hir::look(hir::Look::Start),  | 
603  |  |                 Hir::look(hir::Look::End),  | 
604  |  |             ]),  | 
605  |  |         ]);  | 
606  |  |         assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());  | 
607  |  |     }  | 
608  |  | }  |