/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-syntax-0.6.29/src/hir/print.rs
Line | Count | Source |
1 | | /*! |
2 | | This module provides a regular expression printer for `Hir`. |
3 | | */ |
4 | | |
5 | | use std::fmt; |
6 | | |
7 | | use crate::hir::visitor::{self, Visitor}; |
8 | | use crate::hir::{self, Hir, HirKind}; |
9 | | use crate::is_meta_character; |
10 | | |
11 | | /// A builder for constructing a printer. |
12 | | /// |
13 | | /// Note that since a printer doesn't have any configuration knobs, this type |
14 | | /// remains unexported. |
15 | | #[derive(Clone, Debug)] |
16 | | struct PrinterBuilder { |
17 | | _priv: (), |
18 | | } |
19 | | |
20 | | impl Default for PrinterBuilder { |
21 | 0 | fn default() -> PrinterBuilder { |
22 | 0 | PrinterBuilder::new() |
23 | 0 | } |
24 | | } |
25 | | |
26 | | impl PrinterBuilder { |
27 | 0 | fn new() -> PrinterBuilder { |
28 | 0 | PrinterBuilder { _priv: () } |
29 | 0 | } |
30 | | |
31 | 0 | fn build(&self) -> Printer { |
32 | 0 | Printer { _priv: () } |
33 | 0 | } |
34 | | } |
35 | | |
36 | | /// A printer for a regular expression's high-level intermediate |
37 | | /// representation. |
38 | | /// |
39 | | /// A printer converts a high-level intermediate representation (HIR) to a |
40 | | /// regular expression pattern string. This particular printer uses constant |
41 | | /// stack space and heap space proportional to the size of the HIR. |
42 | | /// |
43 | | /// Since this printer is only using the HIR, the pattern it prints will likely |
44 | | /// not resemble the original pattern at all. For example, a pattern like |
45 | | /// `\pL` will have its entire class written out. |
46 | | /// |
47 | | /// The purpose of this printer is to provide a means to mutate an HIR and then |
48 | | /// build a regular expression from the result of that mutation. (A regex |
49 | | /// library could provide a constructor from this HIR explicitly, but that |
50 | | /// creates an unnecessary public coupling between the regex library and this |
51 | | /// specific HIR representation.) |
52 | | #[derive(Debug)] |
53 | | pub struct Printer { |
54 | | _priv: (), |
55 | | } |
56 | | |
57 | | impl Printer { |
58 | | /// Create a new printer. |
59 | 0 | pub fn new() -> Printer { |
60 | 0 | PrinterBuilder::new().build() |
61 | 0 | } |
62 | | |
63 | | /// Print the given `Ast` to the given writer. The writer must implement |
64 | | /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used |
65 | | /// here are a `fmt::Formatter` (which is available in `fmt::Display` |
66 | | /// implementations) or a `&mut String`. |
67 | 0 | pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { |
68 | 0 | visitor::visit(hir, Writer { wtr }) |
69 | 0 | } |
70 | | } |
71 | | |
72 | | #[derive(Debug)] |
73 | | struct Writer<W> { |
74 | | wtr: W, |
75 | | } |
76 | | |
77 | | impl<W: fmt::Write> Visitor for Writer<W> { |
78 | | type Output = (); |
79 | | type Err = fmt::Error; |
80 | | |
81 | 0 | fn finish(self) -> fmt::Result { |
82 | 0 | Ok(()) |
83 | 0 | } |
84 | | |
85 | 0 | fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { |
86 | 0 | match *hir.kind() { |
87 | | HirKind::Empty |
88 | | | HirKind::Repetition(_) |
89 | | | HirKind::Concat(_) |
90 | 0 | | HirKind::Alternation(_) => {} |
91 | 0 | HirKind::Literal(hir::Literal::Unicode(c)) => { |
92 | 0 | self.write_literal_char(c)?; |
93 | | } |
94 | 0 | HirKind::Literal(hir::Literal::Byte(b)) => { |
95 | 0 | self.write_literal_byte(b)?; |
96 | | } |
97 | 0 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
98 | 0 | self.wtr.write_str("[")?; |
99 | 0 | for range in cls.iter() { |
100 | 0 | if range.start() == range.end() { |
101 | 0 | self.write_literal_char(range.start())?; |
102 | | } else { |
103 | 0 | self.write_literal_char(range.start())?; |
104 | 0 | self.wtr.write_str("-")?; |
105 | 0 | self.write_literal_char(range.end())?; |
106 | | } |
107 | | } |
108 | 0 | self.wtr.write_str("]")?; |
109 | | } |
110 | 0 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
111 | 0 | self.wtr.write_str("(?-u:[")?; |
112 | 0 | for range in cls.iter() { |
113 | 0 | if range.start() == range.end() { |
114 | 0 | self.write_literal_class_byte(range.start())?; |
115 | | } else { |
116 | 0 | self.write_literal_class_byte(range.start())?; |
117 | 0 | self.wtr.write_str("-")?; |
118 | 0 | self.write_literal_class_byte(range.end())?; |
119 | | } |
120 | | } |
121 | 0 | self.wtr.write_str("])")?; |
122 | | } |
123 | | HirKind::Anchor(hir::Anchor::StartLine) => { |
124 | 0 | self.wtr.write_str("(?m:^)")?; |
125 | | } |
126 | | HirKind::Anchor(hir::Anchor::EndLine) => { |
127 | 0 | self.wtr.write_str("(?m:$)")?; |
128 | | } |
129 | | HirKind::Anchor(hir::Anchor::StartText) => { |
130 | 0 | self.wtr.write_str(r"\A")?; |
131 | | } |
132 | | HirKind::Anchor(hir::Anchor::EndText) => { |
133 | 0 | self.wtr.write_str(r"\z")?; |
134 | | } |
135 | | HirKind::WordBoundary(hir::WordBoundary::Unicode) => { |
136 | 0 | self.wtr.write_str(r"\b")?; |
137 | | } |
138 | | HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
139 | 0 | self.wtr.write_str(r"\B")?; |
140 | | } |
141 | | HirKind::WordBoundary(hir::WordBoundary::Ascii) => { |
142 | 0 | self.wtr.write_str(r"(?-u:\b)")?; |
143 | | } |
144 | | HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { |
145 | 0 | self.wtr.write_str(r"(?-u:\B)")?; |
146 | | } |
147 | 0 | HirKind::Group(ref x) => match x.kind { |
148 | | hir::GroupKind::CaptureIndex(_) => { |
149 | 0 | self.wtr.write_str("(")?; |
150 | | } |
151 | 0 | hir::GroupKind::CaptureName { ref name, .. } => { |
152 | 0 | write!(self.wtr, "(?P<{}>", name)?; |
153 | | } |
154 | | hir::GroupKind::NonCapturing => { |
155 | 0 | self.wtr.write_str("(?:")?; |
156 | | } |
157 | | }, |
158 | | } |
159 | 0 | Ok(()) |
160 | 0 | } |
161 | | |
162 | 0 | fn visit_post(&mut self, hir: &Hir) -> fmt::Result { |
163 | 0 | match *hir.kind() { |
164 | | // Handled during visit_pre |
165 | | HirKind::Empty |
166 | | | HirKind::Literal(_) |
167 | | | HirKind::Class(_) |
168 | | | HirKind::Anchor(_) |
169 | | | HirKind::WordBoundary(_) |
170 | | | HirKind::Concat(_) |
171 | 0 | | HirKind::Alternation(_) => {} |
172 | 0 | HirKind::Repetition(ref x) => { |
173 | 0 | match x.kind { |
174 | | hir::RepetitionKind::ZeroOrOne => { |
175 | 0 | self.wtr.write_str("?")?; |
176 | | } |
177 | | hir::RepetitionKind::ZeroOrMore => { |
178 | 0 | self.wtr.write_str("*")?; |
179 | | } |
180 | | hir::RepetitionKind::OneOrMore => { |
181 | 0 | self.wtr.write_str("+")?; |
182 | | } |
183 | 0 | hir::RepetitionKind::Range(ref x) => match *x { |
184 | 0 | hir::RepetitionRange::Exactly(m) => { |
185 | 0 | write!(self.wtr, "{{{}}}", m)?; |
186 | | } |
187 | 0 | hir::RepetitionRange::AtLeast(m) => { |
188 | 0 | write!(self.wtr, "{{{},}}", m)?; |
189 | | } |
190 | 0 | hir::RepetitionRange::Bounded(m, n) => { |
191 | 0 | write!(self.wtr, "{{{},{}}}", m, n)?; |
192 | | } |
193 | | }, |
194 | | } |
195 | 0 | if !x.greedy { |
196 | 0 | self.wtr.write_str("?")?; |
197 | 0 | } |
198 | | } |
199 | | HirKind::Group(_) => { |
200 | 0 | self.wtr.write_str(")")?; |
201 | | } |
202 | | } |
203 | 0 | Ok(()) |
204 | 0 | } |
205 | | |
206 | 0 | fn visit_alternation_in(&mut self) -> fmt::Result { |
207 | 0 | self.wtr.write_str("|") |
208 | 0 | } |
209 | | } |
210 | | |
211 | | impl<W: fmt::Write> Writer<W> { |
212 | 0 | fn write_literal_char(&mut self, c: char) -> fmt::Result { |
213 | 0 | if is_meta_character(c) { |
214 | 0 | self.wtr.write_str("\\")?; |
215 | 0 | } |
216 | 0 | self.wtr.write_char(c) |
217 | 0 | } |
218 | | |
219 | 0 | fn write_literal_byte(&mut self, b: u8) -> fmt::Result { |
220 | 0 | let c = b as char; |
221 | 0 | if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { |
222 | 0 | self.write_literal_char(c) |
223 | | } else { |
224 | 0 | write!(self.wtr, "(?-u:\\x{:02X})", b) |
225 | | } |
226 | 0 | } |
227 | | |
228 | 0 | fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { |
229 | 0 | let c = b as char; |
230 | 0 | if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { |
231 | 0 | self.write_literal_char(c) |
232 | | } else { |
233 | 0 | write!(self.wtr, "\\x{:02X}", b) |
234 | | } |
235 | 0 | } |
236 | | } |
237 | | |
238 | | #[cfg(test)] |
239 | | mod tests { |
240 | | use super::Printer; |
241 | | use crate::ParserBuilder; |
242 | | |
243 | | fn roundtrip(given: &str, expected: &str) { |
244 | | roundtrip_with(|b| b, given, expected); |
245 | | } |
246 | | |
247 | | fn roundtrip_bytes(given: &str, expected: &str) { |
248 | | roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); |
249 | | } |
250 | | |
251 | | fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) |
252 | | where |
253 | | F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, |
254 | | { |
255 | | let mut builder = ParserBuilder::new(); |
256 | | f(&mut builder); |
257 | | let hir = builder.build().parse(given).unwrap(); |
258 | | |
259 | | let mut printer = Printer::new(); |
260 | | let mut dst = String::new(); |
261 | | printer.print(&hir, &mut dst).unwrap(); |
262 | | |
263 | | // Check that the result is actually valid. |
264 | | builder.build().parse(&dst).unwrap(); |
265 | | |
266 | | assert_eq!(expected, dst); |
267 | | } |
268 | | |
269 | | #[test] |
270 | | fn print_literal() { |
271 | | roundtrip("a", "a"); |
272 | | roundtrip(r"\xff", "\u{FF}"); |
273 | | roundtrip_bytes(r"\xff", "\u{FF}"); |
274 | | roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); |
275 | | roundtrip("☃", "☃"); |
276 | | } |
277 | | |
278 | | #[test] |
279 | | fn print_class() { |
280 | | roundtrip(r"[a]", r"[a]"); |
281 | | roundtrip(r"[a-z]", r"[a-z]"); |
282 | | roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); |
283 | | roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); |
284 | | roundtrip(r"[-]", r"[\-]"); |
285 | | roundtrip(r"[☃-⛄]", r"[☃-⛄]"); |
286 | | |
287 | | roundtrip(r"(?-u)[a]", r"(?-u:[a])"); |
288 | | roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); |
289 | | roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); |
290 | | |
291 | | // The following test that the printer escapes meta characters |
292 | | // in character classes. |
293 | | roundtrip(r"[\[]", r"[\[]"); |
294 | | roundtrip(r"[Z-_]", r"[Z-_]"); |
295 | | roundtrip(r"[Z-_--Z]", r"[\[-_]"); |
296 | | |
297 | | // The following test that the printer escapes meta characters |
298 | | // in byte oriented character classes. |
299 | | roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); |
300 | | roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); |
301 | | roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); |
302 | | } |
303 | | |
304 | | #[test] |
305 | | fn print_anchor() { |
306 | | roundtrip(r"^", r"\A"); |
307 | | roundtrip(r"$", r"\z"); |
308 | | roundtrip(r"(?m)^", r"(?m:^)"); |
309 | | roundtrip(r"(?m)$", r"(?m:$)"); |
310 | | } |
311 | | |
312 | | #[test] |
313 | | fn print_word_boundary() { |
314 | | roundtrip(r"\b", r"\b"); |
315 | | roundtrip(r"\B", r"\B"); |
316 | | roundtrip(r"(?-u)\b", r"(?-u:\b)"); |
317 | | roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); |
318 | | } |
319 | | |
320 | | #[test] |
321 | | fn print_repetition() { |
322 | | roundtrip("a?", "a?"); |
323 | | roundtrip("a??", "a??"); |
324 | | roundtrip("(?U)a?", "a??"); |
325 | | |
326 | | roundtrip("a*", "a*"); |
327 | | roundtrip("a*?", "a*?"); |
328 | | roundtrip("(?U)a*", "a*?"); |
329 | | |
330 | | roundtrip("a+", "a+"); |
331 | | roundtrip("a+?", "a+?"); |
332 | | roundtrip("(?U)a+", "a+?"); |
333 | | |
334 | | roundtrip("a{1}", "a{1}"); |
335 | | roundtrip("a{1,}", "a{1,}"); |
336 | | roundtrip("a{1,5}", "a{1,5}"); |
337 | | roundtrip("a{1}?", "a{1}?"); |
338 | | roundtrip("a{1,}?", "a{1,}?"); |
339 | | roundtrip("a{1,5}?", "a{1,5}?"); |
340 | | roundtrip("(?U)a{1}", "a{1}?"); |
341 | | roundtrip("(?U)a{1,}", "a{1,}?"); |
342 | | roundtrip("(?U)a{1,5}", "a{1,5}?"); |
343 | | } |
344 | | |
345 | | #[test] |
346 | | fn print_group() { |
347 | | roundtrip("()", "()"); |
348 | | roundtrip("(?P<foo>)", "(?P<foo>)"); |
349 | | roundtrip("(?:)", "(?:)"); |
350 | | |
351 | | roundtrip("(a)", "(a)"); |
352 | | roundtrip("(?P<foo>a)", "(?P<foo>a)"); |
353 | | roundtrip("(?:a)", "(?:a)"); |
354 | | |
355 | | roundtrip("((((a))))", "((((a))))"); |
356 | | } |
357 | | |
358 | | #[test] |
359 | | fn print_alternation() { |
360 | | roundtrip("|", "|"); |
361 | | roundtrip("||", "||"); |
362 | | |
363 | | roundtrip("a|b", "a|b"); |
364 | | roundtrip("a|b|c", "a|b|c"); |
365 | | roundtrip("foo|bar|quux", "foo|bar|quux"); |
366 | | } |
367 | | } |