/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.4.13/src/util/syntax.rs
Line | Count | Source |
1 | | /*! |
2 | | Utilities for dealing with the syntax of a regular expression. |
3 | | |
4 | | This module currently only exposes a [`Config`] type that |
5 | | itself represents a wrapper around the configuration for a |
6 | | [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of |
7 | | this wrapper is to make configuring syntax options very similar to how other |
8 | | configuration is done throughout this crate. Namely, instead of duplicating |
9 | | syntax options across every builder (of which there are many), we instead |
10 | | create small config objects like this one that can be passed around and |
11 | | composed. |
12 | | */ |
13 | | |
14 | | use alloc::{vec, vec::Vec}; |
15 | | |
16 | | use regex_syntax::{ |
17 | | ast, |
18 | | hir::{self, Hir}, |
19 | | Error, ParserBuilder, |
20 | | }; |
21 | | |
22 | | /// A convenience routine for parsing a pattern into an HIR value with the |
23 | | /// default configuration. |
24 | | /// |
25 | | /// # Example |
26 | | /// |
27 | | /// This shows how to parse a pattern into an HIR value: |
28 | | /// |
29 | | /// ``` |
30 | | /// use regex_automata::util::syntax; |
31 | | /// |
32 | | /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; |
33 | | /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); |
34 | | /// |
35 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
36 | | /// ``` |
37 | 0 | pub fn parse(pattern: &str) -> Result<Hir, Error> { |
38 | 0 | parse_with(pattern, &Config::default()) |
39 | 0 | } |
40 | | |
41 | | /// A convenience routine for parsing many patterns into HIR value with the |
42 | | /// default configuration. |
43 | | /// |
44 | | /// # Example |
45 | | /// |
46 | | /// This shows how to parse many patterns into an corresponding HIR values: |
47 | | /// |
48 | | /// ``` |
49 | | /// use { |
50 | | /// regex_automata::util::syntax, |
51 | | /// regex_syntax::hir::Properties, |
52 | | /// }; |
53 | | /// |
54 | | /// let hirs = syntax::parse_many(&[ |
55 | | /// r"([a-z]+)|([0-9]+)", |
56 | | /// r"foo(A-Z]+)bar", |
57 | | /// ])?; |
58 | | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
59 | | /// assert_eq!(Some(1), props.static_explicit_captures_len()); |
60 | | /// |
61 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
62 | | /// ``` |
63 | 0 | pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { |
64 | 0 | parse_many_with(patterns, &Config::default()) |
65 | 0 | } |
66 | | |
67 | | /// A convenience routine for parsing a pattern into an HIR value using a |
68 | | /// `Config`. |
69 | | /// |
70 | | /// # Example |
71 | | /// |
72 | | /// This shows how to parse a pattern into an HIR value with a non-default |
73 | | /// configuration: |
74 | | /// |
75 | | /// ``` |
76 | | /// use regex_automata::util::syntax; |
77 | | /// |
78 | | /// let hir = syntax::parse_with( |
79 | | /// r"^[a-z]+$", |
80 | | /// &syntax::Config::new().multi_line(true).crlf(true), |
81 | | /// )?; |
82 | | /// assert!(hir.properties().look_set().contains_anchor_crlf()); |
83 | | /// |
84 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
85 | | /// ``` |
86 | 0 | pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { |
87 | 0 | let mut builder = ParserBuilder::new(); |
88 | 0 | config.apply(&mut builder); |
89 | 0 | builder.build().parse(pattern) |
90 | 0 | } |
91 | | |
92 | | /// A convenience routine for parsing many patterns into HIR values using a |
93 | | /// `Config`. |
94 | | /// |
95 | | /// # Example |
96 | | /// |
97 | | /// This shows how to parse many patterns into an corresponding HIR values |
98 | | /// with a non-default configuration: |
99 | | /// |
100 | | /// ``` |
101 | | /// use { |
102 | | /// regex_automata::util::syntax, |
103 | | /// regex_syntax::hir::Properties, |
104 | | /// }; |
105 | | /// |
106 | | /// let patterns = &[ |
107 | | /// r"([a-z]+)|([0-9]+)", |
108 | | /// r"\W", |
109 | | /// r"foo(A-Z]+)bar", |
110 | | /// ]; |
111 | | /// let config = syntax::Config::new().unicode(false).utf8(false); |
112 | | /// let hirs = syntax::parse_many_with(patterns, &config)?; |
113 | | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
114 | | /// assert!(!props.is_utf8()); |
115 | | /// |
116 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
117 | | /// ``` |
118 | 0 | pub fn parse_many_with<P: AsRef<str>>( |
119 | 0 | patterns: &[P], |
120 | 0 | config: &Config, |
121 | 0 | ) -> Result<Vec<Hir>, Error> { |
122 | 0 | let mut builder = ParserBuilder::new(); |
123 | 0 | config.apply(&mut builder); |
124 | 0 | let mut hirs = vec![]; |
125 | 0 | for p in patterns.iter() { |
126 | 0 | hirs.push(builder.build().parse(p.as_ref())?); |
127 | | } |
128 | 0 | Ok(hirs) |
129 | 0 | } |
130 | | |
131 | | /// A common set of configuration options that apply to the syntax of a regex. |
132 | | /// |
133 | | /// This represents a group of configuration options that specifically apply |
134 | | /// to how the concrete syntax of a regular expression is interpreted. In |
135 | | /// particular, they are generally forwarded to the |
136 | | /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) |
137 | | /// in the |
138 | | /// [`regex-syntax`](https://docs.rs/regex-syntax) |
139 | | /// crate when building a regex from its concrete syntax directly. |
140 | | /// |
141 | | /// These options are defined as a group since they apply to every regex engine |
142 | | /// in this crate. Instead of re-defining them on every engine's builder, they |
143 | | /// are instead provided here as one cohesive unit. |
144 | | #[derive(Clone, Copy, Debug)] |
145 | | pub struct Config { |
146 | | case_insensitive: bool, |
147 | | multi_line: bool, |
148 | | dot_matches_new_line: bool, |
149 | | crlf: bool, |
150 | | line_terminator: u8, |
151 | | swap_greed: bool, |
152 | | ignore_whitespace: bool, |
153 | | unicode: bool, |
154 | | utf8: bool, |
155 | | nest_limit: u32, |
156 | | octal: bool, |
157 | | } |
158 | | |
159 | | impl Config { |
160 | | /// Return a new default syntax configuration. |
161 | 0 | pub fn new() -> Config { |
162 | | // These defaults match the ones used in regex-syntax. |
163 | 0 | Config { |
164 | 0 | case_insensitive: false, |
165 | 0 | multi_line: false, |
166 | 0 | dot_matches_new_line: false, |
167 | 0 | crlf: false, |
168 | 0 | line_terminator: b'\n', |
169 | 0 | swap_greed: false, |
170 | 0 | ignore_whitespace: false, |
171 | 0 | unicode: true, |
172 | 0 | utf8: true, |
173 | 0 | nest_limit: 250, |
174 | 0 | octal: false, |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | | /// Enable or disable the case insensitive flag by default. |
179 | | /// |
180 | | /// When Unicode mode is enabled, case insensitivity is Unicode-aware. |
181 | | /// Specifically, it will apply the "simple" case folding rules as |
182 | | /// specified by Unicode. |
183 | | /// |
184 | | /// By default this is disabled. It may alternatively be selectively |
185 | | /// enabled in the regular expression itself via the `i` flag. |
186 | 0 | pub fn case_insensitive(mut self, yes: bool) -> Config { |
187 | 0 | self.case_insensitive = yes; |
188 | 0 | self |
189 | 0 | } |
190 | | |
191 | | /// Enable or disable the multi-line matching flag by default. |
192 | | /// |
193 | | /// When this is enabled, the `^` and `$` look-around assertions will |
194 | | /// match immediately after and immediately before a new line character, |
195 | | /// respectively. Note that the `\A` and `\z` look-around assertions are |
196 | | /// unaffected by this setting and always correspond to matching at the |
197 | | /// beginning and end of the input. |
198 | | /// |
199 | | /// By default this is disabled. It may alternatively be selectively |
200 | | /// enabled in the regular expression itself via the `m` flag. |
201 | 0 | pub fn multi_line(mut self, yes: bool) -> Config { |
202 | 0 | self.multi_line = yes; |
203 | 0 | self |
204 | 0 | } |
205 | | |
206 | | /// Enable or disable the "dot matches any character" flag by default. |
207 | | /// |
208 | | /// When this is enabled, `.` will match any character. When it's disabled, |
209 | | /// then `.` will match any character except for a new line character. |
210 | | /// |
211 | | /// Note that `.` is impacted by whether the "unicode" setting is enabled |
212 | | /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 |
213 | | /// encoding of any Unicode scalar value (sans a new line, depending on |
214 | | /// whether this "dot matches new line" option is enabled). When Unicode |
215 | | /// mode is disabled, `.` will match any byte instead. Because of this, |
216 | | /// when Unicode mode is disabled, `.` can only be used when the "allow |
217 | | /// invalid UTF-8" option is enabled, since `.` could otherwise match |
218 | | /// invalid UTF-8. |
219 | | /// |
220 | | /// By default this is disabled. It may alternatively be selectively |
221 | | /// enabled in the regular expression itself via the `s` flag. |
222 | 0 | pub fn dot_matches_new_line(mut self, yes: bool) -> Config { |
223 | 0 | self.dot_matches_new_line = yes; |
224 | 0 | self |
225 | 0 | } |
226 | | |
227 | | /// Enable or disable the "CRLF mode" flag by default. |
228 | | /// |
229 | | /// By default this is disabled. It may alternatively be selectively |
230 | | /// enabled in the regular expression itself via the `R` flag. |
231 | | /// |
232 | | /// When CRLF mode is enabled, the following happens: |
233 | | /// |
234 | | /// * Unless `dot_matches_new_line` is enabled, `.` will match any character |
235 | | /// except for `\r` and `\n`. |
236 | | /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, |
237 | | /// `\r` and `\n` as line terminators. And in particular, neither will |
238 | | /// match between a `\r` and a `\n`. |
239 | 0 | pub fn crlf(mut self, yes: bool) -> Config { |
240 | 0 | self.crlf = yes; |
241 | 0 | self |
242 | 0 | } |
243 | | |
244 | | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
245 | | /// |
246 | | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
247 | | /// this will cause `.` to match everything except for the byte given. |
248 | | /// |
249 | | /// If `.` is used in a context where Unicode mode is enabled and this byte |
250 | | /// isn't ASCII, then an error will be returned. When Unicode mode is |
251 | | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
252 | | /// mode is enabled and it is a non-ASCII byte. |
253 | | /// |
254 | | /// In short, any ASCII value for a line terminator is always okay. But a |
255 | | /// non-ASCII byte might result in an error depending on whether Unicode |
256 | | /// mode or UTF-8 mode are enabled. |
257 | | /// |
258 | | /// Note that if `R` mode is enabled then it always takes precedence and |
259 | | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
260 | | /// |
261 | | /// Note also that this *doesn't* impact the look-around assertions |
262 | | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
263 | | /// configuration in the regex engine itself. |
264 | 0 | pub fn line_terminator(mut self, byte: u8) -> Config { |
265 | 0 | self.line_terminator = byte; |
266 | 0 | self |
267 | 0 | } |
268 | | |
269 | | /// Enable or disable the "swap greed" flag by default. |
270 | | /// |
271 | | /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` |
272 | | /// will become greedy. |
273 | | /// |
274 | | /// By default this is disabled. It may alternatively be selectively |
275 | | /// enabled in the regular expression itself via the `U` flag. |
276 | 0 | pub fn swap_greed(mut self, yes: bool) -> Config { |
277 | 0 | self.swap_greed = yes; |
278 | 0 | self |
279 | 0 | } |
280 | | |
281 | | /// Enable verbose mode in the regular expression. |
282 | | /// |
283 | | /// When enabled, verbose mode permits insignificant whitespace in many |
284 | | /// places in the regular expression, as well as comments. Comments are |
285 | | /// started using `#` and continue until the end of the line. |
286 | | /// |
287 | | /// By default, this is disabled. It may be selectively enabled in the |
288 | | /// regular expression by using the `x` flag regardless of this setting. |
289 | 0 | pub fn ignore_whitespace(mut self, yes: bool) -> Config { |
290 | 0 | self.ignore_whitespace = yes; |
291 | 0 | self |
292 | 0 | } |
293 | | |
294 | | /// Enable or disable the Unicode flag (`u`) by default. |
295 | | /// |
296 | | /// By default this is **enabled**. It may alternatively be selectively |
297 | | /// disabled in the regular expression itself via the `u` flag. |
298 | | /// |
299 | | /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by |
300 | | /// default), a regular expression will fail to parse if Unicode mode is |
301 | | /// disabled and a sub-expression could possibly match invalid UTF-8. |
302 | | /// |
303 | | /// **WARNING**: Unicode mode can greatly increase the size of the compiled |
304 | | /// DFA, which can noticeably impact both memory usage and compilation |
305 | | /// time. This is especially noticeable if your regex contains character |
306 | | /// classes like `\w` that are impacted by whether Unicode is enabled or |
307 | | /// not. If Unicode is not necessary, you are encouraged to disable it. |
308 | 0 | pub fn unicode(mut self, yes: bool) -> Config { |
309 | 0 | self.unicode = yes; |
310 | 0 | self |
311 | 0 | } |
312 | | |
313 | | /// When disabled, the builder will permit the construction of a regular |
314 | | /// expression that may match invalid UTF-8. |
315 | | /// |
316 | | /// For example, when [`Config::unicode`] is disabled, then |
317 | | /// expressions like `[^a]` may match invalid UTF-8 since they can match |
318 | | /// any single byte that is not `a`. By default, these sub-expressions |
319 | | /// are disallowed to avoid returning offsets that split a UTF-8 |
320 | | /// encoded codepoint. However, in cases where matching at arbitrary |
321 | | /// locations is desired, this option can be disabled to permit all such |
322 | | /// sub-expressions. |
323 | | /// |
324 | | /// When enabled (the default), the builder is guaranteed to produce a |
325 | | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
326 | | /// will return an error). |
327 | 0 | pub fn utf8(mut self, yes: bool) -> Config { |
328 | 0 | self.utf8 = yes; |
329 | 0 | self |
330 | 0 | } |
331 | | |
332 | | /// Set the nesting limit used for the regular expression parser. |
333 | | /// |
334 | | /// The nesting limit controls how deep the abstract syntax tree is allowed |
335 | | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
336 | | /// groups), then an error is returned by the parser. |
337 | | /// |
338 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
339 | | /// overflow when building a finite automaton from a regular expression's |
340 | | /// abstract syntax tree. In particular, construction currently uses |
341 | | /// recursion. In the future, the implementation may stop using recursion |
342 | | /// and this option will no longer be necessary. |
343 | | /// |
344 | | /// This limit is not checked until the entire AST is parsed. Therefore, |
345 | | /// if callers want to put a limit on the amount of heap space used, then |
346 | | /// they should impose a limit on the length, in bytes, of the concrete |
347 | | /// pattern string. In particular, this is viable since the parser will |
348 | | /// limit itself to heap space proportional to the length of the pattern |
349 | | /// string. |
350 | | /// |
351 | | /// Note that a nest limit of `0` will return a nest limit error for most |
352 | | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
353 | | /// not `ab`, since `ab` requires a concatenation AST item, which results |
354 | | /// in a nest depth of `1`. In general, a nest limit is not something that |
355 | | /// manifests in an obvious way in the concrete syntax, therefore, it |
356 | | /// should not be used in a granular way. |
357 | 0 | pub fn nest_limit(mut self, limit: u32) -> Config { |
358 | 0 | self.nest_limit = limit; |
359 | 0 | self |
360 | 0 | } |
361 | | |
362 | | /// Whether to support octal syntax or not. |
363 | | /// |
364 | | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
365 | | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
366 | | /// `\141` are all equivalent regular expressions, where the last example |
367 | | /// shows octal syntax. |
368 | | /// |
369 | | /// While supporting octal syntax isn't in and of itself a problem, it does |
370 | | /// make good error messages harder. That is, in PCRE based regex engines, |
371 | | /// syntax like `\1` invokes a backreference, which is explicitly |
372 | | /// unsupported in Rust's regex engine. However, many users expect it to |
373 | | /// be supported. Therefore, when octal support is disabled, the error |
374 | | /// message will explicitly mention that backreferences aren't supported. |
375 | | /// |
376 | | /// Octal syntax is disabled by default. |
377 | 0 | pub fn octal(mut self, yes: bool) -> Config { |
378 | 0 | self.octal = yes; |
379 | 0 | self |
380 | 0 | } |
381 | | |
382 | | /// Returns whether "unicode" mode is enabled. |
383 | 0 | pub fn get_unicode(&self) -> bool { |
384 | 0 | self.unicode |
385 | 0 | } |
386 | | |
387 | | /// Returns whether "case insensitive" mode is enabled. |
388 | 0 | pub fn get_case_insensitive(&self) -> bool { |
389 | 0 | self.case_insensitive |
390 | 0 | } |
391 | | |
392 | | /// Returns whether "multi line" mode is enabled. |
393 | 0 | pub fn get_multi_line(&self) -> bool { |
394 | 0 | self.multi_line |
395 | 0 | } |
396 | | |
397 | | /// Returns whether "dot matches new line" mode is enabled. |
398 | 0 | pub fn get_dot_matches_new_line(&self) -> bool { |
399 | 0 | self.dot_matches_new_line |
400 | 0 | } |
401 | | |
402 | | /// Returns whether "CRLF" mode is enabled. |
403 | 0 | pub fn get_crlf(&self) -> bool { |
404 | 0 | self.crlf |
405 | 0 | } |
406 | | |
407 | | /// Returns the line terminator in this syntax configuration. |
408 | 0 | pub fn get_line_terminator(&self) -> u8 { |
409 | 0 | self.line_terminator |
410 | 0 | } |
411 | | |
412 | | /// Returns whether "swap greed" mode is enabled. |
413 | 0 | pub fn get_swap_greed(&self) -> bool { |
414 | 0 | self.swap_greed |
415 | 0 | } |
416 | | |
417 | | /// Returns whether "ignore whitespace" mode is enabled. |
418 | 0 | pub fn get_ignore_whitespace(&self) -> bool { |
419 | 0 | self.ignore_whitespace |
420 | 0 | } |
421 | | |
422 | | /// Returns whether UTF-8 mode is enabled. |
423 | 0 | pub fn get_utf8(&self) -> bool { |
424 | 0 | self.utf8 |
425 | 0 | } |
426 | | |
427 | | /// Returns the "nest limit" setting. |
428 | 0 | pub fn get_nest_limit(&self) -> u32 { |
429 | 0 | self.nest_limit |
430 | 0 | } |
431 | | |
432 | | /// Returns whether "octal" mode is enabled. |
433 | 0 | pub fn get_octal(&self) -> bool { |
434 | 0 | self.octal |
435 | 0 | } |
436 | | |
437 | | /// Applies this configuration to the given parser. |
438 | 0 | pub(crate) fn apply(&self, builder: &mut ParserBuilder) { |
439 | 0 | builder |
440 | 0 | .unicode(self.unicode) |
441 | 0 | .case_insensitive(self.case_insensitive) |
442 | 0 | .multi_line(self.multi_line) |
443 | 0 | .dot_matches_new_line(self.dot_matches_new_line) |
444 | 0 | .crlf(self.crlf) |
445 | 0 | .line_terminator(self.line_terminator) |
446 | 0 | .swap_greed(self.swap_greed) |
447 | 0 | .ignore_whitespace(self.ignore_whitespace) |
448 | 0 | .utf8(self.utf8) |
449 | 0 | .nest_limit(self.nest_limit) |
450 | 0 | .octal(self.octal); |
451 | 0 | } |
452 | | |
453 | | /// Applies this configuration to the given AST parser. |
454 | 0 | pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { |
455 | 0 | builder |
456 | 0 | .ignore_whitespace(self.ignore_whitespace) |
457 | 0 | .nest_limit(self.nest_limit) |
458 | 0 | .octal(self.octal); |
459 | 0 | } |
460 | | |
461 | | /// Applies this configuration to the given AST-to-HIR translator. |
462 | 0 | pub(crate) fn apply_hir( |
463 | 0 | &self, |
464 | 0 | builder: &mut hir::translate::TranslatorBuilder, |
465 | 0 | ) { |
466 | 0 | builder |
467 | 0 | .unicode(self.unicode) |
468 | 0 | .case_insensitive(self.case_insensitive) |
469 | 0 | .multi_line(self.multi_line) |
470 | 0 | .crlf(self.crlf) |
471 | 0 | .dot_matches_new_line(self.dot_matches_new_line) |
472 | 0 | .line_terminator(self.line_terminator) |
473 | 0 | .swap_greed(self.swap_greed) |
474 | 0 | .utf8(self.utf8); |
475 | 0 | } |
476 | | } |
477 | | |
478 | | impl Default for Config { |
479 | 0 | fn default() -> Config { |
480 | 0 | Config::new() |
481 | 0 | } |
482 | | } |