/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.6/src/parser.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use crate::{ast, hir, Error}; |
2 | | |
3 | | /// A convenience routine for parsing a regex using default options. |
4 | | /// |
5 | | /// This is equivalent to `Parser::new().parse(pattern)`. |
6 | | /// |
7 | | /// If you need to set non-default options, then use a [`ParserBuilder`]. |
8 | | /// |
9 | | /// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically |
10 | | /// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator |
11 | | /// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then |
12 | | /// you should use a [`ast::parse::Parser`]. |
13 | 0 | pub fn parse(pattern: &str) -> Result<hir::Hir, Error> { |
14 | 0 | Parser::new().parse(pattern) |
15 | 0 | } |
16 | | |
17 | | /// A builder for a regular expression parser. |
18 | | /// |
19 | | /// This builder permits modifying configuration options for the parser. |
20 | | /// |
21 | | /// This type combines the builder options for both the [AST |
22 | | /// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR |
23 | | /// `TranslatorBuilder`](hir::translate::TranslatorBuilder). |
24 | | #[derive(Clone, Debug, Default)] |
25 | | pub struct ParserBuilder { |
26 | | ast: ast::parse::ParserBuilder, |
27 | | hir: hir::translate::TranslatorBuilder, |
28 | | } |
29 | | |
30 | | impl ParserBuilder { |
31 | | /// Create a new parser builder with a default configuration. |
32 | 0 | pub fn new() -> ParserBuilder { |
33 | 0 | ParserBuilder::default() |
34 | 0 | } |
35 | | |
36 | | /// Build a parser from this configuration with the given pattern. |
37 | 0 | pub fn build(&self) -> Parser { |
38 | 0 | Parser { ast: self.ast.build(), hir: self.hir.build() } |
39 | 0 | } |
40 | | |
41 | | /// Set the nesting limit for this parser. |
42 | | /// |
43 | | /// The nesting limit controls how deep the abstract syntax tree is allowed |
44 | | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
45 | | /// groups), then an error is returned by the parser. |
46 | | /// |
47 | | /// The purpose of this limit is to act as a heuristic to prevent stack |
48 | | /// overflow for consumers that do structural induction on an `Ast` using |
49 | | /// explicit recursion. While this crate never does this (instead using |
50 | | /// constant stack space and moving the call stack to the heap), other |
51 | | /// crates may. |
52 | | /// |
53 | | /// This limit is not checked until the entire Ast is parsed. Therefore, |
54 | | /// if callers want to put a limit on the amount of heap space used, then |
55 | | /// they should impose a limit on the length, in bytes, of the concrete |
56 | | /// pattern string. In particular, this is viable since this parser |
57 | | /// implementation will limit itself to heap space proportional to the |
58 | | /// length of the pattern string. |
59 | | /// |
60 | | /// Note that a nest limit of `0` will return a nest limit error for most |
61 | | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
62 | | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
63 | | /// depth of `1`. In general, a nest limit is not something that manifests |
64 | | /// in an obvious way in the concrete syntax, therefore, it should not be |
65 | | /// used in a granular way. |
66 | 0 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { |
67 | 0 | self.ast.nest_limit(limit); |
68 | 0 | self |
69 | 0 | } |
70 | | |
71 | | /// Whether to support octal syntax or not. |
72 | | /// |
73 | | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
74 | | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
75 | | /// `\141` are all equivalent regular expressions, where the last example |
76 | | /// shows octal syntax. |
77 | | /// |
78 | | /// While supporting octal syntax isn't in and of itself a problem, it does |
79 | | /// make good error messages harder. That is, in PCRE based regex engines, |
80 | | /// syntax like `\0` invokes a backreference, which is explicitly |
81 | | /// unsupported in Rust's regex engine. However, many users expect it to |
82 | | /// be supported. Therefore, when octal support is disabled, the error |
83 | | /// message will explicitly mention that backreferences aren't supported. |
84 | | /// |
85 | | /// Octal syntax is disabled by default. |
86 | 0 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { |
87 | 0 | self.ast.octal(yes); |
88 | 0 | self |
89 | 0 | } |
90 | | |
91 | | /// When disabled, translation will permit the construction of a regular |
92 | | /// expression that may match invalid UTF-8. |
93 | | /// |
94 | | /// When enabled (the default), the translator is guaranteed to produce an |
95 | | /// expression that, for non-empty matches, will only ever produce spans |
96 | | /// that are entirely valid UTF-8 (otherwise, the translator will return an |
97 | | /// error). |
98 | | /// |
99 | | /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even |
100 | | /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete |
101 | | /// syntax) will be allowed even though they can produce matches that split |
102 | | /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" |
103 | | /// matches, and it is expected that the regex engine itself must handle |
104 | | /// these cases if necessary (perhaps by suppressing any zero-width matches |
105 | | /// that split a codepoint). |
106 | 0 | pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { |
107 | 0 | self.hir.utf8(yes); |
108 | 0 | self |
109 | 0 | } |
110 | | |
111 | | /// Enable verbose mode in the regular expression. |
112 | | /// |
113 | | /// When enabled, verbose mode permits insignificant whitespace in many |
114 | | /// places in the regular expression, as well as comments. Comments are |
115 | | /// started using `#` and continue until the end of the line. |
116 | | /// |
117 | | /// By default, this is disabled. It may be selectively enabled in the |
118 | | /// regular expression by using the `x` flag regardless of this setting. |
119 | 0 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { |
120 | 0 | self.ast.ignore_whitespace(yes); |
121 | 0 | self |
122 | 0 | } |
123 | | |
124 | | /// Enable or disable the case insensitive flag by default. |
125 | | /// |
126 | | /// By default this is disabled. It may alternatively be selectively |
127 | | /// enabled in the regular expression itself via the `i` flag. |
128 | 0 | pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { |
129 | 0 | self.hir.case_insensitive(yes); |
130 | 0 | self |
131 | 0 | } |
132 | | |
133 | | /// Enable or disable the multi-line matching flag by default. |
134 | | /// |
135 | | /// By default this is disabled. It may alternatively be selectively |
136 | | /// enabled in the regular expression itself via the `m` flag. |
137 | 0 | pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { |
138 | 0 | self.hir.multi_line(yes); |
139 | 0 | self |
140 | 0 | } |
141 | | |
142 | | /// Enable or disable the "dot matches any character" flag by default. |
143 | | /// |
144 | | /// By default this is disabled. It may alternatively be selectively |
145 | | /// enabled in the regular expression itself via the `s` flag. |
146 | 0 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { |
147 | 0 | self.hir.dot_matches_new_line(yes); |
148 | 0 | self |
149 | 0 | } |
150 | | |
151 | | /// Enable or disable the CRLF mode flag by default. |
152 | | /// |
153 | | /// By default this is disabled. It may alternatively be selectively |
154 | | /// enabled in the regular expression itself via the `R` flag. |
155 | | /// |
156 | | /// When CRLF mode is enabled, the following happens: |
157 | | /// |
158 | | /// * Unless `dot_matches_new_line` is enabled, `.` will match any character |
159 | | /// except for `\r` and `\n`. |
160 | | /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, |
161 | | /// `\r` and `\n` as line terminators. And in particular, neither will |
162 | | /// match between a `\r` and a `\n`. |
163 | 0 | pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { |
164 | 0 | self.hir.crlf(yes); |
165 | 0 | self |
166 | 0 | } |
167 | | |
168 | | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
169 | | /// |
170 | | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
171 | | /// this will cause `.` to match everything except for the byte given. |
172 | | /// |
173 | | /// If `.` is used in a context where Unicode mode is enabled and this byte |
174 | | /// isn't ASCII, then an error will be returned. When Unicode mode is |
175 | | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
176 | | /// mode is enabled and it is a non-ASCII byte. |
177 | | /// |
178 | | /// In short, any ASCII value for a line terminator is always okay. But a |
179 | | /// non-ASCII byte might result in an error depending on whether Unicode |
180 | | /// mode or UTF-8 mode are enabled. |
181 | | /// |
182 | | /// Note that if `R` mode is enabled then it always takes precedence and |
183 | | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
184 | | /// |
185 | | /// Note also that this *doesn't* impact the look-around assertions |
186 | | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
187 | | /// configuration in the regex engine itself. |
188 | 0 | pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { |
189 | 0 | self.hir.line_terminator(byte); |
190 | 0 | self |
191 | 0 | } |
192 | | |
193 | | /// Enable or disable the "swap greed" flag by default. |
194 | | /// |
195 | | /// By default this is disabled. It may alternatively be selectively |
196 | | /// enabled in the regular expression itself via the `U` flag. |
197 | 0 | pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { |
198 | 0 | self.hir.swap_greed(yes); |
199 | 0 | self |
200 | 0 | } |
201 | | |
202 | | /// Enable or disable the Unicode flag (`u`) by default. |
203 | | /// |
204 | | /// By default this is **enabled**. It may alternatively be selectively |
205 | | /// disabled in the regular expression itself via the `u` flag. |
206 | | /// |
207 | | /// Note that unless `utf8` is disabled (it's enabled by default), a |
208 | | /// regular expression will fail to parse if Unicode mode is disabled and a |
209 | | /// sub-expression could possibly match invalid UTF-8. |
210 | 0 | pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { |
211 | 0 | self.hir.unicode(yes); |
212 | 0 | self |
213 | 0 | } |
214 | | } |
215 | | |
216 | | /// A convenience parser for regular expressions. |
217 | | /// |
218 | | /// This parser takes as input a regular expression pattern string (the |
219 | | /// "concrete syntax") and returns a high-level intermediate representation |
220 | | /// (the HIR) suitable for most types of analysis. In particular, this parser |
221 | | /// hides the intermediate state of producing an AST (the "abstract syntax"). |
222 | | /// The AST is itself far more complex than the HIR, so this parser serves as a |
223 | | /// convenience for never having to deal with it at all. |
224 | | /// |
225 | | /// If callers have more fine grained use cases that need an AST, then please |
226 | | /// see the [`ast::parse`] module. |
227 | | /// |
228 | | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
229 | | #[derive(Clone, Debug)] |
230 | | pub struct Parser { |
231 | | ast: ast::parse::Parser, |
232 | | hir: hir::translate::Translator, |
233 | | } |
234 | | |
235 | | impl Parser { |
236 | | /// Create a new parser with a default configuration. |
237 | | /// |
238 | | /// The parser can be run with `parse` method. The parse method returns |
239 | | /// a high level intermediate representation of the given regular |
240 | | /// expression. |
241 | | /// |
242 | | /// To set configuration options on the parser, use [`ParserBuilder`]. |
243 | 0 | pub fn new() -> Parser { |
244 | 0 | ParserBuilder::new().build() |
245 | 0 | } |
246 | | |
247 | | /// Parse the regular expression into a high level intermediate |
248 | | /// representation. |
249 | 0 | pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> { |
250 | 0 | let ast = self.ast.parse(pattern)?; |
251 | 0 | let hir = self.hir.translate(pattern, &ast)?; |
252 | 0 | Ok(hir) |
253 | 0 | } |
254 | | } |