Coverage Report

Created: 2025-11-16 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/util/syntax.rs
Line
Count
Source
1
/*!
2
Utilities for dealing with the syntax of a regular expression.
3
4
This module currently only exposes a [`Config`] type that
5
itself represents a wrapper around the configuration for a
6
[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
7
this wrapper is to make configuring syntax options very similar to how other
8
configuration is done throughout this crate. Namely, instead of duplicating
9
syntax options across every builder (of which there are many), we instead
10
create small config objects like this one that can be passed around and
11
composed.
12
*/
13
14
use alloc::{vec, vec::Vec};
15
16
use regex_syntax::{
17
    ast,
18
    hir::{self, Hir},
19
    Error, ParserBuilder,
20
};
21
22
/// A convenience routine for parsing a pattern into an HIR value with the
23
/// default configuration.
24
///
25
/// # Example
26
///
27
/// This shows how to parse a pattern into an HIR value:
28
///
29
/// ```
30
/// use regex_automata::util::syntax;
31
///
32
/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
33
/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
34
///
35
/// # Ok::<(), Box<dyn std::error::Error>>(())
36
/// ```
37
0
pub fn parse(pattern: &str) -> Result<Hir, Error> {
38
0
    parse_with(pattern, &Config::default())
39
0
}
40
41
/// A convenience routine for parsing many patterns into HIR value with the
42
/// default configuration.
43
///
44
/// # Example
45
///
46
/// This shows how to parse many patterns into an corresponding HIR values:
47
///
48
/// ```
49
/// use {
50
///     regex_automata::util::syntax,
51
///     regex_syntax::hir::Properties,
52
/// };
53
///
54
/// let hirs = syntax::parse_many(&[
55
///     r"([a-z]+)|([0-9]+)",
56
///     r"foo(A-Z]+)bar",
57
/// ])?;
58
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
59
/// assert_eq!(Some(1), props.static_explicit_captures_len());
60
///
61
/// # Ok::<(), Box<dyn std::error::Error>>(())
62
/// ```
63
pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
64
    parse_many_with(patterns, &Config::default())
65
}
66
67
/// A convenience routine for parsing a pattern into an HIR value using a
68
/// `Config`.
69
///
70
/// # Example
71
///
72
/// This shows how to parse a pattern into an HIR value with a non-default
73
/// configuration:
74
///
75
/// ```
76
/// use regex_automata::util::syntax;
77
///
78
/// let hir = syntax::parse_with(
79
///     r"^[a-z]+$",
80
///     &syntax::Config::new().multi_line(true).crlf(true),
81
/// )?;
82
/// assert!(hir.properties().look_set().contains_anchor_crlf());
83
///
84
/// # Ok::<(), Box<dyn std::error::Error>>(())
85
/// ```
86
0
pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
87
0
    let mut builder = ParserBuilder::new();
88
0
    config.apply(&mut builder);
89
0
    builder.build().parse(pattern)
90
0
}
91
92
/// A convenience routine for parsing many patterns into HIR values using a
93
/// `Config`.
94
///
95
/// # Example
96
///
97
/// This shows how to parse many patterns into an corresponding HIR values
98
/// with a non-default configuration:
99
///
100
/// ```
101
/// use {
102
///     regex_automata::util::syntax,
103
///     regex_syntax::hir::Properties,
104
/// };
105
///
106
/// let patterns = &[
107
///     r"([a-z]+)|([0-9]+)",
108
///     r"\W",
109
///     r"foo(A-Z]+)bar",
110
/// ];
111
/// let config = syntax::Config::new().unicode(false).utf8(false);
112
/// let hirs = syntax::parse_many_with(patterns, &config)?;
113
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
114
/// assert!(!props.is_utf8());
115
///
116
/// # Ok::<(), Box<dyn std::error::Error>>(())
117
/// ```
118
pub fn parse_many_with<P: AsRef<str>>(
119
    patterns: &[P],
120
    config: &Config,
121
) -> Result<Vec<Hir>, Error> {
122
    let mut builder = ParserBuilder::new();
123
    config.apply(&mut builder);
124
    let mut hirs = vec![];
125
    for p in patterns.iter() {
126
        hirs.push(builder.build().parse(p.as_ref())?);
127
    }
128
    Ok(hirs)
129
}
130
131
/// A common set of configuration options that apply to the syntax of a regex.
132
///
133
/// This represents a group of configuration options that specifically apply
134
/// to how the concrete syntax of a regular expression is interpreted. In
135
/// particular, they are generally forwarded to the
136
/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
137
/// in the
138
/// [`regex-syntax`](https://docs.rs/regex-syntax)
139
/// crate when building a regex from its concrete syntax directly.
140
///
141
/// These options are defined as a group since they apply to every regex engine
142
/// in this crate. Instead of re-defining them on every engine's builder, they
143
/// are instead provided here as one cohesive unit.
144
#[derive(Clone, Copy, Debug)]
145
pub struct Config {
146
    case_insensitive: bool,
147
    multi_line: bool,
148
    dot_matches_new_line: bool,
149
    crlf: bool,
150
    line_terminator: u8,
151
    swap_greed: bool,
152
    ignore_whitespace: bool,
153
    unicode: bool,
154
    utf8: bool,
155
    nest_limit: u32,
156
    octal: bool,
157
}
158
159
impl Config {
160
    /// Return a new default syntax configuration.
161
89.0k
    pub fn new() -> Config {
162
        // These defaults match the ones used in regex-syntax.
163
89.0k
        Config {
164
89.0k
            case_insensitive: false,
165
89.0k
            multi_line: false,
166
89.0k
            dot_matches_new_line: false,
167
89.0k
            crlf: false,
168
89.0k
            line_terminator: b'\n',
169
89.0k
            swap_greed: false,
170
89.0k
            ignore_whitespace: false,
171
89.0k
            unicode: true,
172
89.0k
            utf8: true,
173
89.0k
            nest_limit: 250,
174
89.0k
            octal: false,
175
89.0k
        }
176
89.0k
    }
177
178
    /// Enable or disable the case insensitive flag by default.
179
    ///
180
    /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
181
    /// Specifically, it will apply the "simple" case folding rules as
182
    /// specified by Unicode.
183
    ///
184
    /// By default this is disabled. It may alternatively be selectively
185
    /// enabled in the regular expression itself via the `i` flag.
186
22.5k
    pub fn case_insensitive(mut self, yes: bool) -> Config {
187
22.5k
        self.case_insensitive = yes;
188
22.5k
        self
189
22.5k
    }
190
191
    /// Enable or disable the multi-line matching flag by default.
192
    ///
193
    /// When this is enabled, the `^` and `$` look-around assertions will
194
    /// match immediately after and immediately before a new line character,
195
    /// respectively. Note that the `\A` and `\z` look-around assertions are
196
    /// unaffected by this setting and always correspond to matching at the
197
    /// beginning and end of the input.
198
    ///
199
    /// By default this is disabled. It may alternatively be selectively
200
    /// enabled in the regular expression itself via the `m` flag.
201
22.5k
    pub fn multi_line(mut self, yes: bool) -> Config {
202
22.5k
        self.multi_line = yes;
203
22.5k
        self
204
22.5k
    }
205
206
    /// Enable or disable the "dot matches any character" flag by default.
207
    ///
208
    /// When this is enabled, `.` will match any character. When it's disabled,
209
    /// then `.` will match any character except for a new line character.
210
    ///
211
    /// Note that `.` is impacted by whether the "unicode" setting is enabled
212
    /// or not. When Unicode is enabled (the default), `.` will match any UTF-8
213
    /// encoding of any Unicode scalar value (sans a new line, depending on
214
    /// whether this "dot matches new line" option is enabled). When Unicode
215
    /// mode is disabled, `.` will match any byte instead. Because of this,
216
    /// when Unicode mode is disabled, `.` can only be used when the "allow
217
    /// invalid UTF-8" option is enabled, since `.` could otherwise match
218
    /// invalid UTF-8.
219
    ///
220
    /// By default this is disabled. It may alternatively be selectively
221
    /// enabled in the regular expression itself via the `s` flag.
222
22.5k
    pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
223
22.5k
        self.dot_matches_new_line = yes;
224
22.5k
        self
225
22.5k
    }
226
227
    /// Enable or disable the "CRLF mode" flag by default.
228
    ///
229
    /// By default this is disabled. It may alternatively be selectively
230
    /// enabled in the regular expression itself via the `R` flag.
231
    ///
232
    /// When CRLF mode is enabled, the following happens:
233
    ///
234
    /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
235
    /// except for `\r` and `\n`.
236
    /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
237
    /// `\r` and `\n` as line terminators. And in particular, neither will
238
    /// match between a `\r` and a `\n`.
239
0
    pub fn crlf(mut self, yes: bool) -> Config {
240
0
        self.crlf = yes;
241
0
        self
242
0
    }
243
244
    /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
245
    ///
246
    /// Namely, instead of `.` (by default) matching everything except for `\n`,
247
    /// this will cause `.` to match everything except for the byte given.
248
    ///
249
    /// If `.` is used in a context where Unicode mode is enabled and this byte
250
    /// isn't ASCII, then an error will be returned. When Unicode mode is
251
    /// disabled, then any byte is permitted, but will return an error if UTF-8
252
    /// mode is enabled and it is a non-ASCII byte.
253
    ///
254
    /// In short, any ASCII value for a line terminator is always okay. But a
255
    /// non-ASCII byte might result in an error depending on whether Unicode
256
    /// mode or UTF-8 mode are enabled.
257
    ///
258
    /// Note that if `R` mode is enabled then it always takes precedence and
259
    /// the line terminator will be treated as `\r` and `\n` simultaneously.
260
    ///
261
    /// Note also that this *doesn't* impact the look-around assertions
262
    /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
263
    /// configuration in the regex engine itself.
264
0
    pub fn line_terminator(mut self, byte: u8) -> Config {
265
0
        self.line_terminator = byte;
266
0
        self
267
0
    }
268
269
    /// Enable or disable the "swap greed" flag by default.
270
    ///
271
    /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
272
    /// will become greedy.
273
    ///
274
    /// By default this is disabled. It may alternatively be selectively
275
    /// enabled in the regular expression itself via the `U` flag.
276
22.5k
    pub fn swap_greed(mut self, yes: bool) -> Config {
277
22.5k
        self.swap_greed = yes;
278
22.5k
        self
279
22.5k
    }
280
281
    /// Enable verbose mode in the regular expression.
282
    ///
283
    /// When enabled, verbose mode permits insignificant whitespace in many
284
    /// places in the regular expression, as well as comments. Comments are
285
    /// started using `#` and continue until the end of the line.
286
    ///
287
    /// By default, this is disabled. It may be selectively enabled in the
288
    /// regular expression by using the `x` flag regardless of this setting.
289
22.5k
    pub fn ignore_whitespace(mut self, yes: bool) -> Config {
290
22.5k
        self.ignore_whitespace = yes;
291
22.5k
        self
292
22.5k
    }
293
294
    /// Enable or disable the Unicode flag (`u`) by default.
295
    ///
296
    /// By default this is **enabled**. It may alternatively be selectively
297
    /// disabled in the regular expression itself via the `u` flag.
298
    ///
299
    /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
300
    /// default), a regular expression will fail to parse if Unicode mode is
301
    /// disabled and a sub-expression could possibly match invalid UTF-8.
302
    ///
303
    /// **WARNING**: Unicode mode can greatly increase the size of the compiled
304
    /// DFA, which can noticeably impact both memory usage and compilation
305
    /// time. This is especially noticeable if your regex contains character
306
    /// classes like `\w` that are impacted by whether Unicode is enabled or
307
    /// not. If Unicode is not necessary, you are encouraged to disable it.
308
22.5k
    pub fn unicode(mut self, yes: bool) -> Config {
309
22.5k
        self.unicode = yes;
310
22.5k
        self
311
22.5k
    }
312
313
    /// When disabled, the builder will permit the construction of a regular
314
    /// expression that may match invalid UTF-8.
315
    ///
316
    /// For example, when [`Config::unicode`] is disabled, then
317
    /// expressions like `[^a]` may match invalid UTF-8 since they can match
318
    /// any single byte that is not `a`. By default, these sub-expressions
319
    /// are disallowed to avoid returning offsets that split a UTF-8
320
    /// encoded codepoint. However, in cases where matching at arbitrary
321
    /// locations is desired, this option can be disabled to permit all such
322
    /// sub-expressions.
323
    ///
324
    /// When enabled (the default), the builder is guaranteed to produce a
325
    /// regex that will only ever match valid UTF-8 (otherwise, the builder
326
    /// will return an error).
327
89.0k
    pub fn utf8(mut self, yes: bool) -> Config {
328
89.0k
        self.utf8 = yes;
329
89.0k
        self
330
89.0k
    }
331
332
    /// Set the nesting limit used for the regular expression parser.
333
    ///
334
    /// The nesting limit controls how deep the abstract syntax tree is allowed
335
    /// to be. If the AST exceeds the given limit (e.g., with too many nested
336
    /// groups), then an error is returned by the parser.
337
    ///
338
    /// The purpose of this limit is to act as a heuristic to prevent stack
339
    /// overflow when building a finite automaton from a regular expression's
340
    /// abstract syntax tree. In particular, construction currently uses
341
    /// recursion. In the future, the implementation may stop using recursion
342
    /// and this option will no longer be necessary.
343
    ///
344
    /// This limit is not checked until the entire AST is parsed. Therefore,
345
    /// if callers want to put a limit on the amount of heap space used, then
346
    /// they should impose a limit on the length, in bytes, of the concrete
347
    /// pattern string. In particular, this is viable since the parser will
348
    /// limit itself to heap space proportional to the length of the pattern
349
    /// string.
350
    ///
351
    /// Note that a nest limit of `0` will return a nest limit error for most
352
    /// patterns but not all. For example, a nest limit of `0` permits `a` but
353
    /// not `ab`, since `ab` requires a concatenation AST item, which results
354
    /// in a nest depth of `1`. In general, a nest limit is not something that
355
    /// manifests in an obvious way in the concrete syntax, therefore, it
356
    /// should not be used in a granular way.
357
0
    pub fn nest_limit(mut self, limit: u32) -> Config {
358
0
        self.nest_limit = limit;
359
0
        self
360
0
    }
361
362
    /// Whether to support octal syntax or not.
363
    ///
364
    /// Octal syntax is a little-known way of uttering Unicode codepoints in
365
    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
366
    /// `\141` are all equivalent regular expressions, where the last example
367
    /// shows octal syntax.
368
    ///
369
    /// While supporting octal syntax isn't in and of itself a problem, it does
370
    /// make good error messages harder. That is, in PCRE based regex engines,
371
    /// syntax like `\1` invokes a backreference, which is explicitly
372
    /// unsupported in Rust's regex engine. However, many users expect it to
373
    /// be supported. Therefore, when octal support is disabled, the error
374
    /// message will explicitly mention that backreferences aren't supported.
375
    ///
376
    /// Octal syntax is disabled by default.
377
22.5k
    pub fn octal(mut self, yes: bool) -> Config {
378
22.5k
        self.octal = yes;
379
22.5k
        self
380
22.5k
    }
381
382
    /// Returns whether "unicode" mode is enabled.
383
0
    pub fn get_unicode(&self) -> bool {
384
0
        self.unicode
385
0
    }
386
387
    /// Returns whether "case insensitive" mode is enabled.
388
0
    pub fn get_case_insensitive(&self) -> bool {
389
0
        self.case_insensitive
390
0
    }
391
392
    /// Returns whether "multi line" mode is enabled.
393
0
    pub fn get_multi_line(&self) -> bool {
394
0
        self.multi_line
395
0
    }
396
397
    /// Returns whether "dot matches new line" mode is enabled.
398
0
    pub fn get_dot_matches_new_line(&self) -> bool {
399
0
        self.dot_matches_new_line
400
0
    }
401
402
    /// Returns whether "CRLF" mode is enabled.
403
0
    pub fn get_crlf(&self) -> bool {
404
0
        self.crlf
405
0
    }
406
407
    /// Returns the line terminator in this syntax configuration.
408
0
    pub fn get_line_terminator(&self) -> u8 {
409
0
        self.line_terminator
410
0
    }
411
412
    /// Returns whether "swap greed" mode is enabled.
413
0
    pub fn get_swap_greed(&self) -> bool {
414
0
        self.swap_greed
415
0
    }
416
417
    /// Returns whether "ignore whitespace" mode is enabled.
418
0
    pub fn get_ignore_whitespace(&self) -> bool {
419
0
        self.ignore_whitespace
420
0
    }
421
422
    /// Returns whether UTF-8 mode is enabled.
423
0
    pub fn get_utf8(&self) -> bool {
424
0
        self.utf8
425
0
    }
426
427
    /// Returns the "nest limit" setting.
428
0
    pub fn get_nest_limit(&self) -> u32 {
429
0
        self.nest_limit
430
0
    }
431
432
    /// Returns whether "octal" mode is enabled.
433
0
    pub fn get_octal(&self) -> bool {
434
0
        self.octal
435
0
    }
436
437
    /// Applies this configuration to the given parser.
438
0
    pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
439
0
        builder
440
0
            .unicode(self.unicode)
441
0
            .case_insensitive(self.case_insensitive)
442
0
            .multi_line(self.multi_line)
443
0
            .dot_matches_new_line(self.dot_matches_new_line)
444
0
            .crlf(self.crlf)
445
0
            .line_terminator(self.line_terminator)
446
0
            .swap_greed(self.swap_greed)
447
0
            .ignore_whitespace(self.ignore_whitespace)
448
0
            .utf8(self.utf8)
449
0
            .nest_limit(self.nest_limit)
450
0
            .octal(self.octal);
451
0
    }
452
453
    /// Applies this configuration to the given AST parser.
454
89.0k
    pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
455
89.0k
        builder
456
89.0k
            .ignore_whitespace(self.ignore_whitespace)
457
89.0k
            .nest_limit(self.nest_limit)
458
89.0k
            .octal(self.octal);
459
89.0k
    }
460
461
    /// Applies this configuration to the given AST-to-HIR translator.
462
89.0k
    pub(crate) fn apply_hir(
463
89.0k
        &self,
464
89.0k
        builder: &mut hir::translate::TranslatorBuilder,
465
89.0k
    ) {
466
89.0k
        builder
467
89.0k
            .unicode(self.unicode)
468
89.0k
            .case_insensitive(self.case_insensitive)
469
89.0k
            .multi_line(self.multi_line)
470
89.0k
            .crlf(self.crlf)
471
89.0k
            .dot_matches_new_line(self.dot_matches_new_line)
472
89.0k
            .line_terminator(self.line_terminator)
473
89.0k
            .swap_greed(self.swap_greed)
474
89.0k
            .utf8(self.utf8);
475
89.0k
    }
476
}
477
478
impl Default for Config {
479
89.0k
    fn default() -> Config {
480
89.0k
        Config::new()
481
89.0k
    }
482
}