/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.6/src/parser.rs

Source (jump to first uncovered line)
use crate::{ast, hir, Error};

/// A convenience routine for parsing a regex using default options.
///
/// This is equivalent to `Parser::new().parse(pattern)`.
///
/// If you need to set non-default options, then use a [`ParserBuilder`].
///
/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
/// you should use a [`ast::parse::Parser`].
pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
    Parser::new().parse(pattern)
}

/// A builder for a regular expression parser.
///
/// This builder permits modifying configuration options for the parser.
///
/// This type combines the builder options for both the [AST
/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
#[derive(Clone, Debug, Default)]
pub struct ParserBuilder {
    ast: ast::parse::ParserBuilder,
    hir: hir::translate::TranslatorBuilder,
}

impl ParserBuilder {
    /// Create a new parser builder with a default configuration.
    pub fn new() -> ParserBuilder {
        ParserBuilder::default()
    }

    /// Build a parser from this configuration with the given pattern.
    pub fn build(&self) -> Parser {
        Parser { ast: self.ast.build(), hir: self.hir.build() }
    }

    /// Set the nesting limit for this parser.
    ///
    /// The nesting limit controls how deep the abstract syntax tree is allowed
    /// to be. If the AST exceeds the given limit (e.g., with too many nested
    /// groups), then an error is returned by the parser.
    ///
    /// The purpose of this limit is to act as a heuristic to prevent stack
    /// overflow for consumers that do structural induction on an `Ast` using
    /// explicit recursion. While this crate never does this (instead using
    /// constant stack space and moving the call stack to the heap), other
    /// crates may.
    ///
    /// This limit is not checked until the entire Ast is parsed. Therefore,
    /// if callers want to put a limit on the amount of heap space used, then
    /// they should impose a limit on the length, in bytes, of the concrete
    /// pattern string. In particular, this is viable since this parser
    /// implementation will limit itself to heap space proportional to the
    /// length of the pattern string.
    ///
    /// Note that a nest limit of `0` will return a nest limit error for most
    /// patterns but not all. For example, a nest limit of `0` permits `a` but
    /// not `ab`, since `ab` requires a concatenation, which results in a nest
    /// depth of `1`. In general, a nest limit is not something that manifests
    /// in an obvious way in the concrete syntax, therefore, it should not be
    /// used in a granular way.
    pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
        self.ast.nest_limit(limit);
        self
    }

    /// Whether to support octal syntax or not.
    ///
    /// Octal syntax is a little-known way of uttering Unicode codepoints in
    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
    /// `\141` are all equivalent regular expressions, where the last example
    /// shows octal syntax.
    ///
    /// While supporting octal syntax isn't in and of itself a problem, it does
    /// make good error messages harder. That is, in PCRE based regex engines,
    /// syntax like `\0` invokes a backreference, which is explicitly
    /// unsupported in Rust's regex engine. However, many users expect it to
    /// be supported. Therefore, when octal support is disabled, the error
    /// message will explicitly mention that backreferences aren't supported.
    ///
    /// Octal syntax is disabled by default.
    pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
        self.ast.octal(yes);
        self
    }

    /// When disabled, translation will permit the construction of a regular
    /// expression that may match invalid UTF-8.
    ///
    /// When enabled (the default), the translator is guaranteed to produce an
    /// expression that, for non-empty matches, will only ever produce spans
    /// that are entirely valid UTF-8 (otherwise, the translator will return an
    /// error).
    ///
    /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
    /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
    /// syntax) will be allowed even though they can produce matches that split
    /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
    /// matches, and it is expected that the regex engine itself must handle
    /// these cases if necessary (perhaps by suppressing any zero-width matches
    /// that split a codepoint).
    pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.utf8(yes);
        self
    }

    /// Enable verbose mode in the regular expression.
    ///
    /// When enabled, verbose mode permits insignificant whitespace in many
    /// places in the regular expression, as well as comments. Comments are
    /// started using `#` and continue until the end of the line.
    ///
    /// By default, this is disabled. It may be selectively enabled in the
    /// regular expression by using the `x` flag regardless of this setting.
    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
        self.ast.ignore_whitespace(yes);
        self
    }

    /// Enable or disable the case insensitive flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `i` flag.
    pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.case_insensitive(yes);
        self
    }

    /// Enable or disable the multi-line matching flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `m` flag.
    pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.multi_line(yes);
        self
    }

    /// Enable or disable the "dot matches any character" flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `s` flag.
    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.dot_matches_new_line(yes);
        self
    }

    /// Enable or disable the CRLF mode flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `R` flag.
    ///
    /// When CRLF mode is enabled, the following happens:
    ///
    /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
    /// except for `\r` and `\n`.
    /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
    /// `\r` and `\n` as line terminators. And in particular, neither will
    /// match between a `\r` and a `\n`.
    pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.crlf(yes);
        self
    }

    /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
    ///
    /// Namely, instead of `.` (by default) matching everything except for `\n`,
    /// this will cause `.` to match everything except for the byte given.
    ///
    /// If `.` is used in a context where Unicode mode is enabled and this byte
    /// isn't ASCII, then an error will be returned. When Unicode mode is
    /// disabled, then any byte is permitted, but will return an error if UTF-8
    /// mode is enabled and it is a non-ASCII byte.
    ///
    /// In short, any ASCII value for a line terminator is always okay. But a
    /// non-ASCII byte might result in an error depending on whether Unicode
    /// mode or UTF-8 mode are enabled.
    ///
    /// Note that if `R` mode is enabled then it always takes precedence and
    /// the line terminator will be treated as `\r` and `\n` simultaneously.
    ///
    /// Note also that this *doesn't* impact the look-around assertions
    /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
    /// configuration in the regex engine itself.
    pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
        self.hir.line_terminator(byte);
        self
    }

    /// Enable or disable the "swap greed" flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `U` flag.
    pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.swap_greed(yes);
        self
    }

    /// Enable or disable the Unicode flag (`u`) by default.
    ///
    /// By default this is **enabled**. It may alternatively be selectively
    /// disabled in the regular expression itself via the `u` flag.
    ///
    /// Note that unless `utf8` is disabled (it's enabled by default), a
    /// regular expression will fail to parse if Unicode mode is disabled and a
    /// sub-expression could possibly match invalid UTF-8.
    pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.unicode(yes);
        self
    }
}

/// A convenience parser for regular expressions.
///
/// This parser takes as input a regular expression pattern string (the
/// "concrete syntax") and returns a high-level intermediate representation
/// (the HIR) suitable for most types of analysis. In particular, this parser
/// hides the intermediate state of producing an AST (the "abstract syntax").
/// The AST is itself far more complex than the HIR, so this parser serves as a
/// convenience for never having to deal with it at all.
///
/// If callers have more fine grained use cases that need an AST, then please
/// see the [`ast::parse`] module.
///
/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
#[derive(Clone, Debug)]
pub struct Parser {
    ast: ast::parse::Parser,
    hir: hir::translate::Translator,
}

impl Parser {
    /// Create a new parser with a default configuration.
    ///
    /// The parser can be run with `parse` method. The parse method returns
    /// a high level intermediate representation of the given regular
    /// expression.
    ///
    /// To set configuration options on the parser, use [`ParserBuilder`].
    pub fn new() -> Parser {
        ParserBuilder::new().build()
    }

    /// Parse the regular expression into a high level intermediate
    /// representation.
    pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
        let ast = self.ast.parse(pattern)?;
        let hir = self.hir.translate(pattern, &ast)?;
        Ok(hir)
    }
}

Coverage Report

Created: 2025-08-25 06:27

Line	Count	Source (jump to first uncovered line)
1		use crate::{ast, hir, Error};
2
3		/// A convenience routine for parsing a regex using default options.
4		///
5		/// This is equivalent to `Parser::new().parse(pattern)`.
6		///
7		/// If you need to set non-default options, then use a [`ParserBuilder`].
8		///
9		/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
10		/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
11		/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
12		/// you should use a [`ast::parse::Parser`].
13	0	pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
14	0	Parser::new().parse(pattern)
15	0	}
16
17		/// A builder for a regular expression parser.
18		///
19		/// This builder permits modifying configuration options for the parser.
20		///
21		/// This type combines the builder options for both the [AST
22		/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
23		/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
24		#[derive(Clone, Debug, Default)]
25		pub struct ParserBuilder {
26		ast: ast::parse::ParserBuilder,
27		hir: hir::translate::TranslatorBuilder,
28		}
29
30		impl ParserBuilder {
31		/// Create a new parser builder with a default configuration.
32	0	pub fn new() -> ParserBuilder {
33	0	ParserBuilder::default()
34	0	}
35
36		/// Build a parser from this configuration with the given pattern.
37	0	pub fn build(&self) -> Parser {
38	0	Parser { ast: self.ast.build(), hir: self.hir.build() }
39	0	}
40
41		/// Set the nesting limit for this parser.
42		///
43		/// The nesting limit controls how deep the abstract syntax tree is allowed
44		/// to be. If the AST exceeds the given limit (e.g., with too many nested
45		/// groups), then an error is returned by the parser.
46		///
47		/// The purpose of this limit is to act as a heuristic to prevent stack
48		/// overflow for consumers that do structural induction on an `Ast` using
49		/// explicit recursion. While this crate never does this (instead using
50		/// constant stack space and moving the call stack to the heap), other
51		/// crates may.
52		///
53		/// This limit is not checked until the entire Ast is parsed. Therefore,
54		/// if callers want to put a limit on the amount of heap space used, then
55		/// they should impose a limit on the length, in bytes, of the concrete
56		/// pattern string. In particular, this is viable since this parser
57		/// implementation will limit itself to heap space proportional to the
58		/// length of the pattern string.
59		///
60		/// Note that a nest limit of `0` will return a nest limit error for most
61		/// patterns but not all. For example, a nest limit of `0` permits `a` but
62		/// not `ab`, since `ab` requires a concatenation, which results in a nest
63		/// depth of `1`. In general, a nest limit is not something that manifests
64		/// in an obvious way in the concrete syntax, therefore, it should not be
65		/// used in a granular way.
66	0	pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
67	0	self.ast.nest_limit(limit);
68	0	self
69	0	}
70
71		/// Whether to support octal syntax or not.
72		///
73		/// Octal syntax is a little-known way of uttering Unicode codepoints in
74		/// a regular expression. For example, `a`, `\x61`, `\u0061` and
75		/// `\141` are all equivalent regular expressions, where the last example
76		/// shows octal syntax.
77		///
78		/// While supporting octal syntax isn't in and of itself a problem, it does
79		/// make good error messages harder. That is, in PCRE based regex engines,
80		/// syntax like `\0` invokes a backreference, which is explicitly
81		/// unsupported in Rust's regex engine. However, many users expect it to
82		/// be supported. Therefore, when octal support is disabled, the error
83		/// message will explicitly mention that backreferences aren't supported.
84		///
85		/// Octal syntax is disabled by default.
86	0	pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
87	0	self.ast.octal(yes);
88	0	self
89	0	}
90
91		/// When disabled, translation will permit the construction of a regular
92		/// expression that may match invalid UTF-8.
93		///
94		/// When enabled (the default), the translator is guaranteed to produce an
95		/// expression that, for non-empty matches, will only ever produce spans
96		/// that are entirely valid UTF-8 (otherwise, the translator will return an
97		/// error).
98		///
99		/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
100		/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
101		/// syntax) will be allowed even though they can produce matches that split
102		/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
103		/// matches, and it is expected that the regex engine itself must handle
104		/// these cases if necessary (perhaps by suppressing any zero-width matches
105		/// that split a codepoint).
106	0	pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
107	0	self.hir.utf8(yes);
108	0	self
109	0	}
110
111		/// Enable verbose mode in the regular expression.
112		///
113		/// When enabled, verbose mode permits insignificant whitespace in many
114		/// places in the regular expression, as well as comments. Comments are
115		/// started using `#` and continue until the end of the line.
116		///
117		/// By default, this is disabled. It may be selectively enabled in the
118		/// regular expression by using the `x` flag regardless of this setting.
119	0	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
120	0	self.ast.ignore_whitespace(yes);
121	0	self
122	0	}
123
124		/// Enable or disable the case insensitive flag by default.
125		///
126		/// By default this is disabled. It may alternatively be selectively
127		/// enabled in the regular expression itself via the `i` flag.
128	0	pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
129	0	self.hir.case_insensitive(yes);
130	0	self
131	0	}
132
133		/// Enable or disable the multi-line matching flag by default.
134		///
135		/// By default this is disabled. It may alternatively be selectively
136		/// enabled in the regular expression itself via the `m` flag.
137	0	pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
138	0	self.hir.multi_line(yes);
139	0	self
140	0	}
141
142		/// Enable or disable the "dot matches any character" flag by default.
143		///
144		/// By default this is disabled. It may alternatively be selectively
145		/// enabled in the regular expression itself via the `s` flag.
146	0	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
147	0	self.hir.dot_matches_new_line(yes);
148	0	self
149	0	}
150
151		/// Enable or disable the CRLF mode flag by default.
152		///
153		/// By default this is disabled. It may alternatively be selectively
154		/// enabled in the regular expression itself via the `R` flag.
155		///
156		/// When CRLF mode is enabled, the following happens:
157		///
158		/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
159		/// except for `\r` and `\n`.
160		/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
161		/// `\r` and `\n` as line terminators. And in particular, neither will
162		/// match between a `\r` and a `\n`.
163	0	pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
164	0	self.hir.crlf(yes);
165	0	self
166	0	}
167
168		/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
169		///
170		/// Namely, instead of `.` (by default) matching everything except for `\n`,
171		/// this will cause `.` to match everything except for the byte given.
172		///
173		/// If `.` is used in a context where Unicode mode is enabled and this byte
174		/// isn't ASCII, then an error will be returned. When Unicode mode is
175		/// disabled, then any byte is permitted, but will return an error if UTF-8
176		/// mode is enabled and it is a non-ASCII byte.
177		///
178		/// In short, any ASCII value for a line terminator is always okay. But a
179		/// non-ASCII byte might result in an error depending on whether Unicode
180		/// mode or UTF-8 mode are enabled.
181		///
182		/// Note that if `R` mode is enabled then it always takes precedence and
183		/// the line terminator will be treated as `\r` and `\n` simultaneously.
184		///
185		/// Note also that this doesn't impact the look-around assertions
186		/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
187		/// configuration in the regex engine itself.
188	0	pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
189	0	self.hir.line_terminator(byte);
190	0	self
191	0	}
192
193		/// Enable or disable the "swap greed" flag by default.
194		///
195		/// By default this is disabled. It may alternatively be selectively
196		/// enabled in the regular expression itself via the `U` flag.
197	0	pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
198	0	self.hir.swap_greed(yes);
199	0	self
200	0	}
201
202		/// Enable or disable the Unicode flag (`u`) by default.
203		///
204		/// By default this is enabled. It may alternatively be selectively
205		/// disabled in the regular expression itself via the `u` flag.
206		///
207		/// Note that unless `utf8` is disabled (it's enabled by default), a
208		/// regular expression will fail to parse if Unicode mode is disabled and a
209		/// sub-expression could possibly match invalid UTF-8.
210	0	pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
211	0	self.hir.unicode(yes);
212	0	self
213	0	}
214		}
215
216		/// A convenience parser for regular expressions.
217		///
218		/// This parser takes as input a regular expression pattern string (the
219		/// "concrete syntax") and returns a high-level intermediate representation
220		/// (the HIR) suitable for most types of analysis. In particular, this parser
221		/// hides the intermediate state of producing an AST (the "abstract syntax").
222		/// The AST is itself far more complex than the HIR, so this parser serves as a
223		/// convenience for never having to deal with it at all.
224		///
225		/// If callers have more fine grained use cases that need an AST, then please
226		/// see the [`ast::parse`] module.
227		///
228		/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
229		#[derive(Clone, Debug)]
230		pub struct Parser {
231		ast: ast::parse::Parser,
232		hir: hir::translate::Translator,
233		}
234
235		impl Parser {
236		/// Create a new parser with a default configuration.
237		///
238		/// The parser can be run with `parse` method. The parse method returns
239		/// a high level intermediate representation of the given regular
240		/// expression.
241		///
242		/// To set configuration options on the parser, use [`ParserBuilder`].
243	0	pub fn new() -> Parser {
244	0	ParserBuilder::new().build()
245	0	}
246
247		/// Parse the regular expression into a high level intermediate
248		/// representation.
249	0	pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
250	0	let ast = self.ast.parse(pattern)?;
251	0	let hir = self.hir.translate(pattern, &ast)?;
252	0	Ok(hir)
253	0	}
254		}