/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.5/src/hir/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Defines a high-level intermediate (HIR) representation for regular expressions. |
3 | | |
4 | | The HIR is represented by the [`Hir`] type, and it principally constructed via |
5 | | [translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users |
6 | | may use the smart constructors defined on `Hir` to build their own by hand. The |
7 | | smart constructors simultaneously simplify and "optimize" the HIR, and are also |
8 | | the same routines used by translation. |
9 | | |
10 | | Most regex engines only have an HIR like this, and usually construct it |
11 | | directly from the concrete syntax. This crate however first parses the |
12 | | concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, |
13 | | as mentioned above. It's done this way to facilitate better error reporting, |
14 | | and to have a structured representation of a regex that faithfully represents |
15 | | its concrete syntax. Namely, while an `Hir` value can be converted back to an |
16 | | equivalent regex pattern string, it is unlikely to look like the original due |
17 | | to its simplified structure. |
18 | | */ |
19 | | |
20 | | use core::{char, cmp}; |
21 | | |
22 | | use alloc::{ |
23 | | boxed::Box, |
24 | | format, |
25 | | string::{String, ToString}, |
26 | | vec, |
27 | | vec::Vec, |
28 | | }; |
29 | | |
30 | | use crate::{ |
31 | | ast::Span, |
32 | | hir::interval::{Interval, IntervalSet, IntervalSetIter}, |
33 | | unicode, |
34 | | }; |
35 | | |
36 | | pub use crate::{ |
37 | | hir::visitor::{visit, Visitor}, |
38 | | unicode::CaseFoldError, |
39 | | }; |
40 | | |
41 | | mod interval; |
42 | | pub mod literal; |
43 | | pub mod print; |
44 | | pub mod translate; |
45 | | mod visitor; |
46 | | |
47 | | /// An error that can occur while translating an `Ast` to a `Hir`. |
48 | | #[derive(Clone, Debug, Eq, PartialEq)] |
49 | | pub struct Error { |
50 | | /// The kind of error. |
51 | | kind: ErrorKind, |
52 | | /// The original pattern that the translator's Ast was parsed from. Every |
53 | | /// span in an error is a valid range into this string. |
54 | | pattern: String, |
55 | | /// The span of this error, derived from the Ast given to the translator. |
56 | | span: Span, |
57 | | } |
58 | | |
59 | | impl Error { |
60 | | /// Return the type of this error. |
61 | 1.40k | pub fn kind(&self) -> &ErrorKind { |
62 | 1.40k | &self.kind |
63 | 1.40k | } |
64 | | |
65 | | /// The original pattern string in which this error occurred. |
66 | | /// |
67 | | /// Every span reported by this error is reported in terms of this string. |
68 | 1.40k | pub fn pattern(&self) -> &str { |
69 | 1.40k | &self.pattern |
70 | 1.40k | } |
71 | | |
72 | | /// Return the span at which this error occurred. |
73 | 1.40k | pub fn span(&self) -> &Span { |
74 | 1.40k | &self.span |
75 | 1.40k | } |
76 | | } |
77 | | |
78 | | /// The type of an error that occurred while building an `Hir`. |
79 | | /// |
80 | | /// This error type is marked as `non_exhaustive`. This means that adding a |
81 | | /// new variant is not considered a breaking change. |
82 | | #[non_exhaustive] |
83 | | #[derive(Clone, Debug, Eq, PartialEq)] |
84 | | pub enum ErrorKind { |
85 | | /// This error occurs when a Unicode feature is used when Unicode |
86 | | /// support is disabled. For example `(?-u:\pL)` would trigger this error. |
87 | | UnicodeNotAllowed, |
88 | | /// This error occurs when translating a pattern that could match a byte |
89 | | /// sequence that isn't UTF-8 and `utf8` was enabled. |
90 | | InvalidUtf8, |
91 | | /// This error occurs when one uses a non-ASCII byte for a line terminator, |
92 | | /// but where Unicode mode is enabled and UTF-8 mode is disabled. |
93 | | InvalidLineTerminator, |
94 | | /// This occurs when an unrecognized Unicode property name could not |
95 | | /// be found. |
96 | | UnicodePropertyNotFound, |
97 | | /// This occurs when an unrecognized Unicode property value could not |
98 | | /// be found. |
99 | | UnicodePropertyValueNotFound, |
100 | | /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or |
101 | | /// `\d`) could not be found. This can occur when the `unicode-perl` |
102 | | /// crate feature is not enabled. |
103 | | UnicodePerlClassNotFound, |
104 | | /// This occurs when the Unicode simple case mapping tables are not |
105 | | /// available, and the regular expression required Unicode aware case |
106 | | /// insensitivity. |
107 | | UnicodeCaseUnavailable, |
108 | | } |
109 | | |
110 | | #[cfg(feature = "std")] |
111 | | impl std::error::Error for Error {} |
112 | | |
113 | | impl core::fmt::Display for Error { |
114 | 1.40k | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
115 | 1.40k | crate::error::Formatter::from(self).fmt(f) |
116 | 1.40k | } |
117 | | } |
118 | | |
119 | | impl core::fmt::Display for ErrorKind { |
120 | 1.40k | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
121 | | use self::ErrorKind::*; |
122 | | |
123 | 1.40k | let msg = match *self { |
124 | 191 | UnicodeNotAllowed => "Unicode not allowed here", |
125 | 604 | InvalidUtf8 => "pattern can match invalid UTF-8", |
126 | 0 | InvalidLineTerminator => "invalid line terminator, must be ASCII", |
127 | 596 | UnicodePropertyNotFound => "Unicode property not found", |
128 | 17 | UnicodePropertyValueNotFound => "Unicode property value not found", |
129 | | UnicodePerlClassNotFound => { |
130 | 0 | "Unicode-aware Perl class not found \ |
131 | 0 | (make sure the unicode-perl feature is enabled)" |
132 | | } |
133 | | UnicodeCaseUnavailable => { |
134 | 0 | "Unicode-aware case insensitivity matching is not available \ |
135 | 0 | (make sure the unicode-case feature is enabled)" |
136 | | } |
137 | | }; |
138 | 1.40k | f.write_str(msg) |
139 | 1.40k | } |
140 | | } |
141 | | |
142 | | /// A high-level intermediate representation (HIR) for a regular expression. |
143 | | /// |
144 | | /// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. |
145 | | /// An `HirKind` indicates what kind of regular expression it is (a literal, |
146 | | /// a repetition, a look-around assertion, etc.), where as a `Properties` |
147 | | /// describes various facts about the regular expression. For example, whether |
148 | | /// it matches UTF-8 or if it matches the empty string. |
149 | | /// |
150 | | /// The HIR of a regular expression represents an intermediate step between |
151 | | /// its abstract syntax (a structured description of the concrete syntax) and |
152 | | /// an actual regex matcher. The purpose of HIR is to make regular expressions |
153 | | /// easier to analyze. In particular, the AST is much more complex than the |
154 | | /// HIR. For example, while an AST supports arbitrarily nested character |
155 | | /// classes, the HIR will flatten all nested classes into a single set. The HIR |
156 | | /// will also "compile away" every flag present in the concrete syntax. For |
157 | | /// example, users of HIR expressions never need to worry about case folding; |
158 | | /// it is handled automatically by the translator (e.g., by translating |
159 | | /// `(?i:A)` to `[aA]`). |
160 | | /// |
161 | | /// The specific type of an HIR expression can be accessed via its `kind` |
162 | | /// or `into_kind` methods. This extra level of indirection exists for two |
163 | | /// reasons: |
164 | | /// |
165 | | /// 1. Construction of an HIR expression *must* use the constructor methods on |
166 | | /// this `Hir` type instead of building the `HirKind` values directly. This |
167 | | /// permits construction to enforce invariants like "concatenations always |
168 | | /// consist of two or more sub-expressions." |
169 | | /// 2. Every HIR expression contains attributes that are defined inductively, |
170 | | /// and can be computed cheaply during the construction process. For example, |
171 | | /// one such attribute is whether the expression must match at the beginning of |
172 | | /// the haystack. |
173 | | /// |
174 | | /// In particular, if you have an `HirKind` value, then there is intentionally |
175 | | /// no way to build an `Hir` value from it. You instead need to do case |
176 | | /// analysis on the `HirKind` value and build the `Hir` value using its smart |
177 | | /// constructors. |
178 | | /// |
179 | | /// # UTF-8 |
180 | | /// |
181 | | /// If the HIR was produced by a translator with |
182 | | /// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, |
183 | | /// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty |
184 | | /// matches. |
185 | | /// |
186 | | /// For empty matches, those can occur at any position. It is the |
187 | | /// responsibility of the regex engine to determine whether empty matches are |
188 | | /// permitted between the code units of a single codepoint. |
189 | | /// |
190 | | /// # Stack space |
191 | | /// |
192 | | /// This type defines its own destructor that uses constant stack space and |
193 | | /// heap space proportional to the size of the HIR. |
194 | | /// |
195 | | /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular |
196 | | /// expression pattern string, and uses constant stack space and heap space |
197 | | /// proportional to the size of the `Hir`. The regex it prints is guaranteed to |
198 | | /// be _semantically_ equivalent to the original concrete syntax, but it may |
199 | | /// look very different. (And potentially not practically readable by a human.) |
200 | | /// |
201 | | /// An `Hir`'s `fmt::Debug` implementation currently does not use constant |
202 | | /// stack space. The implementation will also suppress some details (such as |
203 | | /// the `Properties` inlined into every `Hir` value to make it less noisy). |
204 | | #[derive(Clone, Eq, PartialEq)] |
205 | | pub struct Hir { |
206 | | /// The underlying HIR kind. |
207 | | kind: HirKind, |
208 | | /// Analysis info about this HIR, computed during construction. |
209 | | props: Properties, |
210 | | } |
211 | | |
212 | | /// Methods for accessing the underlying `HirKind` and `Properties`. |
213 | | impl Hir { |
214 | | /// Returns a reference to the underlying HIR kind. |
215 | 236M | pub fn kind(&self) -> &HirKind { |
216 | 236M | &self.kind |
217 | 236M | } |
218 | | |
219 | | /// Consumes ownership of this HIR expression and returns its underlying |
220 | | /// `HirKind`. |
221 | 53.7k | pub fn into_kind(mut self) -> HirKind { |
222 | 53.7k | core::mem::replace(&mut self.kind, HirKind::Empty) |
223 | 53.7k | } |
224 | | |
225 | | /// Returns the properties computed for this `Hir`. |
226 | 31.0M | pub fn properties(&self) -> &Properties { |
227 | 31.0M | &self.props |
228 | 31.0M | } |
229 | | |
230 | | /// Splits this HIR into its constituent parts. |
231 | | /// |
232 | | /// This is useful because `let Hir { kind, props } = hir;` does not work |
233 | | /// because of `Hir`'s custom `Drop` implementation. |
234 | 18.4M | fn into_parts(mut self) -> (HirKind, Properties) { |
235 | 18.4M | ( |
236 | 18.4M | core::mem::replace(&mut self.kind, HirKind::Empty), |
237 | 18.4M | core::mem::replace(&mut self.props, Properties::empty()), |
238 | 18.4M | ) |
239 | 18.4M | } |
240 | | } |
241 | | |
242 | | /// Smart constructors for HIR values. |
243 | | /// |
244 | | /// These constructors are called "smart" because they do inductive work or |
245 | | /// simplifications. For example, calling `Hir::repetition` with a repetition |
246 | | /// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind |
247 | | /// since it is equivalent to an empty regex. Another example is calling |
248 | | /// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll |
249 | | /// just get back the original `expr` since it's precisely equivalent. |
250 | | /// |
251 | | /// Smart constructors enable maintaining invariants about the HIR data type |
252 | | /// while also simulanteously keeping the representation as simple as possible. |
253 | | impl Hir { |
254 | | /// Returns an empty HIR expression. |
255 | | /// |
256 | | /// An empty HIR expression always matches, including the empty string. |
257 | | #[inline] |
258 | 2.71M | pub fn empty() -> Hir { |
259 | 2.71M | let props = Properties::empty(); |
260 | 2.71M | Hir { kind: HirKind::Empty, props } |
261 | 2.71M | } <regex_syntax::hir::Hir>::empty Line | Count | Source | 258 | 161k | pub fn empty() -> Hir { | 259 | 161k | let props = Properties::empty(); | 260 | 161k | Hir { kind: HirKind::Empty, props } | 261 | 161k | } |
<regex_syntax::hir::Hir>::empty Line | Count | Source | 258 | 2.55M | pub fn empty() -> Hir { | 259 | 2.55M | let props = Properties::empty(); | 260 | 2.55M | Hir { kind: HirKind::Empty, props } | 261 | 2.55M | } |
|
262 | | |
263 | | /// Returns an HIR expression that can never match anything. That is, |
264 | | /// the size of the set of strings in the language described by the HIR |
265 | | /// returned is `0`. |
266 | | /// |
267 | | /// This is distinct from [`Hir::empty`] in that the empty string matches |
268 | | /// the HIR returned by `Hir::empty`. That is, the set of strings in the |
269 | | /// language describe described by `Hir::empty` is non-empty. |
270 | | /// |
271 | | /// Note that currently, the HIR returned uses an empty character class to |
272 | | /// indicate that nothing can match. An equivalent expression that cannot |
273 | | /// match is an empty alternation, but all such "fail" expressions are |
274 | | /// normalized (via smart constructors) to empty character classes. This is |
275 | | /// because empty character classes can be spelled in the concrete syntax |
276 | | /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but |
277 | | /// empty alternations cannot. |
278 | | #[inline] |
279 | 10.9k | pub fn fail() -> Hir { |
280 | 10.9k | let class = Class::Bytes(ClassBytes::empty()); |
281 | 10.9k | let props = Properties::class(&class); |
282 | 10.9k | // We can't just call Hir::class here because it defers to Hir::fail |
283 | 10.9k | // in order to canonicalize the Hir value used to represent "cannot |
284 | 10.9k | // match." |
285 | 10.9k | Hir { kind: HirKind::Class(class), props } |
286 | 10.9k | } <regex_syntax::hir::Hir>::fail Line | Count | Source | 279 | 2.67k | pub fn fail() -> Hir { | 280 | 2.67k | let class = Class::Bytes(ClassBytes::empty()); | 281 | 2.67k | let props = Properties::class(&class); | 282 | 2.67k | // We can't just call Hir::class here because it defers to Hir::fail | 283 | 2.67k | // in order to canonicalize the Hir value used to represent "cannot | 284 | 2.67k | // match." | 285 | 2.67k | Hir { kind: HirKind::Class(class), props } | 286 | 2.67k | } |
<regex_syntax::hir::Hir>::fail Line | Count | Source | 279 | 8.30k | pub fn fail() -> Hir { | 280 | 8.30k | let class = Class::Bytes(ClassBytes::empty()); | 281 | 8.30k | let props = Properties::class(&class); | 282 | 8.30k | // We can't just call Hir::class here because it defers to Hir::fail | 283 | 8.30k | // in order to canonicalize the Hir value used to represent "cannot | 284 | 8.30k | // match." | 285 | 8.30k | Hir { kind: HirKind::Class(class), props } | 286 | 8.30k | } |
|
287 | | |
288 | | /// Creates a literal HIR expression. |
289 | | /// |
290 | | /// This accepts anything that can be converted into a `Box<[u8]>`. |
291 | | /// |
292 | | /// Note that there is no mechanism for storing a `char` or a `Box<str>` |
293 | | /// in an HIR. Everything is "just bytes." Whether a `Literal` (or |
294 | | /// any HIR node) matches valid UTF-8 exclusively can be queried via |
295 | | /// [`Properties::is_utf8`]. |
296 | | /// |
297 | | /// # Example |
298 | | /// |
299 | | /// This example shows that concatenations of `Literal` HIR values will |
300 | | /// automatically get flattened and combined together. So for example, even |
301 | | /// if you concat multiple `Literal` values that are themselves not valid |
302 | | /// UTF-8, they might add up to valid UTF-8. This also demonstrates just |
303 | | /// how "smart" Hir's smart constructors are. |
304 | | /// |
305 | | /// ``` |
306 | | /// use regex_syntax::hir::{Hir, HirKind, Literal}; |
307 | | /// |
308 | | /// let literals = vec![ |
309 | | /// Hir::literal([0xE2]), |
310 | | /// Hir::literal([0x98]), |
311 | | /// Hir::literal([0x83]), |
312 | | /// ]; |
313 | | /// // Each literal, on its own, is invalid UTF-8. |
314 | | /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); |
315 | | /// |
316 | | /// let concat = Hir::concat(literals); |
317 | | /// // But the concatenation is valid UTF-8! |
318 | | /// assert!(concat.properties().is_utf8()); |
319 | | /// |
320 | | /// // And also notice that the literals have been concatenated into a |
321 | | /// // single `Literal`, to the point where there is no explicit `Concat`! |
322 | | /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); |
323 | | /// assert_eq!(&expected, concat.kind()); |
324 | | /// ``` |
325 | | /// |
326 | | /// # Example: building a literal from a `char` |
327 | | /// |
328 | | /// This example shows how to build a single `Hir` literal from a `char` |
329 | | /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 |
330 | | /// encode a `char` value: |
331 | | /// |
332 | | /// ``` |
333 | | /// use regex_syntax::hir::{Hir, HirKind, Literal}; |
334 | | /// |
335 | | /// let ch = '☃'; |
336 | | /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); |
337 | | /// |
338 | | /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); |
339 | | /// assert_eq!(&expected, got.kind()); |
340 | | /// ``` |
341 | | #[inline] |
342 | 12.5M | pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir { |
343 | 12.5M | let bytes = lit.into(); |
344 | 12.5M | if bytes.is_empty() { |
345 | 0 | return Hir::empty(); |
346 | 12.5M | } |
347 | 12.5M | |
348 | 12.5M | let lit = Literal(bytes); |
349 | 12.5M | let props = Properties::literal(&lit); |
350 | 12.5M | Hir { kind: HirKind::Literal(lit), props } |
351 | 12.5M | } <regex_syntax::hir::Hir>::literal::<alloc::boxed::Box<[u8]>> Line | Count | Source | 342 | 527k | pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir { | 343 | 527k | let bytes = lit.into(); | 344 | 527k | if bytes.is_empty() { | 345 | 0 | return Hir::empty(); | 346 | 527k | } | 347 | 527k | | 348 | 527k | let lit = Literal(bytes); | 349 | 527k | let props = Properties::literal(&lit); | 350 | 527k | Hir { kind: HirKind::Literal(lit), props } | 351 | 527k | } |
<regex_syntax::hir::Hir>::literal::<alloc::vec::Vec<u8>> Line | Count | Source | 342 | 11.9M | pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir { | 343 | 11.9M | let bytes = lit.into(); | 344 | 11.9M | if bytes.is_empty() { | 345 | 0 | return Hir::empty(); | 346 | 11.9M | } | 347 | 11.9M | | 348 | 11.9M | let lit = Literal(bytes); | 349 | 11.9M | let props = Properties::literal(&lit); | 350 | 11.9M | Hir { kind: HirKind::Literal(lit), props } | 351 | 11.9M | } |
|
352 | | |
353 | | /// Creates a class HIR expression. The class may either be defined over |
354 | | /// ranges of Unicode codepoints or ranges of raw byte values. |
355 | | /// |
356 | | /// Note that an empty class is permitted. An empty class is equivalent to |
357 | | /// `Hir::fail()`. |
358 | | #[inline] |
359 | 3.80M | pub fn class(class: Class) -> Hir { |
360 | 3.80M | if class.is_empty() { |
361 | 10.9k | return Hir::fail(); |
362 | 3.78M | } else if let Some(bytes) = class.literal() { |
363 | 13.4k | return Hir::literal(bytes); |
364 | 3.77M | } |
365 | 3.77M | let props = Properties::class(&class); |
366 | 3.77M | Hir { kind: HirKind::Class(class), props } |
367 | 3.80M | } <regex_syntax::hir::Hir>::class Line | Count | Source | 359 | 823k | pub fn class(class: Class) -> Hir { | 360 | 823k | if class.is_empty() { | 361 | 2.67k | return Hir::fail(); | 362 | 820k | } else if let Some(bytes) = class.literal() { | 363 | 0 | return Hir::literal(bytes); | 364 | 820k | } | 365 | 820k | let props = Properties::class(&class); | 366 | 820k | Hir { kind: HirKind::Class(class), props } | 367 | 823k | } |
<regex_syntax::hir::Hir>::class Line | Count | Source | 359 | 2.97M | pub fn class(class: Class) -> Hir { | 360 | 2.97M | if class.is_empty() { | 361 | 8.30k | return Hir::fail(); | 362 | 2.96M | } else if let Some(bytes) = class.literal() { | 363 | 13.4k | return Hir::literal(bytes); | 364 | 2.95M | } | 365 | 2.95M | let props = Properties::class(&class); | 366 | 2.95M | Hir { kind: HirKind::Class(class), props } | 367 | 2.97M | } |
|
368 | | |
369 | | /// Creates a look-around assertion HIR expression. |
370 | | #[inline] |
371 | 335k | pub fn look(look: Look) -> Hir { |
372 | 335k | let props = Properties::look(look); |
373 | 335k | Hir { kind: HirKind::Look(look), props } |
374 | 335k | } <regex_syntax::hir::Hir>::look Line | Count | Source | 371 | 12.4k | pub fn look(look: Look) -> Hir { | 372 | 12.4k | let props = Properties::look(look); | 373 | 12.4k | Hir { kind: HirKind::Look(look), props } | 374 | 12.4k | } |
<regex_syntax::hir::Hir>::look Line | Count | Source | 371 | 322k | pub fn look(look: Look) -> Hir { | 372 | 322k | let props = Properties::look(look); | 373 | 322k | Hir { kind: HirKind::Look(look), props } | 374 | 322k | } |
|
375 | | |
376 | | /// Creates a repetition HIR expression. |
377 | | #[inline] |
378 | 702k | pub fn repetition(mut rep: Repetition) -> Hir { |
379 | 702k | // If the sub-expression of a repetition can only match the empty |
380 | 702k | // string, then we force its maximum to be at most 1. |
381 | 702k | if rep.sub.properties().maximum_len() == Some(0) { |
382 | 29.9k | rep.min = cmp::min(rep.min, 1); |
383 | 29.9k | rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); |
384 | 672k | } |
385 | | // The regex 'a{0}' is always equivalent to the empty regex. This is |
386 | | // true even when 'a' is an expression that never matches anything |
387 | | // (like '\P{any}'). |
388 | | // |
389 | | // Additionally, the regex 'a{1}' is always equivalent to 'a'. |
390 | 702k | if rep.min == 0 && rep.max == Some(0) { |
391 | 4.08k | return Hir::empty(); |
392 | 698k | } else if rep.min == 1 && rep.max == Some(1) { |
393 | 3.97k | return *rep.sub; |
394 | 694k | } |
395 | 694k | let props = Properties::repetition(&rep); |
396 | 694k | Hir { kind: HirKind::Repetition(rep), props } |
397 | 702k | } <regex_syntax::hir::Hir>::repetition Line | Count | Source | 378 | 249k | pub fn repetition(mut rep: Repetition) -> Hir { | 379 | 249k | // If the sub-expression of a repetition can only match the empty | 380 | 249k | // string, then we force its maximum to be at most 1. | 381 | 249k | if rep.sub.properties().maximum_len() == Some(0) { | 382 | 2.17k | rep.min = cmp::min(rep.min, 1); | 383 | 2.17k | rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); | 384 | 247k | } | 385 | | // The regex 'a{0}' is always equivalent to the empty regex. This is | 386 | | // true even when 'a' is an expression that never matches anything | 387 | | // (like '\P{any}'). | 388 | | // | 389 | | // Additionally, the regex 'a{1}' is always equivalent to 'a'. | 390 | 249k | if rep.min == 0 && rep.max == Some(0) { | 391 | 0 | return Hir::empty(); | 392 | 249k | } else if rep.min == 1 && rep.max == Some(1) { | 393 | 0 | return *rep.sub; | 394 | 249k | } | 395 | 249k | let props = Properties::repetition(&rep); | 396 | 249k | Hir { kind: HirKind::Repetition(rep), props } | 397 | 249k | } |
<regex_syntax::hir::Hir>::repetition Line | Count | Source | 378 | 453k | pub fn repetition(mut rep: Repetition) -> Hir { | 379 | 453k | // If the sub-expression of a repetition can only match the empty | 380 | 453k | // string, then we force its maximum to be at most 1. | 381 | 453k | if rep.sub.properties().maximum_len() == Some(0) { | 382 | 27.7k | rep.min = cmp::min(rep.min, 1); | 383 | 27.7k | rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); | 384 | 425k | } | 385 | | // The regex 'a{0}' is always equivalent to the empty regex. This is | 386 | | // true even when 'a' is an expression that never matches anything | 387 | | // (like '\P{any}'). | 388 | | // | 389 | | // Additionally, the regex 'a{1}' is always equivalent to 'a'. | 390 | 453k | if rep.min == 0 && rep.max == Some(0) { | 391 | 4.08k | return Hir::empty(); | 392 | 448k | } else if rep.min == 1 && rep.max == Some(1) { | 393 | 3.97k | return *rep.sub; | 394 | 444k | } | 395 | 444k | let props = Properties::repetition(&rep); | 396 | 444k | Hir { kind: HirKind::Repetition(rep), props } | 397 | 453k | } |
|
398 | | |
399 | | /// Creates a capture HIR expression. |
400 | | /// |
401 | | /// Note that there is no explicit HIR value for a non-capturing group. |
402 | | /// Since a non-capturing group only exists to override precedence in the |
403 | | /// concrete syntax and since an HIR already does its own grouping based on |
404 | | /// what is parsed, there is no need to explicitly represent non-capturing |
405 | | /// groups in the HIR. |
406 | | #[inline] |
407 | 99.7k | pub fn capture(capture: Capture) -> Hir { |
408 | 99.7k | let props = Properties::capture(&capture); |
409 | 99.7k | Hir { kind: HirKind::Capture(capture), props } |
410 | 99.7k | } |
411 | | |
412 | | /// Returns the concatenation of the given expressions. |
413 | | /// |
414 | | /// This attempts to flatten and simplify the concatenation as appropriate. |
415 | | /// |
416 | | /// # Example |
417 | | /// |
418 | | /// This shows a simple example of basic flattening of both concatenations |
419 | | /// and literals. |
420 | | /// |
421 | | /// ``` |
422 | | /// use regex_syntax::hir::Hir; |
423 | | /// |
424 | | /// let hir = Hir::concat(vec![ |
425 | | /// Hir::concat(vec![ |
426 | | /// Hir::literal([b'a']), |
427 | | /// Hir::literal([b'b']), |
428 | | /// Hir::literal([b'c']), |
429 | | /// ]), |
430 | | /// Hir::concat(vec![ |
431 | | /// Hir::literal([b'x']), |
432 | | /// Hir::literal([b'y']), |
433 | | /// Hir::literal([b'z']), |
434 | | /// ]), |
435 | | /// ]); |
436 | | /// let expected = Hir::literal("abcxyz".as_bytes()); |
437 | | /// assert_eq!(expected, hir); |
438 | | /// ``` |
439 | 1.38M | pub fn concat(subs: Vec<Hir>) -> Hir { |
440 | 1.38M | // We rebuild the concatenation by simplifying it. Would be nice to do |
441 | 1.38M | // it in place, but that seems a little tricky? |
442 | 1.38M | let mut new = vec![]; |
443 | 1.38M | // This gobbles up any adjacent literals in a concatenation and smushes |
444 | 1.38M | // them together. Basically, when we see a literal, we add its bytes |
445 | 1.38M | // to 'prior_lit', and whenever we see anything else, we first take |
446 | 1.38M | // any bytes in 'prior_lit' and add it to the 'new' concatenation. |
447 | 1.38M | let mut prior_lit: Option<Vec<u8>> = None; |
448 | 8.42M | for sub in subs { |
449 | 7.04M | let (kind, props) = sub.into_parts(); |
450 | 7.04M | match kind { |
451 | 1.80M | HirKind::Literal(Literal(bytes)) => { |
452 | 1.80M | if let Some(ref mut prior_bytes) = prior_lit { |
453 | 22.8k | prior_bytes.extend_from_slice(&bytes); |
454 | 1.78M | } else { |
455 | 1.78M | prior_lit = Some(bytes.to_vec()); |
456 | 1.78M | } |
457 | | } |
458 | | // We also flatten concats that are direct children of another |
459 | | // concat. We only need to do this one level deep since |
460 | | // Hir::concat is the only way to build concatenations, and so |
461 | | // flattening happens inductively. |
462 | 2.57k | HirKind::Concat(subs2) => { |
463 | 39.2k | for sub2 in subs2 { |
464 | 36.7k | let (kind2, props2) = sub2.into_parts(); |
465 | 36.7k | match kind2 { |
466 | 14.1k | HirKind::Literal(Literal(bytes)) => { |
467 | 14.1k | if let Some(ref mut prior_bytes) = prior_lit { |
468 | 896 | prior_bytes.extend_from_slice(&bytes); |
469 | 13.2k | } else { |
470 | 13.2k | prior_lit = Some(bytes.to_vec()); |
471 | 13.2k | } |
472 | | } |
473 | 22.5k | kind2 => { |
474 | 22.5k | if let Some(prior_bytes) = prior_lit.take() { |
475 | 13.6k | new.push(Hir::literal(prior_bytes)); |
476 | 13.6k | } |
477 | 22.5k | new.push(Hir { kind: kind2, props: props2 }); |
478 | | } |
479 | | } |
480 | | } |
481 | | } |
482 | | // We can just skip empty HIRs. |
483 | 1.33k | HirKind::Empty => {} |
484 | 5.22M | kind => { |
485 | 5.22M | if let Some(prior_bytes) = prior_lit.take() { |
486 | 697k | new.push(Hir::literal(prior_bytes)); |
487 | 4.53M | } |
488 | 5.22M | new.push(Hir { kind, props }); |
489 | | } |
490 | | } |
491 | | } |
492 | 1.38M | if let Some(prior_bytes) = prior_lit.take() { |
493 | 1.08M | new.push(Hir::literal(prior_bytes)); |
494 | 1.08M | } |
495 | 1.38M | if new.is_empty() { |
496 | 673 | return Hir::empty(); |
497 | 1.38M | } else if new.len() == 1 { |
498 | 998k | return new.pop().unwrap(); |
499 | 382k | } |
500 | 382k | let props = Properties::concat(&new); |
501 | 382k | Hir { kind: HirKind::Concat(new), props } |
502 | 1.38M | } |
503 | | |
504 | | /// Returns the alternation of the given expressions. |
505 | | /// |
506 | | /// This flattens and simplifies the alternation as appropriate. This may |
507 | | /// include factoring out common prefixes or even rewriting the alternation |
508 | | /// as a character class. |
509 | | /// |
510 | | /// Note that an empty alternation is equivalent to `Hir::fail()`. (It |
511 | | /// is not possible for one to write an empty alternation, or even an |
512 | | /// alternation with a single sub-expression, in the concrete syntax of a |
513 | | /// regex.) |
514 | | /// |
515 | | /// # Example |
516 | | /// |
517 | | /// This is a simple example showing how an alternation might get |
518 | | /// simplified. |
519 | | /// |
520 | | /// ``` |
521 | | /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; |
522 | | /// |
523 | | /// let hir = Hir::alternation(vec![ |
524 | | /// Hir::literal([b'a']), |
525 | | /// Hir::literal([b'b']), |
526 | | /// Hir::literal([b'c']), |
527 | | /// Hir::literal([b'd']), |
528 | | /// Hir::literal([b'e']), |
529 | | /// Hir::literal([b'f']), |
530 | | /// ]); |
531 | | /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ |
532 | | /// ClassUnicodeRange::new('a', 'f'), |
533 | | /// ]))); |
534 | | /// assert_eq!(expected, hir); |
535 | | /// ``` |
536 | | /// |
537 | | /// And another example showing how common prefixes might get factored |
538 | | /// out. |
539 | | /// |
540 | | /// ``` |
541 | | /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; |
542 | | /// |
543 | | /// let hir = Hir::alternation(vec![ |
544 | | /// Hir::concat(vec![ |
545 | | /// Hir::literal("abc".as_bytes()), |
546 | | /// Hir::class(Class::Unicode(ClassUnicode::new([ |
547 | | /// ClassUnicodeRange::new('A', 'Z'), |
548 | | /// ]))), |
549 | | /// ]), |
550 | | /// Hir::concat(vec![ |
551 | | /// Hir::literal("abc".as_bytes()), |
552 | | /// Hir::class(Class::Unicode(ClassUnicode::new([ |
553 | | /// ClassUnicodeRange::new('a', 'z'), |
554 | | /// ]))), |
555 | | /// ]), |
556 | | /// ]); |
557 | | /// let expected = Hir::concat(vec![ |
558 | | /// Hir::literal("abc".as_bytes()), |
559 | | /// Hir::alternation(vec![ |
560 | | /// Hir::class(Class::Unicode(ClassUnicode::new([ |
561 | | /// ClassUnicodeRange::new('A', 'Z'), |
562 | | /// ]))), |
563 | | /// Hir::class(Class::Unicode(ClassUnicode::new([ |
564 | | /// ClassUnicodeRange::new('a', 'z'), |
565 | | /// ]))), |
566 | | /// ]), |
567 | | /// ]); |
568 | | /// assert_eq!(expected, hir); |
569 | | /// ``` |
570 | | /// |
571 | | /// Note that these sorts of simplifications are not guaranteed. |
572 | 70.6k | pub fn alternation(subs: Vec<Hir>) -> Hir { |
573 | 70.6k | // We rebuild the alternation by simplifying it. We proceed similarly |
574 | 70.6k | // as the concatenation case. But in this case, there's no literal |
575 | 70.6k | // simplification happening. We're just flattening alternations. |
576 | 70.6k | let mut new = Vec::with_capacity(subs.len()); |
577 | 11.4M | for sub in subs { |
578 | 11.4M | let (kind, props) = sub.into_parts(); |
579 | 11.4M | match kind { |
580 | 1.29k | HirKind::Alternation(subs2) => { |
581 | 1.29k | new.extend(subs2); |
582 | 1.29k | } |
583 | 11.4M | kind => { |
584 | 11.4M | new.push(Hir { kind, props }); |
585 | 11.4M | } |
586 | | } |
587 | | } |
588 | 70.6k | if new.is_empty() { |
589 | 0 | return Hir::fail(); |
590 | 70.6k | } else if new.len() == 1 { |
591 | 0 | return new.pop().unwrap(); |
592 | 70.6k | } |
593 | | // Now that it's completely flattened, look for the special case of |
594 | | // 'char1|char2|...|charN' and collapse that into a class. Note that |
595 | | // we look for 'char' first and then bytes. The issue here is that if |
596 | | // we find both non-ASCII codepoints and non-ASCII singleton bytes, |
597 | | // then it isn't actually possible to smush them into a single class. |
598 | | // (Because classes are either "all codepoints" or "all bytes." You |
599 | | // can have a class that both matches non-ASCII but valid UTF-8 and |
600 | | // invalid UTF-8.) So we look for all chars and then all bytes, and |
601 | | // don't handle anything else. |
602 | 70.6k | if let Some(singletons) = singleton_chars(&new) { |
603 | 3.65k | let it = singletons |
604 | 3.65k | .into_iter() |
605 | 469k | .map(|ch| ClassUnicodeRange { start: ch, end: ch }); |
606 | 3.65k | return Hir::class(Class::Unicode(ClassUnicode::new(it))); |
607 | 66.9k | } |
608 | 66.9k | if let Some(singletons) = singleton_bytes(&new) { |
609 | 0 | let it = singletons |
610 | 0 | .into_iter() |
611 | 0 | .map(|b| ClassBytesRange { start: b, end: b }); |
612 | 0 | return Hir::class(Class::Bytes(ClassBytes::new(it))); |
613 | 66.9k | } |
614 | | // Similar to singleton chars, we can also look for alternations of |
615 | | // classes. Those can be smushed into a single class. |
616 | 66.9k | if let Some(cls) = class_chars(&new) { |
617 | 78 | return Hir::class(cls); |
618 | 66.8k | } |
619 | 66.8k | if let Some(cls) = class_bytes(&new) { |
620 | 0 | return Hir::class(cls); |
621 | 66.8k | } |
622 | | // Factor out a common prefix if we can, which might potentially |
623 | | // simplify the expression and unlock other optimizations downstream. |
624 | | // It also might generally make NFA matching and DFA construction |
625 | | // faster by reducing the scope of branching in the regex. |
626 | 66.8k | new = match lift_common_prefix(new) { |
627 | 1.71k | Ok(hir) => return hir, |
628 | 65.1k | Err(unchanged) => unchanged, |
629 | 65.1k | }; |
630 | 65.1k | let props = Properties::alternation(&new); |
631 | 65.1k | Hir { kind: HirKind::Alternation(new), props } |
632 | 70.6k | } |
633 | | |
634 | | /// Returns an HIR expression for `.`. |
635 | | /// |
636 | | /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. |
637 | | /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. |
638 | | /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. |
639 | | /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. |
640 | | /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. |
641 | | /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. |
642 | | /// |
643 | | /// # Example |
644 | | /// |
645 | | /// Note that this is a convenience routine for constructing the correct |
646 | | /// character class based on the value of `Dot`. There is no explicit "dot" |
647 | | /// HIR value. It is just an abbreviation for a common character class. |
648 | | /// |
649 | | /// ``` |
650 | | /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; |
651 | | /// |
652 | | /// let hir = Hir::dot(Dot::AnyByte); |
653 | | /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ |
654 | | /// ClassBytesRange::new(0x00, 0xFF), |
655 | | /// ]))); |
656 | | /// assert_eq!(expected, hir); |
657 | | /// ``` |
658 | | #[inline] |
659 | 1.42M | pub fn dot(dot: Dot) -> Hir { |
660 | 1.42M | match dot { |
661 | 1.29k | Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([ |
662 | 1.29k | ClassUnicodeRange::new('\0', '\u{10FFFF}'), |
663 | 1.29k | ]))), |
664 | 221k | Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([ |
665 | 221k | ClassBytesRange::new(b'\0', b'\xFF'), |
666 | 221k | ]))), |
667 | 1.20M | Dot::AnyCharExcept(ch) => { |
668 | 1.20M | let mut cls = |
669 | 1.20M | ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); |
670 | 1.20M | cls.negate(); |
671 | 1.20M | Hir::class(Class::Unicode(cls)) |
672 | | } |
673 | | Dot::AnyCharExceptLF => { |
674 | 0 | Hir::class(Class::Unicode(ClassUnicode::new([ |
675 | 0 | ClassUnicodeRange::new('\0', '\x09'), |
676 | 0 | ClassUnicodeRange::new('\x0B', '\u{10FFFF}'), |
677 | 0 | ]))) |
678 | | } |
679 | | Dot::AnyCharExceptCRLF => { |
680 | 319 | Hir::class(Class::Unicode(ClassUnicode::new([ |
681 | 319 | ClassUnicodeRange::new('\0', '\x09'), |
682 | 319 | ClassUnicodeRange::new('\x0B', '\x0C'), |
683 | 319 | ClassUnicodeRange::new('\x0E', '\u{10FFFF}'), |
684 | 319 | ]))) |
685 | | } |
686 | 0 | Dot::AnyByteExcept(byte) => { |
687 | 0 | let mut cls = |
688 | 0 | ClassBytes::new([ClassBytesRange::new(byte, byte)]); |
689 | 0 | cls.negate(); |
690 | 0 | Hir::class(Class::Bytes(cls)) |
691 | | } |
692 | | Dot::AnyByteExceptLF => { |
693 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ |
694 | 0 | ClassBytesRange::new(b'\0', b'\x09'), |
695 | 0 | ClassBytesRange::new(b'\x0B', b'\xFF'), |
696 | 0 | ]))) |
697 | | } |
698 | | Dot::AnyByteExceptCRLF => { |
699 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ |
700 | 0 | ClassBytesRange::new(b'\0', b'\x09'), |
701 | 0 | ClassBytesRange::new(b'\x0B', b'\x0C'), |
702 | 0 | ClassBytesRange::new(b'\x0E', b'\xFF'), |
703 | 0 | ]))) |
704 | | } |
705 | | } |
706 | 1.42M | } <regex_syntax::hir::Hir>::dot Line | Count | Source | 659 | 221k | pub fn dot(dot: Dot) -> Hir { | 660 | 221k | match dot { | 661 | 0 | Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([ | 662 | 0 | ClassUnicodeRange::new('\0', '\u{10FFFF}'), | 663 | 0 | ]))), | 664 | 221k | Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([ | 665 | 221k | ClassBytesRange::new(b'\0', b'\xFF'), | 666 | 221k | ]))), | 667 | 0 | Dot::AnyCharExcept(ch) => { | 668 | 0 | let mut cls = | 669 | 0 | ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); | 670 | 0 | cls.negate(); | 671 | 0 | Hir::class(Class::Unicode(cls)) | 672 | | } | 673 | | Dot::AnyCharExceptLF => { | 674 | 0 | Hir::class(Class::Unicode(ClassUnicode::new([ | 675 | 0 | ClassUnicodeRange::new('\0', '\x09'), | 676 | 0 | ClassUnicodeRange::new('\x0B', '\u{10FFFF}'), | 677 | 0 | ]))) | 678 | | } | 679 | | Dot::AnyCharExceptCRLF => { | 680 | 0 | Hir::class(Class::Unicode(ClassUnicode::new([ | 681 | 0 | ClassUnicodeRange::new('\0', '\x09'), | 682 | 0 | ClassUnicodeRange::new('\x0B', '\x0C'), | 683 | 0 | ClassUnicodeRange::new('\x0E', '\u{10FFFF}'), | 684 | 0 | ]))) | 685 | | } | 686 | 0 | Dot::AnyByteExcept(byte) => { | 687 | 0 | let mut cls = | 688 | 0 | ClassBytes::new([ClassBytesRange::new(byte, byte)]); | 689 | 0 | cls.negate(); | 690 | 0 | Hir::class(Class::Bytes(cls)) | 691 | | } | 692 | | Dot::AnyByteExceptLF => { | 693 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ | 694 | 0 | ClassBytesRange::new(b'\0', b'\x09'), | 695 | 0 | ClassBytesRange::new(b'\x0B', b'\xFF'), | 696 | 0 | ]))) | 697 | | } | 698 | | Dot::AnyByteExceptCRLF => { | 699 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ | 700 | 0 | ClassBytesRange::new(b'\0', b'\x09'), | 701 | 0 | ClassBytesRange::new(b'\x0B', b'\x0C'), | 702 | 0 | ClassBytesRange::new(b'\x0E', b'\xFF'), | 703 | 0 | ]))) | 704 | | } | 705 | | } | 706 | 221k | } |
<regex_syntax::hir::Hir>::dot Line | Count | Source | 659 | 1.20M | pub fn dot(dot: Dot) -> Hir { | 660 | 1.20M | match dot { | 661 | 1.29k | Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([ | 662 | 1.29k | ClassUnicodeRange::new('\0', '\u{10FFFF}'), | 663 | 1.29k | ]))), | 664 | 0 | Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([ | 665 | 0 | ClassBytesRange::new(b'\0', b'\xFF'), | 666 | 0 | ]))), | 667 | 1.20M | Dot::AnyCharExcept(ch) => { | 668 | 1.20M | let mut cls = | 669 | 1.20M | ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); | 670 | 1.20M | cls.negate(); | 671 | 1.20M | Hir::class(Class::Unicode(cls)) | 672 | | } | 673 | | Dot::AnyCharExceptLF => { | 674 | 0 | Hir::class(Class::Unicode(ClassUnicode::new([ | 675 | 0 | ClassUnicodeRange::new('\0', '\x09'), | 676 | 0 | ClassUnicodeRange::new('\x0B', '\u{10FFFF}'), | 677 | 0 | ]))) | 678 | | } | 679 | | Dot::AnyCharExceptCRLF => { | 680 | 319 | Hir::class(Class::Unicode(ClassUnicode::new([ | 681 | 319 | ClassUnicodeRange::new('\0', '\x09'), | 682 | 319 | ClassUnicodeRange::new('\x0B', '\x0C'), | 683 | 319 | ClassUnicodeRange::new('\x0E', '\u{10FFFF}'), | 684 | 319 | ]))) | 685 | | } | 686 | 0 | Dot::AnyByteExcept(byte) => { | 687 | 0 | let mut cls = | 688 | 0 | ClassBytes::new([ClassBytesRange::new(byte, byte)]); | 689 | 0 | cls.negate(); | 690 | 0 | Hir::class(Class::Bytes(cls)) | 691 | | } | 692 | | Dot::AnyByteExceptLF => { | 693 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ | 694 | 0 | ClassBytesRange::new(b'\0', b'\x09'), | 695 | 0 | ClassBytesRange::new(b'\x0B', b'\xFF'), | 696 | 0 | ]))) | 697 | | } | 698 | | Dot::AnyByteExceptCRLF => { | 699 | 0 | Hir::class(Class::Bytes(ClassBytes::new([ | 700 | 0 | ClassBytesRange::new(b'\0', b'\x09'), | 701 | 0 | ClassBytesRange::new(b'\x0B', b'\x0C'), | 702 | 0 | ClassBytesRange::new(b'\x0E', b'\xFF'), | 703 | 0 | ]))) | 704 | | } | 705 | | } | 706 | 1.20M | } |
|
707 | | } |
708 | | |
709 | | /// The underlying kind of an arbitrary [`Hir`] expression. |
710 | | /// |
711 | | /// An `HirKind` is principally useful for doing case analysis on the type |
712 | | /// of a regular expression. If you're looking to build new `Hir` values, |
713 | | /// then you _must_ use the smart constructors defined on `Hir`, like |
714 | | /// [`Hir::repetition`], to build new `Hir` values. The API intentionally does |
715 | | /// not expose any way of building an `Hir` directly from an `HirKind`. |
716 | | #[derive(Clone, Debug, Eq, PartialEq)] |
717 | | pub enum HirKind { |
718 | | /// The empty regular expression, which matches everything, including the |
719 | | /// empty string. |
720 | | Empty, |
721 | | /// A literalstring that matches exactly these bytes. |
722 | | Literal(Literal), |
723 | | /// A single character class that matches any of the characters in the |
724 | | /// class. A class can either consist of Unicode scalar values as |
725 | | /// characters, or it can use bytes. |
726 | | /// |
727 | | /// A class may be empty. In which case, it matches nothing. |
728 | | Class(Class), |
729 | | /// A look-around assertion. A look-around match always has zero length. |
730 | | Look(Look), |
731 | | /// A repetition operation applied to a sub-expression. |
732 | | Repetition(Repetition), |
733 | | /// A capturing group, which contains a sub-expression. |
734 | | Capture(Capture), |
735 | | /// A concatenation of expressions. |
736 | | /// |
737 | | /// A concatenation matches only if each of its sub-expressions match one |
738 | | /// after the other. |
739 | | /// |
740 | | /// Concatenations are guaranteed by `Hir`'s smart constructors to always |
741 | | /// have at least two sub-expressions. |
742 | | Concat(Vec<Hir>), |
743 | | /// An alternation of expressions. |
744 | | /// |
745 | | /// An alternation matches only if at least one of its sub-expressions |
746 | | /// match. If multiple sub-expressions match, then the leftmost is |
747 | | /// preferred. |
748 | | /// |
749 | | /// Alternations are guaranteed by `Hir`'s smart constructors to always |
750 | | /// have at least two sub-expressions. |
751 | | Alternation(Vec<Hir>), |
752 | | } |
753 | | |
754 | | impl HirKind { |
755 | | /// Returns a slice of this kind's sub-expressions, if any. |
756 | 833k | pub fn subs(&self) -> &[Hir] { |
757 | | use core::slice::from_ref; |
758 | | |
759 | 833k | match *self { |
760 | | HirKind::Empty |
761 | | | HirKind::Literal(_) |
762 | | | HirKind::Class(_) |
763 | 794k | | HirKind::Look(_) => &[], |
764 | 31.8k | HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), |
765 | 547 | HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), |
766 | 3.66k | HirKind::Concat(ref subs) => subs, |
767 | 2.43k | HirKind::Alternation(ref subs) => subs, |
768 | | } |
769 | 833k | } |
770 | | } |
771 | | |
772 | | impl core::fmt::Debug for Hir { |
773 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
774 | 0 | self.kind.fmt(f) |
775 | 0 | } |
776 | | } |
777 | | |
778 | | /// Print a display representation of this Hir. |
779 | | /// |
780 | | /// The result of this is a valid regular expression pattern string. |
781 | | /// |
782 | | /// This implementation uses constant stack space and heap space proportional |
783 | | /// to the size of the `Hir`. |
784 | | impl core::fmt::Display for Hir { |
785 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
786 | 0 | crate::hir::print::Printer::new().print(self, f) |
787 | 0 | } |
788 | | } |
789 | | |
790 | | /// The high-level intermediate representation of a literal. |
791 | | /// |
792 | | /// A literal corresponds to `0` or more bytes that should be matched |
793 | | /// literally. The smart constructors defined on `Hir` will automatically |
794 | | /// concatenate adjacent literals into one literal, and will even automatically |
795 | | /// replace empty literals with `Hir::empty()`. |
796 | | /// |
797 | | /// Note that despite a literal being represented by a sequence of bytes, its |
798 | | /// `Debug` implementation will attempt to print it as a normal string. (That |
799 | | /// is, not a sequence of decimal numbers.) |
800 | | #[derive(Clone, Eq, PartialEq)] |
801 | | pub struct Literal(pub Box<[u8]>); |
802 | | |
803 | | impl core::fmt::Debug for Literal { |
804 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
805 | 0 | crate::debug::Bytes(&self.0).fmt(f) |
806 | 0 | } |
807 | | } |
808 | | |
809 | | /// The high-level intermediate representation of a character class. |
810 | | /// |
811 | | /// A character class corresponds to a set of characters. A character is either |
812 | | /// defined by a Unicode scalar value or a byte. |
813 | | /// |
814 | | /// A character class, regardless of its character type, is represented by a |
815 | | /// sequence of non-overlapping non-adjacent ranges of characters. |
816 | | /// |
817 | | /// There are no guarantees about which class variant is used. Generally |
818 | | /// speaking, the Unicode variat is used whenever a class needs to contain |
819 | | /// non-ASCII Unicode scalar values. But the Unicode variant can be used even |
820 | | /// when Unicode mode is disabled. For example, at the time of writing, the |
821 | | /// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class |
822 | | /// `[a\u00A0]` due to optimizations. |
823 | | /// |
824 | | /// Note that `Bytes` variant may be produced even when it exclusively matches |
825 | | /// valid UTF-8. This is because a `Bytes` variant represents an intention by |
826 | | /// the author of the regular expression to disable Unicode mode, which in turn |
827 | | /// impacts the semantics of case insensitive matching. For example, `(?i)k` |
828 | | /// and `(?i-u)k` will not match the same set of strings. |
829 | | #[derive(Clone, Eq, PartialEq)] |
830 | | pub enum Class { |
831 | | /// A set of characters represented by Unicode scalar values. |
832 | | Unicode(ClassUnicode), |
833 | | /// A set of characters represented by arbitrary bytes (one byte per |
834 | | /// character). |
835 | | Bytes(ClassBytes), |
836 | | } |
837 | | |
838 | | impl Class { |
839 | | /// Apply Unicode simple case folding to this character class, in place. |
840 | | /// The character class will be expanded to include all simple case folded |
841 | | /// character variants. |
842 | | /// |
843 | | /// If this is a byte oriented character class, then this will be limited |
844 | | /// to the ASCII ranges `A-Z` and `a-z`. |
845 | | /// |
846 | | /// # Panics |
847 | | /// |
848 | | /// This routine panics when the case mapping data necessary for this |
849 | | /// routine to complete is unavailable. This occurs when the `unicode-case` |
850 | | /// feature is not enabled and the underlying class is Unicode oriented. |
851 | | /// |
852 | | /// Callers should prefer using `try_case_fold_simple` instead, which will |
853 | | /// return an error instead of panicking. |
854 | 0 | pub fn case_fold_simple(&mut self) { |
855 | 0 | match *self { |
856 | 0 | Class::Unicode(ref mut x) => x.case_fold_simple(), |
857 | 0 | Class::Bytes(ref mut x) => x.case_fold_simple(), |
858 | | } |
859 | 0 | } |
860 | | |
861 | | /// Apply Unicode simple case folding to this character class, in place. |
862 | | /// The character class will be expanded to include all simple case folded |
863 | | /// character variants. |
864 | | /// |
865 | | /// If this is a byte oriented character class, then this will be limited |
866 | | /// to the ASCII ranges `A-Z` and `a-z`. |
867 | | /// |
868 | | /// # Error |
869 | | /// |
870 | | /// This routine returns an error when the case mapping data necessary |
871 | | /// for this routine to complete is unavailable. This occurs when the |
872 | | /// `unicode-case` feature is not enabled and the underlying class is |
873 | | /// Unicode oriented. |
874 | 0 | pub fn try_case_fold_simple( |
875 | 0 | &mut self, |
876 | 0 | ) -> core::result::Result<(), CaseFoldError> { |
877 | 0 | match *self { |
878 | 0 | Class::Unicode(ref mut x) => x.try_case_fold_simple()?, |
879 | 0 | Class::Bytes(ref mut x) => x.case_fold_simple(), |
880 | | } |
881 | 0 | Ok(()) |
882 | 0 | } |
883 | | |
884 | | /// Negate this character class in place. |
885 | | /// |
886 | | /// After completion, this character class will contain precisely the |
887 | | /// characters that weren't previously in the class. |
888 | 0 | pub fn negate(&mut self) { |
889 | 0 | match *self { |
890 | 0 | Class::Unicode(ref mut x) => x.negate(), |
891 | 0 | Class::Bytes(ref mut x) => x.negate(), |
892 | | } |
893 | 0 | } |
894 | | |
895 | | /// Returns true if and only if this character class will only ever match |
896 | | /// valid UTF-8. |
897 | | /// |
898 | | /// A character class can match invalid UTF-8 only when the following |
899 | | /// conditions are met: |
900 | | /// |
901 | | /// 1. The translator was configured to permit generating an expression |
902 | | /// that can match invalid UTF-8. (By default, this is disabled.) |
903 | | /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete |
904 | | /// syntax or in the parser builder. By default, Unicode mode is |
905 | | /// enabled. |
906 | 3.78M | pub fn is_utf8(&self) -> bool { |
907 | 3.78M | match *self { |
908 | 3.39M | Class::Unicode(_) => true, |
909 | 389k | Class::Bytes(ref x) => x.is_ascii(), |
910 | | } |
911 | 3.78M | } |
912 | | |
913 | | /// Returns the length, in bytes, of the smallest string matched by this |
914 | | /// character class. |
915 | | /// |
916 | | /// For non-empty byte oriented classes, this always returns `1`. For |
917 | | /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or |
918 | | /// `4`. For empty classes, `None` is returned. It is impossible for `0` to |
919 | | /// be returned. |
920 | | /// |
921 | | /// # Example |
922 | | /// |
923 | | /// This example shows some examples of regexes and their corresponding |
924 | | /// minimum length, if any. |
925 | | /// |
926 | | /// ``` |
927 | | /// use regex_syntax::{hir::Properties, parse}; |
928 | | /// |
929 | | /// // The empty string has a min length of 0. |
930 | | /// let hir = parse(r"")?; |
931 | | /// assert_eq!(Some(0), hir.properties().minimum_len()); |
932 | | /// // As do other types of regexes that only match the empty string. |
933 | | /// let hir = parse(r"^$\b\B")?; |
934 | | /// assert_eq!(Some(0), hir.properties().minimum_len()); |
935 | | /// // A regex that can match the empty string but match more is still 0. |
936 | | /// let hir = parse(r"a*")?; |
937 | | /// assert_eq!(Some(0), hir.properties().minimum_len()); |
938 | | /// // A regex that matches nothing has no minimum defined. |
939 | | /// let hir = parse(r"[a&&b]")?; |
940 | | /// assert_eq!(None, hir.properties().minimum_len()); |
941 | | /// // Character classes usually have a minimum length of 1. |
942 | | /// let hir = parse(r"\w")?; |
943 | | /// assert_eq!(Some(1), hir.properties().minimum_len()); |
944 | | /// // But sometimes Unicode classes might be bigger! |
945 | | /// let hir = parse(r"\p{Cyrillic}")?; |
946 | | /// assert_eq!(Some(2), hir.properties().minimum_len()); |
947 | | /// |
948 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
949 | | /// ``` |
950 | 3.78M | pub fn minimum_len(&self) -> Option<usize> { |
951 | 3.78M | match *self { |
952 | 3.39M | Class::Unicode(ref x) => x.minimum_len(), |
953 | 389k | Class::Bytes(ref x) => x.minimum_len(), |
954 | | } |
955 | 3.78M | } |
956 | | |
957 | | /// Returns the length, in bytes, of the longest string matched by this |
958 | | /// character class. |
959 | | /// |
960 | | /// For non-empty byte oriented classes, this always returns `1`. For |
961 | | /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or |
962 | | /// `4`. For empty classes, `None` is returned. It is impossible for `0` to |
963 | | /// be returned. |
964 | | /// |
965 | | /// # Example |
966 | | /// |
967 | | /// This example shows some examples of regexes and their corresponding |
968 | | /// maximum length, if any. |
969 | | /// |
970 | | /// ``` |
971 | | /// use regex_syntax::{hir::Properties, parse}; |
972 | | /// |
973 | | /// // The empty string has a max length of 0. |
974 | | /// let hir = parse(r"")?; |
975 | | /// assert_eq!(Some(0), hir.properties().maximum_len()); |
976 | | /// // As do other types of regexes that only match the empty string. |
977 | | /// let hir = parse(r"^$\b\B")?; |
978 | | /// assert_eq!(Some(0), hir.properties().maximum_len()); |
979 | | /// // A regex that matches nothing has no maximum defined. |
980 | | /// let hir = parse(r"[a&&b]")?; |
981 | | /// assert_eq!(None, hir.properties().maximum_len()); |
982 | | /// // Bounded repeats work as you expect. |
983 | | /// let hir = parse(r"x{2,10}")?; |
984 | | /// assert_eq!(Some(10), hir.properties().maximum_len()); |
985 | | /// // An unbounded repeat means there is no maximum. |
986 | | /// let hir = parse(r"x{2,}")?; |
987 | | /// assert_eq!(None, hir.properties().maximum_len()); |
988 | | /// // With Unicode enabled, \w can match up to 4 bytes! |
989 | | /// let hir = parse(r"\w")?; |
990 | | /// assert_eq!(Some(4), hir.properties().maximum_len()); |
991 | | /// // Without Unicode enabled, \w matches at most 1 byte. |
992 | | /// let hir = parse(r"(?-u)\w")?; |
993 | | /// assert_eq!(Some(1), hir.properties().maximum_len()); |
994 | | /// |
995 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
996 | | /// ``` |
997 | 3.78M | pub fn maximum_len(&self) -> Option<usize> { |
998 | 3.78M | match *self { |
999 | 3.39M | Class::Unicode(ref x) => x.maximum_len(), |
1000 | 389k | Class::Bytes(ref x) => x.maximum_len(), |
1001 | | } |
1002 | 3.78M | } |
1003 | | |
1004 | | /// Returns true if and only if this character class is empty. That is, |
1005 | | /// it has no elements. |
1006 | | /// |
1007 | | /// An empty character can never match anything, including an empty string. |
1008 | 3.80M | pub fn is_empty(&self) -> bool { |
1009 | 3.80M | match *self { |
1010 | 3.41M | Class::Unicode(ref x) => x.ranges().is_empty(), |
1011 | 383k | Class::Bytes(ref x) => x.ranges().is_empty(), |
1012 | | } |
1013 | 3.80M | } |
1014 | | |
1015 | | /// If this class consists of exactly one element (whether a codepoint or a |
1016 | | /// byte), then return it as a literal byte string. |
1017 | | /// |
1018 | | /// If this class is empty or contains more than one element, then `None` |
1019 | | /// is returned. |
1020 | 3.78M | pub fn literal(&self) -> Option<Vec<u8>> { |
1021 | 3.78M | match *self { |
1022 | 3.40M | Class::Unicode(ref x) => x.literal(), |
1023 | 379k | Class::Bytes(ref x) => x.literal(), |
1024 | | } |
1025 | 3.78M | } |
1026 | | } |
1027 | | |
1028 | | impl core::fmt::Debug for Class { |
1029 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1030 | | use crate::debug::Byte; |
1031 | | |
1032 | 0 | let mut fmter = f.debug_set(); |
1033 | 0 | match *self { |
1034 | 0 | Class::Unicode(ref cls) => { |
1035 | 0 | for r in cls.ranges().iter() { |
1036 | 0 | fmter.entry(&(r.start..=r.end)); |
1037 | 0 | } |
1038 | | } |
1039 | 0 | Class::Bytes(ref cls) => { |
1040 | 0 | for r in cls.ranges().iter() { |
1041 | 0 | fmter.entry(&(Byte(r.start)..=Byte(r.end))); |
1042 | 0 | } |
1043 | | } |
1044 | | } |
1045 | 0 | fmter.finish() |
1046 | 0 | } |
1047 | | } |
1048 | | |
1049 | | /// A set of characters represented by Unicode scalar values. |
1050 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1051 | | pub struct ClassUnicode { |
1052 | | set: IntervalSet<ClassUnicodeRange>, |
1053 | | } |
1054 | | |
1055 | | impl ClassUnicode { |
1056 | | /// Create a new class from a sequence of ranges. |
1057 | | /// |
1058 | | /// The given ranges do not need to be in any specific order, and ranges |
1059 | | /// may overlap. Ranges will automatically be sorted into a canonical |
1060 | | /// non-overlapping order. |
1061 | 3.16M | pub fn new<I>(ranges: I) -> ClassUnicode |
1062 | 3.16M | where |
1063 | 3.16M | I: IntoIterator<Item = ClassUnicodeRange>, |
1064 | 3.16M | { |
1065 | 3.16M | ClassUnicode { set: IntervalSet::new(ranges) } |
1066 | 3.16M | } <regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 1]> Line | Count | Source | 1061 | 1.20M | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 1.20M | where | 1063 | 1.20M | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 1.20M | { | 1065 | 1.20M | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 1.20M | } |
Unexecuted instantiation: <regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 2]> <regex_syntax::hir::ClassUnicode>::new::<[regex_syntax::hir::ClassUnicodeRange; 3]> Line | Count | Source | 1061 | 319 | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 319 | where | 1063 | 319 | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 319 | { | 1065 | 319 | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 319 | } |
<regex_syntax::hir::ClassUnicode>::new::<alloc::vec::Vec<regex_syntax::hir::ClassUnicodeRange>> Line | Count | Source | 1061 | 1.95M | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 1.95M | where | 1063 | 1.95M | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 1.95M | { | 1065 | 1.95M | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 1.95M | } |
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<(u8, u8)>>, regex_syntax::hir::translate::ascii_class_as_chars::{closure#0}>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_unicode_class::{closure#0}>> Line | Count | Source | 1061 | 615 | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 615 | where | 1063 | 615 | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 615 | { | 1065 | 615 | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 615 | } |
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::ClassBytesRange>, <regex_syntax::hir::ClassBytes>::to_unicode_class::{closure#0}>> Line | Count | Source | 1061 | 2.11k | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 2.11k | where | 1063 | 2.11k | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 2.11k | { | 1065 | 2.11k | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 2.11k | } |
<regex_syntax::hir::ClassUnicode>::new::<core::iter::adapters::map::Map<alloc::vec::into_iter::IntoIter<char>, <regex_syntax::hir::Hir>::alternation::{closure#0}>> Line | Count | Source | 1061 | 3.65k | pub fn new<I>(ranges: I) -> ClassUnicode | 1062 | 3.65k | where | 1063 | 3.65k | I: IntoIterator<Item = ClassUnicodeRange>, | 1064 | 3.65k | { | 1065 | 3.65k | ClassUnicode { set: IntervalSet::new(ranges) } | 1066 | 3.65k | } |
|
1067 | | |
1068 | | /// Create a new class with no ranges. |
1069 | | /// |
1070 | | /// An empty class matches nothing. That is, it is equivalent to |
1071 | | /// [`Hir::fail`]. |
1072 | 330k | pub fn empty() -> ClassUnicode { |
1073 | 330k | ClassUnicode::new(vec![]) |
1074 | 330k | } |
1075 | | |
1076 | | /// Add a new range to this set. |
1077 | 1.60M | pub fn push(&mut self, range: ClassUnicodeRange) { |
1078 | 1.60M | self.set.push(range); |
1079 | 1.60M | } |
1080 | | |
1081 | | /// Return an iterator over all ranges in this class. |
1082 | | /// |
1083 | | /// The iterator yields ranges in ascending order. |
1084 | 8.92M | pub fn iter(&self) -> ClassUnicodeIter<'_> { |
1085 | 8.92M | ClassUnicodeIter(self.set.iter()) |
1086 | 8.92M | } |
1087 | | |
1088 | | /// Return the underlying ranges as a slice. |
1089 | 16.5M | pub fn ranges(&self) -> &[ClassUnicodeRange] { |
1090 | 16.5M | self.set.intervals() |
1091 | 16.5M | } |
1092 | | |
1093 | | /// Expand this character class such that it contains all case folded |
1094 | | /// characters, according to Unicode's "simple" mapping. For example, if |
1095 | | /// this class consists of the range `a-z`, then applying case folding will |
1096 | | /// result in the class containing both the ranges `a-z` and `A-Z`. |
1097 | | /// |
1098 | | /// # Panics |
1099 | | /// |
1100 | | /// This routine panics when the case mapping data necessary for this |
1101 | | /// routine to complete is unavailable. This occurs when the `unicode-case` |
1102 | | /// feature is not enabled. |
1103 | | /// |
1104 | | /// Callers should prefer using `try_case_fold_simple` instead, which will |
1105 | | /// return an error instead of panicking. |
1106 | 0 | pub fn case_fold_simple(&mut self) { |
1107 | 0 | self.set |
1108 | 0 | .case_fold_simple() |
1109 | 0 | .expect("unicode-case feature must be enabled"); |
1110 | 0 | } |
1111 | | |
1112 | | /// Expand this character class such that it contains all case folded |
1113 | | /// characters, according to Unicode's "simple" mapping. For example, if |
1114 | | /// this class consists of the range `a-z`, then applying case folding will |
1115 | | /// result in the class containing both the ranges `a-z` and `A-Z`. |
1116 | | /// |
1117 | | /// # Error |
1118 | | /// |
1119 | | /// This routine returns an error when the case mapping data necessary |
1120 | | /// for this routine to complete is unavailable. This occurs when the |
1121 | | /// `unicode-case` feature is not enabled. |
1122 | 1.63M | pub fn try_case_fold_simple( |
1123 | 1.63M | &mut self, |
1124 | 1.63M | ) -> core::result::Result<(), CaseFoldError> { |
1125 | 1.63M | self.set.case_fold_simple() |
1126 | 1.63M | } |
1127 | | |
1128 | | /// Negate this character class. |
1129 | | /// |
1130 | | /// For all `c` where `c` is a Unicode scalar value, if `c` was in this |
1131 | | /// set, then it will not be in this set after negation. |
1132 | 1.21M | pub fn negate(&mut self) { |
1133 | 1.21M | self.set.negate(); |
1134 | 1.21M | } |
1135 | | |
1136 | | /// Union this character class with the given character class, in place. |
1137 | 194k | pub fn union(&mut self, other: &ClassUnicode) { |
1138 | 194k | self.set.union(&other.set); |
1139 | 194k | } |
1140 | | |
1141 | | /// Intersect this character class with the given character class, in |
1142 | | /// place. |
1143 | 2.84k | pub fn intersect(&mut self, other: &ClassUnicode) { |
1144 | 2.84k | self.set.intersect(&other.set); |
1145 | 2.84k | } |
1146 | | |
1147 | | /// Subtract the given character class from this character class, in place. |
1148 | 27.9k | pub fn difference(&mut self, other: &ClassUnicode) { |
1149 | 27.9k | self.set.difference(&other.set); |
1150 | 27.9k | } |
1151 | | |
1152 | | /// Compute the symmetric difference of the given character classes, in |
1153 | | /// place. |
1154 | | /// |
1155 | | /// This computes the symmetric difference of two character classes. This |
1156 | | /// removes all elements in this class that are also in the given class, |
1157 | | /// but all adds all elements from the given class that aren't in this |
1158 | | /// class. That is, the class will contain all elements in either class, |
1159 | | /// but will not contain any elements that are in both classes. |
1160 | 42.1k | pub fn symmetric_difference(&mut self, other: &ClassUnicode) { |
1161 | 42.1k | self.set.symmetric_difference(&other.set); |
1162 | 42.1k | } |
1163 | | |
1164 | | /// Returns true if and only if this character class will either match |
1165 | | /// nothing or only ASCII bytes. Stated differently, this returns false |
1166 | | /// if and only if this class contains a non-ASCII codepoint. |
1167 | 8.47M | pub fn is_ascii(&self) -> bool { |
1168 | 8.47M | self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') |
1169 | 8.47M | } |
1170 | | |
1171 | | /// Returns the length, in bytes, of the smallest string matched by this |
1172 | | /// character class. |
1173 | | /// |
1174 | | /// Returns `None` when the class is empty. |
1175 | 3.39M | pub fn minimum_len(&self) -> Option<usize> { |
1176 | 3.39M | let first = self.ranges().get(0)?; |
1177 | | // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). |
1178 | 3.39M | Some(first.start.len_utf8()) |
1179 | 3.39M | } |
1180 | | |
1181 | | /// Returns the length, in bytes, of the longest string matched by this |
1182 | | /// character class. |
1183 | | /// |
1184 | | /// Returns `None` when the class is empty. |
1185 | 3.39M | pub fn maximum_len(&self) -> Option<usize> { |
1186 | 3.39M | let last = self.ranges().last()?; |
1187 | | // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). |
1188 | 3.39M | Some(last.end.len_utf8()) |
1189 | 3.39M | } |
1190 | | |
1191 | | /// If this class consists of exactly one codepoint, then return it as |
1192 | | /// a literal byte string. |
1193 | | /// |
1194 | | /// If this class is empty or contains more than one codepoint, then `None` |
1195 | | /// is returned. |
1196 | 3.40M | pub fn literal(&self) -> Option<Vec<u8>> { |
1197 | 3.40M | let rs = self.ranges(); |
1198 | 3.40M | if rs.len() == 1 && rs[0].start == rs[0].end { |
1199 | 12.3k | Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) |
1200 | | } else { |
1201 | 3.39M | None |
1202 | | } |
1203 | 3.40M | } |
1204 | | |
1205 | | /// If this class consists of only ASCII ranges, then return its |
1206 | | /// corresponding and equivalent byte class. |
1207 | 5.86k | pub fn to_byte_class(&self) -> Option<ClassBytes> { |
1208 | 5.86k | if !self.is_ascii() { |
1209 | 598 | return None; |
1210 | 5.26k | } |
1211 | 38.9k | Some(ClassBytes::new(self.ranges().iter().map(|r| { |
1212 | 38.9k | // Since we are guaranteed that our codepoint range is ASCII, the |
1213 | 38.9k | // 'u8::try_from' calls below are guaranteed to be correct. |
1214 | 38.9k | ClassBytesRange { |
1215 | 38.9k | start: u8::try_from(r.start).unwrap(), |
1216 | 38.9k | end: u8::try_from(r.end).unwrap(), |
1217 | 38.9k | } |
1218 | 38.9k | }))) |
1219 | 5.86k | } |
1220 | | } |
1221 | | |
1222 | | /// An iterator over all ranges in a Unicode character class. |
1223 | | /// |
1224 | | /// The lifetime `'a` refers to the lifetime of the underlying class. |
1225 | | #[derive(Debug)] |
1226 | | pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); |
1227 | | |
1228 | | impl<'a> Iterator for ClassUnicodeIter<'a> { |
1229 | | type Item = &'a ClassUnicodeRange; |
1230 | | |
1231 | 55.0M | fn next(&mut self) -> Option<&'a ClassUnicodeRange> { |
1232 | 55.0M | self.0.next() |
1233 | 55.0M | } |
1234 | | } |
1235 | | |
1236 | | /// A single range of characters represented by Unicode scalar values. |
1237 | | /// |
1238 | | /// The range is closed. That is, the start and end of the range are included |
1239 | | /// in the range. |
1240 | | #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] |
1241 | | pub struct ClassUnicodeRange { |
1242 | | start: char, |
1243 | | end: char, |
1244 | | } |
1245 | | |
1246 | | impl core::fmt::Debug for ClassUnicodeRange { |
1247 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1248 | 0 | let start = if !self.start.is_whitespace() && !self.start.is_control() |
1249 | | { |
1250 | 0 | self.start.to_string() |
1251 | | } else { |
1252 | 0 | format!("0x{:X}", u32::from(self.start)) |
1253 | | }; |
1254 | 0 | let end = if !self.end.is_whitespace() && !self.end.is_control() { |
1255 | 0 | self.end.to_string() |
1256 | | } else { |
1257 | 0 | format!("0x{:X}", u32::from(self.end)) |
1258 | | }; |
1259 | 0 | f.debug_struct("ClassUnicodeRange") |
1260 | 0 | .field("start", &start) |
1261 | 0 | .field("end", &end) |
1262 | 0 | .finish() |
1263 | 0 | } |
1264 | | } |
1265 | | |
1266 | | impl Interval for ClassUnicodeRange { |
1267 | | type Bound = char; |
1268 | | |
1269 | | #[inline] |
1270 | 253M | fn lower(&self) -> char { |
1271 | 253M | self.start |
1272 | 253M | } |
1273 | | #[inline] |
1274 | 257M | fn upper(&self) -> char { |
1275 | 257M | self.end |
1276 | 257M | } |
1277 | | #[inline] |
1278 | 45.4M | fn set_lower(&mut self, bound: char) { |
1279 | 45.4M | self.start = bound; |
1280 | 45.4M | } |
1281 | | #[inline] |
1282 | 45.4M | fn set_upper(&mut self, bound: char) { |
1283 | 45.4M | self.end = bound; |
1284 | 45.4M | } |
1285 | | |
1286 | | /// Apply simple case folding to this Unicode scalar value range. |
1287 | | /// |
1288 | | /// Additional ranges are appended to the given vector. Canonical ordering |
1289 | | /// is *not* maintained in the given vector. |
1290 | 3.72M | fn case_fold_simple( |
1291 | 3.72M | &self, |
1292 | 3.72M | ranges: &mut Vec<ClassUnicodeRange>, |
1293 | 3.72M | ) -> Result<(), unicode::CaseFoldError> { |
1294 | 3.72M | let mut folder = unicode::SimpleCaseFolder::new()?; |
1295 | 3.72M | if !folder.overlaps(self.start, self.end) { |
1296 | 1.25M | return Ok(()); |
1297 | 2.47M | } |
1298 | 2.47M | let (start, end) = (u32::from(self.start), u32::from(self.end)); |
1299 | 1.86G | for cp in (start..=end).filter_map(char::from_u32) { |
1300 | 1.86G | for &cp_folded in folder.mapping(cp) { |
1301 | 11.4M | ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); |
1302 | 11.4M | } |
1303 | | } |
1304 | 2.47M | Ok(()) |
1305 | 3.72M | } |
1306 | | } |
1307 | | |
1308 | | impl ClassUnicodeRange { |
1309 | | /// Create a new Unicode scalar value range for a character class. |
1310 | | /// |
1311 | | /// The returned range is always in a canonical form. That is, the range |
1312 | | /// returned always satisfies the invariant that `start <= end`. |
1313 | 27.8M | pub fn new(start: char, end: char) -> ClassUnicodeRange { |
1314 | 27.8M | ClassUnicodeRange::create(start, end) |
1315 | 27.8M | } |
1316 | | |
1317 | | /// Return the start of this range. |
1318 | | /// |
1319 | | /// The start of a range is always less than or equal to the end of the |
1320 | | /// range. |
1321 | 45.4M | pub fn start(&self) -> char { |
1322 | 45.4M | self.start |
1323 | 45.4M | } |
1324 | | |
1325 | | /// Return the end of this range. |
1326 | | /// |
1327 | | /// The end of a range is always greater than or equal to the start of the |
1328 | | /// range. |
1329 | 45.4M | pub fn end(&self) -> char { |
1330 | 45.4M | self.end |
1331 | 45.4M | } |
1332 | | |
1333 | | /// Returns the number of codepoints in this range. |
1334 | 682k | pub fn len(&self) -> usize { |
1335 | 682k | let diff = 1 + u32::from(self.end) - u32::from(self.start); |
1336 | 682k | // This is likely to panic in 16-bit targets since a usize can only fit |
1337 | 682k | // 2^16. It's not clear what to do here, other than to return an error |
1338 | 682k | // when building a Unicode class that contains a range whose length |
1339 | 682k | // overflows usize. (Which, to be honest, is probably quite common on |
1340 | 682k | // 16-bit targets. For example, this would imply that '.' and '\p{any}' |
1341 | 682k | // would be impossible to build.) |
1342 | 682k | usize::try_from(diff).expect("char class len fits in usize") |
1343 | 682k | } |
1344 | | } |
1345 | | |
1346 | | /// A set of characters represented by arbitrary bytes. |
1347 | | /// |
1348 | | /// Each byte corresponds to one character. |
1349 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1350 | | pub struct ClassBytes { |
1351 | | set: IntervalSet<ClassBytesRange>, |
1352 | | } |
1353 | | |
1354 | | impl ClassBytes { |
1355 | | /// Create a new class from a sequence of ranges. |
1356 | | /// |
1357 | | /// The given ranges do not need to be in any specific order, and ranges |
1358 | | /// may overlap. Ranges will automatically be sorted into a canonical |
1359 | | /// non-overlapping order. |
1360 | 1.23M | pub fn new<I>(ranges: I) -> ClassBytes |
1361 | 1.23M | where |
1362 | 1.23M | I: IntoIterator<Item = ClassBytesRange>, |
1363 | 1.23M | { |
1364 | 1.23M | ClassBytes { set: IntervalSet::new(ranges) } |
1365 | 1.23M | } <regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 1]> Line | Count | Source | 1360 | 221k | pub fn new<I>(ranges: I) -> ClassBytes | 1361 | 221k | where | 1362 | 221k | I: IntoIterator<Item = ClassBytesRange>, | 1363 | 221k | { | 1364 | 221k | ClassBytes { set: IntervalSet::new(ranges) } | 1365 | 221k | } |
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 2]> Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<[regex_syntax::hir::ClassBytesRange; 3]> <regex_syntax::hir::ClassBytes>::new::<alloc::vec::Vec<regex_syntax::hir::ClassBytesRange>> Line | Count | Source | 1360 | 1.00M | pub fn new<I>(ranges: I) -> ClassBytes | 1361 | 1.00M | where | 1362 | 1.00M | I: IntoIterator<Item = ClassBytesRange>, | 1363 | 1.00M | { | 1364 | 1.00M | ClassBytes { set: IntervalSet::new(ranges) } | 1365 | 1.00M | } |
<regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<core::iter::adapters::copied::Copied<core::slice::iter::Iter<(u8, u8)>>, <regex_syntax::hir::translate::TranslatorI>::hir_ascii_byte_class::{closure#0}>> Line | Count | Source | 1360 | 238 | pub fn new<I>(ranges: I) -> ClassBytes | 1361 | 238 | where | 1362 | 238 | I: IntoIterator<Item = ClassBytesRange>, | 1363 | 238 | { | 1364 | 238 | ClassBytes { set: IntervalSet::new(ranges) } | 1365 | 238 | } |
<regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::ClassUnicodeRange>, <regex_syntax::hir::ClassUnicode>::to_byte_class::{closure#0}>> Line | Count | Source | 1360 | 5.26k | pub fn new<I>(ranges: I) -> ClassBytes | 1361 | 5.26k | where | 1362 | 5.26k | I: IntoIterator<Item = ClassBytesRange>, | 1363 | 5.26k | { | 1364 | 5.26k | ClassBytes { set: IntervalSet::new(ranges) } | 1365 | 5.26k | } |
Unexecuted instantiation: <regex_syntax::hir::ClassBytes>::new::<core::iter::adapters::map::Map<alloc::vec::into_iter::IntoIter<u8>, <regex_syntax::hir::Hir>::alternation::{closure#1}>> |
1366 | | |
1367 | | /// Create a new class with no ranges. |
1368 | | /// |
1369 | | /// An empty class matches nothing. That is, it is equivalent to |
1370 | | /// [`Hir::fail`]. |
1371 | 818k | pub fn empty() -> ClassBytes { |
1372 | 818k | ClassBytes::new(vec![]) |
1373 | 818k | } |
1374 | | |
1375 | | /// Add a new range to this set. |
1376 | 6.07M | pub fn push(&mut self, range: ClassBytesRange) { |
1377 | 6.07M | self.set.push(range); |
1378 | 6.07M | } |
1379 | | |
1380 | | /// Return an iterator over all ranges in this class. |
1381 | | /// |
1382 | | /// The iterator yields ranges in ascending order. |
1383 | 2.64M | pub fn iter(&self) -> ClassBytesIter<'_> { |
1384 | 2.64M | ClassBytesIter(self.set.iter()) |
1385 | 2.64M | } |
1386 | | |
1387 | | /// Return the underlying ranges as a slice. |
1388 | 4.14M | pub fn ranges(&self) -> &[ClassBytesRange] { |
1389 | 4.14M | self.set.intervals() |
1390 | 4.14M | } |
1391 | | |
1392 | | /// Expand this character class such that it contains all case folded |
1393 | | /// characters. For example, if this class consists of the range `a-z`, |
1394 | | /// then applying case folding will result in the class containing both the |
1395 | | /// ranges `a-z` and `A-Z`. |
1396 | | /// |
1397 | | /// Note that this only applies ASCII case folding, which is limited to the |
1398 | | /// characters `a-z` and `A-Z`. |
1399 | 898k | pub fn case_fold_simple(&mut self) { |
1400 | 898k | self.set.case_fold_simple().expect("ASCII case folding never fails"); |
1401 | 898k | } |
1402 | | |
1403 | | /// Negate this byte class. |
1404 | | /// |
1405 | | /// For all `b` where `b` is a any byte, if `b` was in this set, then it |
1406 | | /// will not be in this set after negation. |
1407 | 274 | pub fn negate(&mut self) { |
1408 | 274 | self.set.negate(); |
1409 | 274 | } |
1410 | | |
1411 | | /// Union this byte class with the given byte class, in place. |
1412 | 743k | pub fn union(&mut self, other: &ClassBytes) { |
1413 | 743k | self.set.union(&other.set); |
1414 | 743k | } |
1415 | | |
1416 | | /// Intersect this byte class with the given byte class, in place. |
1417 | 1.53k | pub fn intersect(&mut self, other: &ClassBytes) { |
1418 | 1.53k | self.set.intersect(&other.set); |
1419 | 1.53k | } |
1420 | | |
1421 | | /// Subtract the given byte class from this byte class, in place. |
1422 | 20.2k | pub fn difference(&mut self, other: &ClassBytes) { |
1423 | 20.2k | self.set.difference(&other.set); |
1424 | 20.2k | } |
1425 | | |
1426 | | /// Compute the symmetric difference of the given byte classes, in place. |
1427 | | /// |
1428 | | /// This computes the symmetric difference of two byte classes. This |
1429 | | /// removes all elements in this class that are also in the given class, |
1430 | | /// but all adds all elements from the given class that aren't in this |
1431 | | /// class. That is, the class will contain all elements in either class, |
1432 | | /// but will not contain any elements that are in both classes. |
1433 | 32.7k | pub fn symmetric_difference(&mut self, other: &ClassBytes) { |
1434 | 32.7k | self.set.symmetric_difference(&other.set); |
1435 | 32.7k | } |
1436 | | |
1437 | | /// Returns true if and only if this character class will either match |
1438 | | /// nothing or only ASCII bytes. Stated differently, this returns false |
1439 | | /// if and only if this class contains a non-ASCII byte. |
1440 | 1.08M | pub fn is_ascii(&self) -> bool { |
1441 | 1.08M | self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) |
1442 | 1.08M | } |
1443 | | |
1444 | | /// Returns the length, in bytes, of the smallest string matched by this |
1445 | | /// character class. |
1446 | | /// |
1447 | | /// Returns `None` when the class is empty. |
1448 | 389k | pub fn minimum_len(&self) -> Option<usize> { |
1449 | 389k | if self.ranges().is_empty() { |
1450 | 10.9k | None |
1451 | | } else { |
1452 | 378k | Some(1) |
1453 | | } |
1454 | 389k | } |
1455 | | |
1456 | | /// Returns the length, in bytes, of the longest string matched by this |
1457 | | /// character class. |
1458 | | /// |
1459 | | /// Returns `None` when the class is empty. |
1460 | 389k | pub fn maximum_len(&self) -> Option<usize> { |
1461 | 389k | if self.ranges().is_empty() { |
1462 | 10.9k | None |
1463 | | } else { |
1464 | 378k | Some(1) |
1465 | | } |
1466 | 389k | } |
1467 | | |
1468 | | /// If this class consists of exactly one byte, then return it as |
1469 | | /// a literal byte string. |
1470 | | /// |
1471 | | /// If this class is empty or contains more than one byte, then `None` |
1472 | | /// is returned. |
1473 | 379k | pub fn literal(&self) -> Option<Vec<u8>> { |
1474 | 379k | let rs = self.ranges(); |
1475 | 379k | if rs.len() == 1 && rs[0].start == rs[0].end { |
1476 | 1.11k | Some(vec![rs[0].start]) |
1477 | | } else { |
1478 | 378k | None |
1479 | | } |
1480 | 379k | } |
1481 | | |
1482 | | /// If this class consists of only ASCII ranges, then return its |
1483 | | /// corresponding and equivalent Unicode class. |
1484 | 2.11k | pub fn to_unicode_class(&self) -> Option<ClassUnicode> { |
1485 | 2.11k | if !self.is_ascii() { |
1486 | 0 | return None; |
1487 | 2.11k | } |
1488 | 21.6k | Some(ClassUnicode::new(self.ranges().iter().map(|r| { |
1489 | 21.6k | // Since we are guaranteed that our byte range is ASCII, the |
1490 | 21.6k | // 'char::from' calls below are correct and will not erroneously |
1491 | 21.6k | // convert a raw byte value into its corresponding codepoint. |
1492 | 21.6k | ClassUnicodeRange { |
1493 | 21.6k | start: char::from(r.start), |
1494 | 21.6k | end: char::from(r.end), |
1495 | 21.6k | } |
1496 | 21.6k | }))) |
1497 | 2.11k | } |
1498 | | } |
1499 | | |
1500 | | /// An iterator over all ranges in a byte character class. |
1501 | | /// |
1502 | | /// The lifetime `'a` refers to the lifetime of the underlying class. |
1503 | | #[derive(Debug)] |
1504 | | pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); |
1505 | | |
1506 | | impl<'a> Iterator for ClassBytesIter<'a> { |
1507 | | type Item = &'a ClassBytesRange; |
1508 | | |
1509 | 14.6M | fn next(&mut self) -> Option<&'a ClassBytesRange> { |
1510 | 14.6M | self.0.next() |
1511 | 14.6M | } |
1512 | | } |
1513 | | |
1514 | | /// A single range of characters represented by arbitrary bytes. |
1515 | | /// |
1516 | | /// The range is closed. That is, the start and end of the range are included |
1517 | | /// in the range. |
1518 | | #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] |
1519 | | pub struct ClassBytesRange { |
1520 | | start: u8, |
1521 | | end: u8, |
1522 | | } |
1523 | | |
1524 | | impl Interval for ClassBytesRange { |
1525 | | type Bound = u8; |
1526 | | |
1527 | | #[inline] |
1528 | 572M | fn lower(&self) -> u8 { |
1529 | 572M | self.start |
1530 | 572M | } |
1531 | | #[inline] |
1532 | 573M | fn upper(&self) -> u8 { |
1533 | 573M | self.end |
1534 | 573M | } |
1535 | | #[inline] |
1536 | 66.6M | fn set_lower(&mut self, bound: u8) { |
1537 | 66.6M | self.start = bound; |
1538 | 66.6M | } |
1539 | | #[inline] |
1540 | 66.6M | fn set_upper(&mut self, bound: u8) { |
1541 | 66.6M | self.end = bound; |
1542 | 66.6M | } |
1543 | | |
1544 | | /// Apply simple case folding to this byte range. Only ASCII case mappings |
1545 | | /// (for a-z) are applied. |
1546 | | /// |
1547 | | /// Additional ranges are appended to the given vector. Canonical ordering |
1548 | | /// is *not* maintained in the given vector. |
1549 | 17.0M | fn case_fold_simple( |
1550 | 17.0M | &self, |
1551 | 17.0M | ranges: &mut Vec<ClassBytesRange>, |
1552 | 17.0M | ) -> Result<(), unicode::CaseFoldError> { |
1553 | 17.0M | if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { |
1554 | 4.92M | let lower = cmp::max(self.start, b'a'); |
1555 | 4.92M | let upper = cmp::min(self.end, b'z'); |
1556 | 4.92M | ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); |
1557 | 12.1M | } |
1558 | 17.0M | if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { |
1559 | 5.12M | let lower = cmp::max(self.start, b'A'); |
1560 | 5.12M | let upper = cmp::min(self.end, b'Z'); |
1561 | 5.12M | ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); |
1562 | 11.9M | } |
1563 | 17.0M | Ok(()) |
1564 | 17.0M | } |
1565 | | } |
1566 | | |
1567 | | impl ClassBytesRange { |
1568 | | /// Create a new byte range for a character class. |
1569 | | /// |
1570 | | /// The returned range is always in a canonical form. That is, the range |
1571 | | /// returned always satisfies the invariant that `start <= end`. |
1572 | 50.6M | pub fn new(start: u8, end: u8) -> ClassBytesRange { |
1573 | 50.6M | ClassBytesRange::create(start, end) |
1574 | 50.6M | } |
1575 | | |
1576 | | /// Return the start of this range. |
1577 | | /// |
1578 | | /// The start of a range is always less than or equal to the end of the |
1579 | | /// range. |
1580 | 11.9M | pub fn start(&self) -> u8 { |
1581 | 11.9M | self.start |
1582 | 11.9M | } |
1583 | | |
1584 | | /// Return the end of this range. |
1585 | | /// |
1586 | | /// The end of a range is always greater than or equal to the start of the |
1587 | | /// range. |
1588 | 11.9M | pub fn end(&self) -> u8 { |
1589 | 11.9M | self.end |
1590 | 11.9M | } |
1591 | | |
1592 | | /// Returns the number of bytes in this range. |
1593 | 46.0k | pub fn len(&self) -> usize { |
1594 | 46.0k | usize::from(self.end.checked_sub(self.start).unwrap()) |
1595 | 46.0k | .checked_add(1) |
1596 | 46.0k | .unwrap() |
1597 | 46.0k | } |
1598 | | } |
1599 | | |
1600 | | impl core::fmt::Debug for ClassBytesRange { |
1601 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1602 | 0 | f.debug_struct("ClassBytesRange") |
1603 | 0 | .field("start", &crate::debug::Byte(self.start)) |
1604 | 0 | .field("end", &crate::debug::Byte(self.end)) |
1605 | 0 | .finish() |
1606 | 0 | } |
1607 | | } |
1608 | | |
1609 | | /// The high-level intermediate representation for a look-around assertion. |
1610 | | /// |
1611 | | /// An assertion match is always zero-length. Also called an "empty match." |
1612 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1613 | | pub enum Look { |
1614 | | /// Match the beginning of text. Specifically, this matches at the starting |
1615 | | /// position of the input. |
1616 | | Start = 1 << 0, |
1617 | | /// Match the end of text. Specifically, this matches at the ending |
1618 | | /// position of the input. |
1619 | | End = 1 << 1, |
1620 | | /// Match the beginning of a line or the beginning of text. Specifically, |
1621 | | /// this matches at the starting position of the input, or at the position |
1622 | | /// immediately following a `\n` character. |
1623 | | StartLF = 1 << 2, |
1624 | | /// Match the end of a line or the end of text. Specifically, this matches |
1625 | | /// at the end position of the input, or at the position immediately |
1626 | | /// preceding a `\n` character. |
1627 | | EndLF = 1 << 3, |
1628 | | /// Match the beginning of a line or the beginning of text. Specifically, |
1629 | | /// this matches at the starting position of the input, or at the position |
1630 | | /// immediately following either a `\r` or `\n` character, but never after |
1631 | | /// a `\r` when a `\n` follows. |
1632 | | StartCRLF = 1 << 4, |
1633 | | /// Match the end of a line or the end of text. Specifically, this matches |
1634 | | /// at the end position of the input, or at the position immediately |
1635 | | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
1636 | | /// precedes it. |
1637 | | EndCRLF = 1 << 5, |
1638 | | /// Match an ASCII-only word boundary. That is, this matches a position |
1639 | | /// where the left adjacent character and right adjacent character |
1640 | | /// correspond to a word and non-word or a non-word and word character. |
1641 | | WordAscii = 1 << 6, |
1642 | | /// Match an ASCII-only negation of a word boundary. |
1643 | | WordAsciiNegate = 1 << 7, |
1644 | | /// Match a Unicode-aware word boundary. That is, this matches a position |
1645 | | /// where the left adjacent character and right adjacent character |
1646 | | /// correspond to a word and non-word or a non-word and word character. |
1647 | | WordUnicode = 1 << 8, |
1648 | | /// Match a Unicode-aware negation of a word boundary. |
1649 | | WordUnicodeNegate = 1 << 9, |
1650 | | /// Match the start of an ASCII-only word boundary. That is, this matches a |
1651 | | /// position at either the beginning of the haystack or where the previous |
1652 | | /// character is not a word character and the following character is a word |
1653 | | /// character. |
1654 | | WordStartAscii = 1 << 10, |
1655 | | /// Match the end of an ASCII-only word boundary. That is, this matches |
1656 | | /// a position at either the end of the haystack or where the previous |
1657 | | /// character is a word character and the following character is not a word |
1658 | | /// character. |
1659 | | WordEndAscii = 1 << 11, |
1660 | | /// Match the start of a Unicode word boundary. That is, this matches a |
1661 | | /// position at either the beginning of the haystack or where the previous |
1662 | | /// character is not a word character and the following character is a word |
1663 | | /// character. |
1664 | | WordStartUnicode = 1 << 12, |
1665 | | /// Match the end of a Unicode word boundary. That is, this matches a |
1666 | | /// position at either the end of the haystack or where the previous |
1667 | | /// character is a word character and the following character is not a word |
1668 | | /// character. |
1669 | | WordEndUnicode = 1 << 13, |
1670 | | /// Match the start half of an ASCII-only word boundary. That is, this |
1671 | | /// matches a position at either the beginning of the haystack or where the |
1672 | | /// previous character is not a word character. |
1673 | | WordStartHalfAscii = 1 << 14, |
1674 | | /// Match the end half of an ASCII-only word boundary. That is, this |
1675 | | /// matches a position at either the end of the haystack or where the |
1676 | | /// following character is not a word character. |
1677 | | WordEndHalfAscii = 1 << 15, |
1678 | | /// Match the start half of a Unicode word boundary. That is, this matches |
1679 | | /// a position at either the beginning of the haystack or where the |
1680 | | /// previous character is not a word character. |
1681 | | WordStartHalfUnicode = 1 << 16, |
1682 | | /// Match the end half of a Unicode word boundary. That is, this matches |
1683 | | /// a position at either the end of the haystack or where the following |
1684 | | /// character is not a word character. |
1685 | | WordEndHalfUnicode = 1 << 17, |
1686 | | } |
1687 | | |
1688 | | impl Look { |
1689 | | /// Flip the look-around assertion to its equivalent for reverse searches. |
1690 | | /// For example, `StartLF` gets translated to `EndLF`. |
1691 | | /// |
1692 | | /// Some assertions, such as `WordUnicode`, remain the same since they |
1693 | | /// match the same positions regardless of the direction of the search. |
1694 | | #[inline] |
1695 | 0 | pub const fn reversed(self) -> Look { |
1696 | 0 | match self { |
1697 | 0 | Look::Start => Look::End, |
1698 | 0 | Look::End => Look::Start, |
1699 | 0 | Look::StartLF => Look::EndLF, |
1700 | 0 | Look::EndLF => Look::StartLF, |
1701 | 0 | Look::StartCRLF => Look::EndCRLF, |
1702 | 0 | Look::EndCRLF => Look::StartCRLF, |
1703 | 0 | Look::WordAscii => Look::WordAscii, |
1704 | 0 | Look::WordAsciiNegate => Look::WordAsciiNegate, |
1705 | 0 | Look::WordUnicode => Look::WordUnicode, |
1706 | 0 | Look::WordUnicodeNegate => Look::WordUnicodeNegate, |
1707 | 0 | Look::WordStartAscii => Look::WordEndAscii, |
1708 | 0 | Look::WordEndAscii => Look::WordStartAscii, |
1709 | 0 | Look::WordStartUnicode => Look::WordEndUnicode, |
1710 | 0 | Look::WordEndUnicode => Look::WordStartUnicode, |
1711 | 0 | Look::WordStartHalfAscii => Look::WordEndHalfAscii, |
1712 | 0 | Look::WordEndHalfAscii => Look::WordStartHalfAscii, |
1713 | 0 | Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, |
1714 | 0 | Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, |
1715 | | } |
1716 | 0 | } |
1717 | | |
1718 | | /// Return the underlying representation of this look-around enumeration |
1719 | | /// as an integer. Giving the return value to the [`Look::from_repr`] |
1720 | | /// constructor is guaranteed to return the same look-around variant that |
1721 | | /// one started with within a semver compatible release of this crate. |
1722 | | #[inline] |
1723 | 2.85M | pub const fn as_repr(self) -> u32 { |
1724 | 2.85M | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an |
1725 | 2.85M | // actual int. |
1726 | 2.85M | self as u32 |
1727 | 2.85M | } <regex_syntax::hir::Look>::as_repr Line | Count | Source | 1723 | 56.5k | pub const fn as_repr(self) -> u32 { | 1724 | 56.5k | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an | 1725 | 56.5k | // actual int. | 1726 | 56.5k | self as u32 | 1727 | 56.5k | } |
Unexecuted instantiation: <regex_syntax::hir::Look>::as_repr <regex_syntax::hir::Look>::as_repr Line | Count | Source | 1723 | 1.12M | pub const fn as_repr(self) -> u32 { | 1724 | 1.12M | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an | 1725 | 1.12M | // actual int. | 1726 | 1.12M | self as u32 | 1727 | 1.12M | } |
<regex_syntax::hir::Look>::as_repr Line | Count | Source | 1723 | 1.67M | pub const fn as_repr(self) -> u32 { | 1724 | 1.67M | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an | 1725 | 1.67M | // actual int. | 1726 | 1.67M | self as u32 | 1727 | 1.67M | } |
|
1728 | | |
1729 | | /// Given the underlying representation of a `Look` value, return the |
1730 | | /// corresponding `Look` value if the representation is valid. Otherwise |
1731 | | /// `None` is returned. |
1732 | | #[inline] |
1733 | 0 | pub const fn from_repr(repr: u32) -> Option<Look> { |
1734 | 0 | match repr { |
1735 | 0 | 0b00_0000_0000_0000_0001 => Some(Look::Start), |
1736 | 0 | 0b00_0000_0000_0000_0010 => Some(Look::End), |
1737 | 0 | 0b00_0000_0000_0000_0100 => Some(Look::StartLF), |
1738 | 0 | 0b00_0000_0000_0000_1000 => Some(Look::EndLF), |
1739 | 0 | 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), |
1740 | 0 | 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), |
1741 | 0 | 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), |
1742 | 0 | 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), |
1743 | 0 | 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), |
1744 | 0 | 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), |
1745 | 0 | 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), |
1746 | 0 | 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), |
1747 | 0 | 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), |
1748 | 0 | 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), |
1749 | 0 | 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), |
1750 | 0 | 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), |
1751 | 0 | 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), |
1752 | 0 | 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), |
1753 | 0 | _ => None, |
1754 | | } |
1755 | 0 | } |
1756 | | |
1757 | | /// Returns a convenient single codepoint representation of this |
1758 | | /// look-around assertion. Each assertion is guaranteed to be represented |
1759 | | /// by a distinct character. |
1760 | | /// |
1761 | | /// This is useful for succinctly representing a look-around assertion in |
1762 | | /// human friendly but succinct output intended for a programmer working on |
1763 | | /// regex internals. |
1764 | | #[inline] |
1765 | 0 | pub const fn as_char(self) -> char { |
1766 | 0 | match self { |
1767 | 0 | Look::Start => 'A', |
1768 | 0 | Look::End => 'z', |
1769 | 0 | Look::StartLF => '^', |
1770 | 0 | Look::EndLF => '$', |
1771 | 0 | Look::StartCRLF => 'r', |
1772 | 0 | Look::EndCRLF => 'R', |
1773 | 0 | Look::WordAscii => 'b', |
1774 | 0 | Look::WordAsciiNegate => 'B', |
1775 | 0 | Look::WordUnicode => '𝛃', |
1776 | 0 | Look::WordUnicodeNegate => '𝚩', |
1777 | 0 | Look::WordStartAscii => '<', |
1778 | 0 | Look::WordEndAscii => '>', |
1779 | 0 | Look::WordStartUnicode => '〈', |
1780 | 0 | Look::WordEndUnicode => '〉', |
1781 | 0 | Look::WordStartHalfAscii => '◁', |
1782 | 0 | Look::WordEndHalfAscii => '▷', |
1783 | 0 | Look::WordStartHalfUnicode => '◀', |
1784 | 0 | Look::WordEndHalfUnicode => '▶', |
1785 | | } |
1786 | 0 | } |
1787 | | } |
1788 | | |
1789 | | /// The high-level intermediate representation for a capturing group. |
1790 | | /// |
1791 | | /// A capturing group always has an index and a child expression. It may |
1792 | | /// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not |
1793 | | /// necessary. |
1794 | | /// |
1795 | | /// Note that there is no explicit representation of a non-capturing group |
1796 | | /// in a `Hir`. Instead, non-capturing grouping is handled automatically by |
1797 | | /// the recursive structure of the `Hir` itself. |
1798 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1799 | | pub struct Capture { |
1800 | | /// The capture index of the capture. |
1801 | | pub index: u32, |
1802 | | /// The name of the capture, if it exists. |
1803 | | pub name: Option<Box<str>>, |
1804 | | /// The expression inside the capturing group, which may be empty. |
1805 | | pub sub: Box<Hir>, |
1806 | | } |
1807 | | |
1808 | | /// The high-level intermediate representation of a repetition operator. |
1809 | | /// |
1810 | | /// A repetition operator permits the repetition of an arbitrary |
1811 | | /// sub-expression. |
1812 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1813 | | pub struct Repetition { |
1814 | | /// The minimum range of the repetition. |
1815 | | /// |
1816 | | /// Note that special cases like `?`, `+` and `*` all get translated into |
1817 | | /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. |
1818 | | /// |
1819 | | /// When `min` is zero, this expression can match the empty string |
1820 | | /// regardless of what its sub-expression is. |
1821 | | pub min: u32, |
1822 | | /// The maximum range of the repetition. |
1823 | | /// |
1824 | | /// Note that when `max` is `None`, `min` acts as a lower bound but where |
1825 | | /// there is no upper bound. For something like `x{5}` where the min and |
1826 | | /// max are equivalent, `min` will be set to `5` and `max` will be set to |
1827 | | /// `Some(5)`. |
1828 | | pub max: Option<u32>, |
1829 | | /// Whether this repetition operator is greedy or not. A greedy operator |
1830 | | /// will match as much as it can. A non-greedy operator will match as |
1831 | | /// little as it can. |
1832 | | /// |
1833 | | /// Typically, operators are greedy by default and are only non-greedy when |
1834 | | /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is |
1835 | | /// not. However, this can be inverted via the `U` "ungreedy" flag. |
1836 | | pub greedy: bool, |
1837 | | /// The expression being repeated. |
1838 | | pub sub: Box<Hir>, |
1839 | | } |
1840 | | |
1841 | | impl Repetition { |
1842 | | /// Returns a new repetition with the same `min`, `max` and `greedy` |
1843 | | /// values, but with its sub-expression replaced with the one given. |
1844 | 249k | pub fn with(&self, sub: Hir) -> Repetition { |
1845 | 249k | Repetition { |
1846 | 249k | min: self.min, |
1847 | 249k | max: self.max, |
1848 | 249k | greedy: self.greedy, |
1849 | 249k | sub: Box::new(sub), |
1850 | 249k | } |
1851 | 249k | } |
1852 | | } |
1853 | | |
1854 | | /// A type describing the different flavors of `.`. |
1855 | | /// |
1856 | | /// This type is meant to be used with [`Hir::dot`], which is a convenience |
1857 | | /// routine for building HIR values derived from the `.` regex. |
1858 | | #[non_exhaustive] |
1859 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1860 | | pub enum Dot { |
1861 | | /// Matches the UTF-8 encoding of any Unicode scalar value. |
1862 | | /// |
1863 | | /// This is equivalent to `(?su:.)` and also `\p{any}`. |
1864 | | AnyChar, |
1865 | | /// Matches any byte value. |
1866 | | /// |
1867 | | /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. |
1868 | | AnyByte, |
1869 | | /// Matches the UTF-8 encoding of any Unicode scalar value except for the |
1870 | | /// `char` given. |
1871 | | /// |
1872 | | /// This is equivalent to using `(?u-s:.)` with the line terminator set |
1873 | | /// to a particular ASCII byte. (Because of peculiarities in the regex |
1874 | | /// engines, a line terminator must be a single byte. It follows that when |
1875 | | /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar |
1876 | | /// value. That is, ti must be ASCII.) |
1877 | | /// |
1878 | | /// (This and `AnyCharExceptLF` both exist because of legacy reasons. |
1879 | | /// `AnyCharExceptLF` will be dropped in the next breaking change release.) |
1880 | | AnyCharExcept(char), |
1881 | | /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. |
1882 | | /// |
1883 | | /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. |
1884 | | AnyCharExceptLF, |
1885 | | /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` |
1886 | | /// and `\n`. |
1887 | | /// |
1888 | | /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. |
1889 | | AnyCharExceptCRLF, |
1890 | | /// Matches any byte value except for the `u8` given. |
1891 | | /// |
1892 | | /// This is equivalent to using `(?-us:.)` with the line terminator set |
1893 | | /// to a particular ASCII byte. (Because of peculiarities in the regex |
1894 | | /// engines, a line terminator must be a single byte. It follows that when |
1895 | | /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar |
1896 | | /// value. That is, ti must be ASCII.) |
1897 | | /// |
1898 | | /// (This and `AnyByteExceptLF` both exist because of legacy reasons. |
1899 | | /// `AnyByteExceptLF` will be dropped in the next breaking change release.) |
1900 | | AnyByteExcept(u8), |
1901 | | /// Matches any byte value except for `\n`. |
1902 | | /// |
1903 | | /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. |
1904 | | AnyByteExceptLF, |
1905 | | /// Matches any byte value except for `\r` and `\n`. |
1906 | | /// |
1907 | | /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. |
1908 | | AnyByteExceptCRLF, |
1909 | | } |
1910 | | |
1911 | | /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack |
1912 | | /// space but heap space proportional to the depth of the total `Hir`. |
1913 | | impl Drop for Hir { |
1914 | 37.2M | fn drop(&mut self) { |
1915 | | use core::mem; |
1916 | | |
1917 | 37.2M | match *self.kind() { |
1918 | | HirKind::Empty |
1919 | | | HirKind::Literal(_) |
1920 | | | HirKind::Class(_) |
1921 | 35.8M | | HirKind::Look(_) => return, |
1922 | 104k | HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, |
1923 | 728k | HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { |
1924 | 694k | return |
1925 | | } |
1926 | 435k | HirKind::Concat(ref x) if x.is_empty() => return, |
1927 | 97.3k | HirKind::Alternation(ref x) if x.is_empty() => return, |
1928 | 181k | _ => {} |
1929 | 181k | } |
1930 | 181k | |
1931 | 181k | let mut stack = vec![mem::replace(self, Hir::empty())]; |
1932 | 17.0M | while let Some(mut expr) = stack.pop() { |
1933 | 16.8M | match expr.kind { |
1934 | | HirKind::Empty |
1935 | | | HirKind::Literal(_) |
1936 | | | HirKind::Class(_) |
1937 | 15.6M | | HirKind::Look(_) => {} |
1938 | 97.5k | HirKind::Capture(ref mut x) => { |
1939 | 97.5k | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
1940 | 97.5k | } |
1941 | 669k | HirKind::Repetition(ref mut x) => { |
1942 | 669k | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
1943 | 669k | } |
1944 | 325k | HirKind::Concat(ref mut x) => { |
1945 | 325k | stack.extend(x.drain(..)); |
1946 | 325k | } |
1947 | 63.7k | HirKind::Alternation(ref mut x) => { |
1948 | 63.7k | stack.extend(x.drain(..)); |
1949 | 63.7k | } |
1950 | | } |
1951 | | } |
1952 | 37.2M | } |
1953 | | } |
1954 | | |
1955 | | /// A type that collects various properties of an HIR value. |
1956 | | /// |
1957 | | /// Properties are always scalar values and represent meta data that is |
1958 | | /// computed inductively on an HIR value. Properties are defined for all |
1959 | | /// HIR values. |
1960 | | /// |
1961 | | /// All methods on a `Properties` value take constant time and are meant to |
1962 | | /// be cheap to call. |
1963 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1964 | | pub struct Properties(Box<PropertiesI>); |
1965 | | |
1966 | | /// The property definition. It is split out so that we can box it, and |
1967 | | /// there by make `Properties` use less stack size. This is kind-of important |
1968 | | /// because every HIR value has a `Properties` attached to it. |
1969 | | /// |
1970 | | /// This does have the unfortunate consequence that creating any HIR value |
1971 | | /// always leads to at least one alloc for properties, but this is generally |
1972 | | /// true anyway (for pretty much all HirKinds except for look-arounds). |
1973 | | #[derive(Clone, Debug, Eq, PartialEq)] |
1974 | | struct PropertiesI { |
1975 | | minimum_len: Option<usize>, |
1976 | | maximum_len: Option<usize>, |
1977 | | look_set: LookSet, |
1978 | | look_set_prefix: LookSet, |
1979 | | look_set_suffix: LookSet, |
1980 | | look_set_prefix_any: LookSet, |
1981 | | look_set_suffix_any: LookSet, |
1982 | | utf8: bool, |
1983 | | explicit_captures_len: usize, |
1984 | | static_explicit_captures_len: Option<usize>, |
1985 | | literal: bool, |
1986 | | alternation_literal: bool, |
1987 | | } |
1988 | | |
1989 | | impl Properties { |
1990 | | /// Returns the length (in bytes) of the smallest string matched by this |
1991 | | /// HIR. |
1992 | | /// |
1993 | | /// A return value of `0` is possible and occurs when the HIR can match an |
1994 | | /// empty string. |
1995 | | /// |
1996 | | /// `None` is returned when there is no minimum length. This occurs in |
1997 | | /// precisely the cases where the HIR matches nothing. i.e., The language |
1998 | | /// the regex matches is empty. An example of such a regex is `\P{any}`. |
1999 | | #[inline] |
2000 | 27.4M | pub fn minimum_len(&self) -> Option<usize> { |
2001 | 27.4M | self.0.minimum_len |
2002 | 27.4M | } <regex_syntax::hir::Properties>::minimum_len Line | Count | Source | 2000 | 68.4k | pub fn minimum_len(&self) -> Option<usize> { | 2001 | 68.4k | self.0.minimum_len | 2002 | 68.4k | } |
Unexecuted instantiation: <regex_syntax::hir::Properties>::minimum_len <regex_syntax::hir::Properties>::minimum_len Line | Count | Source | 2000 | 9.70M | pub fn minimum_len(&self) -> Option<usize> { | 2001 | 9.70M | self.0.minimum_len | 2002 | 9.70M | } |
<regex_syntax::hir::Properties>::minimum_len Line | Count | Source | 2000 | 17.6M | pub fn minimum_len(&self) -> Option<usize> { | 2001 | 17.6M | self.0.minimum_len | 2002 | 17.6M | } |
|
2003 | | |
2004 | | /// Returns the length (in bytes) of the longest string matched by this |
2005 | | /// HIR. |
2006 | | /// |
2007 | | /// A return value of `0` is possible and occurs when nothing longer than |
2008 | | /// the empty string is in the language described by this HIR. |
2009 | | /// |
2010 | | /// `None` is returned when there is no longest matching string. This |
2011 | | /// occurs when the HIR matches nothing or when there is no upper bound on |
2012 | | /// the length of matching strings. Example of such regexes are `\P{any}` |
2013 | | /// (matches nothing) and `a+` (has no upper bound). |
2014 | | #[inline] |
2015 | 16.7M | pub fn maximum_len(&self) -> Option<usize> { |
2016 | 16.7M | self.0.maximum_len |
2017 | 16.7M | } <regex_syntax::hir::Properties>::maximum_len Line | Count | Source | 2015 | 1 | pub fn maximum_len(&self) -> Option<usize> { | 2016 | 1 | self.0.maximum_len | 2017 | 1 | } |
Unexecuted instantiation: <regex_syntax::hir::Properties>::maximum_len <regex_syntax::hir::Properties>::maximum_len Line | Count | Source | 2015 | 379k | pub fn maximum_len(&self) -> Option<usize> { | 2016 | 379k | self.0.maximum_len | 2017 | 379k | } |
<regex_syntax::hir::Properties>::maximum_len Line | Count | Source | 2015 | 16.3M | pub fn maximum_len(&self) -> Option<usize> { | 2016 | 16.3M | self.0.maximum_len | 2017 | 16.3M | } |
|
2018 | | |
2019 | | /// Returns a set of all look-around assertions that appear at least once |
2020 | | /// in this HIR value. |
2021 | | #[inline] |
2022 | 18.0M | pub fn look_set(&self) -> LookSet { |
2023 | 18.0M | self.0.look_set |
2024 | 18.0M | } <regex_syntax::hir::Properties>::look_set Line | Count | Source | 2022 | 343k | pub fn look_set(&self) -> LookSet { | 2023 | 343k | self.0.look_set | 2024 | 343k | } |
<regex_syntax::hir::Properties>::look_set Line | Count | Source | 2022 | 17.7M | pub fn look_set(&self) -> LookSet { | 2023 | 17.7M | self.0.look_set | 2024 | 17.7M | } |
|
2025 | | |
2026 | | /// Returns a set of all look-around assertions that appear as a prefix for |
2027 | | /// this HIR value. That is, the set returned corresponds to the set of |
2028 | | /// assertions that must be passed before matching any bytes in a haystack. |
2029 | | /// |
2030 | | /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true |
2031 | | /// if and only if the HIR is fully anchored at the start. |
2032 | | #[inline] |
2033 | 12.0M | pub fn look_set_prefix(&self) -> LookSet { |
2034 | 12.0M | self.0.look_set_prefix |
2035 | 12.0M | } <regex_syntax::hir::Properties>::look_set_prefix Line | Count | Source | 2033 | 56.5k | pub fn look_set_prefix(&self) -> LookSet { | 2034 | 56.5k | self.0.look_set_prefix | 2035 | 56.5k | } |
Unexecuted instantiation: <regex_syntax::hir::Properties>::look_set_prefix <regex_syntax::hir::Properties>::look_set_prefix Line | Count | Source | 2033 | 538k | pub fn look_set_prefix(&self) -> LookSet { | 2034 | 538k | self.0.look_set_prefix | 2035 | 538k | } |
<regex_syntax::hir::Properties>::look_set_prefix Line | Count | Source | 2033 | 11.4M | pub fn look_set_prefix(&self) -> LookSet { | 2034 | 11.4M | self.0.look_set_prefix | 2035 | 11.4M | } |
|
2036 | | |
2037 | | /// Returns a set of all look-around assertions that appear as a _possible_ |
2038 | | /// prefix for this HIR value. That is, the set returned corresponds to the |
2039 | | /// set of assertions that _may_ be passed before matching any bytes in a |
2040 | | /// haystack. |
2041 | | /// |
2042 | | /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns |
2043 | | /// true if and only if it's possible for the regex to match through a |
2044 | | /// anchored assertion before consuming any input. |
2045 | | #[inline] |
2046 | 12.2M | pub fn look_set_prefix_any(&self) -> LookSet { |
2047 | 12.2M | self.0.look_set_prefix_any |
2048 | 12.2M | } <regex_syntax::hir::Properties>::look_set_prefix_any Line | Count | Source | 2046 | 129k | pub fn look_set_prefix_any(&self) -> LookSet { | 2047 | 129k | self.0.look_set_prefix_any | 2048 | 129k | } |
<regex_syntax::hir::Properties>::look_set_prefix_any Line | Count | Source | 2046 | 12.1M | pub fn look_set_prefix_any(&self) -> LookSet { | 2047 | 12.1M | self.0.look_set_prefix_any | 2048 | 12.1M | } |
|
2049 | | |
2050 | | /// Returns a set of all look-around assertions that appear as a suffix for |
2051 | | /// this HIR value. That is, the set returned corresponds to the set of |
2052 | | /// assertions that must be passed in order to be considered a match after |
2053 | | /// all other consuming HIR expressions. |
2054 | | /// |
2055 | | /// For example, `hir.look_set_suffix().contains(Look::End)` returns true |
2056 | | /// if and only if the HIR is fully anchored at the end. |
2057 | | #[inline] |
2058 | 11.7M | pub fn look_set_suffix(&self) -> LookSet { |
2059 | 11.7M | self.0.look_set_suffix |
2060 | 11.7M | } <regex_syntax::hir::Properties>::look_set_suffix Line | Count | Source | 2058 | 4 | pub fn look_set_suffix(&self) -> LookSet { | 2059 | 4 | self.0.look_set_suffix | 2060 | 4 | } |
Unexecuted instantiation: <regex_syntax::hir::Properties>::look_set_suffix <regex_syntax::hir::Properties>::look_set_suffix Line | Count | Source | 2058 | 355k | pub fn look_set_suffix(&self) -> LookSet { | 2059 | 355k | self.0.look_set_suffix | 2060 | 355k | } |
<regex_syntax::hir::Properties>::look_set_suffix Line | Count | Source | 2058 | 11.4M | pub fn look_set_suffix(&self) -> LookSet { | 2059 | 11.4M | self.0.look_set_suffix | 2060 | 11.4M | } |
|
2061 | | |
2062 | | /// Returns a set of all look-around assertions that appear as a _possible_ |
2063 | | /// suffix for this HIR value. That is, the set returned corresponds to the |
2064 | | /// set of assertions that _may_ be passed before matching any bytes in a |
2065 | | /// haystack. |
2066 | | /// |
2067 | | /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns |
2068 | | /// true if and only if it's possible for the regex to match through a |
2069 | | /// anchored assertion at the end of a match without consuming any input. |
2070 | | #[inline] |
2071 | 12.2M | pub fn look_set_suffix_any(&self) -> LookSet { |
2072 | 12.2M | self.0.look_set_suffix_any |
2073 | 12.2M | } <regex_syntax::hir::Properties>::look_set_suffix_any Line | Count | Source | 2071 | 129k | pub fn look_set_suffix_any(&self) -> LookSet { | 2072 | 129k | self.0.look_set_suffix_any | 2073 | 129k | } |
<regex_syntax::hir::Properties>::look_set_suffix_any Line | Count | Source | 2071 | 12.0M | pub fn look_set_suffix_any(&self) -> LookSet { | 2072 | 12.0M | self.0.look_set_suffix_any | 2073 | 12.0M | } |
|
2074 | | |
2075 | | /// Return true if and only if the corresponding HIR will always match |
2076 | | /// valid UTF-8. |
2077 | | /// |
2078 | | /// When this returns false, then it is possible for this HIR expression to |
2079 | | /// match invalid UTF-8, including by matching between the code units of |
2080 | | /// a single UTF-8 encoded codepoint. |
2081 | | /// |
2082 | | /// Note that this returns true even when the corresponding HIR can match |
2083 | | /// the empty string. Since an empty string can technically appear between |
2084 | | /// UTF-8 code units, it is possible for a match to be reported that splits |
2085 | | /// a codepoint which could in turn be considered matching invalid UTF-8. |
2086 | | /// However, it is generally assumed that such empty matches are handled |
2087 | | /// specially by the search routine if it is absolutely required that |
2088 | | /// matches not split a codepoint. |
2089 | | /// |
2090 | | /// # Example |
2091 | | /// |
2092 | | /// This code example shows the UTF-8 property of a variety of patterns. |
2093 | | /// |
2094 | | /// ``` |
2095 | | /// use regex_syntax::{ParserBuilder, parse}; |
2096 | | /// |
2097 | | /// // Examples of 'is_utf8() == true'. |
2098 | | /// assert!(parse(r"a")?.properties().is_utf8()); |
2099 | | /// assert!(parse(r"[^a]")?.properties().is_utf8()); |
2100 | | /// assert!(parse(r".")?.properties().is_utf8()); |
2101 | | /// assert!(parse(r"\W")?.properties().is_utf8()); |
2102 | | /// assert!(parse(r"\b")?.properties().is_utf8()); |
2103 | | /// assert!(parse(r"\B")?.properties().is_utf8()); |
2104 | | /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); |
2105 | | /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); |
2106 | | /// // Unicode mode is enabled by default, and in |
2107 | | /// // that mode, all \x hex escapes are treated as |
2108 | | /// // codepoints. So this actually matches the UTF-8 |
2109 | | /// // encoding of U+00FF. |
2110 | | /// assert!(parse(r"\xFF")?.properties().is_utf8()); |
2111 | | /// |
2112 | | /// // Now we show examples of 'is_utf8() == false'. |
2113 | | /// // The only way to do this is to force the parser |
2114 | | /// // to permit invalid UTF-8, otherwise all of these |
2115 | | /// // would fail to parse! |
2116 | | /// let parse = |pattern| { |
2117 | | /// ParserBuilder::new().utf8(false).build().parse(pattern) |
2118 | | /// }; |
2119 | | /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); |
2120 | | /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); |
2121 | | /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); |
2122 | | /// // Conversely to the equivalent example above, |
2123 | | /// // when Unicode mode is disabled, \x hex escapes |
2124 | | /// // are treated as their raw byte values. |
2125 | | /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); |
2126 | | /// // Note that just because we disabled UTF-8 in the |
2127 | | /// // parser doesn't mean we still can't use Unicode. |
2128 | | /// // It is enabled by default, so \xFF is still |
2129 | | /// // equivalent to matching the UTF-8 encoding of |
2130 | | /// // U+00FF by default. |
2131 | | /// assert!(parse(r"\xFF")?.properties().is_utf8()); |
2132 | | /// // Even though we use raw bytes that individually |
2133 | | /// // are not valid UTF-8, when combined together, the |
2134 | | /// // overall expression *does* match valid UTF-8! |
2135 | | /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); |
2136 | | /// |
2137 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2138 | | /// ``` |
2139 | | #[inline] |
2140 | 17.8M | pub fn is_utf8(&self) -> bool { |
2141 | 17.8M | self.0.utf8 |
2142 | 17.8M | } <regex_syntax::hir::Properties>::is_utf8 Line | Count | Source | 2140 | 129k | pub fn is_utf8(&self) -> bool { | 2141 | 129k | self.0.utf8 | 2142 | 129k | } |
<regex_syntax::hir::Properties>::is_utf8 Line | Count | Source | 2140 | 17.7M | pub fn is_utf8(&self) -> bool { | 2141 | 17.7M | self.0.utf8 | 2142 | 17.7M | } |
|
2143 | | |
2144 | | /// Returns the total number of explicit capturing groups in the |
2145 | | /// corresponding HIR. |
2146 | | /// |
2147 | | /// Note that this does not include the implicit capturing group |
2148 | | /// corresponding to the entire match that is typically included by regex |
2149 | | /// engines. |
2150 | | /// |
2151 | | /// # Example |
2152 | | /// |
2153 | | /// This method will return `0` for `a` and `1` for `(a)`: |
2154 | | /// |
2155 | | /// ``` |
2156 | | /// use regex_syntax::parse; |
2157 | | /// |
2158 | | /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); |
2159 | | /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); |
2160 | | /// |
2161 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2162 | | /// ``` |
2163 | | #[inline] |
2164 | 18.1M | pub fn explicit_captures_len(&self) -> usize { |
2165 | 18.1M | self.0.explicit_captures_len |
2166 | 18.1M | } <regex_syntax::hir::Properties>::explicit_captures_len Line | Count | Source | 2164 | 347k | pub fn explicit_captures_len(&self) -> usize { | 2165 | 347k | self.0.explicit_captures_len | 2166 | 347k | } |
<regex_syntax::hir::Properties>::explicit_captures_len Line | Count | Source | 2164 | 17.8M | pub fn explicit_captures_len(&self) -> usize { | 2165 | 17.8M | self.0.explicit_captures_len | 2166 | 17.8M | } |
|
2167 | | |
2168 | | /// Returns the total number of explicit capturing groups that appear in |
2169 | | /// every possible match. |
2170 | | /// |
2171 | | /// If the number of capture groups can vary depending on the match, then |
2172 | | /// this returns `None`. That is, a value is only returned when the number |
2173 | | /// of matching groups is invariant or "static." |
2174 | | /// |
2175 | | /// Note that this does not include the implicit capturing group |
2176 | | /// corresponding to the entire match. |
2177 | | /// |
2178 | | /// # Example |
2179 | | /// |
2180 | | /// This shows a few cases where a static number of capture groups is |
2181 | | /// available and a few cases where it is not. |
2182 | | /// |
2183 | | /// ``` |
2184 | | /// use regex_syntax::parse; |
2185 | | /// |
2186 | | /// let len = |pattern| { |
2187 | | /// parse(pattern).map(|h| { |
2188 | | /// h.properties().static_explicit_captures_len() |
2189 | | /// }) |
2190 | | /// }; |
2191 | | /// |
2192 | | /// assert_eq!(Some(0), len("a")?); |
2193 | | /// assert_eq!(Some(1), len("(a)")?); |
2194 | | /// assert_eq!(Some(1), len("(a)|(b)")?); |
2195 | | /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); |
2196 | | /// assert_eq!(None, len("(a)|b")?); |
2197 | | /// assert_eq!(None, len("a|(b)")?); |
2198 | | /// assert_eq!(None, len("(b)*")?); |
2199 | | /// assert_eq!(Some(1), len("(b)+")?); |
2200 | | /// |
2201 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2202 | | /// ``` |
2203 | | #[inline] |
2204 | 18.1M | pub fn static_explicit_captures_len(&self) -> Option<usize> { |
2205 | 18.1M | self.0.static_explicit_captures_len |
2206 | 18.1M | } Unexecuted instantiation: <regex_syntax::hir::Properties>::static_explicit_captures_len Unexecuted instantiation: <regex_syntax::hir::Properties>::static_explicit_captures_len <regex_syntax::hir::Properties>::static_explicit_captures_len Line | Count | Source | 2204 | 259k | pub fn static_explicit_captures_len(&self) -> Option<usize> { | 2205 | 259k | self.0.static_explicit_captures_len | 2206 | 259k | } |
<regex_syntax::hir::Properties>::static_explicit_captures_len Line | Count | Source | 2204 | 17.8M | pub fn static_explicit_captures_len(&self) -> Option<usize> { | 2205 | 17.8M | self.0.static_explicit_captures_len | 2206 | 17.8M | } |
|
2207 | | |
2208 | | /// Return true if and only if this HIR is a simple literal. This is |
2209 | | /// only true when this HIR expression is either itself a `Literal` or a |
2210 | | /// concatenation of only `Literal`s. |
2211 | | /// |
2212 | | /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and |
2213 | | /// the empty string are not (even though they contain sub-expressions that |
2214 | | /// are literals). |
2215 | | #[inline] |
2216 | 9.08M | pub fn is_literal(&self) -> bool { |
2217 | 9.08M | self.0.literal |
2218 | 9.08M | } <regex_syntax::hir::Properties>::is_literal Line | Count | Source | 2216 | 129k | pub fn is_literal(&self) -> bool { | 2217 | 129k | self.0.literal | 2218 | 129k | } |
<regex_syntax::hir::Properties>::is_literal Line | Count | Source | 2216 | 8.95M | pub fn is_literal(&self) -> bool { | 2217 | 8.95M | self.0.literal | 2218 | 8.95M | } |
|
2219 | | |
2220 | | /// Return true if and only if this HIR is either a simple literal or an |
2221 | | /// alternation of simple literals. This is only |
2222 | | /// true when this HIR expression is either itself a `Literal` or a |
2223 | | /// concatenation of only `Literal`s or an alternation of only `Literal`s. |
2224 | | /// |
2225 | | /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation |
2226 | | /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not |
2227 | | /// (even though that contain sub-expressions that are literals). |
2228 | | #[inline] |
2229 | 552k | pub fn is_alternation_literal(&self) -> bool { |
2230 | 552k | self.0.alternation_literal |
2231 | 552k | } <regex_syntax::hir::Properties>::is_alternation_literal Line | Count | Source | 2229 | 69.3k | pub fn is_alternation_literal(&self) -> bool { | 2230 | 69.3k | self.0.alternation_literal | 2231 | 69.3k | } |
<regex_syntax::hir::Properties>::is_alternation_literal Line | Count | Source | 2229 | 483k | pub fn is_alternation_literal(&self) -> bool { | 2230 | 483k | self.0.alternation_literal | 2231 | 483k | } |
|
2232 | | |
2233 | | /// Returns the total amount of heap memory usage, in bytes, used by this |
2234 | | /// `Properties` value. |
2235 | | #[inline] |
2236 | 0 | pub fn memory_usage(&self) -> usize { |
2237 | 0 | core::mem::size_of::<PropertiesI>() |
2238 | 0 | } Unexecuted instantiation: <regex_syntax::hir::Properties>::memory_usage Unexecuted instantiation: <regex_syntax::hir::Properties>::memory_usage |
2239 | | |
2240 | | /// Returns a new set of properties that corresponds to the union of the |
2241 | | /// iterator of properties given. |
2242 | | /// |
2243 | | /// This is useful when one has multiple `Hir` expressions and wants |
2244 | | /// to combine them into a single alternation without constructing the |
2245 | | /// corresponding `Hir`. This routine provides a way of combining the |
2246 | | /// properties of each `Hir` expression into one set of properties |
2247 | | /// representing the union of those expressions. |
2248 | | /// |
2249 | | /// # Example: union with HIRs that never match |
2250 | | /// |
2251 | | /// This example shows that unioning properties together with one that |
2252 | | /// represents a regex that never matches will "poison" certain attributes, |
2253 | | /// like the minimum and maximum lengths. |
2254 | | /// |
2255 | | /// ``` |
2256 | | /// use regex_syntax::{hir::Properties, parse}; |
2257 | | /// |
2258 | | /// let hir1 = parse("ab?c?")?; |
2259 | | /// assert_eq!(Some(1), hir1.properties().minimum_len()); |
2260 | | /// assert_eq!(Some(3), hir1.properties().maximum_len()); |
2261 | | /// |
2262 | | /// let hir2 = parse(r"[a&&b]")?; |
2263 | | /// assert_eq!(None, hir2.properties().minimum_len()); |
2264 | | /// assert_eq!(None, hir2.properties().maximum_len()); |
2265 | | /// |
2266 | | /// let hir3 = parse(r"wxy?z?")?; |
2267 | | /// assert_eq!(Some(2), hir3.properties().minimum_len()); |
2268 | | /// assert_eq!(Some(4), hir3.properties().maximum_len()); |
2269 | | /// |
2270 | | /// let unioned = Properties::union([ |
2271 | | /// hir1.properties(), |
2272 | | /// hir2.properties(), |
2273 | | /// hir3.properties(), |
2274 | | /// ]); |
2275 | | /// assert_eq!(None, unioned.minimum_len()); |
2276 | | /// assert_eq!(None, unioned.maximum_len()); |
2277 | | /// |
2278 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2279 | | /// ``` |
2280 | | /// |
2281 | | /// The maximum length can also be "poisoned" by a pattern that has no |
2282 | | /// upper bound on the length of a match. The minimum length remains |
2283 | | /// unaffected: |
2284 | | /// |
2285 | | /// ``` |
2286 | | /// use regex_syntax::{hir::Properties, parse}; |
2287 | | /// |
2288 | | /// let hir1 = parse("ab?c?")?; |
2289 | | /// assert_eq!(Some(1), hir1.properties().minimum_len()); |
2290 | | /// assert_eq!(Some(3), hir1.properties().maximum_len()); |
2291 | | /// |
2292 | | /// let hir2 = parse(r"a+")?; |
2293 | | /// assert_eq!(Some(1), hir2.properties().minimum_len()); |
2294 | | /// assert_eq!(None, hir2.properties().maximum_len()); |
2295 | | /// |
2296 | | /// let hir3 = parse(r"wxy?z?")?; |
2297 | | /// assert_eq!(Some(2), hir3.properties().minimum_len()); |
2298 | | /// assert_eq!(Some(4), hir3.properties().maximum_len()); |
2299 | | /// |
2300 | | /// let unioned = Properties::union([ |
2301 | | /// hir1.properties(), |
2302 | | /// hir2.properties(), |
2303 | | /// hir3.properties(), |
2304 | | /// ]); |
2305 | | /// assert_eq!(Some(1), unioned.minimum_len()); |
2306 | | /// assert_eq!(None, unioned.maximum_len()); |
2307 | | /// |
2308 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2309 | | /// ``` |
2310 | 194k | pub fn union<I, P>(props: I) -> Properties |
2311 | 194k | where |
2312 | 194k | I: IntoIterator<Item = P>, |
2313 | 194k | P: core::borrow::Borrow<Properties>, |
2314 | 194k | { |
2315 | 194k | let mut it = props.into_iter().peekable(); |
2316 | | // While empty alternations aren't possible, we still behave as if they |
2317 | | // are. When we have an empty alternate, then clearly the look-around |
2318 | | // prefix and suffix is empty. Otherwise, it is the intersection of all |
2319 | | // prefixes and suffixes (respectively) of the branches. |
2320 | 194k | let fix = if it.peek().is_none() { |
2321 | 0 | LookSet::empty() |
2322 | | } else { |
2323 | 194k | LookSet::full() |
2324 | | }; |
2325 | | // And also, an empty alternate means we have 0 static capture groups, |
2326 | | // but we otherwise start with the number corresponding to the first |
2327 | | // alternate. If any subsequent alternate has a different number of |
2328 | | // static capture groups, then we overall have a variation and not a |
2329 | | // static number of groups. |
2330 | 194k | let static_explicit_captures_len = |
2331 | 194k | it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, ®ex_syntax::hir::Properties>::{closure#0} Line | Count | Source | 2331 | 129k | it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); |
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, ®ex_syntax::hir::Properties>::{closure#0} Line | Count | Source | 2331 | 65.1k | it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); |
|
2332 | 194k | // The base case is an empty alternation, which matches nothing. |
2333 | 194k | // Note though that empty alternations aren't possible, because the |
2334 | 194k | // Hir::alternation smart constructor rewrites those as empty character |
2335 | 194k | // classes. |
2336 | 194k | let mut props = PropertiesI { |
2337 | 194k | minimum_len: None, |
2338 | 194k | maximum_len: None, |
2339 | 194k | look_set: LookSet::empty(), |
2340 | 194k | look_set_prefix: fix, |
2341 | 194k | look_set_suffix: fix, |
2342 | 194k | look_set_prefix_any: LookSet::empty(), |
2343 | 194k | look_set_suffix_any: LookSet::empty(), |
2344 | 194k | utf8: true, |
2345 | 194k | explicit_captures_len: 0, |
2346 | 194k | static_explicit_captures_len, |
2347 | 194k | literal: false, |
2348 | 194k | alternation_literal: true, |
2349 | 194k | }; |
2350 | 194k | let (mut min_poisoned, mut max_poisoned) = (false, false); |
2351 | | // Handle properties that need to visit every child hir. |
2352 | 11.2M | for prop in it { |
2353 | 11.0M | let p = prop.borrow(); |
2354 | 11.0M | props.look_set.set_union(p.look_set()); |
2355 | 11.0M | props.look_set_prefix.set_intersect(p.look_set_prefix()); |
2356 | 11.0M | props.look_set_suffix.set_intersect(p.look_set_suffix()); |
2357 | 11.0M | props.look_set_prefix_any.set_union(p.look_set_prefix_any()); |
2358 | 11.0M | props.look_set_suffix_any.set_union(p.look_set_suffix_any()); |
2359 | 11.0M | props.utf8 = props.utf8 && p.is_utf8(); |
2360 | 11.0M | props.explicit_captures_len = props |
2361 | 11.0M | .explicit_captures_len |
2362 | 11.0M | .saturating_add(p.explicit_captures_len()); |
2363 | 11.0M | if props.static_explicit_captures_len |
2364 | 11.0M | != p.static_explicit_captures_len() |
2365 | 134k | { |
2366 | 134k | props.static_explicit_captures_len = None; |
2367 | 10.9M | } |
2368 | | props.alternation_literal = |
2369 | 11.0M | props.alternation_literal && p.is_literal(); |
2370 | 11.0M | if !min_poisoned { |
2371 | 11.0M | if let Some(xmin) = p.minimum_len() { |
2372 | 11.0M | if props.minimum_len.map_or(true, |pmin| xmin < pmin) { Unexecuted instantiation: <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, ®ex_syntax::hir::Properties>::{closure#1} <regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, ®ex_syntax::hir::Properties>::{closure#1} Line | Count | Source | 2372 | 10.8M | if props.minimum_len.map_or(true, |pmin| xmin < pmin) { |
|
2373 | 223k | props.minimum_len = Some(xmin); |
2374 | 10.8M | } |
2375 | 2.05k | } else { |
2376 | 2.05k | props.minimum_len = None; |
2377 | 2.05k | min_poisoned = true; |
2378 | 2.05k | } |
2379 | 10.0k | } |
2380 | 11.0M | if !max_poisoned { |
2381 | 9.48M | if let Some(xmax) = p.maximum_len() { |
2382 | 9.42M | if props.maximum_len.map_or(true, |pmax| xmax > pmax) { Unexecuted instantiation: <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, ®ex_syntax::hir::Properties>::{closure#2} <regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, ®ex_syntax::hir::Properties>::{closure#2} Line | Count | Source | 2382 | 9.29M | if props.maximum_len.map_or(true, |pmax| xmax > pmax) { |
|
2383 | 168k | props.maximum_len = Some(xmax); |
2384 | 9.26M | } |
2385 | 60.8k | } else { |
2386 | 60.8k | props.maximum_len = None; |
2387 | 60.8k | max_poisoned = true; |
2388 | 60.8k | } |
2389 | 1.60M | } |
2390 | | } |
2391 | 194k | Properties(Box::new(props)) |
2392 | 194k | } <regex_syntax::hir::Properties>::union::<&alloc::vec::Vec<regex_syntax::hir::Properties>, ®ex_syntax::hir::Properties> Line | Count | Source | 2310 | 129k | pub fn union<I, P>(props: I) -> Properties | 2311 | 129k | where | 2312 | 129k | I: IntoIterator<Item = P>, | 2313 | 129k | P: core::borrow::Borrow<Properties>, | 2314 | 129k | { | 2315 | 129k | let mut it = props.into_iter().peekable(); | 2316 | | // While empty alternations aren't possible, we still behave as if they | 2317 | | // are. When we have an empty alternate, then clearly the look-around | 2318 | | // prefix and suffix is empty. Otherwise, it is the intersection of all | 2319 | | // prefixes and suffixes (respectively) of the branches. | 2320 | 129k | let fix = if it.peek().is_none() { | 2321 | 0 | LookSet::empty() | 2322 | | } else { | 2323 | 129k | LookSet::full() | 2324 | | }; | 2325 | | // And also, an empty alternate means we have 0 static capture groups, | 2326 | | // but we otherwise start with the number corresponding to the first | 2327 | | // alternate. If any subsequent alternate has a different number of | 2328 | | // static capture groups, then we overall have a variation and not a | 2329 | | // static number of groups. | 2330 | 129k | let static_explicit_captures_len = | 2331 | 129k | it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); | 2332 | 129k | // The base case is an empty alternation, which matches nothing. | 2333 | 129k | // Note though that empty alternations aren't possible, because the | 2334 | 129k | // Hir::alternation smart constructor rewrites those as empty character | 2335 | 129k | // classes. | 2336 | 129k | let mut props = PropertiesI { | 2337 | 129k | minimum_len: None, | 2338 | 129k | maximum_len: None, | 2339 | 129k | look_set: LookSet::empty(), | 2340 | 129k | look_set_prefix: fix, | 2341 | 129k | look_set_suffix: fix, | 2342 | 129k | look_set_prefix_any: LookSet::empty(), | 2343 | 129k | look_set_suffix_any: LookSet::empty(), | 2344 | 129k | utf8: true, | 2345 | 129k | explicit_captures_len: 0, | 2346 | 129k | static_explicit_captures_len, | 2347 | 129k | literal: false, | 2348 | 129k | alternation_literal: true, | 2349 | 129k | }; | 2350 | 129k | let (mut min_poisoned, mut max_poisoned) = (false, false); | 2351 | | // Handle properties that need to visit every child hir. | 2352 | 259k | for prop in it { | 2353 | 129k | let p = prop.borrow(); | 2354 | 129k | props.look_set.set_union(p.look_set()); | 2355 | 129k | props.look_set_prefix.set_intersect(p.look_set_prefix()); | 2356 | 129k | props.look_set_suffix.set_intersect(p.look_set_suffix()); | 2357 | 129k | props.look_set_prefix_any.set_union(p.look_set_prefix_any()); | 2358 | 129k | props.look_set_suffix_any.set_union(p.look_set_suffix_any()); | 2359 | 129k | props.utf8 = props.utf8 && p.is_utf8(); | 2360 | 129k | props.explicit_captures_len = props | 2361 | 129k | .explicit_captures_len | 2362 | 129k | .saturating_add(p.explicit_captures_len()); | 2363 | 129k | if props.static_explicit_captures_len | 2364 | 129k | != p.static_explicit_captures_len() | 2365 | 0 | { | 2366 | 0 | props.static_explicit_captures_len = None; | 2367 | 129k | } | 2368 | | props.alternation_literal = | 2369 | 129k | props.alternation_literal && p.is_literal(); | 2370 | 129k | if !min_poisoned { | 2371 | 129k | if let Some(xmin) = p.minimum_len() { | 2372 | 128k | if props.minimum_len.map_or(true, |pmin| xmin < pmin) { | 2373 | 128k | props.minimum_len = Some(xmin); | 2374 | 128k | } | 2375 | 1.47k | } else { | 2376 | 1.47k | props.minimum_len = None; | 2377 | 1.47k | min_poisoned = true; | 2378 | 1.47k | } | 2379 | 0 | } | 2380 | 129k | if !max_poisoned { | 2381 | 129k | if let Some(xmax) = p.maximum_len() { | 2382 | 76.3k | if props.maximum_len.map_or(true, |pmax| xmax > pmax) { | 2383 | 76.3k | props.maximum_len = Some(xmax); | 2384 | 76.3k | } | 2385 | 53.2k | } else { | 2386 | 53.2k | props.maximum_len = None; | 2387 | 53.2k | max_poisoned = true; | 2388 | 53.2k | } | 2389 | 0 | } | 2390 | | } | 2391 | 129k | Properties(Box::new(props)) | 2392 | 129k | } |
<regex_syntax::hir::Properties>::union::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_syntax::hir::Properties>::alternation::{closure#0}>, ®ex_syntax::hir::Properties> Line | Count | Source | 2310 | 65.1k | pub fn union<I, P>(props: I) -> Properties | 2311 | 65.1k | where | 2312 | 65.1k | I: IntoIterator<Item = P>, | 2313 | 65.1k | P: core::borrow::Borrow<Properties>, | 2314 | 65.1k | { | 2315 | 65.1k | let mut it = props.into_iter().peekable(); | 2316 | | // While empty alternations aren't possible, we still behave as if they | 2317 | | // are. When we have an empty alternate, then clearly the look-around | 2318 | | // prefix and suffix is empty. Otherwise, it is the intersection of all | 2319 | | // prefixes and suffixes (respectively) of the branches. | 2320 | 65.1k | let fix = if it.peek().is_none() { | 2321 | 0 | LookSet::empty() | 2322 | | } else { | 2323 | 65.1k | LookSet::full() | 2324 | | }; | 2325 | | // And also, an empty alternate means we have 0 static capture groups, | 2326 | | // but we otherwise start with the number corresponding to the first | 2327 | | // alternate. If any subsequent alternate has a different number of | 2328 | | // static capture groups, then we overall have a variation and not a | 2329 | | // static number of groups. | 2330 | 65.1k | let static_explicit_captures_len = | 2331 | 65.1k | it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); | 2332 | 65.1k | // The base case is an empty alternation, which matches nothing. | 2333 | 65.1k | // Note though that empty alternations aren't possible, because the | 2334 | 65.1k | // Hir::alternation smart constructor rewrites those as empty character | 2335 | 65.1k | // classes. | 2336 | 65.1k | let mut props = PropertiesI { | 2337 | 65.1k | minimum_len: None, | 2338 | 65.1k | maximum_len: None, | 2339 | 65.1k | look_set: LookSet::empty(), | 2340 | 65.1k | look_set_prefix: fix, | 2341 | 65.1k | look_set_suffix: fix, | 2342 | 65.1k | look_set_prefix_any: LookSet::empty(), | 2343 | 65.1k | look_set_suffix_any: LookSet::empty(), | 2344 | 65.1k | utf8: true, | 2345 | 65.1k | explicit_captures_len: 0, | 2346 | 65.1k | static_explicit_captures_len, | 2347 | 65.1k | literal: false, | 2348 | 65.1k | alternation_literal: true, | 2349 | 65.1k | }; | 2350 | 65.1k | let (mut min_poisoned, mut max_poisoned) = (false, false); | 2351 | | // Handle properties that need to visit every child hir. | 2352 | 11.0M | for prop in it { | 2353 | 10.9M | let p = prop.borrow(); | 2354 | 10.9M | props.look_set.set_union(p.look_set()); | 2355 | 10.9M | props.look_set_prefix.set_intersect(p.look_set_prefix()); | 2356 | 10.9M | props.look_set_suffix.set_intersect(p.look_set_suffix()); | 2357 | 10.9M | props.look_set_prefix_any.set_union(p.look_set_prefix_any()); | 2358 | 10.9M | props.look_set_suffix_any.set_union(p.look_set_suffix_any()); | 2359 | 10.9M | props.utf8 = props.utf8 && p.is_utf8(); | 2360 | 10.9M | props.explicit_captures_len = props | 2361 | 10.9M | .explicit_captures_len | 2362 | 10.9M | .saturating_add(p.explicit_captures_len()); | 2363 | 10.9M | if props.static_explicit_captures_len | 2364 | 10.9M | != p.static_explicit_captures_len() | 2365 | 134k | { | 2366 | 134k | props.static_explicit_captures_len = None; | 2367 | 10.8M | } | 2368 | | props.alternation_literal = | 2369 | 10.9M | props.alternation_literal && p.is_literal(); | 2370 | 10.9M | if !min_poisoned { | 2371 | 10.9M | if let Some(xmin) = p.minimum_len() { | 2372 | 10.9M | if props.minimum_len.map_or(true, |pmin| xmin < pmin) { | 2373 | 95.0k | props.minimum_len = Some(xmin); | 2374 | 10.8M | } | 2375 | 575 | } else { | 2376 | 575 | props.minimum_len = None; | 2377 | 575 | min_poisoned = true; | 2378 | 575 | } | 2379 | 10.0k | } | 2380 | 10.9M | if !max_poisoned { | 2381 | 9.35M | if let Some(xmax) = p.maximum_len() { | 2382 | 9.35M | if props.maximum_len.map_or(true, |pmax| xmax > pmax) { | 2383 | 91.6k | props.maximum_len = Some(xmax); | 2384 | 9.26M | } | 2385 | 7.60k | } else { | 2386 | 7.60k | props.maximum_len = None; | 2387 | 7.60k | max_poisoned = true; | 2388 | 7.60k | } | 2389 | 1.60M | } | 2390 | | } | 2391 | 65.1k | Properties(Box::new(props)) | 2392 | 65.1k | } |
|
2393 | | } |
2394 | | |
2395 | | impl Properties { |
2396 | | /// Create a new set of HIR properties for an empty regex. |
2397 | 21.1M | fn empty() -> Properties { |
2398 | 21.1M | let inner = PropertiesI { |
2399 | 21.1M | minimum_len: Some(0), |
2400 | 21.1M | maximum_len: Some(0), |
2401 | 21.1M | look_set: LookSet::empty(), |
2402 | 21.1M | look_set_prefix: LookSet::empty(), |
2403 | 21.1M | look_set_suffix: LookSet::empty(), |
2404 | 21.1M | look_set_prefix_any: LookSet::empty(), |
2405 | 21.1M | look_set_suffix_any: LookSet::empty(), |
2406 | 21.1M | // It is debatable whether an empty regex always matches at valid |
2407 | 21.1M | // UTF-8 boundaries. Strictly speaking, at a byte oriented view, |
2408 | 21.1M | // it is clearly false. There are, for example, many empty strings |
2409 | 21.1M | // between the bytes encoding a '☃'. |
2410 | 21.1M | // |
2411 | 21.1M | // However, when Unicode mode is enabled, the fundamental atom |
2412 | 21.1M | // of matching is really a codepoint. And in that scenario, an |
2413 | 21.1M | // empty regex is defined to only match at valid UTF-8 boundaries |
2414 | 21.1M | // and to never split a codepoint. It just so happens that this |
2415 | 21.1M | // enforcement is somewhat tricky to do for regexes that match |
2416 | 21.1M | // the empty string inside regex engines themselves. It usually |
2417 | 21.1M | // requires some layer above the regex engine to filter out such |
2418 | 21.1M | // matches. |
2419 | 21.1M | // |
2420 | 21.1M | // In any case, 'true' is really the only coherent option. If it |
2421 | 21.1M | // were false, for example, then 'a*' would also need to be false |
2422 | 21.1M | // since it too can match the empty string. |
2423 | 21.1M | utf8: true, |
2424 | 21.1M | explicit_captures_len: 0, |
2425 | 21.1M | static_explicit_captures_len: Some(0), |
2426 | 21.1M | literal: false, |
2427 | 21.1M | alternation_literal: false, |
2428 | 21.1M | }; |
2429 | 21.1M | Properties(Box::new(inner)) |
2430 | 21.1M | } |
2431 | | |
2432 | | /// Create a new set of HIR properties for a literal regex. |
2433 | 12.5M | fn literal(lit: &Literal) -> Properties { |
2434 | 12.5M | let inner = PropertiesI { |
2435 | 12.5M | minimum_len: Some(lit.0.len()), |
2436 | 12.5M | maximum_len: Some(lit.0.len()), |
2437 | 12.5M | look_set: LookSet::empty(), |
2438 | 12.5M | look_set_prefix: LookSet::empty(), |
2439 | 12.5M | look_set_suffix: LookSet::empty(), |
2440 | 12.5M | look_set_prefix_any: LookSet::empty(), |
2441 | 12.5M | look_set_suffix_any: LookSet::empty(), |
2442 | 12.5M | utf8: core::str::from_utf8(&lit.0).is_ok(), |
2443 | 12.5M | explicit_captures_len: 0, |
2444 | 12.5M | static_explicit_captures_len: Some(0), |
2445 | 12.5M | literal: true, |
2446 | 12.5M | alternation_literal: true, |
2447 | 12.5M | }; |
2448 | 12.5M | Properties(Box::new(inner)) |
2449 | 12.5M | } |
2450 | | |
2451 | | /// Create a new set of HIR properties for a character class. |
2452 | 3.78M | fn class(class: &Class) -> Properties { |
2453 | 3.78M | let inner = PropertiesI { |
2454 | 3.78M | minimum_len: class.minimum_len(), |
2455 | 3.78M | maximum_len: class.maximum_len(), |
2456 | 3.78M | look_set: LookSet::empty(), |
2457 | 3.78M | look_set_prefix: LookSet::empty(), |
2458 | 3.78M | look_set_suffix: LookSet::empty(), |
2459 | 3.78M | look_set_prefix_any: LookSet::empty(), |
2460 | 3.78M | look_set_suffix_any: LookSet::empty(), |
2461 | 3.78M | utf8: class.is_utf8(), |
2462 | 3.78M | explicit_captures_len: 0, |
2463 | 3.78M | static_explicit_captures_len: Some(0), |
2464 | 3.78M | literal: false, |
2465 | 3.78M | alternation_literal: false, |
2466 | 3.78M | }; |
2467 | 3.78M | Properties(Box::new(inner)) |
2468 | 3.78M | } |
2469 | | |
2470 | | /// Create a new set of HIR properties for a look-around assertion. |
2471 | 335k | fn look(look: Look) -> Properties { |
2472 | 335k | let inner = PropertiesI { |
2473 | 335k | minimum_len: Some(0), |
2474 | 335k | maximum_len: Some(0), |
2475 | 335k | look_set: LookSet::singleton(look), |
2476 | 335k | look_set_prefix: LookSet::singleton(look), |
2477 | 335k | look_set_suffix: LookSet::singleton(look), |
2478 | 335k | look_set_prefix_any: LookSet::singleton(look), |
2479 | 335k | look_set_suffix_any: LookSet::singleton(look), |
2480 | 335k | // This requires a little explanation. Basically, we don't consider |
2481 | 335k | // matching an empty string to be equivalent to matching invalid |
2482 | 335k | // UTF-8, even though technically matching every empty string will |
2483 | 335k | // split the UTF-8 encoding of a single codepoint when treating a |
2484 | 335k | // UTF-8 encoded string as a sequence of bytes. Our defense here is |
2485 | 335k | // that in such a case, a codepoint should logically be treated as |
2486 | 335k | // the fundamental atom for matching, and thus the only valid match |
2487 | 335k | // points are between codepoints and not bytes. |
2488 | 335k | // |
2489 | 335k | // More practically, this is true here because it's also true |
2490 | 335k | // for 'Hir::empty()', otherwise something like 'a*' would be |
2491 | 335k | // considered to match invalid UTF-8. That in turn makes this |
2492 | 335k | // property borderline useless. |
2493 | 335k | utf8: true, |
2494 | 335k | explicit_captures_len: 0, |
2495 | 335k | static_explicit_captures_len: Some(0), |
2496 | 335k | literal: false, |
2497 | 335k | alternation_literal: false, |
2498 | 335k | }; |
2499 | 335k | Properties(Box::new(inner)) |
2500 | 335k | } |
2501 | | |
2502 | | /// Create a new set of HIR properties for a repetition. |
2503 | 694k | fn repetition(rep: &Repetition) -> Properties { |
2504 | 694k | let p = rep.sub.properties(); |
2505 | 694k | let minimum_len = p.minimum_len().map(|child_min| { |
2506 | 692k | let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); |
2507 | 692k | child_min.saturating_mul(rep_min) |
2508 | 694k | }); |
2509 | 694k | let maximum_len = rep.max.and_then(|rep_max| { |
2510 | 494k | let rep_max = usize::try_from(rep_max).ok()?; |
2511 | 494k | let child_max = p.maximum_len()?; |
2512 | 435k | child_max.checked_mul(rep_max) |
2513 | 694k | }); |
2514 | 694k | |
2515 | 694k | let mut inner = PropertiesI { |
2516 | 694k | minimum_len, |
2517 | 694k | maximum_len, |
2518 | 694k | look_set: p.look_set(), |
2519 | 694k | look_set_prefix: LookSet::empty(), |
2520 | 694k | look_set_suffix: LookSet::empty(), |
2521 | 694k | look_set_prefix_any: p.look_set_prefix_any(), |
2522 | 694k | look_set_suffix_any: p.look_set_suffix_any(), |
2523 | 694k | utf8: p.is_utf8(), |
2524 | 694k | explicit_captures_len: p.explicit_captures_len(), |
2525 | 694k | static_explicit_captures_len: p.static_explicit_captures_len(), |
2526 | 694k | literal: false, |
2527 | 694k | alternation_literal: false, |
2528 | 694k | }; |
2529 | 694k | // If the repetition operator can match the empty string, then its |
2530 | 694k | // lookset prefix and suffixes themselves remain empty since they are |
2531 | 694k | // no longer required to match. |
2532 | 694k | if rep.min > 0 { |
2533 | 48.0k | inner.look_set_prefix = p.look_set_prefix(); |
2534 | 48.0k | inner.look_set_suffix = p.look_set_suffix(); |
2535 | 646k | } |
2536 | | // If the static captures len of the sub-expression is not known or |
2537 | | // is greater than zero, then it automatically propagates to the |
2538 | | // repetition, regardless of the repetition. Otherwise, it might |
2539 | | // change, but only when the repetition can match 0 times. |
2540 | 694k | if rep.min == 0 |
2541 | 646k | && inner.static_explicit_captures_len.map_or(false, |len| len > 0) |
2542 | | { |
2543 | | // If we require a match 0 times, then our captures len is |
2544 | | // guaranteed to be zero. Otherwise, if we *can* match the empty |
2545 | | // string, then it's impossible to know how many captures will be |
2546 | | // in the resulting match. |
2547 | 32.7k | if rep.max == Some(0) { |
2548 | 0 | inner.static_explicit_captures_len = Some(0); |
2549 | 32.7k | } else { |
2550 | 32.7k | inner.static_explicit_captures_len = None; |
2551 | 32.7k | } |
2552 | 662k | } |
2553 | 694k | Properties(Box::new(inner)) |
2554 | 694k | } |
2555 | | |
2556 | | /// Create a new set of HIR properties for a capture. |
2557 | 99.7k | fn capture(capture: &Capture) -> Properties { |
2558 | 99.7k | let p = capture.sub.properties(); |
2559 | 99.7k | Properties(Box::new(PropertiesI { |
2560 | 99.7k | explicit_captures_len: p.explicit_captures_len().saturating_add(1), |
2561 | 99.7k | static_explicit_captures_len: p |
2562 | 99.7k | .static_explicit_captures_len() |
2563 | 99.7k | .map(|len| len.saturating_add(1)), |
2564 | 99.7k | literal: false, |
2565 | 99.7k | alternation_literal: false, |
2566 | 99.7k | ..*p.0.clone() |
2567 | 99.7k | })) |
2568 | 99.7k | } |
2569 | | |
2570 | | /// Create a new set of HIR properties for a concatenation. |
2571 | 382k | fn concat(concat: &[Hir]) -> Properties { |
2572 | 382k | // The base case is an empty concatenation, which matches the empty |
2573 | 382k | // string. Note though that empty concatenations aren't possible, |
2574 | 382k | // because the Hir::concat smart constructor rewrites those as |
2575 | 382k | // Hir::empty. |
2576 | 382k | let mut props = PropertiesI { |
2577 | 382k | minimum_len: Some(0), |
2578 | 382k | maximum_len: Some(0), |
2579 | 382k | look_set: LookSet::empty(), |
2580 | 382k | look_set_prefix: LookSet::empty(), |
2581 | 382k | look_set_suffix: LookSet::empty(), |
2582 | 382k | look_set_prefix_any: LookSet::empty(), |
2583 | 382k | look_set_suffix_any: LookSet::empty(), |
2584 | 382k | utf8: true, |
2585 | 382k | explicit_captures_len: 0, |
2586 | 382k | static_explicit_captures_len: Some(0), |
2587 | 382k | literal: true, |
2588 | 382k | alternation_literal: true, |
2589 | 382k | }; |
2590 | | // Handle properties that need to visit every child hir. |
2591 | 6.05M | for x in concat.iter() { |
2592 | 6.05M | let p = x.properties(); |
2593 | 6.05M | props.look_set.set_union(p.look_set()); |
2594 | 6.05M | props.utf8 = props.utf8 && p.is_utf8(); |
2595 | 6.05M | props.explicit_captures_len = props |
2596 | 6.05M | .explicit_captures_len |
2597 | 6.05M | .saturating_add(p.explicit_captures_len()); |
2598 | 6.05M | props.static_explicit_captures_len = p |
2599 | 6.05M | .static_explicit_captures_len() |
2600 | 6.05M | .and_then(|len1| { |
2601 | 6.01M | Some((len1, props.static_explicit_captures_len?)) |
2602 | 6.05M | }) |
2603 | 6.05M | .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); |
2604 | 6.05M | props.literal = props.literal && p.is_literal(); |
2605 | | props.alternation_literal = |
2606 | 6.05M | props.alternation_literal && p.is_alternation_literal(); |
2607 | 6.05M | if let Some(minimum_len) = props.minimum_len { |
2608 | 5.98M | match p.minimum_len() { |
2609 | 7.63k | None => props.minimum_len = None, |
2610 | 5.97M | Some(len) => { |
2611 | 5.97M | // We use saturating arithmetic here because the |
2612 | 5.97M | // minimum is just a lower bound. We can't go any |
2613 | 5.97M | // higher than what our number types permit. |
2614 | 5.97M | props.minimum_len = |
2615 | 5.97M | Some(minimum_len.saturating_add(len)); |
2616 | 5.97M | } |
2617 | | } |
2618 | 70.6k | } |
2619 | 6.05M | if let Some(maximum_len) = props.maximum_len { |
2620 | 5.20M | match p.maximum_len() { |
2621 | 114k | None => props.maximum_len = None, |
2622 | 5.09M | Some(len) => { |
2623 | 5.09M | props.maximum_len = maximum_len.checked_add(len) |
2624 | | } |
2625 | | } |
2626 | 846k | } |
2627 | | } |
2628 | | // Handle the prefix properties, which only requires visiting |
2629 | | // child exprs until one matches more than the empty string. |
2630 | 382k | let mut it = concat.iter(); |
2631 | 461k | while let Some(x) = it.next() { |
2632 | 458k | props.look_set_prefix.set_union(x.properties().look_set_prefix()); |
2633 | 458k | props |
2634 | 458k | .look_set_prefix_any |
2635 | 458k | .set_union(x.properties().look_set_prefix_any()); |
2636 | 458k | if x.properties().maximum_len().map_or(true, |x| x > 0) { |
2637 | 379k | break; |
2638 | 79.3k | } |
2639 | | } |
2640 | | // Same thing for the suffix properties, but in reverse. |
2641 | 382k | let mut it = concat.iter().rev(); |
2642 | 420k | while let Some(x) = it.next() { |
2643 | 417k | props.look_set_suffix.set_union(x.properties().look_set_suffix()); |
2644 | 417k | props |
2645 | 417k | .look_set_suffix_any |
2646 | 417k | .set_union(x.properties().look_set_suffix_any()); |
2647 | 417k | if x.properties().maximum_len().map_or(true, |x| x > 0) { |
2648 | 379k | break; |
2649 | 38.5k | } |
2650 | | } |
2651 | 382k | Properties(Box::new(props)) |
2652 | 382k | } |
2653 | | |
2654 | | /// Create a new set of HIR properties for a concatenation. |
2655 | 65.1k | fn alternation(alts: &[Hir]) -> Properties { |
2656 | 10.9M | Properties::union(alts.iter().map(|hir| hir.properties())) |
2657 | 65.1k | } |
2658 | | } |
2659 | | |
2660 | | /// A set of look-around assertions. |
2661 | | /// |
2662 | | /// This is useful for efficiently tracking look-around assertions. For |
2663 | | /// example, an [`Hir`] provides properties that return `LookSet`s. |
2664 | | #[derive(Clone, Copy, Default, Eq, PartialEq)] |
2665 | | pub struct LookSet { |
2666 | | /// The underlying representation this set is exposed to make it possible |
2667 | | /// to store it somewhere efficiently. The representation is that |
2668 | | /// of a bitset, where each assertion occupies bit `i` where `i = |
2669 | | /// Look::as_repr()`. |
2670 | | /// |
2671 | | /// Note that users of this internal representation must permit the full |
2672 | | /// range of `u16` values to be represented. For example, even if the |
2673 | | /// current implementation only makes use of the 10 least significant bits, |
2674 | | /// it may use more bits in a future semver compatible release. |
2675 | | pub bits: u32, |
2676 | | } |
2677 | | |
2678 | | impl LookSet { |
2679 | | /// Create an empty set of look-around assertions. |
2680 | | #[inline] |
2681 | 193M | pub fn empty() -> LookSet { |
2682 | 193M | LookSet { bits: 0 } |
2683 | 193M | } <regex_syntax::hir::LookSet>::empty Line | Count | Source | 2681 | 388k | pub fn empty() -> LookSet { | 2682 | 388k | LookSet { bits: 0 } | 2683 | 388k | } |
<regex_syntax::hir::LookSet>::empty Line | Count | Source | 2681 | 192M | pub fn empty() -> LookSet { | 2682 | 192M | LookSet { bits: 0 } | 2683 | 192M | } |
|
2684 | | |
2685 | | /// Create a full set of look-around assertions. |
2686 | | /// |
2687 | | /// This set contains all possible look-around assertions. |
2688 | | #[inline] |
2689 | 194k | pub fn full() -> LookSet { |
2690 | 194k | LookSet { bits: !0 } |
2691 | 194k | } <regex_syntax::hir::LookSet>::full Line | Count | Source | 2689 | 129k | pub fn full() -> LookSet { | 2690 | 129k | LookSet { bits: !0 } | 2691 | 129k | } |
<regex_syntax::hir::LookSet>::full Line | Count | Source | 2689 | 65.1k | pub fn full() -> LookSet { | 2690 | 65.1k | LookSet { bits: !0 } | 2691 | 65.1k | } |
|
2692 | | |
2693 | | /// Create a look-around set containing the look-around assertion given. |
2694 | | /// |
2695 | | /// This is a convenience routine for creating an empty set and inserting |
2696 | | /// one look-around assertions. |
2697 | | #[inline] |
2698 | 1.67M | pub fn singleton(look: Look) -> LookSet { |
2699 | 1.67M | LookSet::empty().insert(look) |
2700 | 1.67M | } |
2701 | | |
2702 | | /// Returns the total number of look-around assertions in this set. |
2703 | | #[inline] |
2704 | 131k | pub fn len(self) -> usize { |
2705 | 131k | // OK because max value always fits in a u8, which in turn always |
2706 | 131k | // fits in a usize, regardless of target. |
2707 | 131k | usize::try_from(self.bits.count_ones()).unwrap() |
2708 | 131k | } <regex_syntax::hir::LookSet>::len Line | Count | Source | 2704 | 131k | pub fn len(self) -> usize { | 2705 | 131k | // OK because max value always fits in a u8, which in turn always | 2706 | 131k | // fits in a usize, regardless of target. | 2707 | 131k | usize::try_from(self.bits.count_ones()).unwrap() | 2708 | 131k | } |
Unexecuted instantiation: <regex_syntax::hir::LookSet>::len |
2709 | | |
2710 | | /// Returns true if and only if this set is empty. |
2711 | | #[inline] |
2712 | 131k | pub fn is_empty(self) -> bool { |
2713 | 131k | self.len() == 0 |
2714 | 131k | } <regex_syntax::hir::LookSet>::is_empty Line | Count | Source | 2712 | 131k | pub fn is_empty(self) -> bool { | 2713 | 131k | self.len() == 0 | 2714 | 131k | } |
Unexecuted instantiation: <regex_syntax::hir::LookSet>::is_empty |
2715 | | |
2716 | | /// Returns true if and only if the given look-around assertion is in this |
2717 | | /// set. |
2718 | | #[inline] |
2719 | 1.17M | pub fn contains(self, look: Look) -> bool { |
2720 | 1.17M | self.bits & look.as_repr() != 0 |
2721 | 1.17M | } <regex_syntax::hir::LookSet>::contains Line | Count | Source | 2719 | 56.5k | pub fn contains(self, look: Look) -> bool { | 2720 | 56.5k | self.bits & look.as_repr() != 0 | 2721 | 56.5k | } |
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains <regex_syntax::hir::LookSet>::contains Line | Count | Source | 2719 | 1.12M | pub fn contains(self, look: Look) -> bool { | 2720 | 1.12M | self.bits & look.as_repr() != 0 | 2721 | 1.12M | } |
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains |
2722 | | |
2723 | | /// Returns true if and only if this set contains any anchor assertions. |
2724 | | /// This includes both "start/end of haystack" and "start/end of line." |
2725 | | #[inline] |
2726 | 0 | pub fn contains_anchor(&self) -> bool { |
2727 | 0 | self.contains_anchor_haystack() || self.contains_anchor_line() |
2728 | 0 | } |
2729 | | |
2730 | | /// Returns true if and only if this set contains any "start/end of |
2731 | | /// haystack" anchors. This doesn't include "start/end of line" anchors. |
2732 | | #[inline] |
2733 | 0 | pub fn contains_anchor_haystack(&self) -> bool { |
2734 | 0 | self.contains(Look::Start) || self.contains(Look::End) |
2735 | 0 | } |
2736 | | |
2737 | | /// Returns true if and only if this set contains any "start/end of line" |
2738 | | /// anchors. This doesn't include "start/end of haystack" anchors. This |
2739 | | /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. |
2740 | | #[inline] |
2741 | 0 | pub fn contains_anchor_line(&self) -> bool { |
2742 | 0 | self.contains(Look::StartLF) |
2743 | 0 | || self.contains(Look::EndLF) |
2744 | 0 | || self.contains(Look::StartCRLF) |
2745 | 0 | || self.contains(Look::EndCRLF) |
2746 | 0 | } |
2747 | | |
2748 | | /// Returns true if and only if this set contains any "start/end of line" |
2749 | | /// anchors that only treat `\n` as line terminators. This does not include |
2750 | | /// haystack anchors or CRLF aware line anchors. |
2751 | | #[inline] |
2752 | 0 | pub fn contains_anchor_lf(&self) -> bool { |
2753 | 0 | self.contains(Look::StartLF) || self.contains(Look::EndLF) |
2754 | 0 | } |
2755 | | |
2756 | | /// Returns true if and only if this set contains any "start/end of line" |
2757 | | /// anchors that are CRLF-aware. This doesn't include "start/end of |
2758 | | /// haystack" or "start/end of line-feed" anchors. |
2759 | | #[inline] |
2760 | 0 | pub fn contains_anchor_crlf(&self) -> bool { |
2761 | 0 | self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) |
2762 | 0 | } |
2763 | | |
2764 | | /// Returns true if and only if this set contains any word boundary or |
2765 | | /// negated word boundary assertions. This include both Unicode and ASCII |
2766 | | /// word boundaries. |
2767 | | #[inline] |
2768 | 0 | pub fn contains_word(self) -> bool { |
2769 | 0 | self.contains_word_unicode() || self.contains_word_ascii() |
2770 | 0 | } |
2771 | | |
2772 | | /// Returns true if and only if this set contains any Unicode word boundary |
2773 | | /// or negated Unicode word boundary assertions. |
2774 | | #[inline] |
2775 | 82.7k | pub fn contains_word_unicode(self) -> bool { |
2776 | 82.7k | self.contains(Look::WordUnicode) |
2777 | 82.4k | || self.contains(Look::WordUnicodeNegate) |
2778 | 81.5k | || self.contains(Look::WordStartUnicode) |
2779 | 81.4k | || self.contains(Look::WordEndUnicode) |
2780 | 80.3k | || self.contains(Look::WordStartHalfUnicode) |
2781 | 80.3k | || self.contains(Look::WordEndHalfUnicode) |
2782 | 82.7k | } <regex_syntax::hir::LookSet>::contains_word_unicode Line | Count | Source | 2775 | 82.7k | pub fn contains_word_unicode(self) -> bool { | 2776 | 82.7k | self.contains(Look::WordUnicode) | 2777 | 82.4k | || self.contains(Look::WordUnicodeNegate) | 2778 | 81.5k | || self.contains(Look::WordStartUnicode) | 2779 | 81.4k | || self.contains(Look::WordEndUnicode) | 2780 | 80.3k | || self.contains(Look::WordStartHalfUnicode) | 2781 | 80.3k | || self.contains(Look::WordEndHalfUnicode) | 2782 | 82.7k | } |
Unexecuted instantiation: <regex_syntax::hir::LookSet>::contains_word_unicode |
2783 | | |
2784 | | /// Returns true if and only if this set contains any ASCII word boundary |
2785 | | /// or negated ASCII word boundary assertions. |
2786 | | #[inline] |
2787 | 0 | pub fn contains_word_ascii(self) -> bool { |
2788 | 0 | self.contains(Look::WordAscii) |
2789 | 0 | || self.contains(Look::WordAsciiNegate) |
2790 | 0 | || self.contains(Look::WordStartAscii) |
2791 | 0 | || self.contains(Look::WordEndAscii) |
2792 | 0 | || self.contains(Look::WordStartHalfAscii) |
2793 | 0 | || self.contains(Look::WordEndHalfAscii) |
2794 | 0 | } |
2795 | | |
2796 | | /// Returns an iterator over all of the look-around assertions in this set. |
2797 | | #[inline] |
2798 | 0 | pub fn iter(self) -> LookSetIter { |
2799 | 0 | LookSetIter { set: self } |
2800 | 0 | } |
2801 | | |
2802 | | /// Return a new set that is equivalent to the original, but with the given |
2803 | | /// assertion added to it. If the assertion is already in the set, then the |
2804 | | /// returned set is equivalent to the original. |
2805 | | #[inline] |
2806 | 1.67M | pub fn insert(self, look: Look) -> LookSet { |
2807 | 1.67M | LookSet { bits: self.bits | look.as_repr() } |
2808 | 1.67M | } |
2809 | | |
2810 | | /// Updates this set in place with the result of inserting the given |
2811 | | /// assertion into this set. |
2812 | | #[inline] |
2813 | 0 | pub fn set_insert(&mut self, look: Look) { |
2814 | 0 | *self = self.insert(look); |
2815 | 0 | } |
2816 | | |
2817 | | /// Return a new set that is equivalent to the original, but with the given |
2818 | | /// assertion removed from it. If the assertion is not in the set, then the |
2819 | | /// returned set is equivalent to the original. |
2820 | | #[inline] |
2821 | 0 | pub fn remove(self, look: Look) -> LookSet { |
2822 | 0 | LookSet { bits: self.bits & !look.as_repr() } |
2823 | 0 | } |
2824 | | |
2825 | | /// Updates this set in place with the result of removing the given |
2826 | | /// assertion from this set. |
2827 | | #[inline] |
2828 | 0 | pub fn set_remove(&mut self, look: Look) { |
2829 | 0 | *self = self.remove(look); |
2830 | 0 | } |
2831 | | |
2832 | | /// Returns a new set that is the result of subtracting the given set from |
2833 | | /// this set. |
2834 | | #[inline] |
2835 | 0 | pub fn subtract(self, other: LookSet) -> LookSet { |
2836 | 0 | LookSet { bits: self.bits & !other.bits } |
2837 | 0 | } |
2838 | | |
2839 | | /// Updates this set in place with the result of subtracting the given set |
2840 | | /// from this set. |
2841 | | #[inline] |
2842 | 0 | pub fn set_subtract(&mut self, other: LookSet) { |
2843 | 0 | *self = self.subtract(other); |
2844 | 0 | } |
2845 | | |
2846 | | /// Returns a new set that is the union of this and the one given. |
2847 | | #[inline] |
2848 | 41.0M | pub fn union(self, other: LookSet) -> LookSet { |
2849 | 41.0M | LookSet { bits: self.bits | other.bits } |
2850 | 41.0M | } <regex_syntax::hir::LookSet>::union Line | Count | Source | 2848 | 388k | pub fn union(self, other: LookSet) -> LookSet { | 2849 | 388k | LookSet { bits: self.bits | other.bits } | 2850 | 388k | } |
<regex_syntax::hir::LookSet>::union Line | Count | Source | 2848 | 40.7M | pub fn union(self, other: LookSet) -> LookSet { | 2849 | 40.7M | LookSet { bits: self.bits | other.bits } | 2850 | 40.7M | } |
|
2851 | | |
2852 | | /// Updates this set in place with the result of unioning it with the one |
2853 | | /// given. |
2854 | | #[inline] |
2855 | 41.0M | pub fn set_union(&mut self, other: LookSet) { |
2856 | 41.0M | *self = self.union(other); |
2857 | 41.0M | } <regex_syntax::hir::LookSet>::set_union Line | Count | Source | 2855 | 388k | pub fn set_union(&mut self, other: LookSet) { | 2856 | 388k | *self = self.union(other); | 2857 | 388k | } |
<regex_syntax::hir::LookSet>::set_union Line | Count | Source | 2855 | 40.7M | pub fn set_union(&mut self, other: LookSet) { | 2856 | 40.7M | *self = self.union(other); | 2857 | 40.7M | } |
|
2858 | | |
2859 | | /// Returns a new set that is the intersection of this and the one given. |
2860 | | #[inline] |
2861 | 22.1M | pub fn intersect(self, other: LookSet) -> LookSet { |
2862 | 22.1M | LookSet { bits: self.bits & other.bits } |
2863 | 22.1M | } <regex_syntax::hir::LookSet>::intersect Line | Count | Source | 2861 | 259k | pub fn intersect(self, other: LookSet) -> LookSet { | 2862 | 259k | LookSet { bits: self.bits & other.bits } | 2863 | 259k | } |
<regex_syntax::hir::LookSet>::intersect Line | Count | Source | 2861 | 21.9M | pub fn intersect(self, other: LookSet) -> LookSet { | 2862 | 21.9M | LookSet { bits: self.bits & other.bits } | 2863 | 21.9M | } |
|
2864 | | |
2865 | | /// Updates this set in place with the result of intersecting it with the |
2866 | | /// one given. |
2867 | | #[inline] |
2868 | 22.1M | pub fn set_intersect(&mut self, other: LookSet) { |
2869 | 22.1M | *self = self.intersect(other); |
2870 | 22.1M | } <regex_syntax::hir::LookSet>::set_intersect Line | Count | Source | 2868 | 259k | pub fn set_intersect(&mut self, other: LookSet) { | 2869 | 259k | *self = self.intersect(other); | 2870 | 259k | } |
<regex_syntax::hir::LookSet>::set_intersect Line | Count | Source | 2868 | 21.9M | pub fn set_intersect(&mut self, other: LookSet) { | 2869 | 21.9M | *self = self.intersect(other); | 2870 | 21.9M | } |
|
2871 | | |
2872 | | /// Return a `LookSet` from the slice given as a native endian 32-bit |
2873 | | /// integer. |
2874 | | /// |
2875 | | /// # Panics |
2876 | | /// |
2877 | | /// This panics if `slice.len() < 4`. |
2878 | | #[inline] |
2879 | 0 | pub fn read_repr(slice: &[u8]) -> LookSet { |
2880 | 0 | let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); |
2881 | 0 | LookSet { bits } |
2882 | 0 | } |
2883 | | |
2884 | | /// Write a `LookSet` as a native endian 32-bit integer to the beginning |
2885 | | /// of the slice given. |
2886 | | /// |
2887 | | /// # Panics |
2888 | | /// |
2889 | | /// This panics if `slice.len() < 4`. |
2890 | | #[inline] |
2891 | 0 | pub fn write_repr(self, slice: &mut [u8]) { |
2892 | 0 | let raw = self.bits.to_ne_bytes(); |
2893 | 0 | slice[0] = raw[0]; |
2894 | 0 | slice[1] = raw[1]; |
2895 | 0 | slice[2] = raw[2]; |
2896 | 0 | slice[3] = raw[3]; |
2897 | 0 | } |
2898 | | } |
2899 | | |
2900 | | impl core::fmt::Debug for LookSet { |
2901 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
2902 | 0 | if self.is_empty() { |
2903 | 0 | return write!(f, "∅"); |
2904 | 0 | } |
2905 | 0 | for look in self.iter() { |
2906 | 0 | write!(f, "{}", look.as_char())?; |
2907 | | } |
2908 | 0 | Ok(()) |
2909 | 0 | } |
2910 | | } |
2911 | | |
2912 | | /// An iterator over all look-around assertions in a [`LookSet`]. |
2913 | | /// |
2914 | | /// This iterator is created by [`LookSet::iter`]. |
2915 | | #[derive(Clone, Debug)] |
2916 | | pub struct LookSetIter { |
2917 | | set: LookSet, |
2918 | | } |
2919 | | |
2920 | | impl Iterator for LookSetIter { |
2921 | | type Item = Look; |
2922 | | |
2923 | | #[inline] |
2924 | 0 | fn next(&mut self) -> Option<Look> { |
2925 | 0 | if self.set.is_empty() { |
2926 | 0 | return None; |
2927 | 0 | } |
2928 | 0 | // We'll never have more than u8::MAX distinct look-around assertions, |
2929 | 0 | // so 'bit' will always fit into a u16. |
2930 | 0 | let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); |
2931 | 0 | let look = Look::from_repr(1 << bit)?; |
2932 | 0 | self.set = self.set.remove(look); |
2933 | 0 | Some(look) |
2934 | 0 | } |
2935 | | } |
2936 | | |
2937 | | /// Given a sequence of HIR values where each value corresponds to a Unicode |
2938 | | /// class (or an all-ASCII byte class), return a single Unicode class |
2939 | | /// corresponding to the union of the classes found. |
2940 | 66.9k | fn class_chars(hirs: &[Hir]) -> Option<Class> { |
2941 | 66.9k | let mut cls = ClassUnicode::new(vec![]); |
2942 | 75.7k | for hir in hirs.iter() { |
2943 | 75.7k | match *hir.kind() { |
2944 | 6.70k | HirKind::Class(Class::Unicode(ref cls2)) => { |
2945 | 6.70k | cls.union(cls2); |
2946 | 6.70k | } |
2947 | 2.11k | HirKind::Class(Class::Bytes(ref cls2)) => { |
2948 | 2.11k | cls.union(&cls2.to_unicode_class()?); |
2949 | | } |
2950 | 66.8k | _ => return None, |
2951 | | }; |
2952 | | } |
2953 | 78 | Some(Class::Unicode(cls)) |
2954 | 66.9k | } |
2955 | | |
2956 | | /// Given a sequence of HIR values where each value corresponds to a byte class |
2957 | | /// (or an all-ASCII Unicode class), return a single byte class corresponding |
2958 | | /// to the union of the classes found. |
2959 | 66.8k | fn class_bytes(hirs: &[Hir]) -> Option<Class> { |
2960 | 66.8k | let mut cls = ClassBytes::new(vec![]); |
2961 | 74.2k | for hir in hirs.iter() { |
2962 | 74.2k | match *hir.kind() { |
2963 | 5.86k | HirKind::Class(Class::Unicode(ref cls2)) => { |
2964 | 5.86k | cls.union(&cls2.to_byte_class()?); |
2965 | | } |
2966 | 2.09k | HirKind::Class(Class::Bytes(ref cls2)) => { |
2967 | 2.09k | cls.union(cls2); |
2968 | 2.09k | } |
2969 | 66.3k | _ => return None, |
2970 | | }; |
2971 | | } |
2972 | 0 | Some(Class::Bytes(cls)) |
2973 | 66.8k | } |
2974 | | |
2975 | | /// Given a sequence of HIR values where each value corresponds to a literal |
2976 | | /// that is a single `char`, return that sequence of `char`s. Otherwise return |
2977 | | /// None. No deduplication is done. |
2978 | 70.6k | fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> { |
2979 | 70.6k | let mut singletons = vec![]; |
2980 | 664k | for hir in hirs.iter() { |
2981 | 664k | let literal = match *hir.kind() { |
2982 | 617k | HirKind::Literal(Literal(ref bytes)) => bytes, |
2983 | 46.6k | _ => return None, |
2984 | | }; |
2985 | 617k | let ch = match crate::debug::utf8_decode(literal) { |
2986 | 0 | None => return None, |
2987 | 0 | Some(Err(_)) => return None, |
2988 | 617k | Some(Ok(ch)) => ch, |
2989 | 617k | }; |
2990 | 617k | if literal.len() != ch.len_utf8() { |
2991 | 20.3k | return None; |
2992 | 597k | } |
2993 | 597k | singletons.push(ch); |
2994 | | } |
2995 | 3.65k | Some(singletons) |
2996 | 70.6k | } |
2997 | | |
2998 | | /// Given a sequence of HIR values where each value corresponds to a literal |
2999 | | /// that is a single byte, return that sequence of bytes. Otherwise return |
3000 | | /// None. No deduplication is done. |
3001 | 66.9k | fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> { |
3002 | 66.9k | let mut singletons = vec![]; |
3003 | 194k | for hir in hirs.iter() { |
3004 | 194k | let literal = match *hir.kind() { |
3005 | 148k | HirKind::Literal(Literal(ref bytes)) => bytes, |
3006 | 46.2k | _ => return None, |
3007 | | }; |
3008 | 148k | if literal.len() != 1 { |
3009 | 20.7k | return None; |
3010 | 127k | } |
3011 | 127k | singletons.push(literal[0]); |
3012 | | } |
3013 | 0 | Some(singletons) |
3014 | 66.9k | } |
3015 | | |
3016 | | /// Looks for a common prefix in the list of alternation branches given. If one |
3017 | | /// is found, then an equivalent but (hopefully) simplified Hir is returned. |
3018 | | /// Otherwise, the original given list of branches is returned unmodified. |
3019 | | /// |
3020 | | /// This is not quite as good as it could be. Right now, it requires that |
3021 | | /// all branches are 'Concat' expressions. It also doesn't do well with |
3022 | | /// literals. For example, given 'foofoo|foobar', it will not refactor it to |
3023 | | /// 'foo(?:foo|bar)' because literals are flattened into their own special |
3024 | | /// concatenation. (One wonders if perhaps 'Literal' should be a single atom |
3025 | | /// instead of a string of bytes because of this. Otherwise, handling the |
3026 | | /// current representation in this routine will be pretty gnarly. Sigh.) |
3027 | 66.8k | fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> { |
3028 | 66.8k | if hirs.len() <= 1 { |
3029 | 0 | return Err(hirs); |
3030 | 66.8k | } |
3031 | 66.8k | let mut prefix = match hirs[0].kind() { |
3032 | 18.0k | HirKind::Concat(ref xs) => &**xs, |
3033 | 48.8k | _ => return Err(hirs), |
3034 | | }; |
3035 | 18.0k | if prefix.is_empty() { |
3036 | 0 | return Err(hirs); |
3037 | 18.0k | } |
3038 | 25.3k | for h in hirs.iter().skip(1) { |
3039 | 25.3k | let concat = match h.kind() { |
3040 | 20.2k | HirKind::Concat(ref xs) => xs, |
3041 | 5.11k | _ => return Err(hirs), |
3042 | | }; |
3043 | 20.2k | let common_len = prefix |
3044 | 20.2k | .iter() |
3045 | 20.2k | .zip(concat.iter()) |
3046 | 29.5k | .take_while(|(x, y)| x == y) |
3047 | 20.2k | .count(); |
3048 | 20.2k | prefix = &prefix[..common_len]; |
3049 | 20.2k | if prefix.is_empty() { |
3050 | 11.1k | return Err(hirs); |
3051 | 9.07k | } |
3052 | | } |
3053 | 1.71k | let len = prefix.len(); |
3054 | 1.71k | assert_ne!(0, len); |
3055 | 1.71k | let mut prefix_concat = vec![]; |
3056 | 1.71k | let mut suffix_alts = vec![]; |
3057 | 9.19k | for h in hirs { |
3058 | 7.48k | let mut concat = match h.into_kind() { |
3059 | 7.48k | HirKind::Concat(xs) => xs, |
3060 | | // We required all sub-expressions to be |
3061 | | // concats above, so we're only here if we |
3062 | | // have a concat. |
3063 | 0 | _ => unreachable!(), |
3064 | | }; |
3065 | 7.48k | suffix_alts.push(Hir::concat(concat.split_off(len))); |
3066 | 7.48k | if prefix_concat.is_empty() { |
3067 | 1.71k | prefix_concat = concat; |
3068 | 5.77k | } |
3069 | | } |
3070 | 1.71k | let mut concat = prefix_concat; |
3071 | 1.71k | concat.push(Hir::alternation(suffix_alts)); |
3072 | 1.71k | Ok(Hir::concat(concat)) |
3073 | 66.8k | } |
3074 | | |
3075 | | #[cfg(test)] |
3076 | | mod tests { |
3077 | | use super::*; |
3078 | | |
3079 | | fn uclass(ranges: &[(char, char)]) -> ClassUnicode { |
3080 | | let ranges: Vec<ClassUnicodeRange> = ranges |
3081 | | .iter() |
3082 | | .map(|&(s, e)| ClassUnicodeRange::new(s, e)) |
3083 | | .collect(); |
3084 | | ClassUnicode::new(ranges) |
3085 | | } |
3086 | | |
3087 | | fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { |
3088 | | let ranges: Vec<ClassBytesRange> = |
3089 | | ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); |
3090 | | ClassBytes::new(ranges) |
3091 | | } |
3092 | | |
3093 | | fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { |
3094 | | cls.iter().map(|x| (x.start(), x.end())).collect() |
3095 | | } |
3096 | | |
3097 | | #[cfg(feature = "unicode-case")] |
3098 | | fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { |
3099 | | let mut cls_ = cls.clone(); |
3100 | | cls_.case_fold_simple(); |
3101 | | cls_ |
3102 | | } |
3103 | | |
3104 | | fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
3105 | | let mut cls_ = cls1.clone(); |
3106 | | cls_.union(cls2); |
3107 | | cls_ |
3108 | | } |
3109 | | |
3110 | | fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
3111 | | let mut cls_ = cls1.clone(); |
3112 | | cls_.intersect(cls2); |
3113 | | cls_ |
3114 | | } |
3115 | | |
3116 | | fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { |
3117 | | let mut cls_ = cls1.clone(); |
3118 | | cls_.difference(cls2); |
3119 | | cls_ |
3120 | | } |
3121 | | |
3122 | | fn usymdifference( |
3123 | | cls1: &ClassUnicode, |
3124 | | cls2: &ClassUnicode, |
3125 | | ) -> ClassUnicode { |
3126 | | let mut cls_ = cls1.clone(); |
3127 | | cls_.symmetric_difference(cls2); |
3128 | | cls_ |
3129 | | } |
3130 | | |
3131 | | fn unegate(cls: &ClassUnicode) -> ClassUnicode { |
3132 | | let mut cls_ = cls.clone(); |
3133 | | cls_.negate(); |
3134 | | cls_ |
3135 | | } |
3136 | | |
3137 | | fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { |
3138 | | cls.iter().map(|x| (x.start(), x.end())).collect() |
3139 | | } |
3140 | | |
3141 | | fn bcasefold(cls: &ClassBytes) -> ClassBytes { |
3142 | | let mut cls_ = cls.clone(); |
3143 | | cls_.case_fold_simple(); |
3144 | | cls_ |
3145 | | } |
3146 | | |
3147 | | fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
3148 | | let mut cls_ = cls1.clone(); |
3149 | | cls_.union(cls2); |
3150 | | cls_ |
3151 | | } |
3152 | | |
3153 | | fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
3154 | | let mut cls_ = cls1.clone(); |
3155 | | cls_.intersect(cls2); |
3156 | | cls_ |
3157 | | } |
3158 | | |
3159 | | fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
3160 | | let mut cls_ = cls1.clone(); |
3161 | | cls_.difference(cls2); |
3162 | | cls_ |
3163 | | } |
3164 | | |
3165 | | fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { |
3166 | | let mut cls_ = cls1.clone(); |
3167 | | cls_.symmetric_difference(cls2); |
3168 | | cls_ |
3169 | | } |
3170 | | |
3171 | | fn bnegate(cls: &ClassBytes) -> ClassBytes { |
3172 | | let mut cls_ = cls.clone(); |
3173 | | cls_.negate(); |
3174 | | cls_ |
3175 | | } |
3176 | | |
3177 | | #[test] |
3178 | | fn class_range_canonical_unicode() { |
3179 | | let range = ClassUnicodeRange::new('\u{00FF}', '\0'); |
3180 | | assert_eq!('\0', range.start()); |
3181 | | assert_eq!('\u{00FF}', range.end()); |
3182 | | } |
3183 | | |
3184 | | #[test] |
3185 | | fn class_range_canonical_bytes() { |
3186 | | let range = ClassBytesRange::new(b'\xFF', b'\0'); |
3187 | | assert_eq!(b'\0', range.start()); |
3188 | | assert_eq!(b'\xFF', range.end()); |
3189 | | } |
3190 | | |
3191 | | #[test] |
3192 | | fn class_canonicalize_unicode() { |
3193 | | let cls = uclass(&[('a', 'c'), ('x', 'z')]); |
3194 | | let expected = vec![('a', 'c'), ('x', 'z')]; |
3195 | | assert_eq!(expected, uranges(&cls)); |
3196 | | |
3197 | | let cls = uclass(&[('x', 'z'), ('a', 'c')]); |
3198 | | let expected = vec![('a', 'c'), ('x', 'z')]; |
3199 | | assert_eq!(expected, uranges(&cls)); |
3200 | | |
3201 | | let cls = uclass(&[('x', 'z'), ('w', 'y')]); |
3202 | | let expected = vec![('w', 'z')]; |
3203 | | assert_eq!(expected, uranges(&cls)); |
3204 | | |
3205 | | let cls = uclass(&[ |
3206 | | ('c', 'f'), |
3207 | | ('a', 'g'), |
3208 | | ('d', 'j'), |
3209 | | ('a', 'c'), |
3210 | | ('m', 'p'), |
3211 | | ('l', 's'), |
3212 | | ]); |
3213 | | let expected = vec![('a', 'j'), ('l', 's')]; |
3214 | | assert_eq!(expected, uranges(&cls)); |
3215 | | |
3216 | | let cls = uclass(&[('x', 'z'), ('u', 'w')]); |
3217 | | let expected = vec![('u', 'z')]; |
3218 | | assert_eq!(expected, uranges(&cls)); |
3219 | | |
3220 | | let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); |
3221 | | let expected = vec![('\x00', '\u{10FFFF}')]; |
3222 | | assert_eq!(expected, uranges(&cls)); |
3223 | | |
3224 | | let cls = uclass(&[('a', 'a'), ('b', 'b')]); |
3225 | | let expected = vec![('a', 'b')]; |
3226 | | assert_eq!(expected, uranges(&cls)); |
3227 | | } |
3228 | | |
3229 | | #[test] |
3230 | | fn class_canonicalize_bytes() { |
3231 | | let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); |
3232 | | let expected = vec![(b'a', b'c'), (b'x', b'z')]; |
3233 | | assert_eq!(expected, branges(&cls)); |
3234 | | |
3235 | | let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); |
3236 | | let expected = vec![(b'a', b'c'), (b'x', b'z')]; |
3237 | | assert_eq!(expected, branges(&cls)); |
3238 | | |
3239 | | let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); |
3240 | | let expected = vec![(b'w', b'z')]; |
3241 | | assert_eq!(expected, branges(&cls)); |
3242 | | |
3243 | | let cls = bclass(&[ |
3244 | | (b'c', b'f'), |
3245 | | (b'a', b'g'), |
3246 | | (b'd', b'j'), |
3247 | | (b'a', b'c'), |
3248 | | (b'm', b'p'), |
3249 | | (b'l', b's'), |
3250 | | ]); |
3251 | | let expected = vec![(b'a', b'j'), (b'l', b's')]; |
3252 | | assert_eq!(expected, branges(&cls)); |
3253 | | |
3254 | | let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); |
3255 | | let expected = vec![(b'u', b'z')]; |
3256 | | assert_eq!(expected, branges(&cls)); |
3257 | | |
3258 | | let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); |
3259 | | let expected = vec![(b'\x00', b'\xFF')]; |
3260 | | assert_eq!(expected, branges(&cls)); |
3261 | | |
3262 | | let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); |
3263 | | let expected = vec![(b'a', b'b')]; |
3264 | | assert_eq!(expected, branges(&cls)); |
3265 | | } |
3266 | | |
3267 | | #[test] |
3268 | | #[cfg(feature = "unicode-case")] |
3269 | | fn class_case_fold_unicode() { |
3270 | | let cls = uclass(&[ |
3271 | | ('C', 'F'), |
3272 | | ('A', 'G'), |
3273 | | ('D', 'J'), |
3274 | | ('A', 'C'), |
3275 | | ('M', 'P'), |
3276 | | ('L', 'S'), |
3277 | | ('c', 'f'), |
3278 | | ]); |
3279 | | let expected = uclass(&[ |
3280 | | ('A', 'J'), |
3281 | | ('L', 'S'), |
3282 | | ('a', 'j'), |
3283 | | ('l', 's'), |
3284 | | ('\u{17F}', '\u{17F}'), |
3285 | | ]); |
3286 | | assert_eq!(expected, ucasefold(&cls)); |
3287 | | |
3288 | | let cls = uclass(&[('A', 'Z')]); |
3289 | | let expected = uclass(&[ |
3290 | | ('A', 'Z'), |
3291 | | ('a', 'z'), |
3292 | | ('\u{17F}', '\u{17F}'), |
3293 | | ('\u{212A}', '\u{212A}'), |
3294 | | ]); |
3295 | | assert_eq!(expected, ucasefold(&cls)); |
3296 | | |
3297 | | let cls = uclass(&[('a', 'z')]); |
3298 | | let expected = uclass(&[ |
3299 | | ('A', 'Z'), |
3300 | | ('a', 'z'), |
3301 | | ('\u{17F}', '\u{17F}'), |
3302 | | ('\u{212A}', '\u{212A}'), |
3303 | | ]); |
3304 | | assert_eq!(expected, ucasefold(&cls)); |
3305 | | |
3306 | | let cls = uclass(&[('A', 'A'), ('_', '_')]); |
3307 | | let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); |
3308 | | assert_eq!(expected, ucasefold(&cls)); |
3309 | | |
3310 | | let cls = uclass(&[('A', 'A'), ('=', '=')]); |
3311 | | let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); |
3312 | | assert_eq!(expected, ucasefold(&cls)); |
3313 | | |
3314 | | let cls = uclass(&[('\x00', '\x10')]); |
3315 | | assert_eq!(cls, ucasefold(&cls)); |
3316 | | |
3317 | | let cls = uclass(&[('k', 'k')]); |
3318 | | let expected = |
3319 | | uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); |
3320 | | assert_eq!(expected, ucasefold(&cls)); |
3321 | | |
3322 | | let cls = uclass(&[('@', '@')]); |
3323 | | assert_eq!(cls, ucasefold(&cls)); |
3324 | | } |
3325 | | |
3326 | | #[test] |
3327 | | #[cfg(not(feature = "unicode-case"))] |
3328 | | fn class_case_fold_unicode_disabled() { |
3329 | | let mut cls = uclass(&[ |
3330 | | ('C', 'F'), |
3331 | | ('A', 'G'), |
3332 | | ('D', 'J'), |
3333 | | ('A', 'C'), |
3334 | | ('M', 'P'), |
3335 | | ('L', 'S'), |
3336 | | ('c', 'f'), |
3337 | | ]); |
3338 | | assert!(cls.try_case_fold_simple().is_err()); |
3339 | | } |
3340 | | |
3341 | | #[test] |
3342 | | #[should_panic] |
3343 | | #[cfg(not(feature = "unicode-case"))] |
3344 | | fn class_case_fold_unicode_disabled_panics() { |
3345 | | let mut cls = uclass(&[ |
3346 | | ('C', 'F'), |
3347 | | ('A', 'G'), |
3348 | | ('D', 'J'), |
3349 | | ('A', 'C'), |
3350 | | ('M', 'P'), |
3351 | | ('L', 'S'), |
3352 | | ('c', 'f'), |
3353 | | ]); |
3354 | | cls.case_fold_simple(); |
3355 | | } |
3356 | | |
3357 | | #[test] |
3358 | | fn class_case_fold_bytes() { |
3359 | | let cls = bclass(&[ |
3360 | | (b'C', b'F'), |
3361 | | (b'A', b'G'), |
3362 | | (b'D', b'J'), |
3363 | | (b'A', b'C'), |
3364 | | (b'M', b'P'), |
3365 | | (b'L', b'S'), |
3366 | | (b'c', b'f'), |
3367 | | ]); |
3368 | | let expected = |
3369 | | bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); |
3370 | | assert_eq!(expected, bcasefold(&cls)); |
3371 | | |
3372 | | let cls = bclass(&[(b'A', b'Z')]); |
3373 | | let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); |
3374 | | assert_eq!(expected, bcasefold(&cls)); |
3375 | | |
3376 | | let cls = bclass(&[(b'a', b'z')]); |
3377 | | let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); |
3378 | | assert_eq!(expected, bcasefold(&cls)); |
3379 | | |
3380 | | let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); |
3381 | | let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); |
3382 | | assert_eq!(expected, bcasefold(&cls)); |
3383 | | |
3384 | | let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); |
3385 | | let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); |
3386 | | assert_eq!(expected, bcasefold(&cls)); |
3387 | | |
3388 | | let cls = bclass(&[(b'\x00', b'\x10')]); |
3389 | | assert_eq!(cls, bcasefold(&cls)); |
3390 | | |
3391 | | let cls = bclass(&[(b'k', b'k')]); |
3392 | | let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); |
3393 | | assert_eq!(expected, bcasefold(&cls)); |
3394 | | |
3395 | | let cls = bclass(&[(b'@', b'@')]); |
3396 | | assert_eq!(cls, bcasefold(&cls)); |
3397 | | } |
3398 | | |
3399 | | #[test] |
3400 | | fn class_negate_unicode() { |
3401 | | let cls = uclass(&[('a', 'a')]); |
3402 | | let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); |
3403 | | assert_eq!(expected, unegate(&cls)); |
3404 | | |
3405 | | let cls = uclass(&[('a', 'a'), ('b', 'b')]); |
3406 | | let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); |
3407 | | assert_eq!(expected, unegate(&cls)); |
3408 | | |
3409 | | let cls = uclass(&[('a', 'c'), ('x', 'z')]); |
3410 | | let expected = uclass(&[ |
3411 | | ('\x00', '\x60'), |
3412 | | ('\x64', '\x77'), |
3413 | | ('\x7B', '\u{10FFFF}'), |
3414 | | ]); |
3415 | | assert_eq!(expected, unegate(&cls)); |
3416 | | |
3417 | | let cls = uclass(&[('\x00', 'a')]); |
3418 | | let expected = uclass(&[('\x62', '\u{10FFFF}')]); |
3419 | | assert_eq!(expected, unegate(&cls)); |
3420 | | |
3421 | | let cls = uclass(&[('a', '\u{10FFFF}')]); |
3422 | | let expected = uclass(&[('\x00', '\x60')]); |
3423 | | assert_eq!(expected, unegate(&cls)); |
3424 | | |
3425 | | let cls = uclass(&[('\x00', '\u{10FFFF}')]); |
3426 | | let expected = uclass(&[]); |
3427 | | assert_eq!(expected, unegate(&cls)); |
3428 | | |
3429 | | let cls = uclass(&[]); |
3430 | | let expected = uclass(&[('\x00', '\u{10FFFF}')]); |
3431 | | assert_eq!(expected, unegate(&cls)); |
3432 | | |
3433 | | let cls = |
3434 | | uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); |
3435 | | let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); |
3436 | | assert_eq!(expected, unegate(&cls)); |
3437 | | |
3438 | | let cls = uclass(&[('\x00', '\u{D7FF}')]); |
3439 | | let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); |
3440 | | assert_eq!(expected, unegate(&cls)); |
3441 | | |
3442 | | let cls = uclass(&[('\x00', '\u{D7FE}')]); |
3443 | | let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); |
3444 | | assert_eq!(expected, unegate(&cls)); |
3445 | | |
3446 | | let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); |
3447 | | let expected = uclass(&[('\x00', '\u{D7FF}')]); |
3448 | | assert_eq!(expected, unegate(&cls)); |
3449 | | |
3450 | | let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); |
3451 | | let expected = uclass(&[('\x00', '\u{E000}')]); |
3452 | | assert_eq!(expected, unegate(&cls)); |
3453 | | } |
3454 | | |
3455 | | #[test] |
3456 | | fn class_negate_bytes() { |
3457 | | let cls = bclass(&[(b'a', b'a')]); |
3458 | | let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); |
3459 | | assert_eq!(expected, bnegate(&cls)); |
3460 | | |
3461 | | let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); |
3462 | | let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); |
3463 | | assert_eq!(expected, bnegate(&cls)); |
3464 | | |
3465 | | let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); |
3466 | | let expected = bclass(&[ |
3467 | | (b'\x00', b'\x60'), |
3468 | | (b'\x64', b'\x77'), |
3469 | | (b'\x7B', b'\xFF'), |
3470 | | ]); |
3471 | | assert_eq!(expected, bnegate(&cls)); |
3472 | | |
3473 | | let cls = bclass(&[(b'\x00', b'a')]); |
3474 | | let expected = bclass(&[(b'\x62', b'\xFF')]); |
3475 | | assert_eq!(expected, bnegate(&cls)); |
3476 | | |
3477 | | let cls = bclass(&[(b'a', b'\xFF')]); |
3478 | | let expected = bclass(&[(b'\x00', b'\x60')]); |
3479 | | assert_eq!(expected, bnegate(&cls)); |
3480 | | |
3481 | | let cls = bclass(&[(b'\x00', b'\xFF')]); |
3482 | | let expected = bclass(&[]); |
3483 | | assert_eq!(expected, bnegate(&cls)); |
3484 | | |
3485 | | let cls = bclass(&[]); |
3486 | | let expected = bclass(&[(b'\x00', b'\xFF')]); |
3487 | | assert_eq!(expected, bnegate(&cls)); |
3488 | | |
3489 | | let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); |
3490 | | let expected = bclass(&[(b'\xFE', b'\xFE')]); |
3491 | | assert_eq!(expected, bnegate(&cls)); |
3492 | | } |
3493 | | |
3494 | | #[test] |
3495 | | fn class_union_unicode() { |
3496 | | let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); |
3497 | | let cls2 = uclass(&[('a', 'z')]); |
3498 | | let expected = uclass(&[('a', 'z'), ('A', 'C')]); |
3499 | | assert_eq!(expected, uunion(&cls1, &cls2)); |
3500 | | } |
3501 | | |
3502 | | #[test] |
3503 | | fn class_union_bytes() { |
3504 | | let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); |
3505 | | let cls2 = bclass(&[(b'a', b'z')]); |
3506 | | let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); |
3507 | | assert_eq!(expected, bunion(&cls1, &cls2)); |
3508 | | } |
3509 | | |
3510 | | #[test] |
3511 | | fn class_intersect_unicode() { |
3512 | | let cls1 = uclass(&[]); |
3513 | | let cls2 = uclass(&[('a', 'a')]); |
3514 | | let expected = uclass(&[]); |
3515 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3516 | | |
3517 | | let cls1 = uclass(&[('a', 'a')]); |
3518 | | let cls2 = uclass(&[('a', 'a')]); |
3519 | | let expected = uclass(&[('a', 'a')]); |
3520 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3521 | | |
3522 | | let cls1 = uclass(&[('a', 'a')]); |
3523 | | let cls2 = uclass(&[('b', 'b')]); |
3524 | | let expected = uclass(&[]); |
3525 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3526 | | |
3527 | | let cls1 = uclass(&[('a', 'a')]); |
3528 | | let cls2 = uclass(&[('a', 'c')]); |
3529 | | let expected = uclass(&[('a', 'a')]); |
3530 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3531 | | |
3532 | | let cls1 = uclass(&[('a', 'b')]); |
3533 | | let cls2 = uclass(&[('a', 'c')]); |
3534 | | let expected = uclass(&[('a', 'b')]); |
3535 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3536 | | |
3537 | | let cls1 = uclass(&[('a', 'b')]); |
3538 | | let cls2 = uclass(&[('b', 'c')]); |
3539 | | let expected = uclass(&[('b', 'b')]); |
3540 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3541 | | |
3542 | | let cls1 = uclass(&[('a', 'b')]); |
3543 | | let cls2 = uclass(&[('c', 'd')]); |
3544 | | let expected = uclass(&[]); |
3545 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3546 | | |
3547 | | let cls1 = uclass(&[('b', 'c')]); |
3548 | | let cls2 = uclass(&[('a', 'd')]); |
3549 | | let expected = uclass(&[('b', 'c')]); |
3550 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3551 | | |
3552 | | let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3553 | | let cls2 = uclass(&[('a', 'h')]); |
3554 | | let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3555 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3556 | | |
3557 | | let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3558 | | let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3559 | | let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3560 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3561 | | |
3562 | | let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); |
3563 | | let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); |
3564 | | let expected = uclass(&[]); |
3565 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3566 | | |
3567 | | let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); |
3568 | | let cls2 = uclass(&[('h', 'h')]); |
3569 | | let expected = uclass(&[('h', 'h')]); |
3570 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3571 | | |
3572 | | let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); |
3573 | | let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); |
3574 | | let expected = uclass(&[]); |
3575 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3576 | | |
3577 | | let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); |
3578 | | let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); |
3579 | | let expected = uclass(&[('b', 'f')]); |
3580 | | assert_eq!(expected, uintersect(&cls1, &cls2)); |
3581 | | } |
3582 | | |
3583 | | #[test] |
3584 | | fn class_intersect_bytes() { |
3585 | | let cls1 = bclass(&[]); |
3586 | | let cls2 = bclass(&[(b'a', b'a')]); |
3587 | | let expected = bclass(&[]); |
3588 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3589 | | |
3590 | | let cls1 = bclass(&[(b'a', b'a')]); |
3591 | | let cls2 = bclass(&[(b'a', b'a')]); |
3592 | | let expected = bclass(&[(b'a', b'a')]); |
3593 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3594 | | |
3595 | | let cls1 = bclass(&[(b'a', b'a')]); |
3596 | | let cls2 = bclass(&[(b'b', b'b')]); |
3597 | | let expected = bclass(&[]); |
3598 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3599 | | |
3600 | | let cls1 = bclass(&[(b'a', b'a')]); |
3601 | | let cls2 = bclass(&[(b'a', b'c')]); |
3602 | | let expected = bclass(&[(b'a', b'a')]); |
3603 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3604 | | |
3605 | | let cls1 = bclass(&[(b'a', b'b')]); |
3606 | | let cls2 = bclass(&[(b'a', b'c')]); |
3607 | | let expected = bclass(&[(b'a', b'b')]); |
3608 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3609 | | |
3610 | | let cls1 = bclass(&[(b'a', b'b')]); |
3611 | | let cls2 = bclass(&[(b'b', b'c')]); |
3612 | | let expected = bclass(&[(b'b', b'b')]); |
3613 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3614 | | |
3615 | | let cls1 = bclass(&[(b'a', b'b')]); |
3616 | | let cls2 = bclass(&[(b'c', b'd')]); |
3617 | | let expected = bclass(&[]); |
3618 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3619 | | |
3620 | | let cls1 = bclass(&[(b'b', b'c')]); |
3621 | | let cls2 = bclass(&[(b'a', b'd')]); |
3622 | | let expected = bclass(&[(b'b', b'c')]); |
3623 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3624 | | |
3625 | | let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3626 | | let cls2 = bclass(&[(b'a', b'h')]); |
3627 | | let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3628 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3629 | | |
3630 | | let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3631 | | let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3632 | | let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3633 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3634 | | |
3635 | | let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); |
3636 | | let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); |
3637 | | let expected = bclass(&[]); |
3638 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3639 | | |
3640 | | let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); |
3641 | | let cls2 = bclass(&[(b'h', b'h')]); |
3642 | | let expected = bclass(&[(b'h', b'h')]); |
3643 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3644 | | |
3645 | | let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); |
3646 | | let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); |
3647 | | let expected = bclass(&[]); |
3648 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3649 | | |
3650 | | let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); |
3651 | | let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); |
3652 | | let expected = bclass(&[(b'b', b'f')]); |
3653 | | assert_eq!(expected, bintersect(&cls1, &cls2)); |
3654 | | } |
3655 | | |
3656 | | #[test] |
3657 | | fn class_difference_unicode() { |
3658 | | let cls1 = uclass(&[('a', 'a')]); |
3659 | | let cls2 = uclass(&[('a', 'a')]); |
3660 | | let expected = uclass(&[]); |
3661 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3662 | | |
3663 | | let cls1 = uclass(&[('a', 'a')]); |
3664 | | let cls2 = uclass(&[]); |
3665 | | let expected = uclass(&[('a', 'a')]); |
3666 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3667 | | |
3668 | | let cls1 = uclass(&[]); |
3669 | | let cls2 = uclass(&[('a', 'a')]); |
3670 | | let expected = uclass(&[]); |
3671 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3672 | | |
3673 | | let cls1 = uclass(&[('a', 'z')]); |
3674 | | let cls2 = uclass(&[('a', 'a')]); |
3675 | | let expected = uclass(&[('b', 'z')]); |
3676 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3677 | | |
3678 | | let cls1 = uclass(&[('a', 'z')]); |
3679 | | let cls2 = uclass(&[('z', 'z')]); |
3680 | | let expected = uclass(&[('a', 'y')]); |
3681 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3682 | | |
3683 | | let cls1 = uclass(&[('a', 'z')]); |
3684 | | let cls2 = uclass(&[('m', 'm')]); |
3685 | | let expected = uclass(&[('a', 'l'), ('n', 'z')]); |
3686 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3687 | | |
3688 | | let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); |
3689 | | let cls2 = uclass(&[('a', 'z')]); |
3690 | | let expected = uclass(&[]); |
3691 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3692 | | |
3693 | | let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); |
3694 | | let cls2 = uclass(&[('d', 'v')]); |
3695 | | let expected = uclass(&[('a', 'c')]); |
3696 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3697 | | |
3698 | | let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); |
3699 | | let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); |
3700 | | let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); |
3701 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3702 | | |
3703 | | let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); |
3704 | | let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); |
3705 | | let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); |
3706 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3707 | | |
3708 | | let cls1 = uclass(&[('x', 'z')]); |
3709 | | let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); |
3710 | | let expected = uclass(&[('x', 'z')]); |
3711 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3712 | | |
3713 | | let cls1 = uclass(&[('a', 'z')]); |
3714 | | let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); |
3715 | | let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); |
3716 | | assert_eq!(expected, udifference(&cls1, &cls2)); |
3717 | | } |
3718 | | |
3719 | | #[test] |
3720 | | fn class_difference_bytes() { |
3721 | | let cls1 = bclass(&[(b'a', b'a')]); |
3722 | | let cls2 = bclass(&[(b'a', b'a')]); |
3723 | | let expected = bclass(&[]); |
3724 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3725 | | |
3726 | | let cls1 = bclass(&[(b'a', b'a')]); |
3727 | | let cls2 = bclass(&[]); |
3728 | | let expected = bclass(&[(b'a', b'a')]); |
3729 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3730 | | |
3731 | | let cls1 = bclass(&[]); |
3732 | | let cls2 = bclass(&[(b'a', b'a')]); |
3733 | | let expected = bclass(&[]); |
3734 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3735 | | |
3736 | | let cls1 = bclass(&[(b'a', b'z')]); |
3737 | | let cls2 = bclass(&[(b'a', b'a')]); |
3738 | | let expected = bclass(&[(b'b', b'z')]); |
3739 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3740 | | |
3741 | | let cls1 = bclass(&[(b'a', b'z')]); |
3742 | | let cls2 = bclass(&[(b'z', b'z')]); |
3743 | | let expected = bclass(&[(b'a', b'y')]); |
3744 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3745 | | |
3746 | | let cls1 = bclass(&[(b'a', b'z')]); |
3747 | | let cls2 = bclass(&[(b'm', b'm')]); |
3748 | | let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); |
3749 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3750 | | |
3751 | | let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); |
3752 | | let cls2 = bclass(&[(b'a', b'z')]); |
3753 | | let expected = bclass(&[]); |
3754 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3755 | | |
3756 | | let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); |
3757 | | let cls2 = bclass(&[(b'd', b'v')]); |
3758 | | let expected = bclass(&[(b'a', b'c')]); |
3759 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3760 | | |
3761 | | let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); |
3762 | | let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); |
3763 | | let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); |
3764 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3765 | | |
3766 | | let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); |
3767 | | let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); |
3768 | | let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); |
3769 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3770 | | |
3771 | | let cls1 = bclass(&[(b'x', b'z')]); |
3772 | | let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); |
3773 | | let expected = bclass(&[(b'x', b'z')]); |
3774 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3775 | | |
3776 | | let cls1 = bclass(&[(b'a', b'z')]); |
3777 | | let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); |
3778 | | let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); |
3779 | | assert_eq!(expected, bdifference(&cls1, &cls2)); |
3780 | | } |
3781 | | |
3782 | | #[test] |
3783 | | fn class_symmetric_difference_unicode() { |
3784 | | let cls1 = uclass(&[('a', 'm')]); |
3785 | | let cls2 = uclass(&[('g', 't')]); |
3786 | | let expected = uclass(&[('a', 'f'), ('n', 't')]); |
3787 | | assert_eq!(expected, usymdifference(&cls1, &cls2)); |
3788 | | } |
3789 | | |
3790 | | #[test] |
3791 | | fn class_symmetric_difference_bytes() { |
3792 | | let cls1 = bclass(&[(b'a', b'm')]); |
3793 | | let cls2 = bclass(&[(b'g', b't')]); |
3794 | | let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); |
3795 | | assert_eq!(expected, bsymdifference(&cls1, &cls2)); |
3796 | | } |
3797 | | |
3798 | | // We use a thread with an explicit stack size to test that our destructor |
3799 | | // for Hir can handle arbitrarily sized expressions in constant stack |
3800 | | // space. In case we run on a platform without threads (WASM?), we limit |
3801 | | // this test to Windows/Unix. |
3802 | | #[test] |
3803 | | #[cfg(any(unix, windows))] |
3804 | | fn no_stack_overflow_on_drop() { |
3805 | | use std::thread; |
3806 | | |
3807 | | let run = || { |
3808 | | let mut expr = Hir::empty(); |
3809 | | for _ in 0..100 { |
3810 | | expr = Hir::capture(Capture { |
3811 | | index: 1, |
3812 | | name: None, |
3813 | | sub: Box::new(expr), |
3814 | | }); |
3815 | | expr = Hir::repetition(Repetition { |
3816 | | min: 0, |
3817 | | max: Some(1), |
3818 | | greedy: true, |
3819 | | sub: Box::new(expr), |
3820 | | }); |
3821 | | |
3822 | | expr = Hir { |
3823 | | kind: HirKind::Concat(vec![expr]), |
3824 | | props: Properties::empty(), |
3825 | | }; |
3826 | | expr = Hir { |
3827 | | kind: HirKind::Alternation(vec![expr]), |
3828 | | props: Properties::empty(), |
3829 | | }; |
3830 | | } |
3831 | | assert!(!matches!(*expr.kind(), HirKind::Empty)); |
3832 | | }; |
3833 | | |
3834 | | // We run our test on a thread with a small stack size so we can |
3835 | | // force the issue more easily. |
3836 | | // |
3837 | | // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' |
3838 | | // for context on the specific stack size chosen here. |
3839 | | thread::Builder::new() |
3840 | | .stack_size(16 << 10) |
3841 | | .spawn(run) |
3842 | | .unwrap() |
3843 | | .join() |
3844 | | .unwrap(); |
3845 | | } |
3846 | | |
3847 | | #[test] |
3848 | | fn look_set_iter() { |
3849 | | let set = LookSet::empty(); |
3850 | | assert_eq!(0, set.iter().count()); |
3851 | | |
3852 | | let set = LookSet::full(); |
3853 | | assert_eq!(18, set.iter().count()); |
3854 | | |
3855 | | let set = |
3856 | | LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); |
3857 | | assert_eq!(2, set.iter().count()); |
3858 | | |
3859 | | let set = LookSet::empty().insert(Look::StartLF); |
3860 | | assert_eq!(1, set.iter().count()); |
3861 | | |
3862 | | let set = LookSet::empty().insert(Look::WordAsciiNegate); |
3863 | | assert_eq!(1, set.iter().count()); |
3864 | | } |
3865 | | |
3866 | | #[test] |
3867 | | fn look_set_debug() { |
3868 | | let res = format!("{:?}", LookSet::empty()); |
3869 | | assert_eq!("∅", res); |
3870 | | let res = format!("{:?}", LookSet::full()); |
3871 | | assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); |
3872 | | } |
3873 | | } |