/src/regex/regex-lite/src/nfa.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use core::{cell::RefCell, mem::size_of}; |
2 | | |
3 | | use alloc::{string::String, sync::Arc, vec, vec::Vec}; |
4 | | |
5 | | use crate::{ |
6 | | error::Error, |
7 | | hir::{self, Hir, HirKind}, |
8 | | int::U32, |
9 | | }; |
10 | | |
11 | | pub(crate) type StateID = u32; |
12 | | |
13 | | #[derive(Clone, Copy, Debug)] |
14 | | pub(crate) struct Config { |
15 | | pub(crate) size_limit: Option<usize>, |
16 | | } |
17 | | |
18 | | impl Default for Config { |
19 | 7.54k | fn default() -> Config { |
20 | 7.54k | Config { size_limit: Some(10 * (1 << 20)) } |
21 | 7.54k | } |
22 | | } |
23 | | |
24 | | #[derive(Clone)] |
25 | | pub(crate) struct NFA { |
26 | | /// The pattern string this NFA was generated from. |
27 | | /// |
28 | | /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ |
29 | | pattern: String, |
30 | | /// The states that make up this NFA. |
31 | | states: Vec<State>, |
32 | | /// The ID of the start state. |
33 | | start: StateID, |
34 | | /// Whether this NFA can only match at the beginning of a haystack. |
35 | | is_start_anchored: bool, |
36 | | /// Whether this NFA can match the empty string. |
37 | | is_match_empty: bool, |
38 | | /// If every match has the same number of matching capture groups, then |
39 | | /// this corresponds to the number of groups. |
40 | | static_explicit_captures_len: Option<usize>, |
41 | | /// A map from capture group name to its corresponding index. |
42 | | cap_name_to_index: CaptureNameMap, |
43 | | /// A map from capture group index to the corresponding name, if one |
44 | | /// exists. |
45 | | cap_index_to_name: Vec<Option<Arc<str>>>, |
46 | | /// Heap memory used indirectly by NFA states and other things (like the |
47 | | /// various capturing group representations above). Since each state |
48 | | /// might use a different amount of heap, we need to keep track of this |
49 | | /// incrementally. |
50 | | memory_extra: usize, |
51 | | } |
52 | | |
53 | | impl NFA { |
54 | | /// Creates a new NFA from the given configuration and HIR. |
55 | 5.43k | pub(crate) fn new( |
56 | 5.43k | config: Config, |
57 | 5.43k | pattern: String, |
58 | 5.43k | hir: &Hir, |
59 | 5.43k | ) -> Result<NFA, Error> { |
60 | 5.43k | Compiler::new(config, pattern).compile(hir) |
61 | 5.43k | } |
62 | | |
63 | | /// Returns the pattern string used to construct this NFA. |
64 | 0 | pub(crate) fn pattern(&self) -> &str { |
65 | 0 | &self.pattern |
66 | 0 | } |
67 | | |
68 | | /// Returns the state corresponding to the given ID. |
69 | | /// |
70 | | /// # Panics |
71 | | /// |
72 | | /// If the ID does not refer to a valid state, then this panics. |
73 | 76.8M | pub(crate) fn state(&self, id: StateID) -> &State { |
74 | 76.8M | &self.states[id.as_usize()] |
75 | 76.8M | } |
76 | | |
77 | | /// Returns the total number of states in this NFA. |
78 | 20.3k | pub(crate) fn len(&self) -> usize { |
79 | 20.3k | self.states.len() |
80 | 20.3k | } |
81 | | |
82 | | /// Returns the ID of the starting state for this NFA. |
83 | 5.09k | pub(crate) fn start(&self) -> StateID { |
84 | 5.09k | self.start |
85 | 5.09k | } |
86 | | |
87 | | /// Returns the capture group index for the corresponding named group. |
88 | | /// If no such group with the given name exists, then `None` is returned. |
89 | 0 | pub(crate) fn to_index(&self, name: &str) -> Option<usize> { |
90 | 0 | self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) |
91 | 0 | } |
92 | | |
93 | | /* |
94 | | /// Returns the capture group name for the corresponding index. |
95 | | /// If no such group with the given index, then `None` is returned. |
96 | | pub(crate) fn to_name(&self, index: usize) -> Option<&str> { |
97 | | self.cap_index_to_name.get(index)?.as_deref() |
98 | | } |
99 | | */ |
100 | | |
101 | | /// Returns an iterator over all of the capture groups, along with their |
102 | | /// names if they exist, in this NFA. |
103 | 0 | pub(crate) fn capture_names(&self) -> CaptureNames<'_> { |
104 | 0 | CaptureNames { it: self.cap_index_to_name.iter() } |
105 | 0 | } |
106 | | |
107 | | /// Returns the total number of capture groups, including the first and |
108 | | /// implicit group, in this NFA. |
109 | 10.1k | pub(crate) fn group_len(&self) -> usize { |
110 | 10.1k | self.cap_index_to_name.len() |
111 | 10.1k | } |
112 | | |
113 | | /// Returns true if and only if this NFA can only match at the beginning of |
114 | | /// a haystack. |
115 | 5.09k | pub(crate) fn is_start_anchored(&self) -> bool { |
116 | 5.09k | self.is_start_anchored |
117 | 5.09k | } |
118 | | |
119 | | /// If the pattern always reports the same number of matching capture groups |
120 | | /// for every match, then this returns the number of those groups. This |
121 | | /// doesn't include the implicit group found in every pattern. |
122 | 0 | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
123 | 0 | self.static_explicit_captures_len |
124 | 0 | } |
125 | | |
126 | | /// Returns the heap memory usage, in bytes, used by this NFA. |
127 | 1.64M | fn memory_usage(&self) -> usize { |
128 | 1.64M | (self.states.len() * size_of::<State>()) |
129 | 1.64M | + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>()) |
130 | 1.64M | + self.memory_extra |
131 | 1.64M | } |
132 | | } |
133 | | |
134 | | impl core::fmt::Debug for NFA { |
135 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
136 | 0 | writeln!(f, "NFA(")?; |
137 | 0 | writeln!(f, "pattern: {}", self.pattern)?; |
138 | 0 | for (sid, state) in self.states.iter().enumerate() { |
139 | 0 | writeln!(f, "{:07?}: {:?}", sid, state)?; |
140 | | } |
141 | 0 | writeln!(f, ")")?; |
142 | 0 | Ok(()) |
143 | 0 | } |
144 | | } |
145 | | |
146 | | /// An iterator over all capture groups in an NFA. |
147 | | /// |
148 | | /// If a particular group has a name, then it is yielded. Otherwise, `None` |
149 | | /// is yielded. |
150 | | #[derive(Clone, Debug)] |
151 | | pub(crate) struct CaptureNames<'a> { |
152 | | it: core::slice::Iter<'a, Option<Arc<str>>>, |
153 | | } |
154 | | |
155 | | impl<'a> Iterator for CaptureNames<'a> { |
156 | | type Item = Option<&'a str>; |
157 | | |
158 | 0 | fn next(&mut self) -> Option<Option<&'a str>> { |
159 | 0 | self.it.next().map(|n| n.as_deref()) |
160 | 0 | } |
161 | | } |
162 | | |
163 | | #[derive(Clone, Eq, PartialEq)] |
164 | | pub(crate) enum State { |
165 | | Char { target: StateID, ch: char }, |
166 | | Ranges { target: StateID, ranges: Vec<(char, char)> }, |
167 | | Splits { targets: Vec<StateID>, reverse: bool }, |
168 | | Goto { target: StateID, look: Option<hir::Look> }, |
169 | | Capture { target: StateID, slot: u32 }, |
170 | | Fail, |
171 | | Match, |
172 | | } |
173 | | |
174 | | impl State { |
175 | | /// Returns the heap memory usage of this NFA state in bytes. |
176 | 1.19M | fn memory_usage(&self) -> usize { |
177 | 1.19M | match *self { |
178 | | State::Char { .. } |
179 | | | State::Goto { .. } |
180 | | | State::Capture { .. } |
181 | | | State::Fail { .. } |
182 | 873k | | State::Match => 0, |
183 | 200k | State::Splits { ref targets, .. } => { |
184 | 200k | targets.len() * size_of::<StateID>() |
185 | | } |
186 | 119k | State::Ranges { ref ranges, .. } => { |
187 | 119k | ranges.len() * size_of::<(char, char)>() |
188 | | } |
189 | | } |
190 | 1.19M | } |
191 | | |
192 | | /// Returns an iterator over the given split targets. The order of the |
193 | | /// iterator yields elements in reverse when `reverse` is true. |
194 | 0 | pub(crate) fn iter_splits<'a>( |
195 | 0 | splits: &'a [StateID], |
196 | 0 | reverse: bool, |
197 | 0 | ) -> impl Iterator<Item = StateID> + 'a { |
198 | 0 | let mut it = splits.iter(); |
199 | 0 | core::iter::from_fn(move || { |
200 | 0 | if reverse { it.next_back() } else { it.next() }.copied() |
201 | 0 | }) |
202 | 0 | } |
203 | | } |
204 | | |
205 | | impl core::fmt::Debug for State { |
206 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
207 | 0 | match *self { |
208 | 0 | State::Char { target, ch } => { |
209 | 0 | write!(f, "{:?} => {:?}", ch, target) |
210 | | } |
211 | 0 | State::Ranges { target, ref ranges } => { |
212 | 0 | for (i, &(start, end)) in ranges.iter().enumerate() { |
213 | 0 | if i > 0 { |
214 | 0 | write!(f, ", ")?; |
215 | 0 | } |
216 | 0 | write!(f, "{:?}-{:?} => {:?}", start, end, target)?; |
217 | | } |
218 | 0 | Ok(()) |
219 | | } |
220 | 0 | State::Splits { ref targets, reverse } => { |
221 | 0 | write!(f, "splits(")?; |
222 | 0 | for (i, sid) in |
223 | 0 | State::iter_splits(targets, reverse).enumerate() |
224 | | { |
225 | 0 | if i > 0 { |
226 | 0 | write!(f, ", ")?; |
227 | 0 | } |
228 | 0 | write!(f, "{:?}", sid)?; |
229 | | } |
230 | 0 | write!(f, ")") |
231 | | } |
232 | 0 | State::Goto { target, look: None } => { |
233 | 0 | write!(f, "goto({:?})", target) |
234 | | } |
235 | 0 | State::Goto { target, look: Some(look) } => { |
236 | 0 | write!(f, "{:?} => {:?}", look, target) |
237 | | } |
238 | 0 | State::Capture { target, slot } => { |
239 | 0 | write!(f, "capture(slot={:?}) => {:?}", slot, target,) |
240 | | } |
241 | 0 | State::Fail => write!(f, "FAIL"), |
242 | | State::Match => { |
243 | 0 | write!(f, "MATCH") |
244 | | } |
245 | | } |
246 | 0 | } |
247 | | } |
248 | | |
249 | | /// A map from capture group name to its corresponding capture group index. |
250 | | /// |
251 | | /// We define a type alias here so that we can transparently use a `HashMap` |
252 | | /// whenever it's available. We do so presumably because it's faster, although |
253 | | /// there are no benchmarks verifying this. |
254 | | #[cfg(feature = "std")] |
255 | | type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>; |
256 | | #[cfg(not(feature = "std"))] |
257 | | type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>; |
258 | | |
259 | | #[derive(Debug)] |
260 | | struct Compiler { |
261 | | config: Config, |
262 | | nfa: RefCell<NFA>, |
263 | | } |
264 | | |
265 | | impl Compiler { |
266 | 5.43k | fn new(config: Config, pattern: String) -> Compiler { |
267 | 5.43k | let nfa = RefCell::new(NFA { |
268 | 5.43k | pattern, |
269 | 5.43k | states: vec![], |
270 | 5.43k | start: 0, |
271 | 5.43k | is_start_anchored: false, |
272 | 5.43k | is_match_empty: false, |
273 | 5.43k | static_explicit_captures_len: None, |
274 | 5.43k | cap_name_to_index: CaptureNameMap::default(), |
275 | 5.43k | cap_index_to_name: vec![], |
276 | 5.43k | memory_extra: 0, |
277 | 5.43k | }); |
278 | 5.43k | Compiler { config, nfa } |
279 | 5.43k | } |
280 | | |
281 | 5.43k | fn compile(self, hir: &Hir) -> Result<NFA, Error> { |
282 | 5.43k | self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); |
283 | 5.43k | self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); |
284 | 5.43k | self.nfa.borrow_mut().static_explicit_captures_len = |
285 | 5.43k | hir.static_explicit_captures_len(); |
286 | 5.43k | let compiled = self.c_capture(0, None, hir)?; |
287 | 5.10k | let mat = self.add(State::Match)?; |
288 | 5.09k | self.patch(compiled.end, mat)?; |
289 | 5.09k | self.nfa.borrow_mut().start = compiled.start; |
290 | 5.09k | Ok(self.nfa.into_inner()) |
291 | 5.43k | } |
292 | | |
293 | 970k | fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> { |
294 | 970k | match *hir.kind() { |
295 | 102k | HirKind::Empty => self.c_empty(), |
296 | 436k | HirKind::Char(ch) => self.c_char(ch), |
297 | 121k | HirKind::Class(ref class) => self.c_class(class), |
298 | 124k | HirKind::Look(ref look) => self.c_look(look), |
299 | 87.6k | HirKind::Repetition(ref rep) => self.c_repetition(rep), |
300 | 70.0k | HirKind::Capture(ref cap) => { |
301 | 70.0k | self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) |
302 | | } |
303 | 19.7k | HirKind::Concat(ref subs) => { |
304 | 333k | self.c_concat(subs.iter().map(|s| self.c(s))) |
305 | | } |
306 | 7.47k | HirKind::Alternation(ref subs) => { |
307 | 65.4k | self.c_alternation(subs.iter().map(|s| self.c(s))) |
308 | | } |
309 | | } |
310 | 970k | } |
311 | | |
312 | | /// Compile a "fail" state that can never be transitioned out of. |
313 | 0 | fn c_fail(&self) -> Result<ThompsonRef, Error> { |
314 | 0 | let id = self.add(State::Fail)?; |
315 | 0 | Ok(ThompsonRef { start: id, end: id }) |
316 | 0 | } |
317 | | |
318 | | /// Compile an "empty" state with one unconditional epsilon transition. |
319 | | /// |
320 | | /// Both the `start` and `end` locations point to the state created. |
321 | | /// Callers will likely want to keep the `start`, but patch the `end` to |
322 | | /// point to some other state. |
323 | 103k | fn c_empty(&self) -> Result<ThompsonRef, Error> { |
324 | 103k | let id = self.add_empty()?; |
325 | 103k | Ok(ThompsonRef { start: id, end: id }) |
326 | 103k | } |
327 | | |
328 | | /// Compile the given literal char to an NFA. |
329 | 436k | fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { |
330 | 436k | let id = self.add(State::Char { target: 0, ch })?; |
331 | 436k | Ok(ThompsonRef { start: id, end: id }) |
332 | 436k | } |
333 | | |
334 | | /// Compile the given character class into an NFA. |
335 | | /// |
336 | | /// If the class is empty, then this compiles to a `Fail` state. |
337 | 121k | fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> { |
338 | 121k | let id = if class.ranges.is_empty() { |
339 | | // Technically using an explicit fail state probably isn't |
340 | | // necessary. Because if you try to match against an empty Ranges, |
341 | | // then it should turn up with nothing regardless of input, and |
342 | | // thus "acts" like a Fail state. But it's better to be more |
343 | | // explicit, and there's no real cost to doing so. |
344 | 1.80k | self.add(State::Fail) |
345 | | } else { |
346 | 119k | let ranges = |
347 | 362k | class.ranges.iter().map(|r| (r.start, r.end)).collect(); |
348 | 119k | self.add(State::Ranges { target: 0, ranges }) |
349 | 30 | }?; |
350 | 121k | Ok(ThompsonRef { start: id, end: id }) |
351 | 121k | } |
352 | | |
353 | | /// Compile the given HIR look-around assertion to an NFA look-around |
354 | | /// assertion. |
355 | 124k | fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> { |
356 | 124k | let id = self.add(State::Goto { target: 0, look: Some(*look) })?; |
357 | 124k | Ok(ThompsonRef { start: id, end: id }) |
358 | 124k | } |
359 | | |
360 | | /// Compile the given repetition expression. This handles all types of |
361 | | /// repetitions and greediness. |
362 | 87.6k | fn c_repetition( |
363 | 87.6k | &self, |
364 | 87.6k | rep: &hir::Repetition, |
365 | 87.6k | ) -> Result<ThompsonRef, Error> { |
366 | 87.6k | match (rep.min, rep.max) { |
367 | 26.5k | (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
368 | 51.0k | (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
369 | 10.0k | (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
370 | 2.86k | (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
371 | | } |
372 | 87.6k | } |
373 | | |
374 | | /// Compile the given expression such that it matches at least `min` times, |
375 | | /// but no more than `max` times. |
376 | | /// |
377 | | /// When `greedy` is true, then the preference is for the expression to |
378 | | /// match as much as possible. Otherwise, it will match as little as |
379 | | /// possible. |
380 | 2.86k | fn c_bounded( |
381 | 2.86k | &self, |
382 | 2.86k | hir: &Hir, |
383 | 2.86k | greedy: bool, |
384 | 2.86k | min: u32, |
385 | 2.86k | max: u32, |
386 | 2.86k | ) -> Result<ThompsonRef, Error> { |
387 | 2.86k | let prefix = self.c_exactly(hir, min)?; |
388 | 2.79k | if min == max { |
389 | 0 | return Ok(prefix); |
390 | 2.79k | } |
391 | | |
392 | | // It is tempting here to compile the rest here as a concatenation |
393 | | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
394 | | // were `aaa?a?a?`. The problem here is that it leads to this program: |
395 | | // |
396 | | // >000000: 61 => 01 |
397 | | // 000001: 61 => 02 |
398 | | // 000002: union(03, 04) |
399 | | // 000003: 61 => 04 |
400 | | // 000004: union(05, 06) |
401 | | // 000005: 61 => 06 |
402 | | // 000006: union(07, 08) |
403 | | // 000007: 61 => 08 |
404 | | // 000008: MATCH |
405 | | // |
406 | | // And effectively, once you hit state 2, the epsilon closure will |
407 | | // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
408 | | // to instead compile it like so: |
409 | | // |
410 | | // >000000: 61 => 01 |
411 | | // 000001: 61 => 02 |
412 | | // 000002: union(03, 08) |
413 | | // 000003: 61 => 04 |
414 | | // 000004: union(05, 08) |
415 | | // 000005: 61 => 06 |
416 | | // 000006: union(07, 08) |
417 | | // 000007: 61 => 08 |
418 | | // 000008: MATCH |
419 | | // |
420 | | // So that the epsilon closure of state 2 is now just 3 and 8. |
421 | 2.79k | let empty = self.add_empty()?; |
422 | 2.78k | let mut prev_end = prefix.end; |
423 | 2.78k | for _ in min..max { |
424 | 101k | let splits = |
425 | 101k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
426 | 101k | let compiled = self.c(hir)?; |
427 | 101k | self.patch(prev_end, splits)?; |
428 | 101k | self.patch(splits, compiled.start)?; |
429 | 101k | self.patch(splits, empty)?; |
430 | 101k | prev_end = compiled.end; |
431 | | } |
432 | 2.64k | self.patch(prev_end, empty)?; |
433 | 2.64k | Ok(ThompsonRef { start: prefix.start, end: empty }) |
434 | 2.86k | } |
435 | | |
436 | | /// Compile the given expression such that it may be matched `n` or more |
437 | | /// times, where `n` can be any integer. (Although a particularly large |
438 | | /// integer is likely to run afoul of any configured size limits.) |
439 | | /// |
440 | | /// When `greedy` is true, then the preference is for the expression to |
441 | | /// match as much as possible. Otherwise, it will match as little as |
442 | | /// possible. |
443 | 51.0k | fn c_at_least( |
444 | 51.0k | &self, |
445 | 51.0k | hir: &Hir, |
446 | 51.0k | greedy: bool, |
447 | 51.0k | n: u32, |
448 | 51.0k | ) -> Result<ThompsonRef, Error> { |
449 | 51.0k | if n == 0 { |
450 | | // When the expression cannot match the empty string, then we |
451 | | // can get away with something much simpler: just one 'alt' |
452 | | // instruction that optionally repeats itself. But if the expr |
453 | | // can match the empty string... see below. |
454 | 37.0k | if !hir.is_match_empty() { |
455 | 22.2k | let splits = self.add(State::Splits { |
456 | 22.2k | targets: vec![], |
457 | 22.2k | reverse: !greedy, |
458 | 22.2k | })?; |
459 | 22.2k | let compiled = self.c(hir)?; |
460 | 22.1k | self.patch(splits, compiled.start)?; |
461 | 22.1k | self.patch(compiled.end, splits)?; |
462 | 22.1k | return Ok(ThompsonRef { start: splits, end: splits }); |
463 | 14.8k | } |
464 | | |
465 | | // What's going on here? Shouldn't x* be simpler than this? It |
466 | | // turns out that when implementing leftmost-first (Perl-like) |
467 | | // match semantics, x* results in an incorrect preference order |
468 | | // when computing the transitive closure of states if and only if |
469 | | // 'x' can match the empty string. So instead, we compile x* as |
470 | | // (x+)?, which preserves the correct preference order. |
471 | | // |
472 | | // See: https://github.com/rust-lang/regex/issues/779 |
473 | 14.8k | let compiled = self.c(hir)?; |
474 | 14.7k | let plus = |
475 | 14.7k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
476 | 14.7k | self.patch(compiled.end, plus)?; |
477 | 14.7k | self.patch(plus, compiled.start)?; |
478 | | |
479 | 14.7k | let question = |
480 | 14.7k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
481 | 14.7k | let empty = self.add_empty()?; |
482 | 14.7k | self.patch(question, compiled.start)?; |
483 | 14.7k | self.patch(question, empty)?; |
484 | 14.7k | self.patch(plus, empty)?; |
485 | 14.7k | Ok(ThompsonRef { start: question, end: empty }) |
486 | 13.9k | } else if n == 1 { |
487 | 10.8k | let compiled = self.c(hir)?; |
488 | 10.6k | let splits = |
489 | 10.6k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
490 | 10.6k | self.patch(compiled.end, splits)?; |
491 | 10.6k | self.patch(splits, compiled.start)?; |
492 | 10.6k | Ok(ThompsonRef { start: compiled.start, end: splits }) |
493 | | } else { |
494 | 3.17k | let prefix = self.c_exactly(hir, n - 1)?; |
495 | 3.08k | let last = self.c(hir)?; |
496 | 3.07k | let splits = |
497 | 3.07k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
498 | 3.07k | self.patch(prefix.end, last.start)?; |
499 | 3.07k | self.patch(last.end, splits)?; |
500 | 3.07k | self.patch(splits, last.start)?; |
501 | 3.07k | Ok(ThompsonRef { start: prefix.start, end: splits }) |
502 | | } |
503 | 51.0k | } |
504 | | |
505 | | /// Compile the given expression such that it may be matched zero or one |
506 | | /// times. |
507 | | /// |
508 | | /// When `greedy` is true, then the preference is for the expression to |
509 | | /// match as much as possible. Otherwise, it will match as little as |
510 | | /// possible. |
511 | 26.5k | fn c_zero_or_one( |
512 | 26.5k | &self, |
513 | 26.5k | hir: &Hir, |
514 | 26.5k | greedy: bool, |
515 | 26.5k | ) -> Result<ThompsonRef, Error> { |
516 | 26.5k | let splits = |
517 | 26.5k | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
518 | 26.5k | let compiled = self.c(hir)?; |
519 | 26.4k | let empty = self.add_empty()?; |
520 | 26.4k | self.patch(splits, compiled.start)?; |
521 | 26.4k | self.patch(splits, empty)?; |
522 | 26.4k | self.patch(compiled.end, empty)?; |
523 | 26.4k | Ok(ThompsonRef { start: splits, end: empty }) |
524 | 26.5k | } |
525 | | |
526 | | /// Compile the given HIR expression exactly `n` times. |
527 | 13.2k | fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> { |
528 | 317k | self.c_concat((0..n).map(|_| self.c(hir))) |
529 | 13.2k | } |
530 | | |
531 | | /// Compile the given expression and insert capturing states at the |
532 | | /// beginning and end of it. The slot for the capture states is computed |
533 | | /// from the index. |
534 | 75.4k | fn c_capture( |
535 | 75.4k | &self, |
536 | 75.4k | index: u32, |
537 | 75.4k | name: Option<&str>, |
538 | 75.4k | hir: &Hir, |
539 | 75.4k | ) -> Result<ThompsonRef, Error> { |
540 | 75.4k | // For discontiguous indices, push placeholders for earlier capture |
541 | 75.4k | // groups that weren't explicitly added. This can happen, for example, |
542 | 75.4k | // with patterns like '(a){0}(a)' where '(a){0}' is completely removed |
543 | 75.4k | // from the pattern. |
544 | 75.4k | let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); |
545 | 142k | for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) { |
546 | 142k | self.nfa.borrow_mut().cap_index_to_name.push(None); |
547 | 142k | } |
548 | 75.4k | if index.as_usize() >= existing_groups_len { |
549 | 26.7k | if let Some(name) = name { |
550 | 6.52k | let name = Arc::from(name); |
551 | 6.52k | let mut nfa = self.nfa.borrow_mut(); |
552 | 6.52k | nfa.cap_name_to_index.insert(Arc::clone(&name), index); |
553 | 6.52k | nfa.cap_index_to_name.push(Some(Arc::clone(&name))); |
554 | 6.52k | // This is an approximation. |
555 | 6.52k | nfa.memory_extra += name.len() + size_of::<u32>(); |
556 | 20.2k | } else { |
557 | 20.2k | self.nfa.borrow_mut().cap_index_to_name.push(None); |
558 | 20.2k | } |
559 | 48.6k | } |
560 | | |
561 | 75.4k | let Some(slot) = index.checked_mul(2) else { |
562 | 0 | return Err(Error::new("capture group slots exhausted")); |
563 | | }; |
564 | 75.4k | let start = self.add(State::Capture { target: 0, slot })?; |
565 | 75.4k | let inner = self.c(hir)?; |
566 | 74.9k | let Some(slot) = slot.checked_add(1) else { |
567 | 0 | return Err(Error::new("capture group slots exhausted")); |
568 | | }; |
569 | 74.9k | let end = self.add(State::Capture { target: 0, slot })?; |
570 | 74.9k | self.patch(start, inner.start)?; |
571 | 74.9k | self.patch(inner.end, end)?; |
572 | | |
573 | 74.9k | Ok(ThompsonRef { start, end }) |
574 | 75.4k | } |
575 | | |
576 | | /// Compile a concatenation of the sub-expressions yielded by the given |
577 | | /// iterator. If the iterator yields no elements, then this compiles down |
578 | | /// to an "empty" state that always matches. |
579 | 32.9k | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
580 | 32.9k | where |
581 | 32.9k | I: Iterator<Item = Result<ThompsonRef, Error>>, |
582 | 32.9k | { |
583 | 32.9k | let ThompsonRef { start, mut end } = match it.next() { |
584 | 31.9k | Some(result) => result?, |
585 | 1.06k | None => return self.c_empty(), |
586 | | }; |
587 | 650k | for result in it { |
588 | 619k | let compiled = result?; |
589 | 618k | self.patch(end, compiled.start)?; |
590 | 618k | end = compiled.end; |
591 | | } |
592 | 31.3k | Ok(ThompsonRef { start, end }) |
593 | 32.9k | } <regex_lite::nfa::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_lite::nfa::Compiler>::c_exactly::{closure#0}>> Line | Count | Source | 579 | 13.2k | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> | 580 | 13.2k | where | 581 | 13.2k | I: Iterator<Item = Result<ThompsonRef, Error>>, | 582 | 13.2k | { | 583 | 13.2k | let ThompsonRef { start, mut end } = match it.next() { | 584 | 12.2k | Some(result) => result?, | 585 | 1.06k | None => return self.c_empty(), | 586 | | }; | 587 | 317k | for result in it { | 588 | 305k | let compiled = result?; | 589 | 305k | self.patch(end, compiled.start)?; | 590 | 305k | end = compiled.end; | 591 | | } | 592 | 11.7k | Ok(ThompsonRef { start, end }) | 593 | 13.2k | } |
<regex_lite::nfa::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_lite::hir::Hir>, <regex_lite::nfa::Compiler>::c::{closure#0}>> Line | Count | Source | 579 | 19.7k | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> | 580 | 19.7k | where | 581 | 19.7k | I: Iterator<Item = Result<ThompsonRef, Error>>, | 582 | 19.7k | { | 583 | 19.7k | let ThompsonRef { start, mut end } = match it.next() { | 584 | 19.7k | Some(result) => result?, | 585 | 0 | None => return self.c_empty(), | 586 | | }; | 587 | 332k | for result in it { | 588 | 313k | let compiled = result?; | 589 | 313k | self.patch(end, compiled.start)?; | 590 | 313k | end = compiled.end; | 591 | | } | 592 | 19.5k | Ok(ThompsonRef { start, end }) | 593 | 19.7k | } |
|
594 | | |
595 | | /// Compile an alternation, where each element yielded by the given |
596 | | /// iterator represents an item in the alternation. If the iterator yields |
597 | | /// no elements, then this compiles down to a "fail" state. |
598 | | /// |
599 | | /// In an alternation, expressions appearing earlier are "preferred" at |
600 | | /// match time over expressions appearing later. (This is currently always |
601 | | /// true, as this crate only supports leftmost-first semantics.) |
602 | 7.47k | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
603 | 7.47k | where |
604 | 7.47k | I: Iterator<Item = Result<ThompsonRef, Error>>, |
605 | 7.47k | { |
606 | 7.47k | let first = match it.next() { |
607 | 0 | None => return self.c_fail(), |
608 | 7.47k | Some(result) => result?, |
609 | | }; |
610 | 7.43k | let second = match it.next() { |
611 | 0 | None => return Ok(first), |
612 | 7.43k | Some(result) => result?, |
613 | | }; |
614 | | |
615 | 7.40k | let splits = |
616 | 7.40k | self.add(State::Splits { targets: vec![], reverse: false })?; |
617 | 7.40k | let end = self.add_empty()?; |
618 | 7.40k | self.patch(splits, first.start)?; |
619 | 7.40k | self.patch(first.end, end)?; |
620 | 7.40k | self.patch(splits, second.start)?; |
621 | 7.40k | self.patch(second.end, end)?; |
622 | 57.8k | for result in it { |
623 | 50.5k | let compiled = result?; |
624 | 50.4k | self.patch(splits, compiled.start)?; |
625 | 50.4k | self.patch(compiled.end, end)?; |
626 | | } |
627 | 7.33k | Ok(ThompsonRef { start: splits, end }) |
628 | 7.47k | } |
629 | | |
630 | | /// A convenience routine for adding an empty state, also known as an |
631 | | /// unconditional epsilon transition. These are quite useful for making |
632 | | /// NFA construction simpler. |
633 | | /// |
634 | | /// (In the regex crate, we do a second pass to remove these, but don't |
635 | | /// bother with that here.) |
636 | 155k | fn add_empty(&self) -> Result<StateID, Error> { |
637 | 155k | self.add(State::Goto { target: 0, look: None }) |
638 | 155k | } |
639 | | |
640 | | /// The common implementation of "add a state." It handles the common |
641 | | /// error cases of state ID exhausting (by owning state ID allocation) and |
642 | | /// whether the size limit has been exceeded. |
643 | 1.19M | fn add(&self, state: State) -> Result<StateID, Error> { |
644 | 1.19M | let id = u32::try_from(self.nfa.borrow().states.len()) |
645 | 1.19M | .map_err(|_| Error::new("exhausted state IDs, too many states"))?; |
646 | 1.19M | self.nfa.borrow_mut().memory_extra += state.memory_usage(); |
647 | 1.19M | self.nfa.borrow_mut().states.push(state); |
648 | 1.19M | self.check_size_limit()?; |
649 | 1.19M | Ok(id) |
650 | 1.19M | } |
651 | | |
652 | | /// Add a transition from one state to another. |
653 | | /// |
654 | | /// This routine is called "patch" since it is very common to add the |
655 | | /// states you want, typically with "dummy" state ID transitions, and then |
656 | | /// "patch" in the real state IDs later. This is because you don't always |
657 | | /// know all of the necessary state IDs to add because they might not |
658 | | /// exist yet. |
659 | | /// |
660 | | /// # Errors |
661 | | /// |
662 | | /// This may error if patching leads to an increase in heap usage beyond |
663 | | /// the configured size limit. Heap usage only grows when patching adds a |
664 | | /// new transition (as in the case of a "splits" state). |
665 | 1.43M | fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { |
666 | 1.43M | let mut new_memory_extra = self.nfa.borrow().memory_extra; |
667 | 1.43M | match self.nfa.borrow_mut().states[from.as_usize()] { |
668 | 436k | State::Char { ref mut target, .. } => { |
669 | 436k | *target = to; |
670 | 436k | } |
671 | 119k | State::Ranges { ref mut target, .. } => { |
672 | 119k | *target = to; |
673 | 119k | } |
674 | 451k | State::Splits { ref mut targets, .. } => { |
675 | 451k | targets.push(to); |
676 | 451k | new_memory_extra += size_of::<StateID>(); |
677 | 451k | } |
678 | 279k | State::Goto { ref mut target, .. } => { |
679 | 279k | *target = to; |
680 | 279k | } |
681 | 149k | State::Capture { ref mut target, .. } => { |
682 | 149k | *target = to; |
683 | 149k | } |
684 | 1.80k | State::Fail | State::Match => {} |
685 | | } |
686 | 1.43M | if new_memory_extra != self.nfa.borrow().memory_extra { |
687 | 451k | self.nfa.borrow_mut().memory_extra = new_memory_extra; |
688 | 451k | self.check_size_limit()?; |
689 | 987k | } |
690 | 1.43M | Ok(()) |
691 | 1.43M | } |
692 | | |
693 | | /// Checks that the current heap memory usage of the NFA being compiled |
694 | | /// doesn't exceed the configured size limit. If it does, an error is |
695 | | /// returned. |
696 | 1.64M | fn check_size_limit(&self) -> Result<(), Error> { |
697 | 1.64M | if let Some(limit) = self.config.size_limit { |
698 | 1.64M | if self.nfa.borrow().memory_usage() > limit { |
699 | 339 | return Err(Error::new("compiled regex exceeded size limit")); |
700 | 1.64M | } |
701 | 0 | } |
702 | 1.64M | Ok(()) |
703 | 1.64M | } |
704 | | } |
705 | | |
706 | | /// A value that represents the result of compiling a sub-expression of a |
707 | | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
708 | | /// has an initial state at `start` and a final state at `end`. |
709 | | #[derive(Clone, Copy, Debug)] |
710 | | struct ThompsonRef { |
711 | | start: StateID, |
712 | | end: StateID, |
713 | | } |