/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.1.10/src/nfa/compiler.rs
Line | Count | Source |
1 | | // This module provides an NFA compiler using Thompson's construction |
2 | | // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA |
3 | | // graph as output. The NFA graph is structured in a way that permits it to be |
4 | | // executed by a virtual machine and also used to efficiently build a DFA. |
5 | | // |
6 | | // The compiler deals with a slightly expanded set of NFA states that notably |
7 | | // includes an empty node that has exactly one epsilon transition to the next |
8 | | // state. In other words, it's a "goto" instruction if one views Thompson's NFA |
9 | | // as a set of bytecode instructions. These goto instructions are removed in |
10 | | // a subsequent phase before returning the NFA to the caller. The purpose of |
11 | | // these empty nodes is that they make the construction algorithm substantially |
12 | | // simpler to implement. We remove them before returning to the caller because |
13 | | // they can represent substantial overhead when traversing the NFA graph |
14 | | // (either while searching using the NFA directly or while building a DFA). |
15 | | // |
16 | | // In the future, it would be nice to provide a Glushkov compiler as well, |
17 | | // as it would work well as a bit-parallel NFA for smaller regexes. But |
18 | | // the Thompson construction is one I'm more familiar with and seems more |
19 | | // straight-forward to deal with when it comes to large Unicode character |
20 | | // classes. |
21 | | // |
22 | | // Internally, the compiler uses interior mutability to improve composition |
23 | | // in the face of the borrow checker. In particular, we'd really like to be |
24 | | // able to write things like this: |
25 | | // |
26 | | // self.c_concat(exprs.iter().map(|e| self.c(e))) |
27 | | // |
28 | | // Which elegantly uses iterators to build up a sequence of compiled regex |
29 | | // sub-expressions and then hands it off to the concatenating compiler |
30 | | // routine. Without interior mutability, the borrow checker won't let us |
31 | | // borrow `self` mutably both inside and outside the closure at the same |
32 | | // time. |
33 | | |
34 | | use std::cell::RefCell; |
35 | | use std::mem; |
36 | | |
37 | | use regex_syntax::hir::{self, Hir, HirKind}; |
38 | | use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; |
39 | | |
40 | | use classes::ByteClassSet; |
41 | | use error::{Error, Result}; |
42 | | use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; |
43 | | use nfa::range_trie::RangeTrie; |
44 | | use nfa::{State, StateID, Transition, NFA}; |
45 | | |
46 | | /// Config knobs for the NFA compiler. See the builder's methods for more |
47 | | /// docs on each one. |
48 | | #[derive(Clone, Copy, Debug)] |
49 | | struct Config { |
50 | | anchored: bool, |
51 | | allow_invalid_utf8: bool, |
52 | | reverse: bool, |
53 | | shrink: bool, |
54 | | } |
55 | | |
56 | | impl Default for Config { |
57 | 0 | fn default() -> Config { |
58 | 0 | Config { |
59 | 0 | anchored: false, |
60 | 0 | allow_invalid_utf8: false, |
61 | 0 | reverse: false, |
62 | 0 | shrink: true, |
63 | 0 | } |
64 | 0 | } |
65 | | } |
66 | | |
67 | | /// A builder for compiling an NFA. |
68 | | #[derive(Clone, Debug)] |
69 | | pub struct Builder { |
70 | | config: Config, |
71 | | } |
72 | | |
73 | | impl Builder { |
74 | | /// Create a new NFA builder with its default configuration. |
75 | 0 | pub fn new() -> Builder { |
76 | 0 | Builder { config: Config::default() } |
77 | 0 | } |
78 | | |
79 | | /// Compile the given high level intermediate representation of a regular |
80 | | /// expression into an NFA. |
81 | | /// |
82 | | /// If there was a problem building the NFA, then an error is returned. |
83 | | /// For example, if the regex uses unsupported features (such as zero-width |
84 | | /// assertions), then an error is returned. |
85 | 0 | pub fn build(&self, expr: &Hir) -> Result<NFA> { |
86 | 0 | let mut nfa = NFA::always_match(); |
87 | 0 | self.build_with(&mut Compiler::new(), &mut nfa, expr)?; |
88 | 0 | Ok(nfa) |
89 | 0 | } |
90 | | |
91 | | /// Compile the given high level intermediate representation of a regular |
92 | | /// expression into the NFA given using the given compiler. Callers may |
93 | | /// prefer this over `build` if they would like to reuse allocations while |
94 | | /// compiling many regular expressions. |
95 | | /// |
96 | | /// On success, the given NFA is completely overwritten with the NFA |
97 | | /// produced by the compiler. |
98 | | /// |
99 | | /// If there was a problem building the NFA, then an error is returned. For |
100 | | /// example, if the regex uses unsupported features (such as zero-width |
101 | | /// assertions), then an error is returned. When an error is returned, |
102 | | /// the contents of `nfa` are unspecified and should not be relied upon. |
103 | | /// However, it can still be reused in subsequent calls to this method. |
104 | 0 | pub fn build_with( |
105 | 0 | &self, |
106 | 0 | compiler: &mut Compiler, |
107 | 0 | nfa: &mut NFA, |
108 | 0 | expr: &Hir, |
109 | 0 | ) -> Result<()> { |
110 | 0 | compiler.clear(); |
111 | 0 | compiler.configure(self.config); |
112 | 0 | compiler.compile(nfa, expr) |
113 | 0 | } |
114 | | |
115 | | /// Set whether matching must be anchored at the beginning of the input. |
116 | | /// |
117 | | /// When enabled, a match must begin at the start of the input. When |
118 | | /// disabled, the NFA will act as if the pattern started with a `.*?`, |
119 | | /// which enables a match to appear anywhere. |
120 | | /// |
121 | | /// By default this is disabled. |
122 | 0 | pub fn anchored(&mut self, yes: bool) -> &mut Builder { |
123 | 0 | self.config.anchored = yes; |
124 | 0 | self |
125 | 0 | } |
126 | | |
127 | | /// When enabled, the builder will permit the construction of an NFA that |
128 | | /// may match invalid UTF-8. |
129 | | /// |
130 | | /// When disabled (the default), the builder is guaranteed to produce a |
131 | | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
132 | | /// will return an error). |
133 | 0 | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { |
134 | 0 | self.config.allow_invalid_utf8 = yes; |
135 | 0 | self |
136 | 0 | } |
137 | | |
138 | | /// Reverse the NFA. |
139 | | /// |
140 | | /// A NFA reversal is performed by reversing all of the concatenated |
141 | | /// sub-expressions in the original pattern, recursively. The resulting |
142 | | /// NFA can be used to match the pattern starting from the end of a string |
143 | | /// instead of the beginning of a string. |
144 | | /// |
145 | | /// Reversing the NFA is useful for building a reverse DFA, which is most |
146 | | /// useful for finding the start of a match. |
147 | 0 | pub fn reverse(&mut self, yes: bool) -> &mut Builder { |
148 | 0 | self.config.reverse = yes; |
149 | 0 | self |
150 | 0 | } |
151 | | |
152 | | /// Apply best effort heuristics to shrink the NFA at the expense of more |
153 | | /// time/memory. |
154 | | /// |
155 | | /// This is enabled by default. Generally speaking, if one is using an NFA |
156 | | /// to compile DFA, then the extra time used to shrink the NFA will be |
157 | | /// more than made up for during DFA construction (potentially by a lot). |
158 | | /// In other words, enabling this can substantially decrease the overall |
159 | | /// amount of time it takes to build a DFA. |
160 | | /// |
161 | | /// The only reason to disable this if you want to compile an NFA and start |
162 | | /// using it as quickly as possible without needing to build a DFA. |
163 | 0 | pub fn shrink(&mut self, yes: bool) -> &mut Builder { |
164 | 0 | self.config.shrink = yes; |
165 | 0 | self |
166 | 0 | } |
167 | | } |
168 | | |
169 | | /// A compiler that converts a regex abstract syntax to an NFA via Thompson's |
170 | | /// construction. Namely, this compiler permits epsilon transitions between |
171 | | /// states. |
172 | | /// |
173 | | /// Users of this crate cannot use a compiler directly. Instead, all one can |
174 | | /// do is create one and use it via the |
175 | | /// [`Builder::build_with`](struct.Builder.html#method.build_with) |
176 | | /// method. This permits callers to reuse compilers in order to amortize |
177 | | /// allocations. |
178 | | #[derive(Clone, Debug)] |
179 | | pub struct Compiler { |
180 | | /// The set of compiled NFA states. Once a state is compiled, it is |
181 | | /// assigned a state ID equivalent to its index in this list. Subsequent |
182 | | /// compilation can modify previous states by adding new transitions. |
183 | | states: RefCell<Vec<CState>>, |
184 | | /// The configuration from the builder. |
185 | | config: Config, |
186 | | /// State used for compiling character classes to UTF-8 byte automata. |
187 | | /// State is not retained between character class compilations. This just |
188 | | /// serves to amortize allocation to the extent possible. |
189 | | utf8_state: RefCell<Utf8State>, |
190 | | /// State used for arranging character classes in reverse into a trie. |
191 | | trie_state: RefCell<RangeTrie>, |
192 | | /// State used for caching common suffixes when compiling reverse UTF-8 |
193 | | /// automata (for Unicode character classes). |
194 | | utf8_suffix: RefCell<Utf8SuffixMap>, |
195 | | /// A map used to re-map state IDs when translating the compiler's internal |
196 | | /// NFA state representation to the external NFA representation. |
197 | | remap: RefCell<Vec<StateID>>, |
198 | | /// A set of compiler internal state IDs that correspond to states that are |
199 | | /// exclusively epsilon transitions, i.e., goto instructions, combined with |
200 | | /// the state that they point to. This is used to record said states while |
201 | | /// transforming the compiler's internal NFA representation to the external |
202 | | /// form. |
203 | | empties: RefCell<Vec<(StateID, StateID)>>, |
204 | | } |
205 | | |
206 | | /// A compiler intermediate state representation for an NFA that is only used |
207 | | /// during compilation. Once compilation is done, `CState`s are converted to |
208 | | /// `State`s, which have a much simpler representation. |
209 | | #[derive(Clone, Debug, Eq, PartialEq)] |
210 | | enum CState { |
211 | | /// An empty state whose only purpose is to forward the automaton to |
212 | | /// another state via en epsilon transition. These are useful during |
213 | | /// compilation but are otherwise removed at the end. |
214 | | Empty { next: StateID }, |
215 | | /// A state that only transitions to `next` if the current input byte is |
216 | | /// in the range `[start, end]` (inclusive on both ends). |
217 | | Range { range: Transition }, |
218 | | /// A state with possibly many transitions, represented in a sparse |
219 | | /// fashion. Transitions are ordered lexicographically by input range. |
220 | | /// As such, this may only be used when every transition has equal |
221 | | /// priority. (In practice, this is only used for encoding large UTF-8 |
222 | | /// automata.) |
223 | | Sparse { ranges: Vec<Transition> }, |
224 | | /// An alternation such that there exists an epsilon transition to all |
225 | | /// states in `alternates`, where matches found via earlier transitions |
226 | | /// are preferred over later transitions. |
227 | | Union { alternates: Vec<StateID> }, |
228 | | /// An alternation such that there exists an epsilon transition to all |
229 | | /// states in `alternates`, where matches found via later transitions |
230 | | /// are preferred over earlier transitions. |
231 | | /// |
232 | | /// This "reverse" state exists for convenience during compilation that |
233 | | /// permits easy construction of non-greedy combinations of NFA states. |
234 | | /// At the end of compilation, Union and UnionReverse states are merged |
235 | | /// into one Union type of state, where the latter has its epsilon |
236 | | /// transitions reversed to reflect the priority inversion. |
237 | | UnionReverse { alternates: Vec<StateID> }, |
238 | | /// A match state. There is exactly one such occurrence of this state in |
239 | | /// an NFA. |
240 | | Match, |
241 | | } |
242 | | |
243 | | /// A value that represents the result of compiling a sub-expression of a |
244 | | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
245 | | /// has an initial state at `start` and a final state at `end`. |
246 | | #[derive(Clone, Copy, Debug)] |
247 | | pub struct ThompsonRef { |
248 | | start: StateID, |
249 | | end: StateID, |
250 | | } |
251 | | |
252 | | impl Compiler { |
253 | | /// Create a new compiler. |
254 | 0 | pub fn new() -> Compiler { |
255 | 0 | Compiler { |
256 | 0 | states: RefCell::new(vec![]), |
257 | 0 | config: Config::default(), |
258 | 0 | utf8_state: RefCell::new(Utf8State::new()), |
259 | 0 | trie_state: RefCell::new(RangeTrie::new()), |
260 | 0 | utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), |
261 | 0 | remap: RefCell::new(vec![]), |
262 | 0 | empties: RefCell::new(vec![]), |
263 | 0 | } |
264 | 0 | } |
265 | | |
266 | | /// Clear any memory used by this compiler such that it is ready to compile |
267 | | /// a new regex. |
268 | | /// |
269 | | /// It is preferrable to reuse a compiler if possible in order to reuse |
270 | | /// allocations. |
271 | 0 | fn clear(&self) { |
272 | 0 | self.states.borrow_mut().clear(); |
273 | | // We don't need to clear anything else since they are cleared on |
274 | | // their own and only when they are used. |
275 | 0 | } |
276 | | |
277 | | /// Configure this compiler from the builder's knobs. |
278 | | /// |
279 | | /// The compiler is always reconfigured by the builder before using it to |
280 | | /// build an NFA. |
281 | 0 | fn configure(&mut self, config: Config) { |
282 | 0 | self.config = config; |
283 | 0 | } |
284 | | |
285 | | /// Convert the current intermediate NFA to its final compiled form. |
286 | 0 | fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { |
287 | 0 | nfa.anchored = self.config.anchored; |
288 | | |
289 | 0 | let mut start = self.add_empty(); |
290 | 0 | if !nfa.anchored { |
291 | 0 | let compiled = if self.config.allow_invalid_utf8 { |
292 | 0 | self.c_unanchored_prefix_invalid_utf8()? |
293 | | } else { |
294 | 0 | self.c_unanchored_prefix_valid_utf8()? |
295 | | }; |
296 | 0 | self.patch(start, compiled.start); |
297 | 0 | start = compiled.end; |
298 | 0 | } |
299 | 0 | let compiled = self.c(&expr)?; |
300 | 0 | let match_id = self.add_match(); |
301 | 0 | self.patch(start, compiled.start); |
302 | 0 | self.patch(compiled.end, match_id); |
303 | 0 | self.finish(nfa); |
304 | 0 | Ok(()) |
305 | 0 | } |
306 | | |
307 | | /// Finishes the compilation process and populates the provide NFA with |
308 | | /// the final graph. |
309 | 0 | fn finish(&self, nfa: &mut NFA) { |
310 | 0 | let mut bstates = self.states.borrow_mut(); |
311 | 0 | let mut remap = self.remap.borrow_mut(); |
312 | 0 | remap.resize(bstates.len(), 0); |
313 | 0 | let mut empties = self.empties.borrow_mut(); |
314 | 0 | empties.clear(); |
315 | | |
316 | | // We don't reuse allocations here becuase this is what we're |
317 | | // returning. |
318 | 0 | nfa.states.clear(); |
319 | 0 | let mut byteset = ByteClassSet::new(); |
320 | | |
321 | | // The idea here is to convert our intermediate states to their final |
322 | | // form. The only real complexity here is the process of converting |
323 | | // transitions, which are expressed in terms of state IDs. The new |
324 | | // set of states will be smaller because of partial epsilon removal, |
325 | | // so the state IDs will not be the same. |
326 | 0 | for (id, bstate) in bstates.iter_mut().enumerate() { |
327 | 0 | match *bstate { |
328 | 0 | CState::Empty { next } => { |
329 | 0 | // Since we're removing empty states, we need to handle |
330 | 0 | // them later since we don't yet know which new state this |
331 | 0 | // empty state will be mapped to. |
332 | 0 | empties.push((id, next)); |
333 | 0 | } |
334 | 0 | CState::Range { ref range } => { |
335 | 0 | remap[id] = nfa.states.len(); |
336 | 0 | byteset.set_range(range.start, range.end); |
337 | 0 | nfa.states.push(State::Range { range: range.clone() }); |
338 | 0 | } |
339 | 0 | CState::Sparse { ref mut ranges } => { |
340 | 0 | remap[id] = nfa.states.len(); |
341 | | |
342 | 0 | let ranges = mem::replace(ranges, vec![]); |
343 | 0 | for r in &ranges { |
344 | 0 | byteset.set_range(r.start, r.end); |
345 | 0 | } |
346 | 0 | nfa.states.push(State::Sparse { |
347 | 0 | ranges: ranges.into_boxed_slice(), |
348 | 0 | }); |
349 | | } |
350 | 0 | CState::Union { ref mut alternates } => { |
351 | 0 | remap[id] = nfa.states.len(); |
352 | 0 |
|
353 | 0 | let alternates = mem::replace(alternates, vec![]); |
354 | 0 | nfa.states.push(State::Union { |
355 | 0 | alternates: alternates.into_boxed_slice(), |
356 | 0 | }); |
357 | 0 | } |
358 | 0 | CState::UnionReverse { ref mut alternates } => { |
359 | 0 | remap[id] = nfa.states.len(); |
360 | 0 |
|
361 | 0 | let mut alternates = mem::replace(alternates, vec![]); |
362 | 0 | alternates.reverse(); |
363 | 0 | nfa.states.push(State::Union { |
364 | 0 | alternates: alternates.into_boxed_slice(), |
365 | 0 | }); |
366 | 0 | } |
367 | 0 | CState::Match => { |
368 | 0 | remap[id] = nfa.states.len(); |
369 | 0 | nfa.states.push(State::Match); |
370 | 0 | } |
371 | | } |
372 | | } |
373 | 0 | for &(empty_id, mut empty_next) in empties.iter() { |
374 | | // empty states can point to other empty states, forming a chain. |
375 | | // So we must follow the chain until the end, which must end at |
376 | | // a non-empty state, and therefore, a state that is correctly |
377 | | // remapped. We are guaranteed to terminate because our compiler |
378 | | // never builds a loop among empty states. |
379 | 0 | while let CState::Empty { next } = bstates[empty_next] { |
380 | 0 | empty_next = next; |
381 | 0 | } |
382 | 0 | remap[empty_id] = remap[empty_next]; |
383 | | } |
384 | 0 | for state in &mut nfa.states { |
385 | 0 | state.remap(&remap); |
386 | 0 | } |
387 | | // The compiler always begins the NFA at the first state. |
388 | 0 | nfa.start = remap[0]; |
389 | 0 | nfa.byte_classes = byteset.byte_classes(); |
390 | 0 | } |
391 | | |
392 | 0 | fn c(&self, expr: &Hir) -> Result<ThompsonRef> { |
393 | 0 | match *expr.kind() { |
394 | | HirKind::Empty => { |
395 | 0 | let id = self.add_empty(); |
396 | 0 | Ok(ThompsonRef { start: id, end: id }) |
397 | | } |
398 | 0 | HirKind::Literal(hir::Literal::Unicode(ch)) => { |
399 | 0 | let mut buf = [0; 4]; |
400 | 0 | let it = ch |
401 | 0 | .encode_utf8(&mut buf) |
402 | 0 | .as_bytes() |
403 | 0 | .iter() |
404 | 0 | .map(|&b| Ok(self.c_range(b, b))); |
405 | 0 | self.c_concat(it) |
406 | | } |
407 | 0 | HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), |
408 | 0 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
409 | 0 | self.c_byte_class(cls) |
410 | | } |
411 | 0 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
412 | 0 | self.c_unicode_class(cls) |
413 | | } |
414 | 0 | HirKind::Repetition(ref rep) => self.c_repetition(rep), |
415 | 0 | HirKind::Group(ref group) => self.c(&*group.hir), |
416 | 0 | HirKind::Concat(ref exprs) => { |
417 | 0 | self.c_concat(exprs.iter().map(|e| self.c(e))) |
418 | | } |
419 | 0 | HirKind::Alternation(ref exprs) => { |
420 | 0 | self.c_alternation(exprs.iter().map(|e| self.c(e))) |
421 | | } |
422 | 0 | HirKind::Anchor(_) => Err(Error::unsupported_anchor()), |
423 | 0 | HirKind::WordBoundary(_) => Err(Error::unsupported_word()), |
424 | | } |
425 | 0 | } |
426 | | |
427 | 0 | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> |
428 | 0 | where |
429 | 0 | I: DoubleEndedIterator<Item = Result<ThompsonRef>>, |
430 | | { |
431 | 0 | let first = |
432 | 0 | if self.config.reverse { it.next_back() } else { it.next() }; |
433 | 0 | let ThompsonRef { start, mut end } = match first { |
434 | 0 | Some(result) => result?, |
435 | 0 | None => return Ok(self.c_empty()), |
436 | | }; |
437 | | loop { |
438 | 0 | let next = |
439 | 0 | if self.config.reverse { it.next_back() } else { it.next() }; |
440 | 0 | let compiled = match next { |
441 | 0 | Some(result) => result?, |
442 | 0 | None => break, |
443 | | }; |
444 | 0 | self.patch(end, compiled.start); |
445 | 0 | end = compiled.end; |
446 | | } |
447 | 0 | Ok(ThompsonRef { start, end }) |
448 | 0 | } Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::ops::range::Range<u32>, <regex_automata::nfa::compiler::Compiler>::c_exactly::{closure#0}>>Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<regex_syntax::hir::Hir>, <regex_automata::nfa::compiler::Compiler>::c::{closure#1}>>Unexecuted instantiation: <regex_automata::nfa::compiler::Compiler>::c_concat::<core::iter::adapters::map::Map<core::slice::iter::Iter<u8>, <regex_automata::nfa::compiler::Compiler>::c::{closure#0}>> |
449 | | |
450 | 0 | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> |
451 | 0 | where |
452 | 0 | I: Iterator<Item = Result<ThompsonRef>>, |
453 | | { |
454 | 0 | let first = it.next().expect("alternations must be non-empty")?; |
455 | 0 | let second = match it.next() { |
456 | 0 | None => return Ok(first), |
457 | 0 | Some(result) => result?, |
458 | | }; |
459 | | |
460 | 0 | let union = self.add_union(); |
461 | 0 | let end = self.add_empty(); |
462 | 0 | self.patch(union, first.start); |
463 | 0 | self.patch(first.end, end); |
464 | 0 | self.patch(union, second.start); |
465 | 0 | self.patch(second.end, end); |
466 | 0 | for result in it { |
467 | 0 | let compiled = result?; |
468 | 0 | self.patch(union, compiled.start); |
469 | 0 | self.patch(compiled.end, end); |
470 | | } |
471 | 0 | Ok(ThompsonRef { start: union, end }) |
472 | 0 | } |
473 | | |
474 | 0 | fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { |
475 | 0 | match rep.kind { |
476 | | hir::RepetitionKind::ZeroOrOne => { |
477 | 0 | self.c_zero_or_one(&rep.hir, rep.greedy) |
478 | | } |
479 | | hir::RepetitionKind::ZeroOrMore => { |
480 | 0 | self.c_at_least(&rep.hir, rep.greedy, 0) |
481 | | } |
482 | | hir::RepetitionKind::OneOrMore => { |
483 | 0 | self.c_at_least(&rep.hir, rep.greedy, 1) |
484 | | } |
485 | 0 | hir::RepetitionKind::Range(ref rng) => match *rng { |
486 | 0 | hir::RepetitionRange::Exactly(count) => { |
487 | 0 | self.c_exactly(&rep.hir, count) |
488 | | } |
489 | 0 | hir::RepetitionRange::AtLeast(m) => { |
490 | 0 | self.c_at_least(&rep.hir, rep.greedy, m) |
491 | | } |
492 | 0 | hir::RepetitionRange::Bounded(min, max) => { |
493 | 0 | self.c_bounded(&rep.hir, rep.greedy, min, max) |
494 | | } |
495 | | }, |
496 | | } |
497 | 0 | } |
498 | | |
499 | 0 | fn c_bounded( |
500 | 0 | &self, |
501 | 0 | expr: &Hir, |
502 | 0 | greedy: bool, |
503 | 0 | min: u32, |
504 | 0 | max: u32, |
505 | 0 | ) -> Result<ThompsonRef> { |
506 | 0 | let prefix = self.c_exactly(expr, min)?; |
507 | 0 | if min == max { |
508 | 0 | return Ok(prefix); |
509 | 0 | } |
510 | | |
511 | | // It is tempting here to compile the rest here as a concatenation |
512 | | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
513 | | // were `aaa?a?a?`. The problem here is that it leads to this program: |
514 | | // |
515 | | // >000000: 61 => 01 |
516 | | // 000001: 61 => 02 |
517 | | // 000002: alt(03, 04) |
518 | | // 000003: 61 => 04 |
519 | | // 000004: alt(05, 06) |
520 | | // 000005: 61 => 06 |
521 | | // 000006: alt(07, 08) |
522 | | // 000007: 61 => 08 |
523 | | // 000008: MATCH |
524 | | // |
525 | | // And effectively, once you hit state 2, the epsilon closure will |
526 | | // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is |
527 | | // better to instead compile it like so: |
528 | | // |
529 | | // >000000: 61 => 01 |
530 | | // 000001: 61 => 02 |
531 | | // 000002: alt(03, 08) |
532 | | // 000003: 61 => 04 |
533 | | // 000004: alt(05, 08) |
534 | | // 000005: 61 => 06 |
535 | | // 000006: alt(07, 08) |
536 | | // 000007: 61 => 08 |
537 | | // 000008: MATCH |
538 | | // |
539 | | // So that the epsilon closure of state 2 is now just 3 and 8. |
540 | 0 | let empty = self.add_empty(); |
541 | 0 | let mut prev_end = prefix.end; |
542 | 0 | for _ in min..max { |
543 | 0 | let union = if greedy { |
544 | 0 | self.add_union() |
545 | | } else { |
546 | 0 | self.add_reverse_union() |
547 | | }; |
548 | 0 | let compiled = self.c(expr)?; |
549 | 0 | self.patch(prev_end, union); |
550 | 0 | self.patch(union, compiled.start); |
551 | 0 | self.patch(union, empty); |
552 | 0 | prev_end = compiled.end; |
553 | | } |
554 | 0 | self.patch(prev_end, empty); |
555 | 0 | Ok(ThompsonRef { start: prefix.start, end: empty }) |
556 | 0 | } |
557 | | |
558 | 0 | fn c_at_least( |
559 | 0 | &self, |
560 | 0 | expr: &Hir, |
561 | 0 | greedy: bool, |
562 | 0 | n: u32, |
563 | 0 | ) -> Result<ThompsonRef> { |
564 | 0 | if n == 0 { |
565 | 0 | let union = if greedy { |
566 | 0 | self.add_union() |
567 | | } else { |
568 | 0 | self.add_reverse_union() |
569 | | }; |
570 | 0 | let compiled = self.c(expr)?; |
571 | 0 | self.patch(union, compiled.start); |
572 | 0 | self.patch(compiled.end, union); |
573 | 0 | Ok(ThompsonRef { start: union, end: union }) |
574 | 0 | } else if n == 1 { |
575 | 0 | let compiled = self.c(expr)?; |
576 | 0 | let union = if greedy { |
577 | 0 | self.add_union() |
578 | | } else { |
579 | 0 | self.add_reverse_union() |
580 | | }; |
581 | 0 | self.patch(compiled.end, union); |
582 | 0 | self.patch(union, compiled.start); |
583 | 0 | Ok(ThompsonRef { start: compiled.start, end: union }) |
584 | | } else { |
585 | 0 | let prefix = self.c_exactly(expr, n - 1)?; |
586 | 0 | let last = self.c(expr)?; |
587 | 0 | let union = if greedy { |
588 | 0 | self.add_union() |
589 | | } else { |
590 | 0 | self.add_reverse_union() |
591 | | }; |
592 | 0 | self.patch(prefix.end, last.start); |
593 | 0 | self.patch(last.end, union); |
594 | 0 | self.patch(union, last.start); |
595 | 0 | Ok(ThompsonRef { start: prefix.start, end: union }) |
596 | | } |
597 | 0 | } |
598 | | |
599 | 0 | fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { |
600 | 0 | let union = |
601 | 0 | if greedy { self.add_union() } else { self.add_reverse_union() }; |
602 | 0 | let compiled = self.c(expr)?; |
603 | 0 | let empty = self.add_empty(); |
604 | 0 | self.patch(union, compiled.start); |
605 | 0 | self.patch(union, empty); |
606 | 0 | self.patch(compiled.end, empty); |
607 | 0 | Ok(ThompsonRef { start: union, end: empty }) |
608 | 0 | } |
609 | | |
610 | 0 | fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { |
611 | 0 | let it = (0..n).map(|_| self.c(expr)); |
612 | 0 | self.c_concat(it) |
613 | 0 | } |
614 | | |
615 | 0 | fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { |
616 | 0 | let end = self.add_empty(); |
617 | 0 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
618 | 0 | for r in cls.iter() { |
619 | 0 | trans.push(Transition { |
620 | 0 | start: r.start(), |
621 | 0 | end: r.end(), |
622 | 0 | next: end, |
623 | 0 | }); |
624 | 0 | } |
625 | 0 | Ok(ThompsonRef { start: self.add_sparse(trans), end }) |
626 | 0 | } |
627 | | |
628 | 0 | fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { |
629 | | // If all we have are ASCII ranges wrapped in a Unicode package, then |
630 | | // there is zero reason to bring out the big guns. We can fit all ASCII |
631 | | // ranges within a single sparse transition. |
632 | 0 | if cls.is_all_ascii() { |
633 | 0 | let end = self.add_empty(); |
634 | 0 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
635 | 0 | for r in cls.iter() { |
636 | 0 | assert!(r.start() <= '\x7F'); |
637 | 0 | assert!(r.end() <= '\x7F'); |
638 | 0 | trans.push(Transition { |
639 | 0 | start: r.start() as u8, |
640 | 0 | end: r.end() as u8, |
641 | 0 | next: end, |
642 | 0 | }); |
643 | | } |
644 | 0 | Ok(ThompsonRef { start: self.add_sparse(trans), end }) |
645 | 0 | } else if self.config.reverse { |
646 | 0 | if !self.config.shrink { |
647 | | // When we don't want to spend the extra time shrinking, we |
648 | | // compile the UTF-8 automaton in reverse using something like |
649 | | // the "naive" approach, but will attempt to re-use common |
650 | | // suffixes. |
651 | 0 | self.c_unicode_class_reverse_with_suffix(cls) |
652 | | } else { |
653 | | // When we want to shrink our NFA for reverse UTF-8 automata, |
654 | | // we cannot feed UTF-8 sequences directly to the UTF-8 |
655 | | // compiler, since the UTF-8 compiler requires all sequences |
656 | | // to be lexicographically sorted. Instead, we organize our |
657 | | // sequences into a range trie, which can then output our |
658 | | // sequences in the correct order. Unfortunately, building the |
659 | | // range trie is fairly expensive (but not nearly as expensive |
660 | | // as building a DFA). Hence the reason why the 'shrink' option |
661 | | // exists, so that this path can be toggled off. |
662 | 0 | let mut trie = self.trie_state.borrow_mut(); |
663 | 0 | trie.clear(); |
664 | | |
665 | 0 | for rng in cls.iter() { |
666 | 0 | for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { |
667 | 0 | seq.reverse(); |
668 | 0 | trie.insert(seq.as_slice()); |
669 | 0 | } |
670 | | } |
671 | 0 | let mut utf8_state = self.utf8_state.borrow_mut(); |
672 | 0 | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); |
673 | 0 | trie.iter(|seq| { |
674 | 0 | utf8c.add(&seq); |
675 | 0 | }); |
676 | 0 | Ok(utf8c.finish()) |
677 | | } |
678 | | } else { |
679 | | // In the forward direction, we always shrink our UTF-8 automata |
680 | | // because we can stream it right into the UTF-8 compiler. There |
681 | | // is almost no downside (in either memory or time) to using this |
682 | | // approach. |
683 | 0 | let mut utf8_state = self.utf8_state.borrow_mut(); |
684 | 0 | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); |
685 | 0 | for rng in cls.iter() { |
686 | 0 | for seq in Utf8Sequences::new(rng.start(), rng.end()) { |
687 | 0 | utf8c.add(seq.as_slice()); |
688 | 0 | } |
689 | | } |
690 | 0 | Ok(utf8c.finish()) |
691 | | } |
692 | | |
693 | | // For reference, the code below is the "naive" version of compiling a |
694 | | // UTF-8 automaton. It is deliciously simple (and works for both the |
695 | | // forward and reverse cases), but will unfortunately produce very |
696 | | // large NFAs. When compiling a forward automaton, the size difference |
697 | | // can sometimes be an order of magnitude. For example, the '\w' regex |
698 | | // will generate about ~3000 NFA states using the naive approach below, |
699 | | // but only 283 states when using the approach above. This is because |
700 | | // the approach above actually compiles a *minimal* (or near minimal, |
701 | | // because of the bounded hashmap) UTF-8 automaton. |
702 | | // |
703 | | // The code below is kept as a reference point in order to make it |
704 | | // easier to understand the higher level goal here. |
705 | | /* |
706 | | let it = cls |
707 | | .iter() |
708 | | .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) |
709 | | .map(|seq| { |
710 | | let it = seq |
711 | | .as_slice() |
712 | | .iter() |
713 | | .map(|rng| Ok(self.c_range(rng.start, rng.end))); |
714 | | self.c_concat(it) |
715 | | }); |
716 | | self.c_alternation(it); |
717 | | */ |
718 | 0 | } |
719 | | |
720 | 0 | fn c_unicode_class_reverse_with_suffix( |
721 | 0 | &self, |
722 | 0 | cls: &hir::ClassUnicode, |
723 | 0 | ) -> Result<ThompsonRef> { |
724 | | // N.B. It would likely be better to cache common *prefixes* in the |
725 | | // reverse direction, but it's not quite clear how to do that. The |
726 | | // advantage of caching suffixes is that it does give us a win, and |
727 | | // has a very small additional overhead. |
728 | 0 | let mut cache = self.utf8_suffix.borrow_mut(); |
729 | 0 | cache.clear(); |
730 | | |
731 | 0 | let union = self.add_union(); |
732 | 0 | let alt_end = self.add_empty(); |
733 | 0 | for urng in cls.iter() { |
734 | 0 | for seq in Utf8Sequences::new(urng.start(), urng.end()) { |
735 | 0 | let mut end = alt_end; |
736 | 0 | for brng in seq.as_slice() { |
737 | 0 | let key = Utf8SuffixKey { |
738 | 0 | from: end, |
739 | 0 | start: brng.start, |
740 | 0 | end: brng.end, |
741 | 0 | }; |
742 | 0 | let hash = cache.hash(&key); |
743 | 0 | if let Some(id) = cache.get(&key, hash) { |
744 | 0 | end = id; |
745 | 0 | continue; |
746 | 0 | } |
747 | | |
748 | 0 | let compiled = self.c_range(brng.start, brng.end); |
749 | 0 | self.patch(compiled.end, end); |
750 | 0 | end = compiled.start; |
751 | 0 | cache.set(key, hash, end); |
752 | | } |
753 | 0 | self.patch(union, end); |
754 | | } |
755 | | } |
756 | 0 | Ok(ThompsonRef { start: union, end: alt_end }) |
757 | 0 | } |
758 | | |
759 | 0 | fn c_range(&self, start: u8, end: u8) -> ThompsonRef { |
760 | 0 | let id = self.add_range(start, end); |
761 | 0 | ThompsonRef { start: id, end: id } |
762 | 0 | } |
763 | | |
764 | 0 | fn c_empty(&self) -> ThompsonRef { |
765 | 0 | let id = self.add_empty(); |
766 | 0 | ThompsonRef { start: id, end: id } |
767 | 0 | } |
768 | | |
769 | 0 | fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { |
770 | 0 | self.c(&Hir::repetition(hir::Repetition { |
771 | 0 | kind: hir::RepetitionKind::ZeroOrMore, |
772 | 0 | greedy: false, |
773 | 0 | hir: Box::new(Hir::any(false)), |
774 | 0 | })) |
775 | 0 | } |
776 | | |
777 | 0 | fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { |
778 | 0 | self.c(&Hir::repetition(hir::Repetition { |
779 | 0 | kind: hir::RepetitionKind::ZeroOrMore, |
780 | 0 | greedy: false, |
781 | 0 | hir: Box::new(Hir::any(true)), |
782 | 0 | })) |
783 | 0 | } |
784 | | |
785 | 0 | fn patch(&self, from: StateID, to: StateID) { |
786 | 0 | match self.states.borrow_mut()[from] { |
787 | 0 | CState::Empty { ref mut next } => { |
788 | 0 | *next = to; |
789 | 0 | } |
790 | 0 | CState::Range { ref mut range } => { |
791 | 0 | range.next = to; |
792 | 0 | } |
793 | | CState::Sparse { .. } => { |
794 | 0 | panic!("cannot patch from a sparse NFA state") |
795 | | } |
796 | 0 | CState::Union { ref mut alternates } => { |
797 | 0 | alternates.push(to); |
798 | 0 | } |
799 | 0 | CState::UnionReverse { ref mut alternates } => { |
800 | 0 | alternates.push(to); |
801 | 0 | } |
802 | 0 | CState::Match => {} |
803 | | } |
804 | 0 | } |
805 | | |
806 | 0 | fn add_empty(&self) -> StateID { |
807 | 0 | let id = self.states.borrow().len(); |
808 | 0 | self.states.borrow_mut().push(CState::Empty { next: 0 }); |
809 | 0 | id |
810 | 0 | } |
811 | | |
812 | 0 | fn add_range(&self, start: u8, end: u8) -> StateID { |
813 | 0 | let id = self.states.borrow().len(); |
814 | 0 | let trans = Transition { start, end, next: 0 }; |
815 | 0 | let state = CState::Range { range: trans }; |
816 | 0 | self.states.borrow_mut().push(state); |
817 | 0 | id |
818 | 0 | } |
819 | | |
820 | 0 | fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { |
821 | 0 | if ranges.len() == 1 { |
822 | 0 | let id = self.states.borrow().len(); |
823 | 0 | let state = CState::Range { range: ranges[0] }; |
824 | 0 | self.states.borrow_mut().push(state); |
825 | 0 | return id; |
826 | 0 | } |
827 | 0 | let id = self.states.borrow().len(); |
828 | 0 | let state = CState::Sparse { ranges }; |
829 | 0 | self.states.borrow_mut().push(state); |
830 | 0 | id |
831 | 0 | } |
832 | | |
833 | 0 | fn add_union(&self) -> StateID { |
834 | 0 | let id = self.states.borrow().len(); |
835 | 0 | let state = CState::Union { alternates: vec![] }; |
836 | 0 | self.states.borrow_mut().push(state); |
837 | 0 | id |
838 | 0 | } |
839 | | |
840 | 0 | fn add_reverse_union(&self) -> StateID { |
841 | 0 | let id = self.states.borrow().len(); |
842 | 0 | let state = CState::UnionReverse { alternates: vec![] }; |
843 | 0 | self.states.borrow_mut().push(state); |
844 | 0 | id |
845 | 0 | } |
846 | | |
847 | 0 | fn add_match(&self) -> StateID { |
848 | 0 | let id = self.states.borrow().len(); |
849 | 0 | self.states.borrow_mut().push(CState::Match); |
850 | 0 | id |
851 | 0 | } |
852 | | } |
853 | | |
854 | | #[derive(Debug)] |
855 | | struct Utf8Compiler<'a> { |
856 | | nfac: &'a Compiler, |
857 | | state: &'a mut Utf8State, |
858 | | target: StateID, |
859 | | } |
860 | | |
861 | | #[derive(Clone, Debug)] |
862 | | struct Utf8State { |
863 | | compiled: Utf8BoundedMap, |
864 | | uncompiled: Vec<Utf8Node>, |
865 | | } |
866 | | |
867 | | #[derive(Clone, Debug)] |
868 | | struct Utf8Node { |
869 | | trans: Vec<Transition>, |
870 | | last: Option<Utf8LastTransition>, |
871 | | } |
872 | | |
873 | | #[derive(Clone, Debug)] |
874 | | struct Utf8LastTransition { |
875 | | start: u8, |
876 | | end: u8, |
877 | | } |
878 | | |
879 | | impl Utf8State { |
880 | 0 | fn new() -> Utf8State { |
881 | 0 | Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } |
882 | 0 | } |
883 | | |
884 | 0 | fn clear(&mut self) { |
885 | 0 | self.compiled.clear(); |
886 | 0 | self.uncompiled.clear(); |
887 | 0 | } |
888 | | } |
889 | | |
890 | | impl<'a> Utf8Compiler<'a> { |
891 | 0 | fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { |
892 | 0 | let target = nfac.add_empty(); |
893 | 0 | state.clear(); |
894 | 0 | let mut utf8c = Utf8Compiler { nfac, state, target }; |
895 | 0 | utf8c.add_empty(); |
896 | 0 | utf8c |
897 | 0 | } |
898 | | |
899 | 0 | fn finish(&mut self) -> ThompsonRef { |
900 | 0 | self.compile_from(0); |
901 | 0 | let node = self.pop_root(); |
902 | 0 | let start = self.compile(node); |
903 | 0 | ThompsonRef { start, end: self.target } |
904 | 0 | } |
905 | | |
906 | 0 | fn add(&mut self, ranges: &[Utf8Range]) { |
907 | 0 | let prefix_len = ranges |
908 | 0 | .iter() |
909 | 0 | .zip(&self.state.uncompiled) |
910 | 0 | .take_while(|&(range, node)| { |
911 | 0 | node.last.as_ref().map_or(false, |t| { |
912 | 0 | (t.start, t.end) == (range.start, range.end) |
913 | 0 | }) |
914 | 0 | }) |
915 | 0 | .count(); |
916 | 0 | assert!(prefix_len < ranges.len()); |
917 | 0 | self.compile_from(prefix_len); |
918 | 0 | self.add_suffix(&ranges[prefix_len..]); |
919 | 0 | } |
920 | | |
921 | 0 | fn compile_from(&mut self, from: usize) { |
922 | 0 | let mut next = self.target; |
923 | 0 | while from + 1 < self.state.uncompiled.len() { |
924 | 0 | let node = self.pop_freeze(next); |
925 | 0 | next = self.compile(node); |
926 | 0 | } |
927 | 0 | self.top_last_freeze(next); |
928 | 0 | } |
929 | | |
930 | 0 | fn compile(&mut self, node: Vec<Transition>) -> StateID { |
931 | 0 | let hash = self.state.compiled.hash(&node); |
932 | 0 | if let Some(id) = self.state.compiled.get(&node, hash) { |
933 | 0 | return id; |
934 | 0 | } |
935 | 0 | let id = self.nfac.add_sparse(node.clone()); |
936 | 0 | self.state.compiled.set(node, hash, id); |
937 | 0 | id |
938 | 0 | } |
939 | | |
940 | 0 | fn add_suffix(&mut self, ranges: &[Utf8Range]) { |
941 | 0 | assert!(!ranges.is_empty()); |
942 | 0 | let last = self |
943 | 0 | .state |
944 | 0 | .uncompiled |
945 | 0 | .len() |
946 | 0 | .checked_sub(1) |
947 | 0 | .expect("non-empty nodes"); |
948 | 0 | assert!(self.state.uncompiled[last].last.is_none()); |
949 | 0 | self.state.uncompiled[last].last = Some(Utf8LastTransition { |
950 | 0 | start: ranges[0].start, |
951 | 0 | end: ranges[0].end, |
952 | 0 | }); |
953 | 0 | for r in &ranges[1..] { |
954 | 0 | self.state.uncompiled.push(Utf8Node { |
955 | 0 | trans: vec![], |
956 | 0 | last: Some(Utf8LastTransition { start: r.start, end: r.end }), |
957 | 0 | }); |
958 | 0 | } |
959 | 0 | } |
960 | | |
961 | 0 | fn add_empty(&mut self) { |
962 | 0 | self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); |
963 | 0 | } |
964 | | |
965 | 0 | fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { |
966 | 0 | let mut uncompiled = self.state.uncompiled.pop().unwrap(); |
967 | 0 | uncompiled.set_last_transition(next); |
968 | 0 | uncompiled.trans |
969 | 0 | } |
970 | | |
971 | 0 | fn pop_root(&mut self) -> Vec<Transition> { |
972 | 0 | assert_eq!(self.state.uncompiled.len(), 1); |
973 | 0 | assert!(self.state.uncompiled[0].last.is_none()); |
974 | 0 | self.state.uncompiled.pop().expect("non-empty nodes").trans |
975 | 0 | } |
976 | | |
977 | 0 | fn top_last_freeze(&mut self, next: StateID) { |
978 | 0 | let last = self |
979 | 0 | .state |
980 | 0 | .uncompiled |
981 | 0 | .len() |
982 | 0 | .checked_sub(1) |
983 | 0 | .expect("non-empty nodes"); |
984 | 0 | self.state.uncompiled[last].set_last_transition(next); |
985 | 0 | } |
986 | | } |
987 | | |
988 | | impl Utf8Node { |
989 | 0 | fn set_last_transition(&mut self, next: StateID) { |
990 | 0 | if let Some(last) = self.last.take() { |
991 | 0 | self.trans.push(Transition { |
992 | 0 | start: last.start, |
993 | 0 | end: last.end, |
994 | 0 | next, |
995 | 0 | }); |
996 | 0 | } |
997 | 0 | } |
998 | | } |
999 | | |
1000 | | #[cfg(test)] |
1001 | | mod tests { |
1002 | | use regex_syntax::hir::Hir; |
1003 | | use regex_syntax::ParserBuilder; |
1004 | | |
1005 | | use super::{Builder, State, StateID, Transition, NFA}; |
1006 | | |
1007 | | fn parse(pattern: &str) -> Hir { |
1008 | | ParserBuilder::new().build().parse(pattern).unwrap() |
1009 | | } |
1010 | | |
1011 | | fn build(pattern: &str) -> NFA { |
1012 | | Builder::new().anchored(true).build(&parse(pattern)).unwrap() |
1013 | | } |
1014 | | |
1015 | | fn s_byte(byte: u8, next: StateID) -> State { |
1016 | | let trans = Transition { start: byte, end: byte, next }; |
1017 | | State::Range { range: trans } |
1018 | | } |
1019 | | |
1020 | | fn s_range(start: u8, end: u8, next: StateID) -> State { |
1021 | | let trans = Transition { start, end, next }; |
1022 | | State::Range { range: trans } |
1023 | | } |
1024 | | |
1025 | | fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { |
1026 | | let ranges = ranges |
1027 | | .iter() |
1028 | | .map(|&(start, end, next)| Transition { start, end, next }) |
1029 | | .collect(); |
1030 | | State::Sparse { ranges } |
1031 | | } |
1032 | | |
1033 | | fn s_union(alts: &[StateID]) -> State { |
1034 | | State::Union { alternates: alts.to_vec().into_boxed_slice() } |
1035 | | } |
1036 | | |
1037 | | fn s_match() -> State { |
1038 | | State::Match |
1039 | | } |
1040 | | |
1041 | | #[test] |
1042 | | fn errors() { |
1043 | | // unsupported anchors |
1044 | | assert!(Builder::new().build(&parse(r"^")).is_err()); |
1045 | | assert!(Builder::new().build(&parse(r"$")).is_err()); |
1046 | | assert!(Builder::new().build(&parse(r"\A")).is_err()); |
1047 | | assert!(Builder::new().build(&parse(r"\z")).is_err()); |
1048 | | |
1049 | | // unsupported word boundaries |
1050 | | assert!(Builder::new().build(&parse(r"\b")).is_err()); |
1051 | | assert!(Builder::new().build(&parse(r"\B")).is_err()); |
1052 | | assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); |
1053 | | } |
1054 | | |
1055 | | // Test that building an unanchored NFA has an appropriate `.*?` prefix. |
1056 | | #[test] |
1057 | | fn compile_unanchored_prefix() { |
1058 | | // When the machine can only match valid UTF-8. |
1059 | | let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); |
1060 | | // There should be many states since the `.` in `.*?` matches any |
1061 | | // Unicode scalar value. |
1062 | | assert_eq!(11, nfa.len()); |
1063 | | assert_eq!(nfa.states[10], s_match()); |
1064 | | assert_eq!(nfa.states[9], s_byte(b'a', 10)); |
1065 | | |
1066 | | // When the machine can match invalid UTF-8. |
1067 | | let nfa = Builder::new() |
1068 | | .anchored(false) |
1069 | | .allow_invalid_utf8(true) |
1070 | | .build(&parse(r"a")) |
1071 | | .unwrap(); |
1072 | | assert_eq!( |
1073 | | nfa.states, |
1074 | | &[ |
1075 | | s_union(&[2, 1]), |
1076 | | s_range(0, 255, 0), |
1077 | | s_byte(b'a', 3), |
1078 | | s_match(), |
1079 | | ] |
1080 | | ); |
1081 | | } |
1082 | | |
1083 | | #[test] |
1084 | | fn compile_empty() { |
1085 | | assert_eq!(build("").states, &[s_match(),]); |
1086 | | } |
1087 | | |
1088 | | #[test] |
1089 | | fn compile_literal() { |
1090 | | assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); |
1091 | | assert_eq!( |
1092 | | build("ab").states, |
1093 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] |
1094 | | ); |
1095 | | assert_eq!( |
1096 | | build("ā").states, |
1097 | | &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] |
1098 | | ); |
1099 | | |
1100 | | // Check that non-UTF-8 literals work. |
1101 | | let hir = ParserBuilder::new() |
1102 | | .allow_invalid_utf8(true) |
1103 | | .build() |
1104 | | .parse(r"(?-u)\xFF") |
1105 | | .unwrap(); |
1106 | | let nfa = Builder::new() |
1107 | | .anchored(true) |
1108 | | .allow_invalid_utf8(true) |
1109 | | .build(&hir) |
1110 | | .unwrap(); |
1111 | | assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); |
1112 | | } |
1113 | | |
1114 | | #[test] |
1115 | | fn compile_class() { |
1116 | | assert_eq!( |
1117 | | build(r"[a-z]").states, |
1118 | | &[s_range(b'a', b'z', 1), s_match(),] |
1119 | | ); |
1120 | | assert_eq!( |
1121 | | build(r"[x-za-c]").states, |
1122 | | &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] |
1123 | | ); |
1124 | | assert_eq!( |
1125 | | build(r"[\u03B1-\u03B4]").states, |
1126 | | &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] |
1127 | | ); |
1128 | | assert_eq!( |
1129 | | build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, |
1130 | | &[ |
1131 | | s_range(0xB1, 0xB4, 5), |
1132 | | s_range(0x99, 0x9E, 5), |
1133 | | s_byte(0xA4, 1), |
1134 | | s_byte(0x9F, 2), |
1135 | | s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), |
1136 | | s_match(), |
1137 | | ] |
1138 | | ); |
1139 | | assert_eq!( |
1140 | | build(r"[a-zā]").states, |
1141 | | &[ |
1142 | | s_byte(0x83, 3), |
1143 | | s_byte(0x98, 0), |
1144 | | s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), |
1145 | | s_match(), |
1146 | | ] |
1147 | | ); |
1148 | | } |
1149 | | |
1150 | | #[test] |
1151 | | fn compile_repetition() { |
1152 | | assert_eq!( |
1153 | | build(r"a?").states, |
1154 | | &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] |
1155 | | ); |
1156 | | assert_eq!( |
1157 | | build(r"a??").states, |
1158 | | &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] |
1159 | | ); |
1160 | | } |
1161 | | |
1162 | | #[test] |
1163 | | fn compile_group() { |
1164 | | assert_eq!( |
1165 | | build(r"ab+").states, |
1166 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] |
1167 | | ); |
1168 | | assert_eq!( |
1169 | | build(r"(ab)").states, |
1170 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] |
1171 | | ); |
1172 | | assert_eq!( |
1173 | | build(r"(ab)+").states, |
1174 | | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] |
1175 | | ); |
1176 | | } |
1177 | | |
1178 | | #[test] |
1179 | | fn compile_alternation() { |
1180 | | assert_eq!( |
1181 | | build(r"a|b").states, |
1182 | | &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] |
1183 | | ); |
1184 | | assert_eq!( |
1185 | | build(r"|b").states, |
1186 | | &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] |
1187 | | ); |
1188 | | assert_eq!( |
1189 | | build(r"a|").states, |
1190 | | &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] |
1191 | | ); |
1192 | | } |
1193 | | } |