/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-1.8.1/src/exec.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use std::cell::RefCell; |
2 | | use std::collections::HashMap; |
3 | | use std::panic::AssertUnwindSafe; |
4 | | use std::sync::Arc; |
5 | | |
6 | | #[cfg(feature = "perf-literal")] |
7 | | use aho_corasick::{AhoCorasick, MatchKind}; |
8 | | use regex_syntax::hir::literal; |
9 | | use regex_syntax::hir::{Hir, Look}; |
10 | | use regex_syntax::ParserBuilder; |
11 | | |
12 | | use crate::backtrack; |
13 | | use crate::compile::Compiler; |
14 | | #[cfg(feature = "perf-dfa")] |
15 | | use crate::dfa; |
16 | | use crate::error::Error; |
17 | | use crate::input::{ByteInput, CharInput}; |
18 | | use crate::literal::LiteralSearcher; |
19 | | use crate::pikevm; |
20 | | use crate::pool::{Pool, PoolGuard}; |
21 | | use crate::prog::Program; |
22 | | use crate::re_builder::RegexOptions; |
23 | | use crate::re_bytes; |
24 | | use crate::re_set; |
25 | | use crate::re_trait::{Locations, RegularExpression, Slot}; |
26 | | use crate::re_unicode; |
27 | | use crate::utf8::next_utf8; |
28 | | |
29 | | /// `Exec` manages the execution of a regular expression. |
30 | | /// |
31 | | /// In particular, this manages the various compiled forms of a single regular |
32 | | /// expression and the choice of which matching engine to use to execute a |
33 | | /// regular expression. |
34 | 0 | #[derive(Debug)] |
35 | | pub struct Exec { |
36 | | /// All read only state. |
37 | | ro: Arc<ExecReadOnly>, |
38 | | /// A pool of reusable values for the various matching engines. |
39 | | /// |
40 | | /// Note that boxing this value is not strictly necessary, but it is an |
41 | | /// easy way to ensure that T does not bloat the stack sized used by a pool |
42 | | /// in the case where T is big. And this turns out to be the case at the |
43 | | /// time of writing for regex's use of this pool. At the time of writing, |
44 | | /// the size of a Regex on the stack is 856 bytes. Boxing this value |
45 | | /// reduces that size to 16 bytes. |
46 | | pool: Box<Pool<ProgramCache>>, |
47 | | } |
48 | | |
49 | | /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This |
50 | | /// means it is no longer Sync, but we can now avoid the overhead of |
51 | | /// synchronization to fetch the cache. |
52 | 0 | #[derive(Debug)] |
53 | | pub struct ExecNoSync<'c> { |
54 | | /// All read only state. |
55 | | ro: &'c Arc<ExecReadOnly>, |
56 | | /// Caches for the various matching engines. |
57 | | cache: PoolGuard<'c, ProgramCache>, |
58 | | } |
59 | | |
60 | | /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. |
61 | 0 | #[derive(Debug)] |
62 | | pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); |
63 | | |
64 | | /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such |
65 | | /// state is determined at compile time and never changes during search. |
66 | 0 | #[derive(Debug)] |
67 | | struct ExecReadOnly { |
68 | | /// The original regular expressions given by the caller to compile. |
69 | | res: Vec<String>, |
70 | | /// A compiled program that is used in the NFA simulation and backtracking. |
71 | | /// It can be byte-based or Unicode codepoint based. |
72 | | /// |
73 | | /// N.B. It is not possibly to make this byte-based from the public API. |
74 | | /// It is only used for testing byte based programs in the NFA simulations. |
75 | | nfa: Program, |
76 | | /// A compiled byte based program for DFA execution. This is only used |
77 | | /// if a DFA can be executed. (Currently, only word boundary assertions are |
78 | | /// not supported.) Note that this program contains an embedded `.*?` |
79 | | /// preceding the first capture group, unless the regex is anchored at the |
80 | | /// beginning. |
81 | | #[allow(dead_code)] |
82 | | dfa: Program, |
83 | | /// The same as above, except the program is reversed (and there is no |
84 | | /// preceding `.*?`). This is used by the DFA to find the starting location |
85 | | /// of matches. |
86 | | #[allow(dead_code)] |
87 | | dfa_reverse: Program, |
88 | | /// A set of suffix literals extracted from the regex. |
89 | | /// |
90 | | /// Prefix literals are stored on the `Program`, since they are used inside |
91 | | /// the matching engines. |
92 | | #[allow(dead_code)] |
93 | | suffixes: LiteralSearcher, |
94 | | /// An Aho-Corasick automaton with leftmost-first match semantics. |
95 | | /// |
96 | | /// This is only set when the entire regex is a simple unanchored |
97 | | /// alternation of literals. We could probably use it more circumstances, |
98 | | /// but this is already hacky enough in this architecture. |
99 | | /// |
100 | | /// N.B. We use u32 as a state ID representation under the assumption that |
101 | | /// if we were to exhaust the ID space, we probably would have long |
102 | | /// surpassed the compilation size limit. |
103 | | #[cfg(feature = "perf-literal")] |
104 | | ac: Option<AhoCorasick>, |
105 | | /// match_type encodes as much upfront knowledge about how we're going to |
106 | | /// execute a search as possible. |
107 | | match_type: MatchType, |
108 | | } |
109 | | |
110 | | /// Facilitates the construction of an executor by exposing various knobs |
111 | | /// to control how a regex is executed and what kinds of resources it's |
112 | | /// permitted to use. |
113 | | // `ExecBuilder` is only public via the `internal` module, so avoid deriving |
114 | | // `Debug`. |
115 | | #[allow(missing_debug_implementations)] |
116 | | pub struct ExecBuilder { |
117 | | options: RegexOptions, |
118 | | match_type: Option<MatchType>, |
119 | | bytes: bool, |
120 | | only_utf8: bool, |
121 | | } |
122 | | |
123 | | /// Parsed represents a set of parsed regular expressions and their detected |
124 | | /// literals. |
125 | | struct Parsed { |
126 | | exprs: Vec<Hir>, |
127 | | prefixes: literal::Seq, |
128 | | suffixes: literal::Seq, |
129 | | bytes: bool, |
130 | | } |
131 | | |
132 | | impl ExecBuilder { |
133 | | /// Create a regex execution builder. |
134 | | /// |
135 | | /// This uses default settings for everything except the regex itself, |
136 | | /// which must be provided. Further knobs can be set by calling methods, |
137 | | /// and then finally, `build` to actually create the executor. |
138 | 0 | pub fn new(re: &str) -> Self { |
139 | 0 | Self::new_many(&[re]) |
140 | 0 | } |
141 | | |
142 | | /// Like new, but compiles the union of the given regular expressions. |
143 | | /// |
144 | | /// Note that when compiling 2 or more regular expressions, capture groups |
145 | | /// are completely unsupported. (This means both `find` and `captures` |
146 | | /// won't work.) |
147 | 0 | pub fn new_many<I, S>(res: I) -> Self |
148 | 0 | where |
149 | 0 | S: AsRef<str>, |
150 | 0 | I: IntoIterator<Item = S>, |
151 | 0 | { |
152 | 0 | let mut opts = RegexOptions::default(); |
153 | 0 | opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); |
154 | 0 | Self::new_options(opts) |
155 | 0 | } |
156 | | |
157 | | /// Create a regex execution builder. |
158 | 0 | pub fn new_options(opts: RegexOptions) -> Self { |
159 | 0 | ExecBuilder { |
160 | 0 | options: opts, |
161 | 0 | match_type: None, |
162 | 0 | bytes: false, |
163 | 0 | only_utf8: true, |
164 | 0 | } |
165 | 0 | } |
166 | | |
167 | | /// Set the matching engine to be automatically determined. |
168 | | /// |
169 | | /// This is the default state and will apply whatever optimizations are |
170 | | /// possible, such as running a DFA. |
171 | | /// |
172 | | /// This overrides whatever was previously set via the `nfa` or |
173 | | /// `bounded_backtracking` methods. |
174 | 0 | pub fn automatic(mut self) -> Self { |
175 | 0 | self.match_type = None; |
176 | 0 | self |
177 | 0 | } |
178 | | |
179 | | /// Sets the matching engine to use the NFA algorithm no matter what |
180 | | /// optimizations are possible. |
181 | | /// |
182 | | /// This overrides whatever was previously set via the `automatic` or |
183 | | /// `bounded_backtracking` methods. |
184 | 0 | pub fn nfa(mut self) -> Self { |
185 | 0 | self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); |
186 | 0 | self |
187 | 0 | } |
188 | | |
189 | | /// Sets the matching engine to use a bounded backtracking engine no |
190 | | /// matter what optimizations are possible. |
191 | | /// |
192 | | /// One must use this with care, since the bounded backtracking engine |
193 | | /// uses memory proportion to `len(regex) * len(text)`. |
194 | | /// |
195 | | /// This overrides whatever was previously set via the `automatic` or |
196 | | /// `nfa` methods. |
197 | 0 | pub fn bounded_backtracking(mut self) -> Self { |
198 | 0 | self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); |
199 | 0 | self |
200 | 0 | } |
201 | | |
202 | | /// Compiles byte based programs for use with the NFA matching engines. |
203 | | /// |
204 | | /// By default, the NFA engines match on Unicode scalar values. They can |
205 | | /// be made to use byte based programs instead. In general, the byte based |
206 | | /// programs are slower because of a less efficient encoding of character |
207 | | /// classes. |
208 | | /// |
209 | | /// Note that this does not impact DFA matching engines, which always |
210 | | /// execute on bytes. |
211 | 0 | pub fn bytes(mut self, yes: bool) -> Self { |
212 | 0 | self.bytes = yes; |
213 | 0 | self |
214 | 0 | } |
215 | | |
216 | | /// When disabled, the program compiled may match arbitrary bytes. |
217 | | /// |
218 | | /// When enabled (the default), all compiled programs exclusively match |
219 | | /// valid UTF-8 bytes. |
220 | 0 | pub fn only_utf8(mut self, yes: bool) -> Self { |
221 | 0 | self.only_utf8 = yes; |
222 | 0 | self |
223 | 0 | } |
224 | | |
225 | | /// Set the Unicode flag. |
226 | 0 | pub fn unicode(mut self, yes: bool) -> Self { |
227 | 0 | self.options.unicode = yes; |
228 | 0 | self |
229 | 0 | } |
230 | | |
231 | | /// Parse the current set of patterns into their AST and extract literals. |
232 | 0 | fn parse(&self) -> Result<Parsed, Error> { |
233 | 0 | let mut exprs = Vec::with_capacity(self.options.pats.len()); |
234 | 0 | let mut prefixes = Some(literal::Seq::empty()); |
235 | 0 | let mut suffixes = Some(literal::Seq::empty()); |
236 | 0 | let mut bytes = false; |
237 | 0 | let is_set = self.options.pats.len() > 1; |
238 | | // If we're compiling a regex set and that set has any anchored |
239 | | // expressions, then disable all literal optimizations. |
240 | 0 | for pat in &self.options.pats { |
241 | 0 | let mut parser = ParserBuilder::new() |
242 | 0 | .octal(self.options.octal) |
243 | 0 | .case_insensitive(self.options.case_insensitive) |
244 | 0 | .multi_line(self.options.multi_line) |
245 | 0 | .dot_matches_new_line(self.options.dot_matches_new_line) |
246 | 0 | .swap_greed(self.options.swap_greed) |
247 | 0 | .ignore_whitespace(self.options.ignore_whitespace) |
248 | 0 | .unicode(self.options.unicode) |
249 | 0 | .utf8(self.only_utf8) |
250 | 0 | .nest_limit(self.options.nest_limit) |
251 | 0 | .build(); |
252 | 0 | let expr = |
253 | 0 | parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; |
254 | 0 | let props = expr.properties(); |
255 | 0 | // This used to just check whether the HIR matched valid UTF-8 |
256 | 0 | // or not, but in regex-syntax 0.7, we changed our definition of |
257 | 0 | // "matches valid UTF-8" to exclude zero-width matches. And in |
258 | 0 | // particular, previously, we considered WordAsciiNegate (that |
259 | 0 | // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our |
260 | 0 | // matcher engines were built under this assumption and fixing |
261 | 0 | // them is not worth it with the imminent plan to switch over to |
262 | 0 | // regex-automata. So for now, we retain the previous behavior by |
263 | 0 | // just explicitly treating the presence of a negated ASCII word |
264 | 0 | // boundary as forcing use to use a byte oriented automaton. |
265 | 0 | bytes = bytes |
266 | 0 | || !props.is_utf8() |
267 | 0 | || props.look_set().contains(Look::WordAsciiNegate); |
268 | | |
269 | 0 | if cfg!(feature = "perf-literal") { |
270 | 0 | if !props.look_set_prefix().contains(Look::Start) |
271 | 0 | && props.look_set().contains(Look::Start) |
272 | 0 | { |
273 | 0 | // Partial anchors unfortunately make it hard to use |
274 | 0 | // prefixes, so disable them. |
275 | 0 | prefixes = None; |
276 | 0 | } else if is_set |
277 | 0 | && props.look_set_prefix_any().contains(Look::Start) |
278 | 0 | { |
279 | 0 | // Regex sets with anchors do not go well with literal |
280 | 0 | // optimizations. |
281 | 0 | prefixes = None; |
282 | 0 | } else if props.look_set_prefix_any().contains_word() { |
283 | 0 | // The new literal extractor ignores look-around while |
284 | 0 | // the old one refused to extract prefixes from regexes |
285 | 0 | // that began with a \b. These old creaky regex internals |
286 | 0 | // can't deal with it, so we drop it. |
287 | 0 | prefixes = None; |
288 | 0 | } else if props.look_set_prefix_any().contains(Look::StartLF) { |
289 | 0 | // Similar to the reasoning for word boundaries, this old |
290 | 0 | // regex engine can't handle literal prefixes with '(?m:^)' |
291 | 0 | // at the beginning of a regex. |
292 | 0 | prefixes = None; |
293 | 0 | } |
294 | | |
295 | 0 | if !props.look_set_suffix().contains(Look::End) |
296 | 0 | && props.look_set().contains(Look::End) |
297 | 0 | { |
298 | 0 | // Partial anchors unfortunately make it hard to use |
299 | 0 | // suffixes, so disable them. |
300 | 0 | suffixes = None; |
301 | 0 | } else if is_set |
302 | 0 | && props.look_set_suffix_any().contains(Look::End) |
303 | 0 | { |
304 | 0 | // Regex sets with anchors do not go well with literal |
305 | 0 | // optimizations. |
306 | 0 | suffixes = None; |
307 | 0 | } else if props.look_set_suffix_any().contains_word() { |
308 | 0 | // See the prefix case for reasoning here. |
309 | 0 | suffixes = None; |
310 | 0 | } else if props.look_set_suffix_any().contains(Look::EndLF) { |
311 | 0 | // See the prefix case for reasoning here. |
312 | 0 | suffixes = None; |
313 | 0 | } |
314 | | |
315 | 0 | let (mut pres, mut suffs) = |
316 | 0 | if prefixes.is_none() && suffixes.is_none() { |
317 | 0 | (literal::Seq::infinite(), literal::Seq::infinite()) |
318 | | } else { |
319 | 0 | literal_analysis(&expr) |
320 | | }; |
321 | | // These old creaky regex internals can't handle cases where |
322 | | // the literal sequences are exact but there are look-around |
323 | | // assertions. So we make sure the sequences are inexact if |
324 | | // there are look-around assertions anywhere. This forces the |
325 | | // regex engines to run instead of assuming that a literal |
326 | | // match implies an overall match. |
327 | 0 | if !props.look_set().is_empty() { |
328 | 0 | pres.make_inexact(); |
329 | 0 | suffs.make_inexact(); |
330 | 0 | } |
331 | 0 | prefixes = prefixes.and_then(|mut prefixes| { |
332 | 0 | prefixes.union(&mut pres); |
333 | 0 | Some(prefixes) |
334 | 0 | }); |
335 | 0 | suffixes = suffixes.and_then(|mut suffixes| { |
336 | 0 | suffixes.union(&mut suffs); |
337 | 0 | Some(suffixes) |
338 | 0 | }); |
339 | 0 | } |
340 | 0 | exprs.push(expr); |
341 | | } |
342 | 0 | Ok(Parsed { |
343 | 0 | exprs, |
344 | 0 | prefixes: prefixes.unwrap_or_else(literal::Seq::empty), |
345 | 0 | suffixes: suffixes.unwrap_or_else(literal::Seq::empty), |
346 | 0 | bytes, |
347 | 0 | }) |
348 | 0 | } |
349 | | |
350 | | /// Build an executor that can run a regular expression. |
351 | 0 | pub fn build(self) -> Result<Exec, Error> { |
352 | 0 | // Special case when we have no patterns to compile. |
353 | 0 | // This can happen when compiling a regex set. |
354 | 0 | if self.options.pats.is_empty() { |
355 | 0 | let ro = Arc::new(ExecReadOnly { |
356 | 0 | res: vec![], |
357 | 0 | nfa: Program::new(), |
358 | 0 | dfa: Program::new(), |
359 | 0 | dfa_reverse: Program::new(), |
360 | 0 | suffixes: LiteralSearcher::empty(), |
361 | 0 | #[cfg(feature = "perf-literal")] |
362 | 0 | ac: None, |
363 | 0 | match_type: MatchType::Nothing, |
364 | 0 | }); |
365 | 0 | let pool = ExecReadOnly::new_pool(&ro); |
366 | 0 | return Ok(Exec { ro, pool }); |
367 | 0 | } |
368 | 0 | let parsed = self.parse()?; |
369 | 0 | let mut nfa = Compiler::new() |
370 | 0 | .size_limit(self.options.size_limit) |
371 | 0 | .bytes(self.bytes || parsed.bytes) |
372 | 0 | .only_utf8(self.only_utf8) |
373 | 0 | .compile(&parsed.exprs)?; |
374 | 0 | let mut dfa = Compiler::new() |
375 | 0 | .size_limit(self.options.size_limit) |
376 | 0 | .dfa(true) |
377 | 0 | .only_utf8(self.only_utf8) |
378 | 0 | .compile(&parsed.exprs)?; |
379 | 0 | let mut dfa_reverse = Compiler::new() |
380 | 0 | .size_limit(self.options.size_limit) |
381 | 0 | .dfa(true) |
382 | 0 | .only_utf8(self.only_utf8) |
383 | 0 | .reverse(true) |
384 | 0 | .compile(&parsed.exprs)?; |
385 | | |
386 | | #[cfg(feature = "perf-literal")] |
387 | 0 | let ac = self.build_aho_corasick(&parsed); |
388 | 0 | nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); |
389 | 0 | dfa.prefixes = nfa.prefixes.clone(); |
390 | 0 | dfa.dfa_size_limit = self.options.dfa_size_limit; |
391 | 0 | dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; |
392 | 0 |
|
393 | 0 | let mut ro = ExecReadOnly { |
394 | 0 | res: self.options.pats, |
395 | 0 | nfa, |
396 | 0 | dfa, |
397 | 0 | dfa_reverse, |
398 | 0 | suffixes: LiteralSearcher::suffixes(parsed.suffixes), |
399 | 0 | #[cfg(feature = "perf-literal")] |
400 | 0 | ac, |
401 | 0 | match_type: MatchType::Nothing, |
402 | 0 | }; |
403 | 0 | ro.match_type = ro.choose_match_type(self.match_type); |
404 | 0 |
|
405 | 0 | let ro = Arc::new(ro); |
406 | 0 | let pool = ExecReadOnly::new_pool(&ro); |
407 | 0 | Ok(Exec { ro, pool }) |
408 | 0 | } |
409 | | |
410 | | #[cfg(feature = "perf-literal")] |
411 | 0 | fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> { |
412 | 0 | if parsed.exprs.len() != 1 { |
413 | 0 | return None; |
414 | 0 | } |
415 | 0 | let lits = match alternation_literals(&parsed.exprs[0]) { |
416 | 0 | None => return None, |
417 | 0 | Some(lits) => lits, |
418 | 0 | }; |
419 | 0 | // If we have a small number of literals, then let Teddy handle |
420 | 0 | // things (see literal/mod.rs). |
421 | 0 | if lits.len() <= 32 { |
422 | 0 | return None; |
423 | 0 | } |
424 | 0 | Some( |
425 | 0 | AhoCorasick::builder() |
426 | 0 | .match_kind(MatchKind::LeftmostFirst) |
427 | 0 | .build(&lits) |
428 | 0 | // This should never happen because we'd long exceed the |
429 | 0 | // compilation limit for regexes first. |
430 | 0 | .expect("AC automaton too big"), |
431 | 0 | ) |
432 | 0 | } |
433 | | } |
434 | | |
435 | | impl<'c> RegularExpression for ExecNoSyncStr<'c> { |
436 | | type Text = str; |
437 | | |
438 | 0 | fn slots_len(&self) -> usize { |
439 | 0 | self.0.slots_len() |
440 | 0 | } |
441 | | |
442 | 0 | fn next_after_empty(&self, text: &str, i: usize) -> usize { |
443 | 0 | next_utf8(text.as_bytes(), i) |
444 | 0 | } |
445 | | |
446 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
447 | 0 | fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { |
448 | 0 | self.0.shortest_match_at(text.as_bytes(), start) |
449 | 0 | } |
450 | | |
451 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
452 | 0 | fn is_match_at(&self, text: &str, start: usize) -> bool { |
453 | 0 | self.0.is_match_at(text.as_bytes(), start) |
454 | 0 | } |
455 | | |
456 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
457 | 0 | fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { |
458 | 0 | self.0.find_at(text.as_bytes(), start) |
459 | 0 | } |
460 | | |
461 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
462 | 0 | fn captures_read_at( |
463 | 0 | &self, |
464 | 0 | locs: &mut Locations, |
465 | 0 | text: &str, |
466 | 0 | start: usize, |
467 | 0 | ) -> Option<(usize, usize)> { |
468 | 0 | self.0.captures_read_at(locs, text.as_bytes(), start) |
469 | 0 | } |
470 | | } |
471 | | |
472 | | impl<'c> RegularExpression for ExecNoSync<'c> { |
473 | | type Text = [u8]; |
474 | | |
475 | | /// Returns the number of capture slots in the regular expression. (There |
476 | | /// are two slots for every capture group, corresponding to possibly empty |
477 | | /// start and end locations of the capture.) |
478 | 0 | fn slots_len(&self) -> usize { |
479 | 0 | self.ro.nfa.captures.len() * 2 |
480 | 0 | } |
481 | | |
482 | 0 | fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { |
483 | 0 | i + 1 |
484 | 0 | } |
485 | | |
486 | | /// Returns the end of a match location, possibly occurring before the |
487 | | /// end location of the correct leftmost-first match. |
488 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
489 | 0 | fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { |
490 | 0 | if !self.is_anchor_end_match(text) { |
491 | 0 | return None; |
492 | 0 | } |
493 | 0 | match self.ro.match_type { |
494 | | #[cfg(feature = "perf-literal")] |
495 | 0 | MatchType::Literal(ty) => { |
496 | 0 | self.find_literals(ty, text, start).map(|(_, e)| e) |
497 | | } |
498 | | #[cfg(feature = "perf-dfa")] |
499 | | MatchType::Dfa | MatchType::DfaMany => { |
500 | 0 | match self.shortest_dfa(text, start) { |
501 | 0 | dfa::Result::Match(end) => Some(end), |
502 | 0 | dfa::Result::NoMatch(_) => None, |
503 | 0 | dfa::Result::Quit => self.shortest_nfa(text, start), |
504 | | } |
505 | | } |
506 | | #[cfg(feature = "perf-dfa")] |
507 | | MatchType::DfaAnchoredReverse => { |
508 | 0 | match dfa::Fsm::reverse( |
509 | 0 | &self.ro.dfa_reverse, |
510 | 0 | self.cache.value(), |
511 | 0 | true, |
512 | 0 | &text[start..], |
513 | 0 | text.len() - start, |
514 | 0 | ) { |
515 | 0 | dfa::Result::Match(_) => Some(text.len()), |
516 | 0 | dfa::Result::NoMatch(_) => None, |
517 | 0 | dfa::Result::Quit => self.shortest_nfa(text, start), |
518 | | } |
519 | | } |
520 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
521 | | MatchType::DfaSuffix => { |
522 | 0 | match self.shortest_dfa_reverse_suffix(text, start) { |
523 | 0 | dfa::Result::Match(e) => Some(e), |
524 | 0 | dfa::Result::NoMatch(_) => None, |
525 | 0 | dfa::Result::Quit => self.shortest_nfa(text, start), |
526 | | } |
527 | | } |
528 | 0 | MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), |
529 | 0 | MatchType::Nothing => None, |
530 | | } |
531 | 0 | } |
532 | | |
533 | | /// Returns true if and only if the regex matches text. |
534 | | /// |
535 | | /// For single regular expressions, this is equivalent to calling |
536 | | /// shortest_match(...).is_some(). |
537 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
538 | 0 | fn is_match_at(&self, text: &[u8], start: usize) -> bool { |
539 | 0 | if !self.is_anchor_end_match(text) { |
540 | 0 | return false; |
541 | 0 | } |
542 | 0 | // We need to do this dance because shortest_match relies on the NFA |
543 | 0 | // filling in captures[1], but a RegexSet has no captures. In other |
544 | 0 | // words, a RegexSet can't (currently) use shortest_match. ---AG |
545 | 0 | match self.ro.match_type { |
546 | | #[cfg(feature = "perf-literal")] |
547 | 0 | MatchType::Literal(ty) => { |
548 | 0 | self.find_literals(ty, text, start).is_some() |
549 | | } |
550 | | #[cfg(feature = "perf-dfa")] |
551 | | MatchType::Dfa | MatchType::DfaMany => { |
552 | 0 | match self.shortest_dfa(text, start) { |
553 | 0 | dfa::Result::Match(_) => true, |
554 | 0 | dfa::Result::NoMatch(_) => false, |
555 | 0 | dfa::Result::Quit => self.match_nfa(text, start), |
556 | | } |
557 | | } |
558 | | #[cfg(feature = "perf-dfa")] |
559 | | MatchType::DfaAnchoredReverse => { |
560 | 0 | match dfa::Fsm::reverse( |
561 | 0 | &self.ro.dfa_reverse, |
562 | 0 | self.cache.value(), |
563 | 0 | true, |
564 | 0 | &text[start..], |
565 | 0 | text.len() - start, |
566 | 0 | ) { |
567 | 0 | dfa::Result::Match(_) => true, |
568 | 0 | dfa::Result::NoMatch(_) => false, |
569 | 0 | dfa::Result::Quit => self.match_nfa(text, start), |
570 | | } |
571 | | } |
572 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
573 | | MatchType::DfaSuffix => { |
574 | 0 | match self.shortest_dfa_reverse_suffix(text, start) { |
575 | 0 | dfa::Result::Match(_) => true, |
576 | 0 | dfa::Result::NoMatch(_) => false, |
577 | 0 | dfa::Result::Quit => self.match_nfa(text, start), |
578 | | } |
579 | | } |
580 | 0 | MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), |
581 | 0 | MatchType::Nothing => false, |
582 | | } |
583 | 0 | } |
584 | | |
585 | | /// Finds the start and end location of the leftmost-first match, starting |
586 | | /// at the given location. |
587 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
588 | 0 | fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { |
589 | 0 | if !self.is_anchor_end_match(text) { |
590 | 0 | return None; |
591 | 0 | } |
592 | 0 | match self.ro.match_type { |
593 | | #[cfg(feature = "perf-literal")] |
594 | 0 | MatchType::Literal(ty) => self.find_literals(ty, text, start), |
595 | | #[cfg(feature = "perf-dfa")] |
596 | 0 | MatchType::Dfa => match self.find_dfa_forward(text, start) { |
597 | 0 | dfa::Result::Match((s, e)) => Some((s, e)), |
598 | 0 | dfa::Result::NoMatch(_) => None, |
599 | | dfa::Result::Quit => { |
600 | 0 | self.find_nfa(MatchNfaType::Auto, text, start) |
601 | | } |
602 | | }, |
603 | | #[cfg(feature = "perf-dfa")] |
604 | | MatchType::DfaAnchoredReverse => { |
605 | 0 | match self.find_dfa_anchored_reverse(text, start) { |
606 | 0 | dfa::Result::Match((s, e)) => Some((s, e)), |
607 | 0 | dfa::Result::NoMatch(_) => None, |
608 | | dfa::Result::Quit => { |
609 | 0 | self.find_nfa(MatchNfaType::Auto, text, start) |
610 | | } |
611 | | } |
612 | | } |
613 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
614 | | MatchType::DfaSuffix => { |
615 | 0 | match self.find_dfa_reverse_suffix(text, start) { |
616 | 0 | dfa::Result::Match((s, e)) => Some((s, e)), |
617 | 0 | dfa::Result::NoMatch(_) => None, |
618 | | dfa::Result::Quit => { |
619 | 0 | self.find_nfa(MatchNfaType::Auto, text, start) |
620 | | } |
621 | | } |
622 | | } |
623 | 0 | MatchType::Nfa(ty) => self.find_nfa(ty, text, start), |
624 | 0 | MatchType::Nothing => None, |
625 | | #[cfg(feature = "perf-dfa")] |
626 | | MatchType::DfaMany => { |
627 | 0 | unreachable!("BUG: RegexSet cannot be used with find") |
628 | | } |
629 | | } |
630 | 0 | } |
631 | | |
632 | | /// Finds the start and end location of the leftmost-first match and also |
633 | | /// fills in all matching capture groups. |
634 | | /// |
635 | | /// The number of capture slots given should be equal to the total number |
636 | | /// of capture slots in the compiled program. |
637 | | /// |
638 | | /// Note that the first two slots always correspond to the start and end |
639 | | /// locations of the overall match. |
640 | 0 | fn captures_read_at( |
641 | 0 | &self, |
642 | 0 | locs: &mut Locations, |
643 | 0 | text: &[u8], |
644 | 0 | start: usize, |
645 | 0 | ) -> Option<(usize, usize)> { |
646 | 0 | let slots = locs.as_slots(); |
647 | 0 | for slot in slots.iter_mut() { |
648 | 0 | *slot = None; |
649 | 0 | } |
650 | | // If the caller unnecessarily uses this, then we try to save them |
651 | | // from themselves. |
652 | 0 | match slots.len() { |
653 | 0 | 0 => return self.find_at(text, start), |
654 | | 2 => { |
655 | 0 | return self.find_at(text, start).map(|(s, e)| { |
656 | 0 | slots[0] = Some(s); |
657 | 0 | slots[1] = Some(e); |
658 | 0 | (s, e) |
659 | 0 | }); |
660 | | } |
661 | 0 | _ => {} // fallthrough |
662 | 0 | } |
663 | 0 | if !self.is_anchor_end_match(text) { |
664 | 0 | return None; |
665 | 0 | } |
666 | 0 | match self.ro.match_type { |
667 | | #[cfg(feature = "perf-literal")] |
668 | 0 | MatchType::Literal(ty) => { |
669 | 0 | self.find_literals(ty, text, start).and_then(|(s, e)| { |
670 | 0 | self.captures_nfa_type( |
671 | 0 | MatchNfaType::Auto, |
672 | 0 | slots, |
673 | 0 | text, |
674 | 0 | s, |
675 | 0 | e, |
676 | 0 | ) |
677 | 0 | }) |
678 | | } |
679 | | #[cfg(feature = "perf-dfa")] |
680 | | MatchType::Dfa => { |
681 | 0 | if self.ro.nfa.is_anchored_start { |
682 | 0 | self.captures_nfa(slots, text, start) |
683 | | } else { |
684 | 0 | match self.find_dfa_forward(text, start) { |
685 | 0 | dfa::Result::Match((s, e)) => self.captures_nfa_type( |
686 | 0 | MatchNfaType::Auto, |
687 | 0 | slots, |
688 | 0 | text, |
689 | 0 | s, |
690 | 0 | e, |
691 | 0 | ), |
692 | 0 | dfa::Result::NoMatch(_) => None, |
693 | | dfa::Result::Quit => { |
694 | 0 | self.captures_nfa(slots, text, start) |
695 | | } |
696 | | } |
697 | | } |
698 | | } |
699 | | #[cfg(feature = "perf-dfa")] |
700 | | MatchType::DfaAnchoredReverse => { |
701 | 0 | match self.find_dfa_anchored_reverse(text, start) { |
702 | 0 | dfa::Result::Match((s, e)) => self.captures_nfa_type( |
703 | 0 | MatchNfaType::Auto, |
704 | 0 | slots, |
705 | 0 | text, |
706 | 0 | s, |
707 | 0 | e, |
708 | 0 | ), |
709 | 0 | dfa::Result::NoMatch(_) => None, |
710 | 0 | dfa::Result::Quit => self.captures_nfa(slots, text, start), |
711 | | } |
712 | | } |
713 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
714 | | MatchType::DfaSuffix => { |
715 | 0 | match self.find_dfa_reverse_suffix(text, start) { |
716 | 0 | dfa::Result::Match((s, e)) => self.captures_nfa_type( |
717 | 0 | MatchNfaType::Auto, |
718 | 0 | slots, |
719 | 0 | text, |
720 | 0 | s, |
721 | 0 | e, |
722 | 0 | ), |
723 | 0 | dfa::Result::NoMatch(_) => None, |
724 | 0 | dfa::Result::Quit => self.captures_nfa(slots, text, start), |
725 | | } |
726 | | } |
727 | 0 | MatchType::Nfa(ty) => { |
728 | 0 | self.captures_nfa_type(ty, slots, text, start, text.len()) |
729 | | } |
730 | 0 | MatchType::Nothing => None, |
731 | | #[cfg(feature = "perf-dfa")] |
732 | | MatchType::DfaMany => { |
733 | 0 | unreachable!("BUG: RegexSet cannot be used with captures") |
734 | | } |
735 | | } |
736 | 0 | } |
737 | | } |
738 | | |
739 | | impl<'c> ExecNoSync<'c> { |
740 | | /// Finds the leftmost-first match using only literal search. |
741 | | #[cfg(feature = "perf-literal")] |
742 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
743 | 0 | fn find_literals( |
744 | 0 | &self, |
745 | 0 | ty: MatchLiteralType, |
746 | 0 | text: &[u8], |
747 | 0 | start: usize, |
748 | 0 | ) -> Option<(usize, usize)> { |
749 | 0 | use self::MatchLiteralType::*; |
750 | 0 | match ty { |
751 | | Unanchored => { |
752 | 0 | let lits = &self.ro.nfa.prefixes; |
753 | 0 | lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) |
754 | | } |
755 | | AnchoredStart => { |
756 | 0 | let lits = &self.ro.nfa.prefixes; |
757 | 0 | if start == 0 || !self.ro.nfa.is_anchored_start { |
758 | 0 | lits.find_start(&text[start..]) |
759 | 0 | .map(|(s, e)| (start + s, start + e)) |
760 | | } else { |
761 | 0 | None |
762 | | } |
763 | | } |
764 | | AnchoredEnd => { |
765 | 0 | let lits = &self.ro.suffixes; |
766 | 0 | lits.find_end(&text[start..]) |
767 | 0 | .map(|(s, e)| (start + s, start + e)) |
768 | | } |
769 | 0 | AhoCorasick => self |
770 | 0 | .ro |
771 | 0 | .ac |
772 | 0 | .as_ref() |
773 | 0 | .unwrap() |
774 | 0 | .find(&text[start..]) |
775 | 0 | .map(|m| (start + m.start(), start + m.end())), |
776 | | } |
777 | 0 | } |
778 | | |
779 | | /// Finds the leftmost-first match (start and end) using only the DFA. |
780 | | /// |
781 | | /// If the result returned indicates that the DFA quit, then another |
782 | | /// matching engine should be used. |
783 | | #[cfg(feature = "perf-dfa")] |
784 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
785 | 0 | fn find_dfa_forward( |
786 | 0 | &self, |
787 | 0 | text: &[u8], |
788 | 0 | start: usize, |
789 | 0 | ) -> dfa::Result<(usize, usize)> { |
790 | | use crate::dfa::Result::*; |
791 | 0 | let end = match dfa::Fsm::forward( |
792 | 0 | &self.ro.dfa, |
793 | 0 | self.cache.value(), |
794 | 0 | false, |
795 | 0 | text, |
796 | 0 | start, |
797 | | ) { |
798 | 0 | NoMatch(i) => return NoMatch(i), |
799 | 0 | Quit => return Quit, |
800 | 0 | Match(end) if start == end => return Match((start, start)), |
801 | 0 | Match(end) => end, |
802 | 0 | }; |
803 | 0 | // Now run the DFA in reverse to find the start of the match. |
804 | 0 | match dfa::Fsm::reverse( |
805 | 0 | &self.ro.dfa_reverse, |
806 | 0 | self.cache.value(), |
807 | 0 | false, |
808 | 0 | &text[start..], |
809 | 0 | end - start, |
810 | 0 | ) { |
811 | 0 | Match(s) => Match((start + s, end)), |
812 | 0 | NoMatch(i) => NoMatch(i), |
813 | 0 | Quit => Quit, |
814 | | } |
815 | 0 | } |
816 | | |
817 | | /// Finds the leftmost-first match (start and end) using only the DFA, |
818 | | /// but assumes the regex is anchored at the end and therefore starts at |
819 | | /// the end of the regex and matches in reverse. |
820 | | /// |
821 | | /// If the result returned indicates that the DFA quit, then another |
822 | | /// matching engine should be used. |
823 | | #[cfg(feature = "perf-dfa")] |
824 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
825 | 0 | fn find_dfa_anchored_reverse( |
826 | 0 | &self, |
827 | 0 | text: &[u8], |
828 | 0 | start: usize, |
829 | 0 | ) -> dfa::Result<(usize, usize)> { |
830 | 0 | use crate::dfa::Result::*; |
831 | 0 | match dfa::Fsm::reverse( |
832 | 0 | &self.ro.dfa_reverse, |
833 | 0 | self.cache.value(), |
834 | 0 | false, |
835 | 0 | &text[start..], |
836 | 0 | text.len() - start, |
837 | 0 | ) { |
838 | 0 | Match(s) => Match((start + s, text.len())), |
839 | 0 | NoMatch(i) => NoMatch(i), |
840 | 0 | Quit => Quit, |
841 | | } |
842 | 0 | } |
843 | | |
844 | | /// Finds the end of the shortest match using only the DFA. |
845 | | #[cfg(feature = "perf-dfa")] |
846 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
847 | 0 | fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { |
848 | 0 | dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) |
849 | 0 | } |
850 | | |
851 | | /// Finds the end of the shortest match using only the DFA by scanning for |
852 | | /// suffix literals. |
853 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
854 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
855 | 0 | fn shortest_dfa_reverse_suffix( |
856 | 0 | &self, |
857 | 0 | text: &[u8], |
858 | 0 | start: usize, |
859 | 0 | ) -> dfa::Result<usize> { |
860 | 0 | match self.exec_dfa_reverse_suffix(text, start) { |
861 | 0 | None => self.shortest_dfa(text, start), |
862 | 0 | Some(r) => r.map(|(_, end)| end), |
863 | | } |
864 | 0 | } |
865 | | |
866 | | /// Finds the end of the shortest match using only the DFA by scanning for |
867 | | /// suffix literals. It also reports the start of the match. |
868 | | /// |
869 | | /// Note that if None is returned, then the optimization gave up to avoid |
870 | | /// worst case quadratic behavior. A forward scanning DFA should be tried |
871 | | /// next. |
872 | | /// |
873 | | /// If a match is returned and the full leftmost-first match is desired, |
874 | | /// then a forward scan starting from the beginning of the match must be |
875 | | /// done. |
876 | | /// |
877 | | /// If the result returned indicates that the DFA quit, then another |
878 | | /// matching engine should be used. |
879 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
880 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
881 | 0 | fn exec_dfa_reverse_suffix( |
882 | 0 | &self, |
883 | 0 | text: &[u8], |
884 | 0 | original_start: usize, |
885 | 0 | ) -> Option<dfa::Result<(usize, usize)>> { |
886 | 0 | use crate::dfa::Result::*; |
887 | 0 |
|
888 | 0 | let lcs = self.ro.suffixes.lcs(); |
889 | 0 | debug_assert!(lcs.len() >= 1); |
890 | 0 | let mut start = original_start; |
891 | 0 | let mut end = start; |
892 | 0 | let mut last_literal = start; |
893 | 0 | while end <= text.len() { |
894 | 0 | last_literal += match lcs.find(&text[last_literal..]) { |
895 | 0 | None => return Some(NoMatch(text.len())), |
896 | 0 | Some(i) => i, |
897 | 0 | }; |
898 | 0 | end = last_literal + lcs.len(); |
899 | 0 | match dfa::Fsm::reverse( |
900 | 0 | &self.ro.dfa_reverse, |
901 | 0 | self.cache.value(), |
902 | 0 | false, |
903 | 0 | &text[start..end], |
904 | 0 | end - start, |
905 | 0 | ) { |
906 | 0 | Match(0) | NoMatch(0) => return None, |
907 | 0 | Match(i) => return Some(Match((start + i, end))), |
908 | 0 | NoMatch(i) => { |
909 | 0 | start += i; |
910 | 0 | last_literal += 1; |
911 | 0 | continue; |
912 | | } |
913 | 0 | Quit => return Some(Quit), |
914 | | }; |
915 | | } |
916 | 0 | Some(NoMatch(text.len())) |
917 | 0 | } |
918 | | |
919 | | /// Finds the leftmost-first match (start and end) using only the DFA |
920 | | /// by scanning for suffix literals. |
921 | | /// |
922 | | /// If the result returned indicates that the DFA quit, then another |
923 | | /// matching engine should be used. |
924 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
925 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
926 | 0 | fn find_dfa_reverse_suffix( |
927 | 0 | &self, |
928 | 0 | text: &[u8], |
929 | 0 | start: usize, |
930 | 0 | ) -> dfa::Result<(usize, usize)> { |
931 | | use crate::dfa::Result::*; |
932 | | |
933 | 0 | let match_start = match self.exec_dfa_reverse_suffix(text, start) { |
934 | 0 | None => return self.find_dfa_forward(text, start), |
935 | 0 | Some(Match((start, _))) => start, |
936 | 0 | Some(r) => return r, |
937 | | }; |
938 | | // At this point, we've found a match. The only way to quit now |
939 | | // without a match is if the DFA gives up (seems unlikely). |
940 | | // |
941 | | // Now run the DFA forwards to find the proper end of the match. |
942 | | // (The suffix literal match can only indicate the earliest |
943 | | // possible end location, which may appear before the end of the |
944 | | // leftmost-first match.) |
945 | 0 | match dfa::Fsm::forward( |
946 | 0 | &self.ro.dfa, |
947 | 0 | self.cache.value(), |
948 | 0 | false, |
949 | 0 | text, |
950 | 0 | match_start, |
951 | 0 | ) { |
952 | 0 | NoMatch(_) => panic!("BUG: reverse match implies forward match"), |
953 | 0 | Quit => Quit, |
954 | 0 | Match(e) => Match((match_start, e)), |
955 | | } |
956 | 0 | } |
957 | | |
958 | | /// Executes the NFA engine to return whether there is a match or not. |
959 | | /// |
960 | | /// Ideally, we could use shortest_nfa(...).is_some() and get the same |
961 | | /// performance characteristics, but regex sets don't have captures, which |
962 | | /// shortest_nfa depends on. |
963 | | #[cfg(feature = "perf-dfa")] |
964 | 0 | fn match_nfa(&self, text: &[u8], start: usize) -> bool { |
965 | 0 | self.match_nfa_type(MatchNfaType::Auto, text, start) |
966 | 0 | } |
967 | | |
968 | | /// Like match_nfa, but allows specification of the type of NFA engine. |
969 | 0 | fn match_nfa_type( |
970 | 0 | &self, |
971 | 0 | ty: MatchNfaType, |
972 | 0 | text: &[u8], |
973 | 0 | start: usize, |
974 | 0 | ) -> bool { |
975 | 0 | self.exec_nfa( |
976 | 0 | ty, |
977 | 0 | &mut [false], |
978 | 0 | &mut [], |
979 | 0 | true, |
980 | 0 | false, |
981 | 0 | text, |
982 | 0 | start, |
983 | 0 | text.len(), |
984 | 0 | ) |
985 | 0 | } |
986 | | |
987 | | /// Finds the shortest match using an NFA. |
988 | | #[cfg(feature = "perf-dfa")] |
989 | 0 | fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { |
990 | 0 | self.shortest_nfa_type(MatchNfaType::Auto, text, start) |
991 | 0 | } |
992 | | |
993 | | /// Like shortest_nfa, but allows specification of the type of NFA engine. |
994 | 0 | fn shortest_nfa_type( |
995 | 0 | &self, |
996 | 0 | ty: MatchNfaType, |
997 | 0 | text: &[u8], |
998 | 0 | start: usize, |
999 | 0 | ) -> Option<usize> { |
1000 | 0 | let mut slots = [None, None]; |
1001 | 0 | if self.exec_nfa( |
1002 | 0 | ty, |
1003 | 0 | &mut [false], |
1004 | 0 | &mut slots, |
1005 | 0 | true, |
1006 | 0 | true, |
1007 | 0 | text, |
1008 | 0 | start, |
1009 | 0 | text.len(), |
1010 | 0 | ) { |
1011 | 0 | slots[1] |
1012 | | } else { |
1013 | 0 | None |
1014 | | } |
1015 | 0 | } |
1016 | | |
1017 | | /// Like find, but executes an NFA engine. |
1018 | 0 | fn find_nfa( |
1019 | 0 | &self, |
1020 | 0 | ty: MatchNfaType, |
1021 | 0 | text: &[u8], |
1022 | 0 | start: usize, |
1023 | 0 | ) -> Option<(usize, usize)> { |
1024 | 0 | let mut slots = [None, None]; |
1025 | 0 | if self.exec_nfa( |
1026 | 0 | ty, |
1027 | 0 | &mut [false], |
1028 | 0 | &mut slots, |
1029 | 0 | false, |
1030 | 0 | false, |
1031 | 0 | text, |
1032 | 0 | start, |
1033 | 0 | text.len(), |
1034 | 0 | ) { |
1035 | 0 | match (slots[0], slots[1]) { |
1036 | 0 | (Some(s), Some(e)) => Some((s, e)), |
1037 | 0 | _ => None, |
1038 | | } |
1039 | | } else { |
1040 | 0 | None |
1041 | | } |
1042 | 0 | } |
1043 | | |
1044 | | /// Like find_nfa, but fills in captures. |
1045 | | /// |
1046 | | /// `slots` should have length equal to `2 * nfa.captures.len()`. |
1047 | | #[cfg(feature = "perf-dfa")] |
1048 | 0 | fn captures_nfa( |
1049 | 0 | &self, |
1050 | 0 | slots: &mut [Slot], |
1051 | 0 | text: &[u8], |
1052 | 0 | start: usize, |
1053 | 0 | ) -> Option<(usize, usize)> { |
1054 | 0 | self.captures_nfa_type( |
1055 | 0 | MatchNfaType::Auto, |
1056 | 0 | slots, |
1057 | 0 | text, |
1058 | 0 | start, |
1059 | 0 | text.len(), |
1060 | 0 | ) |
1061 | 0 | } |
1062 | | |
1063 | | /// Like captures_nfa, but allows specification of type of NFA engine. |
1064 | 0 | fn captures_nfa_type( |
1065 | 0 | &self, |
1066 | 0 | ty: MatchNfaType, |
1067 | 0 | slots: &mut [Slot], |
1068 | 0 | text: &[u8], |
1069 | 0 | start: usize, |
1070 | 0 | end: usize, |
1071 | 0 | ) -> Option<(usize, usize)> { |
1072 | 0 | if self.exec_nfa( |
1073 | 0 | ty, |
1074 | 0 | &mut [false], |
1075 | 0 | slots, |
1076 | 0 | false, |
1077 | 0 | false, |
1078 | 0 | text, |
1079 | 0 | start, |
1080 | 0 | end, |
1081 | 0 | ) { |
1082 | 0 | match (slots[0], slots[1]) { |
1083 | 0 | (Some(s), Some(e)) => Some((s, e)), |
1084 | 0 | _ => None, |
1085 | | } |
1086 | | } else { |
1087 | 0 | None |
1088 | | } |
1089 | 0 | } |
1090 | | |
1091 | 0 | fn exec_nfa( |
1092 | 0 | &self, |
1093 | 0 | mut ty: MatchNfaType, |
1094 | 0 | matches: &mut [bool], |
1095 | 0 | slots: &mut [Slot], |
1096 | 0 | quit_after_match: bool, |
1097 | 0 | quit_after_match_with_pos: bool, |
1098 | 0 | text: &[u8], |
1099 | 0 | start: usize, |
1100 | 0 | end: usize, |
1101 | 0 | ) -> bool { |
1102 | 0 | use self::MatchNfaType::*; |
1103 | 0 | if let Auto = ty { |
1104 | 0 | if backtrack::should_exec(self.ro.nfa.len(), text.len()) { |
1105 | 0 | ty = Backtrack; |
1106 | 0 | } else { |
1107 | 0 | ty = PikeVM; |
1108 | 0 | } |
1109 | 0 | } |
1110 | | // The backtracker can't return the shortest match position as it is |
1111 | | // implemented today. So if someone calls `shortest_match` and we need |
1112 | | // to run an NFA, then use the PikeVM. |
1113 | 0 | if quit_after_match_with_pos || ty == PikeVM { |
1114 | 0 | self.exec_pikevm( |
1115 | 0 | matches, |
1116 | 0 | slots, |
1117 | 0 | quit_after_match, |
1118 | 0 | text, |
1119 | 0 | start, |
1120 | 0 | end, |
1121 | 0 | ) |
1122 | | } else { |
1123 | 0 | self.exec_backtrack(matches, slots, text, start, end) |
1124 | | } |
1125 | 0 | } |
1126 | | |
1127 | | /// Always run the NFA algorithm. |
1128 | 0 | fn exec_pikevm( |
1129 | 0 | &self, |
1130 | 0 | matches: &mut [bool], |
1131 | 0 | slots: &mut [Slot], |
1132 | 0 | quit_after_match: bool, |
1133 | 0 | text: &[u8], |
1134 | 0 | start: usize, |
1135 | 0 | end: usize, |
1136 | 0 | ) -> bool { |
1137 | 0 | if self.ro.nfa.uses_bytes() { |
1138 | 0 | pikevm::Fsm::exec( |
1139 | 0 | &self.ro.nfa, |
1140 | 0 | self.cache.value(), |
1141 | 0 | matches, |
1142 | 0 | slots, |
1143 | 0 | quit_after_match, |
1144 | 0 | ByteInput::new(text, self.ro.nfa.only_utf8), |
1145 | 0 | start, |
1146 | 0 | end, |
1147 | 0 | ) |
1148 | | } else { |
1149 | 0 | pikevm::Fsm::exec( |
1150 | 0 | &self.ro.nfa, |
1151 | 0 | self.cache.value(), |
1152 | 0 | matches, |
1153 | 0 | slots, |
1154 | 0 | quit_after_match, |
1155 | 0 | CharInput::new(text), |
1156 | 0 | start, |
1157 | 0 | end, |
1158 | 0 | ) |
1159 | | } |
1160 | 0 | } |
1161 | | |
1162 | | /// Always runs the NFA using bounded backtracking. |
1163 | 0 | fn exec_backtrack( |
1164 | 0 | &self, |
1165 | 0 | matches: &mut [bool], |
1166 | 0 | slots: &mut [Slot], |
1167 | 0 | text: &[u8], |
1168 | 0 | start: usize, |
1169 | 0 | end: usize, |
1170 | 0 | ) -> bool { |
1171 | 0 | if self.ro.nfa.uses_bytes() { |
1172 | 0 | backtrack::Bounded::exec( |
1173 | 0 | &self.ro.nfa, |
1174 | 0 | self.cache.value(), |
1175 | 0 | matches, |
1176 | 0 | slots, |
1177 | 0 | ByteInput::new(text, self.ro.nfa.only_utf8), |
1178 | 0 | start, |
1179 | 0 | end, |
1180 | 0 | ) |
1181 | | } else { |
1182 | 0 | backtrack::Bounded::exec( |
1183 | 0 | &self.ro.nfa, |
1184 | 0 | self.cache.value(), |
1185 | 0 | matches, |
1186 | 0 | slots, |
1187 | 0 | CharInput::new(text), |
1188 | 0 | start, |
1189 | 0 | end, |
1190 | 0 | ) |
1191 | | } |
1192 | 0 | } |
1193 | | |
1194 | | /// Finds which regular expressions match the given text. |
1195 | | /// |
1196 | | /// `matches` should have length equal to the number of regexes being |
1197 | | /// searched. |
1198 | | /// |
1199 | | /// This is only useful when one wants to know which regexes in a set |
1200 | | /// match some text. |
1201 | 0 | pub fn many_matches_at( |
1202 | 0 | &self, |
1203 | 0 | matches: &mut [bool], |
1204 | 0 | text: &[u8], |
1205 | 0 | start: usize, |
1206 | 0 | ) -> bool { |
1207 | 0 | use self::MatchType::*; |
1208 | 0 | if !self.is_anchor_end_match(text) { |
1209 | 0 | return false; |
1210 | 0 | } |
1211 | 0 | match self.ro.match_type { |
1212 | | #[cfg(feature = "perf-literal")] |
1213 | 0 | Literal(ty) => { |
1214 | 0 | debug_assert_eq!(matches.len(), 1); |
1215 | 0 | matches[0] = self.find_literals(ty, text, start).is_some(); |
1216 | 0 | matches[0] |
1217 | | } |
1218 | | #[cfg(feature = "perf-dfa")] |
1219 | | Dfa | DfaAnchoredReverse | DfaMany => { |
1220 | 0 | match dfa::Fsm::forward_many( |
1221 | 0 | &self.ro.dfa, |
1222 | 0 | self.cache.value(), |
1223 | 0 | matches, |
1224 | 0 | text, |
1225 | 0 | start, |
1226 | 0 | ) { |
1227 | 0 | dfa::Result::Match(_) => true, |
1228 | 0 | dfa::Result::NoMatch(_) => false, |
1229 | 0 | dfa::Result::Quit => self.exec_nfa( |
1230 | 0 | MatchNfaType::Auto, |
1231 | 0 | matches, |
1232 | 0 | &mut [], |
1233 | 0 | false, |
1234 | 0 | false, |
1235 | 0 | text, |
1236 | 0 | start, |
1237 | 0 | text.len(), |
1238 | 0 | ), |
1239 | | } |
1240 | | } |
1241 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
1242 | | DfaSuffix => { |
1243 | 0 | match dfa::Fsm::forward_many( |
1244 | 0 | &self.ro.dfa, |
1245 | 0 | self.cache.value(), |
1246 | 0 | matches, |
1247 | 0 | text, |
1248 | 0 | start, |
1249 | 0 | ) { |
1250 | 0 | dfa::Result::Match(_) => true, |
1251 | 0 | dfa::Result::NoMatch(_) => false, |
1252 | 0 | dfa::Result::Quit => self.exec_nfa( |
1253 | 0 | MatchNfaType::Auto, |
1254 | 0 | matches, |
1255 | 0 | &mut [], |
1256 | 0 | false, |
1257 | 0 | false, |
1258 | 0 | text, |
1259 | 0 | start, |
1260 | 0 | text.len(), |
1261 | 0 | ), |
1262 | | } |
1263 | | } |
1264 | 0 | Nfa(ty) => self.exec_nfa( |
1265 | 0 | ty, |
1266 | 0 | matches, |
1267 | 0 | &mut [], |
1268 | 0 | false, |
1269 | 0 | false, |
1270 | 0 | text, |
1271 | 0 | start, |
1272 | 0 | text.len(), |
1273 | 0 | ), |
1274 | 0 | Nothing => false, |
1275 | | } |
1276 | 0 | } |
1277 | | |
1278 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1279 | 0 | fn is_anchor_end_match(&self, text: &[u8]) -> bool { |
1280 | 0 | #[cfg(not(feature = "perf-literal"))] |
1281 | 0 | fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { |
1282 | 0 | true |
1283 | 0 | } |
1284 | 0 |
|
1285 | 0 | #[cfg(feature = "perf-literal")] |
1286 | 0 | fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { |
1287 | 0 | // Only do this check if the haystack is big (>1MB). |
1288 | 0 | if text.len() > (1 << 20) && ro.nfa.is_anchored_end { |
1289 | 0 | let lcs = ro.suffixes.lcs(); |
1290 | 0 | if lcs.len() >= 1 && !lcs.is_suffix(text) { |
1291 | 0 | return false; |
1292 | 0 | } |
1293 | 0 | } |
1294 | 0 | true |
1295 | 0 | } |
1296 | 0 |
|
1297 | 0 | imp(&self.ro, text) |
1298 | 0 | } |
1299 | | |
1300 | 0 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
1301 | 0 | &self.ro.nfa.capture_name_idx |
1302 | 0 | } |
1303 | | } |
1304 | | |
1305 | | impl<'c> ExecNoSyncStr<'c> { |
1306 | 0 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
1307 | 0 | self.0.capture_name_idx() |
1308 | 0 | } |
1309 | | } |
1310 | | |
1311 | | impl Exec { |
1312 | | /// Get a searcher that isn't Sync. |
1313 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1314 | 0 | pub fn searcher(&self) -> ExecNoSync<'_> { |
1315 | 0 | ExecNoSync { |
1316 | 0 | ro: &self.ro, // a clone is too expensive here! (and not needed) |
1317 | 0 | cache: self.pool.get(), |
1318 | 0 | } |
1319 | 0 | } |
1320 | | |
1321 | | /// Get a searcher that isn't Sync and can match on &str. |
1322 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1323 | 0 | pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { |
1324 | 0 | ExecNoSyncStr(self.searcher()) |
1325 | 0 | } |
1326 | | |
1327 | | /// Build a Regex from this executor. |
1328 | 0 | pub fn into_regex(self) -> re_unicode::Regex { |
1329 | 0 | re_unicode::Regex::from(self) |
1330 | 0 | } |
1331 | | |
1332 | | /// Build a RegexSet from this executor. |
1333 | 0 | pub fn into_regex_set(self) -> re_set::unicode::RegexSet { |
1334 | 0 | re_set::unicode::RegexSet::from(self) |
1335 | 0 | } |
1336 | | |
1337 | | /// Build a Regex from this executor that can match arbitrary bytes. |
1338 | 0 | pub fn into_byte_regex(self) -> re_bytes::Regex { |
1339 | 0 | re_bytes::Regex::from(self) |
1340 | 0 | } |
1341 | | |
1342 | | /// Build a RegexSet from this executor that can match arbitrary bytes. |
1343 | 0 | pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { |
1344 | 0 | re_set::bytes::RegexSet::from(self) |
1345 | 0 | } |
1346 | | |
1347 | | /// The original regular expressions given by the caller that were |
1348 | | /// compiled. |
1349 | 0 | pub fn regex_strings(&self) -> &[String] { |
1350 | 0 | &self.ro.res |
1351 | 0 | } |
1352 | | |
1353 | | /// Return a slice of capture names. |
1354 | | /// |
1355 | | /// Any capture that isn't named is None. |
1356 | 0 | pub fn capture_names(&self) -> &[Option<String>] { |
1357 | 0 | &self.ro.nfa.captures |
1358 | 0 | } |
1359 | | |
1360 | | /// Return a reference to named groups mapping (from group name to |
1361 | | /// group position). |
1362 | 0 | pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
1363 | 0 | &self.ro.nfa.capture_name_idx |
1364 | 0 | } |
1365 | | |
1366 | | /// If the number of capture groups in every match is always the same, then |
1367 | | /// return that number. Otherwise return `None`. |
1368 | 0 | pub fn static_captures_len(&self) -> Option<usize> { |
1369 | 0 | self.ro.nfa.static_captures_len |
1370 | 0 | } |
1371 | | } |
1372 | | |
1373 | | impl Clone for Exec { |
1374 | 0 | fn clone(&self) -> Exec { |
1375 | 0 | let pool = ExecReadOnly::new_pool(&self.ro); |
1376 | 0 | Exec { ro: self.ro.clone(), pool } |
1377 | 0 | } |
1378 | | } |
1379 | | |
1380 | | impl ExecReadOnly { |
1381 | | fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { |
1382 | 0 | if let Some(MatchType::Nfa(_)) = hint { |
1383 | 0 | return hint.unwrap(); |
1384 | 0 | } |
1385 | 0 | // If the NFA is empty, then we'll never match anything. |
1386 | 0 | if self.nfa.insts.is_empty() { |
1387 | 0 | return MatchType::Nothing; |
1388 | 0 | } |
1389 | 0 | if let Some(literalty) = self.choose_literal_match_type() { |
1390 | 0 | return literalty; |
1391 | 0 | } |
1392 | 0 | if let Some(dfaty) = self.choose_dfa_match_type() { |
1393 | 0 | return dfaty; |
1394 | 0 | } |
1395 | 0 | // We're so totally hosed. |
1396 | 0 | MatchType::Nfa(MatchNfaType::Auto) |
1397 | 0 | } |
1398 | | |
1399 | | /// If a plain literal scan can be used, then a corresponding literal |
1400 | | /// search type is returned. |
1401 | 0 | fn choose_literal_match_type(&self) -> Option<MatchType> { |
1402 | 0 | #[cfg(not(feature = "perf-literal"))] |
1403 | 0 | fn imp(_: &ExecReadOnly) -> Option<MatchType> { |
1404 | 0 | None |
1405 | 0 | } |
1406 | 0 |
|
1407 | 0 | #[cfg(feature = "perf-literal")] |
1408 | 0 | fn imp(ro: &ExecReadOnly) -> Option<MatchType> { |
1409 | 0 | // If our set of prefixes is complete, then we can use it to find |
1410 | 0 | // a match in lieu of a regex engine. This doesn't quite work well |
1411 | 0 | // in the presence of multiple regexes, so only do it when there's |
1412 | 0 | // one. |
1413 | 0 | // |
1414 | 0 | // TODO(burntsushi): Also, don't try to match literals if the regex |
1415 | 0 | // is partially anchored. We could technically do it, but we'd need |
1416 | 0 | // to create two sets of literals: all of them and then the subset |
1417 | 0 | // that aren't anchored. We would then only search for all of them |
1418 | 0 | // when at the beginning of the input and use the subset in all |
1419 | 0 | // other cases. |
1420 | 0 | if ro.res.len() != 1 { |
1421 | 0 | return None; |
1422 | 0 | } |
1423 | 0 | if ro.ac.is_some() { |
1424 | 0 | return Some(MatchType::Literal( |
1425 | 0 | MatchLiteralType::AhoCorasick, |
1426 | 0 | )); |
1427 | 0 | } |
1428 | 0 | if ro.nfa.prefixes.complete() { |
1429 | 0 | return if ro.nfa.is_anchored_start { |
1430 | 0 | Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) |
1431 | 0 | } else { |
1432 | 0 | Some(MatchType::Literal(MatchLiteralType::Unanchored)) |
1433 | 0 | }; |
1434 | 0 | } |
1435 | 0 | if ro.suffixes.complete() { |
1436 | 0 | return if ro.nfa.is_anchored_end { |
1437 | 0 | Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) |
1438 | 0 | } else { |
1439 | 0 | // This case shouldn't happen. When the regex isn't |
1440 | 0 | // anchored, then complete prefixes should imply complete |
1441 | 0 | // suffixes. |
1442 | 0 | Some(MatchType::Literal(MatchLiteralType::Unanchored)) |
1443 | 0 | }; |
1444 | 0 | } |
1445 | 0 | None |
1446 | 0 | } |
1447 | 0 |
|
1448 | 0 | imp(self) |
1449 | 0 | } |
1450 | | |
1451 | | /// If a DFA scan can be used, then choose the appropriate DFA strategy. |
1452 | 0 | fn choose_dfa_match_type(&self) -> Option<MatchType> { |
1453 | 0 | #[cfg(not(feature = "perf-dfa"))] |
1454 | 0 | fn imp(_: &ExecReadOnly) -> Option<MatchType> { |
1455 | 0 | None |
1456 | 0 | } |
1457 | 0 |
|
1458 | 0 | #[cfg(feature = "perf-dfa")] |
1459 | 0 | fn imp(ro: &ExecReadOnly) -> Option<MatchType> { |
1460 | 0 | if !dfa::can_exec(&ro.dfa) { |
1461 | 0 | return None; |
1462 | 0 | } |
1463 | 0 | // Regex sets require a slightly specialized path. |
1464 | 0 | if ro.res.len() >= 2 { |
1465 | 0 | return Some(MatchType::DfaMany); |
1466 | 0 | } |
1467 | 0 | // If the regex is anchored at the end but not the start, then |
1468 | 0 | // just match in reverse from the end of the haystack. |
1469 | 0 | if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { |
1470 | 0 | return Some(MatchType::DfaAnchoredReverse); |
1471 | 0 | } |
1472 | 0 | #[cfg(feature = "perf-literal")] |
1473 | 0 | { |
1474 | 0 | // If there's a longish suffix literal, then it might be faster |
1475 | 0 | // to look for that first. |
1476 | 0 | if ro.should_suffix_scan() { |
1477 | 0 | return Some(MatchType::DfaSuffix); |
1478 | 0 | } |
1479 | 0 | } |
1480 | 0 | // Fall back to your garden variety forward searching lazy DFA. |
1481 | 0 | Some(MatchType::Dfa) |
1482 | 0 | } |
1483 | 0 |
|
1484 | 0 | imp(self) |
1485 | 0 | } |
1486 | | |
1487 | | /// Returns true if the program is amenable to suffix scanning. |
1488 | | /// |
1489 | | /// When this is true, as a heuristic, we assume it is OK to quickly scan |
1490 | | /// for suffix literals and then do a *reverse* DFA match from any matches |
1491 | | /// produced by the literal scan. (And then followed by a forward DFA |
1492 | | /// search, since the previously found suffix literal maybe not actually be |
1493 | | /// the end of a match.) |
1494 | | /// |
1495 | | /// This is a bit of a specialized optimization, but can result in pretty |
1496 | | /// big performance wins if 1) there are no prefix literals and 2) the |
1497 | | /// suffix literals are pretty rare in the text. (1) is obviously easy to |
1498 | | /// account for but (2) is harder. As a proxy, we assume that longer |
1499 | | /// strings are generally rarer, so we only enable this optimization when |
1500 | | /// we have a meaty suffix. |
1501 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
1502 | 0 | fn should_suffix_scan(&self) -> bool { |
1503 | 0 | if self.suffixes.is_empty() { |
1504 | 0 | return false; |
1505 | 0 | } |
1506 | 0 | let lcs_len = self.suffixes.lcs().char_len(); |
1507 | 0 | lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() |
1508 | 0 | } |
1509 | | |
1510 | 0 | fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { |
1511 | 0 | let ro = ro.clone(); |
1512 | 0 | Box::new(Pool::new(Box::new(move || { |
1513 | 0 | AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) |
1514 | 0 | }))) |
1515 | 0 | } |
1516 | | } |
1517 | | |
1518 | 0 | #[derive(Clone, Copy, Debug)] |
1519 | | enum MatchType { |
1520 | | /// A single or multiple literal search. This is only used when the regex |
1521 | | /// can be decomposed into a literal search. |
1522 | | #[cfg(feature = "perf-literal")] |
1523 | | Literal(MatchLiteralType), |
1524 | | /// A normal DFA search. |
1525 | | #[cfg(feature = "perf-dfa")] |
1526 | | Dfa, |
1527 | | /// A reverse DFA search starting from the end of a haystack. |
1528 | | #[cfg(feature = "perf-dfa")] |
1529 | | DfaAnchoredReverse, |
1530 | | /// A reverse DFA search with suffix literal scanning. |
1531 | | #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
1532 | | DfaSuffix, |
1533 | | /// Use the DFA on two or more regular expressions. |
1534 | | #[cfg(feature = "perf-dfa")] |
1535 | | DfaMany, |
1536 | | /// An NFA variant. |
1537 | | Nfa(MatchNfaType), |
1538 | | /// No match is ever possible, so don't ever try to search. |
1539 | | Nothing, |
1540 | | } |
1541 | | |
1542 | 0 | #[derive(Clone, Copy, Debug)] |
1543 | | #[cfg(feature = "perf-literal")] |
1544 | | enum MatchLiteralType { |
1545 | | /// Match literals anywhere in text. |
1546 | | Unanchored, |
1547 | | /// Match literals only at the start of text. |
1548 | | AnchoredStart, |
1549 | | /// Match literals only at the end of text. |
1550 | | AnchoredEnd, |
1551 | | /// Use an Aho-Corasick automaton. This requires `ac` to be Some on |
1552 | | /// ExecReadOnly. |
1553 | | AhoCorasick, |
1554 | | } |
1555 | | |
1556 | 0 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1557 | | enum MatchNfaType { |
1558 | | /// Choose between Backtrack and PikeVM. |
1559 | | Auto, |
1560 | | /// NFA bounded backtracking. |
1561 | | /// |
1562 | | /// (This is only set by tests, since it never makes sense to always want |
1563 | | /// backtracking.) |
1564 | | Backtrack, |
1565 | | /// The Pike VM. |
1566 | | /// |
1567 | | /// (This is only set by tests, since it never makes sense to always want |
1568 | | /// the Pike VM.) |
1569 | | PikeVM, |
1570 | | } |
1571 | | |
1572 | | /// `ProgramCache` maintains reusable allocations for each matching engine |
1573 | | /// available to a particular program. |
1574 | | /// |
1575 | | /// We declare this as unwind safe since it's a cache that's only used for |
1576 | | /// performance purposes. If a panic occurs, it is (or should be) always safe |
1577 | | /// to continue using the same regex object. |
1578 | | pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; |
1579 | | |
1580 | 0 | #[derive(Debug)] |
1581 | | pub struct ProgramCacheInner { |
1582 | | pub pikevm: pikevm::Cache, |
1583 | | pub backtrack: backtrack::Cache, |
1584 | | #[cfg(feature = "perf-dfa")] |
1585 | | pub dfa: dfa::Cache, |
1586 | | #[cfg(feature = "perf-dfa")] |
1587 | | pub dfa_reverse: dfa::Cache, |
1588 | | } |
1589 | | |
1590 | | impl ProgramCacheInner { |
1591 | 0 | fn new(ro: &ExecReadOnly) -> Self { |
1592 | 0 | ProgramCacheInner { |
1593 | 0 | pikevm: pikevm::Cache::new(&ro.nfa), |
1594 | 0 | backtrack: backtrack::Cache::new(&ro.nfa), |
1595 | 0 | #[cfg(feature = "perf-dfa")] |
1596 | 0 | dfa: dfa::Cache::new(&ro.dfa), |
1597 | 0 | #[cfg(feature = "perf-dfa")] |
1598 | 0 | dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), |
1599 | 0 | } |
1600 | 0 | } |
1601 | | } |
1602 | | |
1603 | | /// Alternation literals checks if the given HIR is a simple alternation of |
1604 | | /// literals, and if so, returns them. Otherwise, this returns None. |
1605 | | #[cfg(feature = "perf-literal")] |
1606 | 0 | fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { |
1607 | 0 | use regex_syntax::hir::{HirKind, Literal}; |
1608 | 0 |
|
1609 | 0 | // This is pretty hacky, but basically, if `is_alternation_literal` is |
1610 | 0 | // true, then we can make several assumptions about the structure of our |
1611 | 0 | // HIR. This is what justifies the `unreachable!` statements below. |
1612 | 0 | // |
1613 | 0 | // This code should be refactored once we overhaul this crate's |
1614 | 0 | // optimization pipeline, because this is a terribly inflexible way to go |
1615 | 0 | // about things. |
1616 | 0 |
|
1617 | 0 | if !expr.properties().is_alternation_literal() { |
1618 | 0 | return None; |
1619 | 0 | } |
1620 | 0 | let alts = match *expr.kind() { |
1621 | 0 | HirKind::Alternation(ref alts) => alts, |
1622 | 0 | _ => return None, // one literal isn't worth it |
1623 | | }; |
1624 | | |
1625 | 0 | let mut lits = vec![]; |
1626 | 0 | for alt in alts { |
1627 | 0 | let mut lit = vec![]; |
1628 | 0 | match *alt.kind() { |
1629 | 0 | HirKind::Literal(Literal(ref bytes)) => { |
1630 | 0 | lit.extend_from_slice(bytes) |
1631 | | } |
1632 | 0 | HirKind::Concat(ref exprs) => { |
1633 | 0 | for e in exprs { |
1634 | 0 | match *e.kind() { |
1635 | 0 | HirKind::Literal(Literal(ref bytes)) => { |
1636 | 0 | lit.extend_from_slice(bytes); |
1637 | 0 | } |
1638 | 0 | _ => unreachable!("expected literal, got {:?}", e), |
1639 | | } |
1640 | | } |
1641 | | } |
1642 | 0 | _ => unreachable!("expected literal or concat, got {:?}", alt), |
1643 | | } |
1644 | 0 | lits.push(lit); |
1645 | | } |
1646 | 0 | Some(lits) |
1647 | 0 | } |
1648 | | |
1649 | | #[cfg(not(feature = "perf-literal"))] |
1650 | | fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { |
1651 | | (literal::Seq::infinite(), literal::Seq::infinite()) |
1652 | | } |
1653 | | |
1654 | | #[cfg(feature = "perf-literal")] |
1655 | 0 | fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { |
1656 | 0 | const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; |
1657 | 0 |
|
1658 | 0 | let mut prefixes = literal::Extractor::new() |
1659 | 0 | .kind(literal::ExtractKind::Prefix) |
1660 | 0 | .extract(expr); |
1661 | 0 | for (keep, limit) in ATTEMPTS { |
1662 | 0 | let len = match prefixes.len() { |
1663 | 0 | None => break, |
1664 | 0 | Some(len) => len, |
1665 | 0 | }; |
1666 | 0 | if len <= limit { |
1667 | 0 | break; |
1668 | 0 | } |
1669 | 0 | prefixes.keep_first_bytes(keep); |
1670 | 0 | prefixes.minimize_by_preference(); |
1671 | | } |
1672 | | |
1673 | 0 | let mut suffixes = literal::Extractor::new() |
1674 | 0 | .kind(literal::ExtractKind::Suffix) |
1675 | 0 | .extract(expr); |
1676 | 0 | for (keep, limit) in ATTEMPTS { |
1677 | 0 | let len = match suffixes.len() { |
1678 | 0 | None => break, |
1679 | 0 | Some(len) => len, |
1680 | 0 | }; |
1681 | 0 | if len <= limit { |
1682 | 0 | break; |
1683 | 0 | } |
1684 | 0 | suffixes.keep_last_bytes(keep); |
1685 | 0 | suffixes.minimize_by_preference(); |
1686 | | } |
1687 | | |
1688 | 0 | (prefixes, suffixes) |
1689 | 0 | } |
1690 | | |
1691 | | #[cfg(test)] |
1692 | | mod test { |
1693 | | #[test] |
1694 | | fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { |
1695 | | use crate::internal::ExecBuilder; |
1696 | | |
1697 | | let backtrack_bytes_re = ExecBuilder::new("^S") |
1698 | | .bounded_backtracking() |
1699 | | .only_utf8(false) |
1700 | | .build() |
1701 | | .map(|exec| exec.into_byte_regex()) |
1702 | | .map_err(|err| format!("{}", err)) |
1703 | | .unwrap(); |
1704 | | |
1705 | | let default_bytes_re = ExecBuilder::new("^S") |
1706 | | .only_utf8(false) |
1707 | | .build() |
1708 | | .map(|exec| exec.into_byte_regex()) |
1709 | | .map_err(|err| format!("{}", err)) |
1710 | | .unwrap(); |
1711 | | |
1712 | | let input = vec![83, 83]; |
1713 | | |
1714 | | let s1 = backtrack_bytes_re.split(&input); |
1715 | | let s2 = default_bytes_re.split(&input); |
1716 | | for (chunk1, chunk2) in s1.zip(s2) { |
1717 | | assert_eq!(chunk1, chunk2); |
1718 | | } |
1719 | | } |
1720 | | |
1721 | | #[test] |
1722 | | fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { |
1723 | | use crate::internal::ExecBuilder; |
1724 | | |
1725 | | let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") |
1726 | | .bounded_backtracking() |
1727 | | .bytes(true) |
1728 | | .build() |
1729 | | .map(|exec| exec.into_regex()) |
1730 | | .map_err(|err| format!("{}", err)) |
1731 | | .unwrap(); |
1732 | | |
1733 | | let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") |
1734 | | .bytes(true) |
1735 | | .build() |
1736 | | .map(|exec| exec.into_regex()) |
1737 | | .map_err(|err| format!("{}", err)) |
1738 | | .unwrap(); |
1739 | | |
1740 | | let input = "**"; |
1741 | | |
1742 | | let s1 = backtrack_bytes_re.split(input); |
1743 | | let s2 = default_bytes_re.split(input); |
1744 | | for (chunk1, chunk2) in s1.zip(s2) { |
1745 | | assert_eq!(chunk1, chunk2); |
1746 | | } |
1747 | | } |
1748 | | } |