/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/meta/literal.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use alloc::{vec, vec::Vec}; |
2 | | |
3 | | use regex_syntax::hir::Hir; |
4 | | |
5 | | use crate::{meta::regex::RegexInfo, util::search::MatchKind}; |
6 | | |
7 | | /// Pull out an alternation of literals from the given sequence of HIR |
8 | | /// expressions. |
9 | | /// |
10 | | /// There are numerous ways for this to fail. Generally, this only applies |
11 | | /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there |
12 | | /// are "too few" alternates, in which case, the regex engine is likely faster. |
13 | | /// |
14 | | /// And currently, this only returns something when 'hirs.len() == 1'. |
15 | 0 | pub(crate) fn alternation_literals( |
16 | 0 | info: &RegexInfo, |
17 | 0 | hirs: &[&Hir], |
18 | 0 | ) -> Option<Vec<Vec<u8>>> { |
19 | | use regex_syntax::hir::{HirKind, Literal}; |
20 | | |
21 | | // Might as well skip the work below if we know we can't build an |
22 | | // Aho-Corasick searcher. |
23 | 0 | if !cfg!(feature = "perf-literal-multisubstring") { |
24 | 0 | return None; |
25 | 0 | } |
26 | 0 | // This is pretty hacky, but basically, if `is_alternation_literal` is |
27 | 0 | // true, then we can make several assumptions about the structure of our |
28 | 0 | // HIR. This is what justifies the `unreachable!` statements below. |
29 | 0 | if hirs.len() != 1 |
30 | 0 | || !info.props()[0].look_set().is_empty() |
31 | 0 | || info.props()[0].explicit_captures_len() > 0 |
32 | 0 | || !info.props()[0].is_alternation_literal() |
33 | 0 | || info.config().get_match_kind() != MatchKind::LeftmostFirst |
34 | | { |
35 | 0 | return None; |
36 | 0 | } |
37 | 0 | let hir = &hirs[0]; |
38 | 0 | let alts = match *hir.kind() { |
39 | 0 | HirKind::Alternation(ref alts) => alts, |
40 | 0 | _ => return None, // one literal isn't worth it |
41 | | }; |
42 | | |
43 | 0 | let mut lits = vec![]; |
44 | 0 | for alt in alts { |
45 | 0 | let mut lit = vec![]; |
46 | 0 | match *alt.kind() { |
47 | 0 | HirKind::Literal(Literal(ref bytes)) => { |
48 | 0 | lit.extend_from_slice(bytes) |
49 | | } |
50 | 0 | HirKind::Concat(ref exprs) => { |
51 | 0 | for e in exprs { |
52 | 0 | match *e.kind() { |
53 | 0 | HirKind::Literal(Literal(ref bytes)) => { |
54 | 0 | lit.extend_from_slice(bytes); |
55 | 0 | } |
56 | 0 | _ => unreachable!("expected literal, got {:?}", e), |
57 | | } |
58 | | } |
59 | | } |
60 | 0 | _ => unreachable!("expected literal or concat, got {:?}", alt), |
61 | | } |
62 | 0 | lits.push(lit); |
63 | | } |
64 | | // Why do this? Well, when the number of literals is small, it's likely |
65 | | // that we'll use the lazy DFA which is in turn likely to be faster than |
66 | | // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have |
67 | | // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use |
68 | | // the latter because it is so hungry (in time and space), and the former |
69 | | // is decently fast, but not as fast as a well oiled lazy DFA. |
70 | | // |
71 | | // However, once the number starts getting large, the lazy DFA is likely |
72 | | // to start thrashing because of the modest default cache size. When |
73 | | // exactly does this happen? Dunno. But at whatever point that is (we make |
74 | | // a guess below based on ad hoc benchmarking), we'll want to cut over to |
75 | | // Aho-Corasick, where even the contiguous NFA is likely to do much better. |
76 | 0 | if lits.len() < 3000 { |
77 | | debug!("skipping Aho-Corasick because there are too few literals"); |
78 | 0 | return None; |
79 | 0 | } |
80 | 0 | Some(lits) |
81 | 0 | } |