Coverage Report

Created: 2025-12-31 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/meta/literal.rs
Line
Count
Source
1
use alloc::{vec, vec::Vec};
2
3
use regex_syntax::hir::Hir;
4
5
use crate::{meta::regex::RegexInfo, util::search::MatchKind};
6
7
/// Pull out an alternation of literals from the given sequence of HIR
8
/// expressions.
9
///
10
/// There are numerous ways for this to fail. Generally, this only applies
11
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
12
/// are "too few" alternates, in which case, the regex engine is likely faster.
13
///
14
/// And currently, this only returns something when 'hirs.len() == 1'.
15
68.4k
pub(crate) fn alternation_literals(
16
68.4k
    info: &RegexInfo,
17
68.4k
    hirs: &[&Hir],
18
68.4k
) -> Option<Vec<Vec<u8>>> {
19
    use regex_syntax::hir::{HirKind, Literal};
20
21
    // Might as well skip the work below if we know we can't build an
22
    // Aho-Corasick searcher.
23
68.4k
    if !cfg!(feature = "perf-literal-multisubstring") {
24
0
        return None;
25
68.4k
    }
26
    // This is pretty hacky, but basically, if `is_alternation_literal` is
27
    // true, then we can make several assumptions about the structure of our
28
    // HIR. This is what justifies the `unreachable!` statements below.
29
68.4k
    if hirs.len() != 1
30
68.4k
        || !info.props()[0].look_set().is_empty()
31
36.5k
        || info.props()[0].explicit_captures_len() > 0
32
30.7k
        || !info.props()[0].is_alternation_literal()
33
516
        || info.config().get_match_kind() != MatchKind::LeftmostFirst
34
    {
35
67.8k
        return None;
36
516
    }
37
516
    let hir = &hirs[0];
38
516
    let alts = match *hir.kind() {
39
459
        HirKind::Alternation(ref alts) => alts,
40
57
        _ => return None, // one literal isn't worth it
41
    };
42
43
459
    let mut lits = vec![];
44
822k
    for alt in alts {
45
822k
        let mut lit = vec![];
46
822k
        match *alt.kind() {
47
822k
            HirKind::Literal(Literal(ref bytes)) => {
48
822k
                lit.extend_from_slice(bytes)
49
            }
50
0
            HirKind::Concat(ref exprs) => {
51
0
                for e in exprs {
52
0
                    match *e.kind() {
53
0
                        HirKind::Literal(Literal(ref bytes)) => {
54
0
                            lit.extend_from_slice(bytes);
55
0
                        }
56
0
                        _ => unreachable!("expected literal, got {e:?}"),
57
                    }
58
                }
59
            }
60
0
            _ => unreachable!("expected literal or concat, got {alt:?}"),
61
        }
62
822k
        lits.push(lit);
63
    }
64
    // Why do this? Well, when the number of literals is small, it's likely
65
    // that we'll use the lazy DFA which is in turn likely to be faster than
66
    // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
67
    // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
68
    // the latter because it is so hungry (in time and space), and the former
69
    // is decently fast, but not as fast as a well oiled lazy DFA.
70
    //
71
    // However, once the number starts getting large, the lazy DFA is likely
72
    // to start thrashing because of the modest default cache size. When
73
    // exactly does this happen? Dunno. But at whatever point that is (we make
74
    // a guess below based on ad hoc benchmarking), we'll want to cut over to
75
    // Aho-Corasick, where even the contiguous NFA is likely to do much better.
76
459
    if lits.len() < 3000 {
77
209
        debug!("skipping Aho-Corasick because there are too few literals");
78
209
        return None;
79
250
    }
80
250
    Some(lits)
81
68.4k
}