/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/meta/literal.rs

Source (jump to first uncovered line)
use alloc::{vec, vec::Vec};

use regex_syntax::hir::Hir;

use crate::{meta::regex::RegexInfo, util::search::MatchKind};

/// Pull out an alternation of literals from the given sequence of HIR
/// expressions.
///
/// There are numerous ways for this to fail. Generally, this only applies
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
/// are "too few" alternates, in which case, the regex engine is likely faster.
///
/// And currently, this only returns something when 'hirs.len() == 1'.
pub(crate) fn alternation_literals(
    info: &RegexInfo,
    hirs: &[&Hir],
) -> Option<Vec<Vec<u8>>> {
    use regex_syntax::hir::{HirKind, Literal};

    // Might as well skip the work below if we know we can't build an
    // Aho-Corasick searcher.
    if !cfg!(feature = "perf-literal-multisubstring") {
        return None;
    }
    // This is pretty hacky, but basically, if `is_alternation_literal` is
    // true, then we can make several assumptions about the structure of our
    // HIR. This is what justifies the `unreachable!` statements below.
    if hirs.len() != 1
        || !info.props()[0].look_set().is_empty()
        || info.props()[0].explicit_captures_len() > 0
        || !info.props()[0].is_alternation_literal()
        || info.config().get_match_kind() != MatchKind::LeftmostFirst
    {
        return None;
    }
    let hir = &hirs[0];
    let alts = match *hir.kind() {
        HirKind::Alternation(ref alts) => alts,
        _ => return None, // one literal isn't worth it
    };

    let mut lits = vec![];
    for alt in alts {
        let mut lit = vec![];
        match *alt.kind() {
            HirKind::Literal(Literal(ref bytes)) => {
                lit.extend_from_slice(bytes)
            }
            HirKind::Concat(ref exprs) => {
                for e in exprs {
                    match *e.kind() {
                        HirKind::Literal(Literal(ref bytes)) => {
                            lit.extend_from_slice(bytes);
                        }
                        _ => unreachable!("expected literal, got {:?}", e),
                    }
                }
            }
            _ => unreachable!("expected literal or concat, got {:?}", alt),
        }
        lits.push(lit);
    }
    // Why do this? Well, when the number of literals is small, it's likely
    // that we'll use the lazy DFA which is in turn likely to be faster than
    // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
    // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
    // the latter because it is so hungry (in time and space), and the former
    // is decently fast, but not as fast as a well oiled lazy DFA.
    //
    // However, once the number starts getting large, the lazy DFA is likely
    // to start thrashing because of the modest default cache size. When
    // exactly does this happen? Dunno. But at whatever point that is (we make
    // a guess below based on ad hoc benchmarking), we'll want to cut over to
    // Aho-Corasick, where even the contiguous NFA is likely to do much better.
    if lits.len() < 3000 {
        debug!("skipping Aho-Corasick because there are too few literals");
        return None;
    }
    Some(lits)
}

Coverage Report

Created: 2025-07-23 06:05

Line	Count	Source (jump to first uncovered line)
1		use alloc::{vec, vec::Vec};
2
3		use regex_syntax::hir::Hir;
4
5		use crate::{meta::regex::RegexInfo, util::search::MatchKind};
6
7		/// Pull out an alternation of literals from the given sequence of HIR
8		/// expressions.
9		///
10		/// There are numerous ways for this to fail. Generally, this only applies
11		/// to regexes of the form 'foo\|bar\|baz\|...\|quux'. It can also fail if there
12		/// are "too few" alternates, in which case, the regex engine is likely faster.
13		///
14		/// And currently, this only returns something when 'hirs.len() == 1'.
15	0	pub(crate) fn alternation_literals(
16	0	info: &RegexInfo,
17	0	hirs: &[&Hir],
18	0	) -> Option<Vec<Vec<u8>>> {
19		use regex_syntax::hir::{HirKind, Literal};
20
21		// Might as well skip the work below if we know we can't build an
22		// Aho-Corasick searcher.
23	0	if !cfg!(feature = "perf-literal-multisubstring") {
24	0	return None;
25	0	}
26	0	// This is pretty hacky, but basically, if `is_alternation_literal` is
27	0	// true, then we can make several assumptions about the structure of our
28	0	// HIR. This is what justifies the `unreachable!` statements below.
29	0	if hirs.len() != 1
30	0	\|\| !info.props()[0].look_set().is_empty()
31	0	\|\| info.props()[0].explicit_captures_len() > 0
32	0	\|\| !info.props()[0].is_alternation_literal()
33	0	\|\| info.config().get_match_kind() != MatchKind::LeftmostFirst
34		{
35	0	return None;
36	0	}
37	0	let hir = &hirs[0];
38	0	let alts = match *hir.kind() {
39	0	HirKind::Alternation(ref alts) => alts,
40	0	_ => return None, // one literal isn't worth it
41		};
42
43	0	let mut lits = vec![];
44	0	for alt in alts {
45	0	let mut lit = vec![];
46	0	match *alt.kind() {
47	0	HirKind::Literal(Literal(ref bytes)) => {
48	0	lit.extend_from_slice(bytes)
49		}
50	0	HirKind::Concat(ref exprs) => {
51	0	for e in exprs {
52	0	match *e.kind() {
53	0	HirKind::Literal(Literal(ref bytes)) => {
54	0	lit.extend_from_slice(bytes);
55	0	}
56	0	_ => unreachable!("expected literal, got {:?}", e),
57		}
58		}
59		}
60	0	_ => unreachable!("expected literal or concat, got {:?}", alt),
61		}
62	0	lits.push(lit);
63		}
64		// Why do this? Well, when the number of literals is small, it's likely
65		// that we'll use the lazy DFA which is in turn likely to be faster than
66		// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
67		// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
68		// the latter because it is so hungry (in time and space), and the former
69		// is decently fast, but not as fast as a well oiled lazy DFA.
70		//
71		// However, once the number starts getting large, the lazy DFA is likely
72		// to start thrashing because of the modest default cache size. When
73		// exactly does this happen? Dunno. But at whatever point that is (we make
74		// a guess below based on ad hoc benchmarking), we'll want to cut over to
75		// Aho-Corasick, where even the contiguous NFA is likely to do much better.
76	0	if lits.len() < 3000 {
77		debug!("skipping Aho-Corasick because there are too few literals");
78	0	return None;
79	0	}
80	0	Some(lits)
81	0	}