Coverage Report

Created: 2025-12-12 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/regex/regex-automata/src/meta/reverse_inner.rs
Line
Count
Source
1
/*!
2
A module dedicated to plucking inner literals out of a regex pattern, and
3
then constructing a prefilter for them. We also include a regex pattern
4
"prefix" that corresponds to the bits of the regex that need to match before
5
the literals do. The reverse inner optimization then proceeds by looking for
6
matches of the inner literal(s), and then doing a reverse search of the prefix
7
from the start of the literal match to find the overall start position of the
8
match.
9
10
The essential invariant we want to uphold here is that the literals we return
11
reflect a set where *at least* one of them must match in order for the overall
12
regex to match. We also need to maintain the invariant that the regex prefix
13
returned corresponds to the entirety of the regex up until the literals we
14
return.
15
16
This somewhat limits what we can do. That is, if we a regex like
17
`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
18
from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
19
we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
20
we could still extract `{@!, %%}`, and it is true that at least of them must
21
match. But then, what is our regex prefix? Again, in theory, that could be
22
`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
23
matches, and `\w+` only matches when `%%` matches.
24
25
All of that is technically possible to do, but it seemingly requires a lot of
26
sophistication and machinery. Probably the way to tackle that is with some kind
27
of formalism and approach this problem more generally.
28
29
For now, the code below basically just looks for a top-level concatenation.
30
And if it can find one, it looks for literals in each of the direct child
31
sub-expressions of that concatenation. If some good ones are found, we return
32
those and a concatenation of the Hir expressions seen up to that point.
33
*/
34
35
use alloc::vec::Vec;
36
37
use regex_syntax::hir::{self, literal, Hir, HirKind};
38
39
use crate::{util::prefilter::Prefilter, MatchKind};
40
41
/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
42
/// one was found, then a concatenation of the HIR expressions that precede it
43
/// is returned.
44
///
45
/// The idea here is that the prefilter returned can be used to find candidate
46
/// matches. And then the HIR returned can be used to build a reverse regex
47
/// matcher, which will find the start of the candidate match. Finally, the
48
/// match still has to be confirmed with a normal anchored forward scan to find
49
/// the end position of the match.
50
///
51
/// Note that this assumes leftmost-first match semantics, so callers must
52
/// not call this otherwise.
53
44.1k
pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
54
44.1k
    if hirs.len() != 1 {
55
0
        debug!(
56
0
            "skipping reverse inner optimization since it only \
57
0
       supports 1 pattern, {} were given",
58
0
            hirs.len(),
59
        );
60
0
        return None;
61
44.1k
    }
62
44.1k
    let mut concat = match top_concat(hirs[0]) {
63
17.5k
        Some(concat) => concat,
64
        None => {
65
26.6k
            debug!(
66
0
                "skipping reverse inner optimization because a top-level \
67
0
           concatenation could not found",
68
            );
69
26.6k
            return None;
70
        }
71
    };
72
    // We skip the first HIR because if it did have a prefix prefilter in it,
73
    // we probably wouldn't be here looking for an inner prefilter.
74
131k
    for i in 1..concat.len() {
75
131k
        let hir = &concat[i];
76
131k
        let pre = match prefilter(hir) {
77
74.5k
            None => continue,
78
56.7k
            Some(pre) => pre,
79
        };
80
        // Even if we got a prefilter, if it isn't consider "fast," then we
81
        // probably don't want to bother with it. Namely, since the reverse
82
        // inner optimization requires some overhead, it likely only makes
83
        // sense if the prefilter scan itself is (believed) to be much faster
84
        // than the regex engine.
85
56.7k
        if !pre.is_fast() {
86
51.9k
            debug!(
87
0
                "skipping extracted inner prefilter because \
88
0
         it probably isn't fast"
89
            );
90
51.9k
            continue;
91
4.80k
        }
92
4.80k
        let concat_suffix = Hir::concat(concat.split_off(i));
93
4.80k
        let concat_prefix = Hir::concat(concat);
94
        // Look for a prefilter again. Why? Because above we only looked for
95
        // a prefilter on the individual 'hir', but we might be able to find
96
        // something better and more discriminatory by looking at the entire
97
        // suffix. We don't do this above to avoid making this loop worst case
98
        // quadratic in the length of 'concat'.
99
4.80k
        let pre2 = match prefilter(&concat_suffix) {
100
199
            None => pre,
101
4.60k
            Some(pre2) => {
102
4.60k
                if pre2.is_fast() {
103
4.31k
                    pre2
104
                } else {
105
284
                    pre
106
                }
107
            }
108
        };
109
4.80k
        return Some((concat_prefix, pre2));
110
    }
111
12.7k
    debug!(
112
0
        "skipping reverse inner optimization because a top-level \
113
0
       sub-expression with a fast prefilter could not be found"
114
    );
115
12.7k
    None
116
44.1k
}
117
118
/// Attempt to extract a prefilter from an HIR expression.
119
///
120
/// We do a little massaging here to do our best that the prefilter we get out
121
/// of this is *probably* fast. Basically, the false positive rate has a much
122
/// higher impact for things like the reverse inner optimization because more
123
/// work needs to potentially be done for each candidate match.
124
///
125
/// Note that this assumes leftmost-first match semantics, so callers must
126
/// not call this otherwise.
127
136k
fn prefilter(hir: &Hir) -> Option<Prefilter> {
128
136k
    let mut extractor = literal::Extractor::new();
129
136k
    extractor.kind(literal::ExtractKind::Prefix);
130
136k
    let mut prefixes = extractor.extract(hir);
131
136k
    debug!(
132
0
        "inner prefixes (len={:?}) extracted before optimization: {:?}",
133
0
        prefixes.len(),
134
        prefixes
135
    );
136
    // Since these are inner literals, we know they cannot be exact. But the
137
    // extractor doesn't know this. We mark them as inexact because this might
138
    // impact literal optimization. Namely, optimization weights "all literals
139
    // are exact" as very high, because it presumes that any match results in
140
    // an overall match. But of course, that is not the case here.
141
    //
142
    // In practice, this avoids plucking out a ASCII-only \s as an alternation
143
    // of single-byte whitespace characters.
144
136k
    prefixes.make_inexact();
145
136k
    prefixes.optimize_for_prefix_by_preference();
146
136k
    debug!(
147
0
        "inner prefixes (len={:?}) extracted after optimization: {:?}",
148
0
        prefixes.len(),
149
        prefixes
150
    );
151
136k
    prefixes
152
136k
        .literals()
153
136k
        .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
154
136k
}
155
156
/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
157
/// try to return one even if it's embedded in a capturing group, but is
158
/// otherwise pretty conservative in what is returned.
159
///
160
/// The HIR returned is a complete copy of the concat with all capturing
161
/// groups removed. In effect, the concat returned is "flattened" with respect
162
/// to capturing groups. This makes the detection logic above for prefixes
163
/// a bit simpler, and it works because 1) capturing groups never influence
164
/// whether a match occurs or not and 2) capturing groups are not used when
165
/// doing the reverse inner search to find the start of the match.
166
44.1k
fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
167
    loop {
168
56.9k
        hir = match hir.kind() {
169
            HirKind::Empty
170
            | HirKind::Literal(_)
171
            | HirKind::Class(_)
172
            | HirKind::Look(_)
173
            | HirKind::Repetition(_)
174
26.4k
            | HirKind::Alternation(_) => return None,
175
12.7k
            HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
176
17.7k
            HirKind::Concat(ref subs) => {
177
                // We are careful to only do the flattening/copy when we know
178
                // we have a "top level" concat we can inspect. This avoids
179
                // doing extra work in cases where we definitely won't use it.
180
                // (This might still be wasted work if we can't go on to find
181
                // some literals to extract.)
182
17.7k
                let concat =
183
193k
                    Hir::concat(subs.iter().map(|h| flatten(h)).collect());
184
17.7k
                return match concat.into_kind() {
185
17.5k
                    HirKind::Concat(xs) => Some(xs),
186
                    // It is actually possible for this case to occur, because
187
                    // 'Hir::concat' might simplify the expression to the point
188
                    // that concatenations are actually removed. One wonders
189
                    // whether this leads to other cases where we should be
190
                    // extracting literals, but in theory, I believe if we do
191
                    // get here, then it means that a "real" prefilter failed
192
                    // to be extracted and we should probably leave well enough
193
                    // alone. (A "real" prefilter is unbothered by "top-level
194
                    // concats" and "capturing groups.")
195
262
                    _ => return None,
196
                };
197
            }
198
        };
199
    }
200
44.1k
}
201
202
/// Returns a copy of the given HIR but with all capturing groups removed.
203
503k
fn flatten(hir: &Hir) -> Hir {
204
503k
    match hir.kind() {
205
17.7k
        HirKind::Empty => Hir::empty(),
206
53.3k
        HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
207
228k
        HirKind::Class(ref x) => Hir::class(x.clone()),
208
77.3k
        HirKind::Look(ref x) => Hir::look(x.clone()),
209
49.3k
        HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
210
        // This is the interesting case. We just drop the group information
211
        // entirely and use the child HIR itself.
212
34.7k
        HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
213
17.9k
        HirKind::Alternation(ref xs) => {
214
68.0k
            Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
215
        }
216
23.9k
        HirKind::Concat(ref xs) => {
217
157k
            Hir::concat(xs.iter().map(|x| flatten(x)).collect())
218
        }
219
    }
220
503k
}