Coverage Report

Created: 2023-04-25 07:07

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-1.5.5/src/pikevm.rs
Line
Count
Source (jump to first uncovered line)
1
// This module implements the Pike VM. That is, it guarantees linear time
2
// search of a regex on any text with memory use proportional to the size of
3
// the regex.
4
//
5
// It is equal in power to the backtracking engine in this crate, except the
6
// backtracking engine is typically faster on small regexes/texts at the
7
// expense of a bigger memory footprint.
8
//
9
// It can do more than the DFA can (specifically, record capture locations
10
// and execute Unicode word boundary assertions), but at a slower speed.
11
// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12
// epsilon transitions. That is, the Pike VM engine can be in multiple states
13
// at once where as the DFA is only ever in one state at a time.
14
//
15
// Therefore, the Pike VM is generally treated as the fallback when the other
16
// matching engines either aren't feasible to run or are insufficient.
17
18
use std::mem;
19
20
use crate::exec::ProgramCache;
21
use crate::input::{Input, InputAt};
22
use crate::prog::{InstPtr, Program};
23
use crate::re_trait::Slot;
24
use crate::sparse::SparseSet;
25
26
/// An NFA simulation matching engine.
27
0
#[derive(Debug)]
28
pub struct Fsm<'r, I> {
29
    /// The sequence of opcodes (among other things) that is actually executed.
30
    ///
31
    /// The program may be byte oriented or Unicode codepoint oriented.
32
    prog: &'r Program,
33
    /// An explicit stack used for following epsilon transitions. (This is
34
    /// borrowed from the cache.)
35
    stack: &'r mut Vec<FollowEpsilon>,
36
    /// The input to search.
37
    input: I,
38
}
39
40
/// A cached allocation that can be reused on each execution.
41
0
#[derive(Clone, Debug)]
42
pub struct Cache {
43
    /// A pair of ordered sets for tracking NFA states.
44
    clist: Threads,
45
    nlist: Threads,
46
    /// An explicit stack used for following epsilon transitions.
47
    stack: Vec<FollowEpsilon>,
48
}
49
50
/// An ordered set of NFA states and their captures.
51
0
#[derive(Clone, Debug)]
52
struct Threads {
53
    /// An ordered set of opcodes (each opcode is an NFA state).
54
    set: SparseSet,
55
    /// Captures for every NFA state.
56
    ///
57
    /// It is stored in row-major order, where the columns are the capture
58
    /// slots and the rows are the states.
59
    caps: Vec<Slot>,
60
    /// The number of capture slots stored per thread. (Every capture has
61
    /// two slots.)
62
    slots_per_thread: usize,
63
}
64
65
/// A representation of an explicit stack frame when following epsilon
66
/// transitions. This is used to avoid recursion.
67
0
#[derive(Clone, Debug)]
68
enum FollowEpsilon {
69
    /// Follow transitions at the given instruction pointer.
70
    IP(InstPtr),
71
    /// Restore the capture slot with the given position in the input.
72
    Capture { slot: usize, pos: Slot },
73
}
74
75
impl Cache {
76
    /// Create a new allocation used by the NFA machine to record execution
77
    /// and captures.
78
0
    pub fn new(_prog: &Program) -> Self {
79
0
        Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80
0
    }
81
}
82
83
impl<'r, I: Input> Fsm<'r, I> {
84
    /// Execute the NFA matching engine.
85
    ///
86
    /// If there's a match, `exec` returns `true` and populates the given
87
    /// captures accordingly.
88
0
    pub fn exec(
89
0
        prog: &'r Program,
90
0
        cache: &ProgramCache,
91
0
        matches: &mut [bool],
92
0
        slots: &mut [Slot],
93
0
        quit_after_match: bool,
94
0
        input: I,
95
0
        start: usize,
96
0
        end: usize,
97
0
    ) -> bool {
98
0
        let mut cache = cache.borrow_mut();
99
0
        let cache = &mut cache.pikevm;
100
0
        cache.clist.resize(prog.len(), prog.captures.len());
101
0
        cache.nlist.resize(prog.len(), prog.captures.len());
102
0
        let at = input.at(start);
103
0
        Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
104
0
            &mut cache.clist,
105
0
            &mut cache.nlist,
106
0
            matches,
107
0
            slots,
108
0
            quit_after_match,
109
0
            at,
110
0
            end,
111
0
        )
112
0
    }
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec
113
114
0
    fn exec_(
115
0
        &mut self,
116
0
        mut clist: &mut Threads,
117
0
        mut nlist: &mut Threads,
118
0
        matches: &mut [bool],
119
0
        slots: &mut [Slot],
120
0
        quit_after_match: bool,
121
0
        mut at: InputAt,
122
0
        end: usize,
123
0
    ) -> bool {
124
0
        let mut matched = false;
125
0
        let mut all_matched = false;
126
0
        clist.set.clear();
127
0
        nlist.set.clear();
128
        'LOOP: loop {
129
0
            if clist.set.is_empty() {
130
                // Three ways to bail out when our current set of threads is
131
                // empty.
132
                //
133
                // 1. We have a match---so we're done exploring any possible
134
                //    alternatives. Time to quit. (We can't do this if we're
135
                //    looking for matches for multiple regexes, unless we know
136
                //    they all matched.)
137
                //
138
                // 2. If the expression starts with a '^' we can terminate as
139
                //    soon as the last thread dies.
140
0
                if (matched && matches.len() <= 1)
141
0
                    || all_matched
142
0
                    || (!at.is_start() && self.prog.is_anchored_start)
143
                {
144
0
                    break;
145
0
                }
146
0
147
0
                // 3. If there's a literal prefix for the program, try to
148
0
                //    jump ahead quickly. If it can't be found, then we can
149
0
                //    bail out early.
150
0
                if !self.prog.prefixes.is_empty() {
151
0
                    at = match self.input.prefix_at(&self.prog.prefixes, at) {
152
0
                        None => break,
153
0
                        Some(at) => at,
154
                    };
155
0
                }
156
0
            }
157
158
            // This simulates a preceding '.*?' for every regex by adding
159
            // a state starting at the current position in the input for the
160
            // beginning of the program only if we don't already have a match.
161
0
            if clist.set.is_empty()
162
0
                || (!self.prog.is_anchored_start && !all_matched)
163
0
            {
164
0
                self.add(&mut clist, slots, 0, at);
165
0
            }
166
            // The previous call to "add" actually inspects the position just
167
            // before the current character. For stepping through the machine,
168
            // we can to look at the current character, so we advance the
169
            // input.
170
0
            let at_next = self.input.at(at.next_pos());
171
0
            for i in 0..clist.set.len() {
172
0
                let ip = clist.set[i];
173
0
                if self.step(
174
0
                    &mut nlist,
175
0
                    matches,
176
0
                    slots,
177
0
                    clist.caps(ip),
178
0
                    ip,
179
0
                    at,
180
0
                    at_next,
181
0
                ) {
182
0
                    matched = true;
183
0
                    all_matched = all_matched || matches.iter().all(|&b| b);
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_::{closure#0}
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_::{closure#0}
184
0
                    if quit_after_match {
185
                        // If we only care if a match occurs (not its
186
                        // position), then we can quit right now.
187
0
                        break 'LOOP;
188
0
                    }
189
0
                    if self.prog.matches.len() == 1 {
190
                        // We don't need to check the rest of the threads
191
                        // in this set because we've matched something
192
                        // ("leftmost-first"). However, we still need to check
193
                        // threads in the next set to support things like
194
                        // greedy matching.
195
                        //
196
                        // This is only true on normal regexes. For regex sets,
197
                        // we need to mush on to observe other matches.
198
0
                        break;
199
0
                    }
200
0
                }
201
            }
202
0
            if at.pos() >= end {
203
0
                break;
204
0
            }
205
0
            at = at_next;
206
0
            mem::swap(clist, nlist);
207
0
            nlist.set.clear();
208
        }
209
0
        matched
210
0
    }
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_
211
212
    /// Step through the input, one token (byte or codepoint) at a time.
213
    ///
214
    /// nlist is the set of states that will be processed on the next token
215
    /// in the input.
216
    ///
217
    /// caps is the set of captures passed by the caller of the NFA. They are
218
    /// written to only when a match state is visited.
219
    ///
220
    /// thread_caps is the set of captures set for the current NFA state, ip.
221
    ///
222
    /// at and at_next are the current and next positions in the input. at or
223
    /// at_next may be EOF.
224
0
    fn step(
225
0
        &mut self,
226
0
        nlist: &mut Threads,
227
0
        matches: &mut [bool],
228
0
        slots: &mut [Slot],
229
0
        thread_caps: &mut [Option<usize>],
230
0
        ip: usize,
231
0
        at: InputAt,
232
0
        at_next: InputAt,
233
0
    ) -> bool {
234
0
        use crate::prog::Inst::*;
235
0
        match self.prog[ip] {
236
0
            Match(match_slot) => {
237
0
                if match_slot < matches.len() {
238
0
                    matches[match_slot] = true;
239
0
                }
240
0
                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241
0
                    *slot = *val;
242
0
                }
243
0
                true
244
            }
245
0
            Char(ref inst) => {
246
0
                if inst.c == at.char() {
247
0
                    self.add(nlist, thread_caps, inst.goto, at_next);
248
0
                }
249
0
                false
250
            }
251
0
            Ranges(ref inst) => {
252
0
                if inst.matches(at.char()) {
253
0
                    self.add(nlist, thread_caps, inst.goto, at_next);
254
0
                }
255
0
                false
256
            }
257
0
            Bytes(ref inst) => {
258
0
                if let Some(b) = at.byte() {
259
0
                    if inst.matches(b) {
260
0
                        self.add(nlist, thread_caps, inst.goto, at_next);
261
0
                    }
262
0
                }
263
0
                false
264
            }
265
0
            EmptyLook(_) | Save(_) | Split(_) => false,
266
        }
267
0
    }
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::step
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::step
268
269
    /// Follows epsilon transitions and adds them for processing to nlist,
270
    /// starting at and including ip.
271
0
    fn add(
272
0
        &mut self,
273
0
        nlist: &mut Threads,
274
0
        thread_caps: &mut [Option<usize>],
275
0
        ip: usize,
276
0
        at: InputAt,
277
0
    ) {
278
0
        self.stack.push(FollowEpsilon::IP(ip));
279
0
        while let Some(frame) = self.stack.pop() {
280
0
            match frame {
281
0
                FollowEpsilon::IP(ip) => {
282
0
                    self.add_step(nlist, thread_caps, ip, at);
283
0
                }
284
0
                FollowEpsilon::Capture { slot, pos } => {
285
0
                    thread_caps[slot] = pos;
286
0
                }
287
            }
288
        }
289
0
    }
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add
290
291
    /// A helper function for add that avoids excessive pushing to the stack.
292
0
    fn add_step(
293
0
        &mut self,
294
0
        nlist: &mut Threads,
295
0
        thread_caps: &mut [Option<usize>],
296
0
        mut ip: usize,
297
0
        at: InputAt,
298
0
    ) {
299
        // Instead of pushing and popping to the stack, we mutate ip as we
300
        // traverse the set of states. We only push to the stack when we
301
        // absolutely need recursion (restoring captures or following a
302
        // branch).
303
        use crate::prog::Inst::*;
304
0
        loop {
305
0
            // Don't visit states we've already added.
306
0
            if nlist.set.contains(ip) {
307
0
                return;
308
0
            }
309
0
            nlist.set.insert(ip);
310
0
            match self.prog[ip] {
311
0
                EmptyLook(ref inst) => {
312
0
                    if self.input.is_empty_match(at, inst) {
313
0
                        ip = inst.goto;
314
0
                    }
315
                }
316
0
                Save(ref inst) => {
317
0
                    if inst.slot < thread_caps.len() {
318
0
                        self.stack.push(FollowEpsilon::Capture {
319
0
                            slot: inst.slot,
320
0
                            pos: thread_caps[inst.slot],
321
0
                        });
322
0
                        thread_caps[inst.slot] = Some(at.pos());
323
0
                    }
324
0
                    ip = inst.goto;
325
                }
326
0
                Split(ref inst) => {
327
0
                    self.stack.push(FollowEpsilon::IP(inst.goto2));
328
0
                    ip = inst.goto1;
329
0
                }
330
                Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
331
0
                    let t = &mut nlist.caps(ip);
332
0
                    for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333
0
                        *slot = *val;
334
0
                    }
335
0
                    return;
336
                }
337
            }
338
        }
339
0
    }
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add_step
Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add_step
340
}
341
342
impl Threads {
343
0
    fn new() -> Self {
344
0
        Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
345
0
    }
346
347
0
    fn resize(&mut self, num_insts: usize, ncaps: usize) {
348
0
        if num_insts == self.set.capacity() {
349
0
            return;
350
0
        }
351
0
        self.slots_per_thread = ncaps * 2;
352
0
        self.set = SparseSet::new(num_insts);
353
0
        self.caps = vec![None; self.slots_per_thread * num_insts];
354
0
    }
355
356
0
    fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357
0
        let i = pc * self.slots_per_thread;
358
0
        &mut self.caps[i..i + self.slots_per_thread]
359
0
    }
360
}