/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-1.5.5/src/pikevm.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This module implements the Pike VM. That is, it guarantees linear time |
2 | | // search of a regex on any text with memory use proportional to the size of |
3 | | // the regex. |
4 | | // |
5 | | // It is equal in power to the backtracking engine in this crate, except the |
6 | | // backtracking engine is typically faster on small regexes/texts at the |
7 | | // expense of a bigger memory footprint. |
8 | | // |
9 | | // It can do more than the DFA can (specifically, record capture locations |
10 | | // and execute Unicode word boundary assertions), but at a slower speed. |
11 | | // Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding |
12 | | // epsilon transitions. That is, the Pike VM engine can be in multiple states |
13 | | // at once where as the DFA is only ever in one state at a time. |
14 | | // |
15 | | // Therefore, the Pike VM is generally treated as the fallback when the other |
16 | | // matching engines either aren't feasible to run or are insufficient. |
17 | | |
18 | | use std::mem; |
19 | | |
20 | | use crate::exec::ProgramCache; |
21 | | use crate::input::{Input, InputAt}; |
22 | | use crate::prog::{InstPtr, Program}; |
23 | | use crate::re_trait::Slot; |
24 | | use crate::sparse::SparseSet; |
25 | | |
26 | | /// An NFA simulation matching engine. |
27 | 0 | #[derive(Debug)] |
28 | | pub struct Fsm<'r, I> { |
29 | | /// The sequence of opcodes (among other things) that is actually executed. |
30 | | /// |
31 | | /// The program may be byte oriented or Unicode codepoint oriented. |
32 | | prog: &'r Program, |
33 | | /// An explicit stack used for following epsilon transitions. (This is |
34 | | /// borrowed from the cache.) |
35 | | stack: &'r mut Vec<FollowEpsilon>, |
36 | | /// The input to search. |
37 | | input: I, |
38 | | } |
39 | | |
40 | | /// A cached allocation that can be reused on each execution. |
41 | 0 | #[derive(Clone, Debug)] |
42 | | pub struct Cache { |
43 | | /// A pair of ordered sets for tracking NFA states. |
44 | | clist: Threads, |
45 | | nlist: Threads, |
46 | | /// An explicit stack used for following epsilon transitions. |
47 | | stack: Vec<FollowEpsilon>, |
48 | | } |
49 | | |
50 | | /// An ordered set of NFA states and their captures. |
51 | 0 | #[derive(Clone, Debug)] |
52 | | struct Threads { |
53 | | /// An ordered set of opcodes (each opcode is an NFA state). |
54 | | set: SparseSet, |
55 | | /// Captures for every NFA state. |
56 | | /// |
57 | | /// It is stored in row-major order, where the columns are the capture |
58 | | /// slots and the rows are the states. |
59 | | caps: Vec<Slot>, |
60 | | /// The number of capture slots stored per thread. (Every capture has |
61 | | /// two slots.) |
62 | | slots_per_thread: usize, |
63 | | } |
64 | | |
65 | | /// A representation of an explicit stack frame when following epsilon |
66 | | /// transitions. This is used to avoid recursion. |
67 | 0 | #[derive(Clone, Debug)] |
68 | | enum FollowEpsilon { |
69 | | /// Follow transitions at the given instruction pointer. |
70 | | IP(InstPtr), |
71 | | /// Restore the capture slot with the given position in the input. |
72 | | Capture { slot: usize, pos: Slot }, |
73 | | } |
74 | | |
75 | | impl Cache { |
76 | | /// Create a new allocation used by the NFA machine to record execution |
77 | | /// and captures. |
78 | 0 | pub fn new(_prog: &Program) -> Self { |
79 | 0 | Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } |
80 | 0 | } |
81 | | } |
82 | | |
83 | | impl<'r, I: Input> Fsm<'r, I> { |
84 | | /// Execute the NFA matching engine. |
85 | | /// |
86 | | /// If there's a match, `exec` returns `true` and populates the given |
87 | | /// captures accordingly. |
88 | 0 | pub fn exec( |
89 | 0 | prog: &'r Program, |
90 | 0 | cache: &ProgramCache, |
91 | 0 | matches: &mut [bool], |
92 | 0 | slots: &mut [Slot], |
93 | 0 | quit_after_match: bool, |
94 | 0 | input: I, |
95 | 0 | start: usize, |
96 | 0 | end: usize, |
97 | 0 | ) -> bool { |
98 | 0 | let mut cache = cache.borrow_mut(); |
99 | 0 | let cache = &mut cache.pikevm; |
100 | 0 | cache.clist.resize(prog.len(), prog.captures.len()); |
101 | 0 | cache.nlist.resize(prog.len(), prog.captures.len()); |
102 | 0 | let at = input.at(start); |
103 | 0 | Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( |
104 | 0 | &mut cache.clist, |
105 | 0 | &mut cache.nlist, |
106 | 0 | matches, |
107 | 0 | slots, |
108 | 0 | quit_after_match, |
109 | 0 | at, |
110 | 0 | end, |
111 | 0 | ) |
112 | 0 | } Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec |
113 | | |
114 | 0 | fn exec_( |
115 | 0 | &mut self, |
116 | 0 | mut clist: &mut Threads, |
117 | 0 | mut nlist: &mut Threads, |
118 | 0 | matches: &mut [bool], |
119 | 0 | slots: &mut [Slot], |
120 | 0 | quit_after_match: bool, |
121 | 0 | mut at: InputAt, |
122 | 0 | end: usize, |
123 | 0 | ) -> bool { |
124 | 0 | let mut matched = false; |
125 | 0 | let mut all_matched = false; |
126 | 0 | clist.set.clear(); |
127 | 0 | nlist.set.clear(); |
128 | | 'LOOP: loop { |
129 | 0 | if clist.set.is_empty() { |
130 | | // Three ways to bail out when our current set of threads is |
131 | | // empty. |
132 | | // |
133 | | // 1. We have a match---so we're done exploring any possible |
134 | | // alternatives. Time to quit. (We can't do this if we're |
135 | | // looking for matches for multiple regexes, unless we know |
136 | | // they all matched.) |
137 | | // |
138 | | // 2. If the expression starts with a '^' we can terminate as |
139 | | // soon as the last thread dies. |
140 | 0 | if (matched && matches.len() <= 1) |
141 | 0 | || all_matched |
142 | 0 | || (!at.is_start() && self.prog.is_anchored_start) |
143 | | { |
144 | 0 | break; |
145 | 0 | } |
146 | 0 |
|
147 | 0 | // 3. If there's a literal prefix for the program, try to |
148 | 0 | // jump ahead quickly. If it can't be found, then we can |
149 | 0 | // bail out early. |
150 | 0 | if !self.prog.prefixes.is_empty() { |
151 | 0 | at = match self.input.prefix_at(&self.prog.prefixes, at) { |
152 | 0 | None => break, |
153 | 0 | Some(at) => at, |
154 | | }; |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | | // This simulates a preceding '.*?' for every regex by adding |
159 | | // a state starting at the current position in the input for the |
160 | | // beginning of the program only if we don't already have a match. |
161 | 0 | if clist.set.is_empty() |
162 | 0 | || (!self.prog.is_anchored_start && !all_matched) |
163 | 0 | { |
164 | 0 | self.add(&mut clist, slots, 0, at); |
165 | 0 | } |
166 | | // The previous call to "add" actually inspects the position just |
167 | | // before the current character. For stepping through the machine, |
168 | | // we can to look at the current character, so we advance the |
169 | | // input. |
170 | 0 | let at_next = self.input.at(at.next_pos()); |
171 | 0 | for i in 0..clist.set.len() { |
172 | 0 | let ip = clist.set[i]; |
173 | 0 | if self.step( |
174 | 0 | &mut nlist, |
175 | 0 | matches, |
176 | 0 | slots, |
177 | 0 | clist.caps(ip), |
178 | 0 | ip, |
179 | 0 | at, |
180 | 0 | at_next, |
181 | 0 | ) { |
182 | 0 | matched = true; |
183 | 0 | all_matched = all_matched || matches.iter().all(|&b| b); Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_::{closure#0}Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_::{closure#0} |
184 | 0 | if quit_after_match { |
185 | | // If we only care if a match occurs (not its |
186 | | // position), then we can quit right now. |
187 | 0 | break 'LOOP; |
188 | 0 | } |
189 | 0 | if self.prog.matches.len() == 1 { |
190 | | // We don't need to check the rest of the threads |
191 | | // in this set because we've matched something |
192 | | // ("leftmost-first"). However, we still need to check |
193 | | // threads in the next set to support things like |
194 | | // greedy matching. |
195 | | // |
196 | | // This is only true on normal regexes. For regex sets, |
197 | | // we need to mush on to observe other matches. |
198 | 0 | break; |
199 | 0 | } |
200 | 0 | } |
201 | | } |
202 | 0 | if at.pos() >= end { |
203 | 0 | break; |
204 | 0 | } |
205 | 0 | at = at_next; |
206 | 0 | mem::swap(clist, nlist); |
207 | 0 | nlist.set.clear(); |
208 | | } |
209 | 0 | matched |
210 | 0 | } Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_ Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_ |
211 | | |
212 | | /// Step through the input, one token (byte or codepoint) at a time. |
213 | | /// |
214 | | /// nlist is the set of states that will be processed on the next token |
215 | | /// in the input. |
216 | | /// |
217 | | /// caps is the set of captures passed by the caller of the NFA. They are |
218 | | /// written to only when a match state is visited. |
219 | | /// |
220 | | /// thread_caps is the set of captures set for the current NFA state, ip. |
221 | | /// |
222 | | /// at and at_next are the current and next positions in the input. at or |
223 | | /// at_next may be EOF. |
224 | 0 | fn step( |
225 | 0 | &mut self, |
226 | 0 | nlist: &mut Threads, |
227 | 0 | matches: &mut [bool], |
228 | 0 | slots: &mut [Slot], |
229 | 0 | thread_caps: &mut [Option<usize>], |
230 | 0 | ip: usize, |
231 | 0 | at: InputAt, |
232 | 0 | at_next: InputAt, |
233 | 0 | ) -> bool { |
234 | 0 | use crate::prog::Inst::*; |
235 | 0 | match self.prog[ip] { |
236 | 0 | Match(match_slot) => { |
237 | 0 | if match_slot < matches.len() { |
238 | 0 | matches[match_slot] = true; |
239 | 0 | } |
240 | 0 | for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { |
241 | 0 | *slot = *val; |
242 | 0 | } |
243 | 0 | true |
244 | | } |
245 | 0 | Char(ref inst) => { |
246 | 0 | if inst.c == at.char() { |
247 | 0 | self.add(nlist, thread_caps, inst.goto, at_next); |
248 | 0 | } |
249 | 0 | false |
250 | | } |
251 | 0 | Ranges(ref inst) => { |
252 | 0 | if inst.matches(at.char()) { |
253 | 0 | self.add(nlist, thread_caps, inst.goto, at_next); |
254 | 0 | } |
255 | 0 | false |
256 | | } |
257 | 0 | Bytes(ref inst) => { |
258 | 0 | if let Some(b) = at.byte() { |
259 | 0 | if inst.matches(b) { |
260 | 0 | self.add(nlist, thread_caps, inst.goto, at_next); |
261 | 0 | } |
262 | 0 | } |
263 | 0 | false |
264 | | } |
265 | 0 | EmptyLook(_) | Save(_) | Split(_) => false, |
266 | | } |
267 | 0 | } Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::step Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::step |
268 | | |
269 | | /// Follows epsilon transitions and adds them for processing to nlist, |
270 | | /// starting at and including ip. |
271 | 0 | fn add( |
272 | 0 | &mut self, |
273 | 0 | nlist: &mut Threads, |
274 | 0 | thread_caps: &mut [Option<usize>], |
275 | 0 | ip: usize, |
276 | 0 | at: InputAt, |
277 | 0 | ) { |
278 | 0 | self.stack.push(FollowEpsilon::IP(ip)); |
279 | 0 | while let Some(frame) = self.stack.pop() { |
280 | 0 | match frame { |
281 | 0 | FollowEpsilon::IP(ip) => { |
282 | 0 | self.add_step(nlist, thread_caps, ip, at); |
283 | 0 | } |
284 | 0 | FollowEpsilon::Capture { slot, pos } => { |
285 | 0 | thread_caps[slot] = pos; |
286 | 0 | } |
287 | | } |
288 | | } |
289 | 0 | } Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add |
290 | | |
291 | | /// A helper function for add that avoids excessive pushing to the stack. |
292 | 0 | fn add_step( |
293 | 0 | &mut self, |
294 | 0 | nlist: &mut Threads, |
295 | 0 | thread_caps: &mut [Option<usize>], |
296 | 0 | mut ip: usize, |
297 | 0 | at: InputAt, |
298 | 0 | ) { |
299 | | // Instead of pushing and popping to the stack, we mutate ip as we |
300 | | // traverse the set of states. We only push to the stack when we |
301 | | // absolutely need recursion (restoring captures or following a |
302 | | // branch). |
303 | | use crate::prog::Inst::*; |
304 | 0 | loop { |
305 | 0 | // Don't visit states we've already added. |
306 | 0 | if nlist.set.contains(ip) { |
307 | 0 | return; |
308 | 0 | } |
309 | 0 | nlist.set.insert(ip); |
310 | 0 | match self.prog[ip] { |
311 | 0 | EmptyLook(ref inst) => { |
312 | 0 | if self.input.is_empty_match(at, inst) { |
313 | 0 | ip = inst.goto; |
314 | 0 | } |
315 | | } |
316 | 0 | Save(ref inst) => { |
317 | 0 | if inst.slot < thread_caps.len() { |
318 | 0 | self.stack.push(FollowEpsilon::Capture { |
319 | 0 | slot: inst.slot, |
320 | 0 | pos: thread_caps[inst.slot], |
321 | 0 | }); |
322 | 0 | thread_caps[inst.slot] = Some(at.pos()); |
323 | 0 | } |
324 | 0 | ip = inst.goto; |
325 | | } |
326 | 0 | Split(ref inst) => { |
327 | 0 | self.stack.push(FollowEpsilon::IP(inst.goto2)); |
328 | 0 | ip = inst.goto1; |
329 | 0 | } |
330 | | Match(_) | Char(_) | Ranges(_) | Bytes(_) => { |
331 | 0 | let t = &mut nlist.caps(ip); |
332 | 0 | for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { |
333 | 0 | *slot = *val; |
334 | 0 | } |
335 | 0 | return; |
336 | | } |
337 | | } |
338 | | } |
339 | 0 | } Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add_step Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add_step |
340 | | } |
341 | | |
342 | | impl Threads { |
343 | 0 | fn new() -> Self { |
344 | 0 | Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } |
345 | 0 | } |
346 | | |
347 | 0 | fn resize(&mut self, num_insts: usize, ncaps: usize) { |
348 | 0 | if num_insts == self.set.capacity() { |
349 | 0 | return; |
350 | 0 | } |
351 | 0 | self.slots_per_thread = ncaps * 2; |
352 | 0 | self.set = SparseSet::new(num_insts); |
353 | 0 | self.caps = vec![None; self.slots_per_thread * num_insts]; |
354 | 0 | } |
355 | | |
356 | 0 | fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { |
357 | 0 | let i = pc * self.slots_per_thread; |
358 | 0 | &mut self.caps[i..i + self.slots_per_thread] |
359 | 0 | } |
360 | | } |