/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-1.5.5/src/pikevm.rs

Source (jump to first uncovered line)
// This module implements the Pike VM. That is, it guarantees linear time
// search of a regex on any text with memory use proportional to the size of
// the regex.
//
// It is equal in power to the backtracking engine in this crate, except the
// backtracking engine is typically faster on small regexes/texts at the
// expense of a bigger memory footprint.
//
// It can do more than the DFA can (specifically, record capture locations
// and execute Unicode word boundary assertions), but at a slower speed.
// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
// epsilon transitions. That is, the Pike VM engine can be in multiple states
// at once where as the DFA is only ever in one state at a time.
//
// Therefore, the Pike VM is generally treated as the fallback when the other
// matching engines either aren't feasible to run or are insufficient.

use std::mem;

use crate::exec::ProgramCache;
use crate::input::{Input, InputAt};
use crate::prog::{InstPtr, Program};
use crate::re_trait::Slot;
use crate::sparse::SparseSet;

/// An NFA simulation matching engine.
#[derive(Debug)]
pub struct Fsm<'r, I> {
    /// The sequence of opcodes (among other things) that is actually executed.
    ///
    /// The program may be byte oriented or Unicode codepoint oriented.
    prog: &'r Program,
    /// An explicit stack used for following epsilon transitions. (This is
    /// borrowed from the cache.)
    stack: &'r mut Vec<FollowEpsilon>,
    /// The input to search.
    input: I,
}

/// A cached allocation that can be reused on each execution.
#[derive(Clone, Debug)]
pub struct Cache {
    /// A pair of ordered sets for tracking NFA states.
    clist: Threads,
    nlist: Threads,
    /// An explicit stack used for following epsilon transitions.
    stack: Vec<FollowEpsilon>,
}

/// An ordered set of NFA states and their captures.
#[derive(Clone, Debug)]
struct Threads {
    /// An ordered set of opcodes (each opcode is an NFA state).
    set: SparseSet,
    /// Captures for every NFA state.
    ///
    /// It is stored in row-major order, where the columns are the capture
    /// slots and the rows are the states.
    caps: Vec<Slot>,
    /// The number of capture slots stored per thread. (Every capture has
    /// two slots.)
    slots_per_thread: usize,
}

/// A representation of an explicit stack frame when following epsilon
/// transitions. This is used to avoid recursion.
#[derive(Clone, Debug)]
enum FollowEpsilon {
    /// Follow transitions at the given instruction pointer.
    IP(InstPtr),
    /// Restore the capture slot with the given position in the input.
    Capture { slot: usize, pos: Slot },
}

impl Cache {
    /// Create a new allocation used by the NFA machine to record execution
    /// and captures.
    pub fn new(_prog: &Program) -> Self {
        Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
    }
}

impl<'r, I: Input> Fsm<'r, I> {
    /// Execute the NFA matching engine.
    ///
    /// If there's a match, `exec` returns `true` and populates the given
    /// captures accordingly.
    pub fn exec(
        prog: &'r Program,
        cache: &ProgramCache,
        matches: &mut [bool],
        slots: &mut [Slot],
        quit_after_match: bool,
        input: I,
        start: usize,
        end: usize,
    ) -> bool {
        let mut cache = cache.borrow_mut();
        let cache = &mut cache.pikevm;
        cache.clist.resize(prog.len(), prog.captures.len());
        cache.nlist.resize(prog.len(), prog.captures.len());
        let at = input.at(start);
        Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
            &mut cache.clist,
            &mut cache.nlist,
            matches,
            slots,
            quit_after_match,
            at,
            end,
        )
    }

    fn exec_(
        &mut self,
        mut clist: &mut Threads,
        mut nlist: &mut Threads,
        matches: &mut [bool],
        slots: &mut [Slot],
        quit_after_match: bool,
        mut at: InputAt,
        end: usize,
    ) -> bool {
        let mut matched = false;
        let mut all_matched = false;
        clist.set.clear();
        nlist.set.clear();
        'LOOP: loop {
            if clist.set.is_empty() {
                // Three ways to bail out when our current set of threads is
                // empty.
                //
                // 1. We have a match---so we're done exploring any possible
                //    alternatives. Time to quit. (We can't do this if we're
                //    looking for matches for multiple regexes, unless we know
                //    they all matched.)
                //
                // 2. If the expression starts with a '^' we can terminate as
                //    soon as the last thread dies.
                if (matched && matches.len() <= 1)
                    || all_matched
                    || (!at.is_start() && self.prog.is_anchored_start)
                {
                    break;
                }

                // 3. If there's a literal prefix for the program, try to
                //    jump ahead quickly. If it can't be found, then we can
                //    bail out early.
                if !self.prog.prefixes.is_empty() {
                    at = match self.input.prefix_at(&self.prog.prefixes, at) {
                        None => break,
                        Some(at) => at,
                    };
                }
            }

            // This simulates a preceding '.*?' for every regex by adding
            // a state starting at the current position in the input for the
            // beginning of the program only if we don't already have a match.
            if clist.set.is_empty()
                || (!self.prog.is_anchored_start && !all_matched)
            {
                self.add(&mut clist, slots, 0, at);
            }
            // The previous call to "add" actually inspects the position just
            // before the current character. For stepping through the machine,
            // we can to look at the current character, so we advance the
            // input.
            let at_next = self.input.at(at.next_pos());
            for i in 0..clist.set.len() {
                let ip = clist.set[i];
                if self.step(
                    &mut nlist,
                    matches,
                    slots,
                    clist.caps(ip),
                    ip,
                    at,
                    at_next,
                ) {
                    matched = true;
                    all_matched = all_matched || matches.iter().all(|&b| b);
                    if quit_after_match {
                        // If we only care if a match occurs (not its
                        // position), then we can quit right now.
                        break 'LOOP;
                    }
                    if self.prog.matches.len() == 1 {
                        // We don't need to check the rest of the threads
                        // in this set because we've matched something
                        // ("leftmost-first"). However, we still need to check
                        // threads in the next set to support things like
                        // greedy matching.
                        //
                        // This is only true on normal regexes. For regex sets,
                        // we need to mush on to observe other matches.
                        break;
                    }
                }
            }
            if at.pos() >= end {
                break;
            }
            at = at_next;
            mem::swap(clist, nlist);
            nlist.set.clear();
        }
        matched
    }

    /// Step through the input, one token (byte or codepoint) at a time.
    ///
    /// nlist is the set of states that will be processed on the next token
    /// in the input.
    ///
    /// caps is the set of captures passed by the caller of the NFA. They are
    /// written to only when a match state is visited.
    ///
    /// thread_caps is the set of captures set for the current NFA state, ip.
    ///
    /// at and at_next are the current and next positions in the input. at or
    /// at_next may be EOF.
    fn step(
        &mut self,
        nlist: &mut Threads,
        matches: &mut [bool],
        slots: &mut [Slot],
        thread_caps: &mut [Option<usize>],
        ip: usize,
        at: InputAt,
        at_next: InputAt,
    ) -> bool {
        use crate::prog::Inst::*;
        match self.prog[ip] {
            Match(match_slot) => {
                if match_slot < matches.len() {
                    matches[match_slot] = true;
                }
                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
                    *slot = *val;
                }
                true
            }
            Char(ref inst) => {
                if inst.c == at.char() {
                    self.add(nlist, thread_caps, inst.goto, at_next);
                }
                false
            }
            Ranges(ref inst) => {
                if inst.matches(at.char()) {
                    self.add(nlist, thread_caps, inst.goto, at_next);
                }
                false
            }
            Bytes(ref inst) => {
                if let Some(b) = at.byte() {
                    if inst.matches(b) {
                        self.add(nlist, thread_caps, inst.goto, at_next);
                    }
                }
                false
            }
            EmptyLook(_) | Save(_) | Split(_) => false,
        }
    }

    /// Follows epsilon transitions and adds them for processing to nlist,
    /// starting at and including ip.
    fn add(
        &mut self,
        nlist: &mut Threads,
        thread_caps: &mut [Option<usize>],
        ip: usize,
        at: InputAt,
    ) {
        self.stack.push(FollowEpsilon::IP(ip));
        while let Some(frame) = self.stack.pop() {
            match frame {
                FollowEpsilon::IP(ip) => {
                    self.add_step(nlist, thread_caps, ip, at);
                }
                FollowEpsilon::Capture { slot, pos } => {
                    thread_caps[slot] = pos;
                }
            }
        }
    }

    /// A helper function for add that avoids excessive pushing to the stack.
    fn add_step(
        &mut self,
        nlist: &mut Threads,
        thread_caps: &mut [Option<usize>],
        mut ip: usize,
        at: InputAt,
    ) {
        // Instead of pushing and popping to the stack, we mutate ip as we
        // traverse the set of states. We only push to the stack when we
        // absolutely need recursion (restoring captures or following a
        // branch).
        use crate::prog::Inst::*;
        loop {
            // Don't visit states we've already added.
            if nlist.set.contains(ip) {
                return;
            }
            nlist.set.insert(ip);
            match self.prog[ip] {
                EmptyLook(ref inst) => {
                    if self.input.is_empty_match(at, inst) {
                        ip = inst.goto;
                    }
                }
                Save(ref inst) => {
                    if inst.slot < thread_caps.len() {
                        self.stack.push(FollowEpsilon::Capture {
                            slot: inst.slot,
                            pos: thread_caps[inst.slot],
                        });
                        thread_caps[inst.slot] = Some(at.pos());
                    }
                    ip = inst.goto;
                }
                Split(ref inst) => {
                    self.stack.push(FollowEpsilon::IP(inst.goto2));
                    ip = inst.goto1;
                }
                Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
                    let t = &mut nlist.caps(ip);
                    for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
                        *slot = *val;
                    }
                    return;
                }
            }
        }
    }
}

impl Threads {
    fn new() -> Self {
        Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
    }

    fn resize(&mut self, num_insts: usize, ncaps: usize) {
        if num_insts == self.set.capacity() {
            return;
        }
        self.slots_per_thread = ncaps * 2;
        self.set = SparseSet::new(num_insts);
        self.caps = vec![None; self.slots_per_thread * num_insts];
    }

    fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
        let i = pc * self.slots_per_thread;
        &mut self.caps[i..i + self.slots_per_thread]
    }
}

Coverage Report

Created: 2023-04-25 07:07

Line	Count	Source (jump to first uncovered line)
1		// This module implements the Pike VM. That is, it guarantees linear time
2		// search of a regex on any text with memory use proportional to the size of
3		// the regex.
4		//
5		// It is equal in power to the backtracking engine in this crate, except the
6		// backtracking engine is typically faster on small regexes/texts at the
7		// expense of a bigger memory footprint.
8		//
9		// It can do more than the DFA can (specifically, record capture locations
10		// and execute Unicode word boundary assertions), but at a slower speed.
11		// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12		// epsilon transitions. That is, the Pike VM engine can be in multiple states
13		// at once where as the DFA is only ever in one state at a time.
14		//
15		// Therefore, the Pike VM is generally treated as the fallback when the other
16		// matching engines either aren't feasible to run or are insufficient.
17
18		use std::mem;
19
20		use crate::exec::ProgramCache;
21		use crate::input::{Input, InputAt};
22		use crate::prog::{InstPtr, Program};
23		use crate::re_trait::Slot;
24		use crate::sparse::SparseSet;
25
26		/// An NFA simulation matching engine.
27	0	#[derive(Debug)]
28		pub struct Fsm<'r, I> {
29		/// The sequence of opcodes (among other things) that is actually executed.
30		///
31		/// The program may be byte oriented or Unicode codepoint oriented.
32		prog: &'r Program,
33		/// An explicit stack used for following epsilon transitions. (This is
34		/// borrowed from the cache.)
35		stack: &'r mut Vec<FollowEpsilon>,
36		/// The input to search.
37		input: I,
38		}
39
40		/// A cached allocation that can be reused on each execution.
41	0	#[derive(Clone, Debug)]
42		pub struct Cache {
43		/// A pair of ordered sets for tracking NFA states.
44		clist: Threads,
45		nlist: Threads,
46		/// An explicit stack used for following epsilon transitions.
47		stack: Vec<FollowEpsilon>,
48		}
49
50		/// An ordered set of NFA states and their captures.
51	0	#[derive(Clone, Debug)]
52		struct Threads {
53		/// An ordered set of opcodes (each opcode is an NFA state).
54		set: SparseSet,
55		/// Captures for every NFA state.
56		///
57		/// It is stored in row-major order, where the columns are the capture
58		/// slots and the rows are the states.
59		caps: Vec<Slot>,
60		/// The number of capture slots stored per thread. (Every capture has
61		/// two slots.)
62		slots_per_thread: usize,
63		}
64
65		/// A representation of an explicit stack frame when following epsilon
66		/// transitions. This is used to avoid recursion.
67	0	#[derive(Clone, Debug)]
68		enum FollowEpsilon {
69		/// Follow transitions at the given instruction pointer.
70		IP(InstPtr),
71		/// Restore the capture slot with the given position in the input.
72		Capture { slot: usize, pos: Slot },
73		}
74
75		impl Cache {
76		/// Create a new allocation used by the NFA machine to record execution
77		/// and captures.
78	0	pub fn new(_prog: &Program) -> Self {
79	0	Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80	0	}
81		}
82
83		impl<'r, I: Input> Fsm<'r, I> {
84		/// Execute the NFA matching engine.
85		///
86		/// If there's a match, `exec` returns `true` and populates the given
87		/// captures accordingly.
88	0	pub fn exec(
89	0	prog: &'r Program,
90	0	cache: &ProgramCache,
91	0	matches: &mut [bool],
92	0	slots: &mut [Slot],
93	0	quit_after_match: bool,
94	0	input: I,
95	0	start: usize,
96	0	end: usize,
97	0	) -> bool {
98	0	let mut cache = cache.borrow_mut();
99	0	let cache = &mut cache.pikevm;
100	0	cache.clist.resize(prog.len(), prog.captures.len());
101	0	cache.nlist.resize(prog.len(), prog.captures.len());
102	0	let at = input.at(start);
103	0	Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
104	0	&mut cache.clist,
105	0	&mut cache.nlist,
106	0	matches,
107	0	slots,
108	0	quit_after_match,
109	0	at,
110	0	end,
111	0	)
112	0	} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec
113
114	0	fn exec_(
115	0	&mut self,
116	0	mut clist: &mut Threads,
117	0	mut nlist: &mut Threads,
118	0	matches: &mut [bool],
119	0	slots: &mut [Slot],
120	0	quit_after_match: bool,
121	0	mut at: InputAt,
122	0	end: usize,
123	0	) -> bool {
124	0	let mut matched = false;
125	0	let mut all_matched = false;
126	0	clist.set.clear();
127	0	nlist.set.clear();
128		'LOOP: loop {
129	0	if clist.set.is_empty() {
130		// Three ways to bail out when our current set of threads is
131		// empty.
132		//
133		// 1. We have a match---so we're done exploring any possible
134		// alternatives. Time to quit. (We can't do this if we're
135		// looking for matches for multiple regexes, unless we know
136		// they all matched.)
137		//
138		// 2. If the expression starts with a '^' we can terminate as
139		// soon as the last thread dies.
140	0	if (matched && matches.len() <= 1)
141	0	\|\| all_matched
142	0	\|\| (!at.is_start() && self.prog.is_anchored_start)
143		{
144	0	break;
145	0	}
146	0
147	0	// 3. If there's a literal prefix for the program, try to
148	0	// jump ahead quickly. If it can't be found, then we can
149	0	// bail out early.
150	0	if !self.prog.prefixes.is_empty() {
151	0	at = match self.input.prefix_at(&self.prog.prefixes, at) {
152	0	None => break,
153	0	Some(at) => at,
154		};
155	0	}
156	0	}
157
158		// This simulates a preceding '.*?' for every regex by adding
159		// a state starting at the current position in the input for the
160		// beginning of the program only if we don't already have a match.
161	0	if clist.set.is_empty()
162	0	\|\| (!self.prog.is_anchored_start && !all_matched)
163	0	{
164	0	self.add(&mut clist, slots, 0, at);
165	0	}
166		// The previous call to "add" actually inspects the position just
167		// before the current character. For stepping through the machine,
168		// we can to look at the current character, so we advance the
169		// input.
170	0	let at_next = self.input.at(at.next_pos());
171	0	for i in 0..clist.set.len() {
172	0	let ip = clist.set[i];
173	0	if self.step(
174	0	&mut nlist,
175	0	matches,
176	0	slots,
177	0	clist.caps(ip),
178	0	ip,
179	0	at,
180	0	at_next,
181	0	) {
182	0	matched = true;
183	0	all_matched = all_matched \|\| matches.iter().all(\|&b\| b); Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_::{closure#0} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_::{closure#0}
184	0	if quit_after_match {
185		// If we only care if a match occurs (not its
186		// position), then we can quit right now.
187	0	break 'LOOP;
188	0	}
189	0	if self.prog.matches.len() == 1 {
190		// We don't need to check the rest of the threads
191		// in this set because we've matched something
192		// ("leftmost-first"). However, we still need to check
193		// threads in the next set to support things like
194		// greedy matching.
195		//
196		// This is only true on normal regexes. For regex sets,
197		// we need to mush on to observe other matches.
198	0	break;
199	0	}
200	0	}
201		}
202	0	if at.pos() >= end {
203	0	break;
204	0	}
205	0	at = at_next;
206	0	mem::swap(clist, nlist);
207	0	nlist.set.clear();
208		}
209	0	matched
210	0	} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::exec_ Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::exec_
211
212		/// Step through the input, one token (byte or codepoint) at a time.
213		///
214		/// nlist is the set of states that will be processed on the next token
215		/// in the input.
216		///
217		/// caps is the set of captures passed by the caller of the NFA. They are
218		/// written to only when a match state is visited.
219		///
220		/// thread_caps is the set of captures set for the current NFA state, ip.
221		///
222		/// at and at_next are the current and next positions in the input. at or
223		/// at_next may be EOF.
224	0	fn step(
225	0	&mut self,
226	0	nlist: &mut Threads,
227	0	matches: &mut [bool],
228	0	slots: &mut [Slot],
229	0	thread_caps: &mut [Option<usize>],
230	0	ip: usize,
231	0	at: InputAt,
232	0	at_next: InputAt,
233	0	) -> bool {
234	0	use crate::prog::Inst::*;
235	0	match self.prog[ip] {
236	0	Match(match_slot) => {
237	0	if match_slot < matches.len() {
238	0	matches[match_slot] = true;
239	0	}
240	0	for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241	0	slot = val;
242	0	}
243	0	true
244		}
245	0	Char(ref inst) => {
246	0	if inst.c == at.char() {
247	0	self.add(nlist, thread_caps, inst.goto, at_next);
248	0	}
249	0	false
250		}
251	0	Ranges(ref inst) => {
252	0	if inst.matches(at.char()) {
253	0	self.add(nlist, thread_caps, inst.goto, at_next);
254	0	}
255	0	false
256		}
257	0	Bytes(ref inst) => {
258	0	if let Some(b) = at.byte() {
259	0	if inst.matches(b) {
260	0	self.add(nlist, thread_caps, inst.goto, at_next);
261	0	}
262	0	}
263	0	false
264		}
265	0	EmptyLook(_) \| Save(_) \| Split(_) => false,
266		}
267	0	} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::step Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::step
268
269		/// Follows epsilon transitions and adds them for processing to nlist,
270		/// starting at and including ip.
271	0	fn add(
272	0	&mut self,
273	0	nlist: &mut Threads,
274	0	thread_caps: &mut [Option<usize>],
275	0	ip: usize,
276	0	at: InputAt,
277	0	) {
278	0	self.stack.push(FollowEpsilon::IP(ip));
279	0	while let Some(frame) = self.stack.pop() {
280	0	match frame {
281	0	FollowEpsilon::IP(ip) => {
282	0	self.add_step(nlist, thread_caps, ip, at);
283	0	}
284	0	FollowEpsilon::Capture { slot, pos } => {
285	0	thread_caps[slot] = pos;
286	0	}
287		}
288		}
289	0	} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add
290
291		/// A helper function for add that avoids excessive pushing to the stack.
292	0	fn add_step(
293	0	&mut self,
294	0	nlist: &mut Threads,
295	0	thread_caps: &mut [Option<usize>],
296	0	mut ip: usize,
297	0	at: InputAt,
298	0	) {
299		// Instead of pushing and popping to the stack, we mutate ip as we
300		// traverse the set of states. We only push to the stack when we
301		// absolutely need recursion (restoring captures or following a
302		// branch).
303		use crate::prog::Inst::*;
304	0	loop {
305	0	// Don't visit states we've already added.
306	0	if nlist.set.contains(ip) {
307	0	return;
308	0	}
309	0	nlist.set.insert(ip);
310	0	match self.prog[ip] {
311	0	EmptyLook(ref inst) => {
312	0	if self.input.is_empty_match(at, inst) {
313	0	ip = inst.goto;
314	0	}
315		}
316	0	Save(ref inst) => {
317	0	if inst.slot < thread_caps.len() {
318	0	self.stack.push(FollowEpsilon::Capture {
319	0	slot: inst.slot,
320	0	pos: thread_caps[inst.slot],
321	0	});
322	0	thread_caps[inst.slot] = Some(at.pos());
323	0	}
324	0	ip = inst.goto;
325		}
326	0	Split(ref inst) => {
327	0	self.stack.push(FollowEpsilon::IP(inst.goto2));
328	0	ip = inst.goto1;
329	0	}
330		Match(_) \| Char(_) \| Ranges(_) \| Bytes(_) => {
331	0	let t = &mut nlist.caps(ip);
332	0	for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333	0	slot = val;
334	0	}
335	0	return;
336		}
337		}
338		}
339	0	} Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::CharInput>>::add_step Unexecuted instantiation: <regex::pikevm::Fsm<regex::input::ByteInput>>::add_step
340		}
341
342		impl Threads {
343	0	fn new() -> Self {
344	0	Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
345	0	}
346
347	0	fn resize(&mut self, num_insts: usize, ncaps: usize) {
348	0	if num_insts == self.set.capacity() {
349	0	return;
350	0	}
351	0	self.slots_per_thread = ncaps * 2;
352	0	self.set = SparseSet::new(num_insts);
353	0	self.caps = vec![None; self.slots_per_thread * num_insts];
354	0	}
355
356	0	fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357	0	let i = pc * self.slots_per_thread;
358	0	&mut self.caps[i..i + self.slots_per_thread]
359	0	}
360		}