/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.2.0/src/dfa/special.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use crate::{ |
2 | | dfa::DEAD, |
3 | | util::{ |
4 | | bytes::{self, DeserializeError, Endian, SerializeError}, |
5 | | id::StateID, |
6 | | }, |
7 | | }; |
8 | | |
9 | | macro_rules! err { |
10 | | ($msg:expr) => { |
11 | | return Err(DeserializeError::generic($msg)); |
12 | | }; |
13 | | } |
14 | | |
15 | | // Special represents the identifiers in a DFA that correspond to "special" |
16 | | // states. If a state is one or more of the following, then it is considered |
17 | | // special: |
18 | | // |
19 | | // * dead - A non-matching state where all outgoing transitions lead back to |
20 | | // itself. There is only one of these, regardless of whether minimization |
21 | | // has run. The dead state always has an ID of 0. i.e., It is always the |
22 | | // first state in a DFA. |
23 | | // * quit - A state that is entered whenever a byte is seen that should cause |
24 | | // a DFA to give up and stop searching. This results in a MatchError::Quit |
25 | | // error being returned at search time. The default configuration for a DFA |
26 | | // has no quit bytes, which means this state is unreachable by default, |
27 | | // although it is always present for reasons of implementation simplicity. |
28 | | // This state is only reachable when the caller configures the DFA to quit |
29 | | // on certain bytes. There is always exactly one of these states and it |
30 | | // is always the second state. (Its actual ID depends on the size of the |
31 | | // alphabet in dense DFAs, since state IDs are premultiplied in order to |
32 | | // allow them to be used directly as indices into the transition table.) |
33 | | // * match - An accepting state, i.e., indicative of a match. There may be |
34 | | // zero or more of these states. |
35 | | // * accelerated - A state where all of its outgoing transitions, except a |
36 | | // few, loop back to itself. These states are candidates for acceleration |
37 | | // via memchr during search. There may be zero or more of these states. |
38 | | // * start - A non-matching state that indicates where the automaton should |
39 | | // start during a search. There is always at least one starting state and |
40 | | // all are guaranteed to be non-match states. (A start state cannot be a |
41 | | // match state because the DFAs in this crate delay all matches by one byte. |
42 | | // So every search that finds a match must move through one transition to |
43 | | // some other match state, even when searching an empty string.) |
44 | | // |
45 | | // These are not mutually exclusive categories. Namely, the following |
46 | | // overlappings can occur: |
47 | | // |
48 | | // * {dead, start} - If a DFA can never lead to a match and it is minimized, |
49 | | // then it will typically compile to something where all starting IDs point |
50 | | // to the DFA's dead state. |
51 | | // * {match, accelerated} - It is possible for a match state to have the |
52 | | // majority of its transitions loop back to itself, which means it's |
53 | | // possible for a match state to be accelerated. |
54 | | // * {start, accelerated} - Similarly, it is possible for a start state to be |
55 | | // accelerated. Note that it is possible for an accelerated state to be |
56 | | // neither a match or a start state. Also note that just because both match |
57 | | // and start states overlap with accelerated states does not mean that |
58 | | // match and start states overlap with each other. In fact, they are |
59 | | // guaranteed not to overlap. |
60 | | // |
61 | | // As a special mention, every DFA always has a dead and a quit state, even |
62 | | // though from the perspective of the DFA, they are equivalent. (Indeed, |
63 | | // minimization special cases them to ensure they don't get merged.) The |
64 | | // purpose of keeping them distinct is to use the quit state as a sentinel to |
65 | | // distguish between whether a search finished successfully without finding |
66 | | // anything or whether it gave up before finishing. |
67 | | // |
68 | | // So the main problem we want to solve here is the *fast* detection of whether |
69 | | // a state is special or not. And we also want to do this while storing as |
70 | | // little extra data as possible. AND we want to be able to quickly determine |
71 | | // which categories a state falls into above if it is special. |
72 | | // |
73 | | // We achieve this by essentially shuffling all special states to the beginning |
74 | | // of a DFA. That is, all special states appear before every other non-special |
75 | | // state. By representing special states this way, we can determine whether a |
76 | | // state is special or not by a single comparison, where special.max is the |
77 | | // identifier of the last special state in the DFA: |
78 | | // |
79 | | // if current_state <= special.max: |
80 | | // ... do something with special state |
81 | | // |
82 | | // The only thing left to do is to determine what kind of special state |
83 | | // it is. Because what we do next depends on that. Since special states |
84 | | // are typically rare, we can afford to do a bit more extra work, but we'd |
85 | | // still like this to be as fast as possible. The trick we employ here is to |
86 | | // continue shuffling states even within the special state range. Such that |
87 | | // one contiguous region corresponds to match states, another for start states |
88 | | // and then an overlapping range for accelerated states. At a high level, our |
89 | | // special state detection might look like this (for leftmost searching, where |
90 | | // we continue searching even after seeing a match): |
91 | | // |
92 | | // byte = input[offset] |
93 | | // current_state = next_state(current_state, byte) |
94 | | // offset += 1 |
95 | | // if current_state <= special.max: |
96 | | // if current_state == 0: |
97 | | // # We can never leave a dead state, so this always marks the |
98 | | // # end of our search. |
99 | | // return last_match |
100 | | // if current_state == special.quit_id: |
101 | | // # A quit state means we give up. If he DFA has no quit state, |
102 | | // # then special.quit_id == 0 == dead, which is handled by the |
103 | | // # conditional above. |
104 | | // return Err(MatchError::Quit { byte, offset: offset - 1 }) |
105 | | // if special.min_match <= current_state <= special.max_match: |
106 | | // last_match = Some(offset) |
107 | | // if special.min_accel <= current_state <= special.max_accel: |
108 | | // offset = accelerate(input, offset) |
109 | | // last_match = Some(offset) |
110 | | // elif special.min_start <= current_state <= special.max_start: |
111 | | // offset = prefilter.find(input, offset) |
112 | | // if special.min_accel <= current_state <= special.max_accel: |
113 | | // offset = accelerate(input, offset) |
114 | | // elif special.min_accel <= current_state <= special.max_accel: |
115 | | // offset = accelerate(input, offset) |
116 | | // |
117 | | // There are some small details left out of the logic above. For example, |
118 | | // in order to accelerate a state, we need to know which bytes to search for. |
119 | | // This in turn implies some extra data we need to store in the DFA. To keep |
120 | | // things compact, we would ideally only store |
121 | | // |
122 | | // N = special.max_accel - special.min_accel + 1 |
123 | | // |
124 | | // items. But state IDs are premultiplied, which means they are not contiguous. |
125 | | // So in order to take a state ID and index an array of accelerated structures, |
126 | | // we need to do: |
127 | | // |
128 | | // i = (state_id - special.min_accel) / stride |
129 | | // |
130 | | // (N.B. 'stride' is always a power of 2, so the above can be implemented via |
131 | | // '(state_id - special.min_accel) >> stride2', where 'stride2' is x in |
132 | | // 2^x=stride.) |
133 | | // |
134 | | // Moreover, some of these specialty categories may be empty. For example, |
135 | | // DFAs are not required to have any match states or any accelerated states. |
136 | | // In that case, the lower and upper bounds are both set to 0 (the dead state |
137 | | // ID) and the first `current_state == 0` check subsumes cases where the |
138 | | // ranges are empty. |
139 | | // |
140 | | // Loop unrolling, if applicable, has also been left out of the logic above. |
141 | | // |
142 | | // Graphically, the ranges look like this, where asterisks indicate ranges |
143 | | // that can be empty. Each 'x' is a state. |
144 | | // |
145 | | // quit |
146 | | // dead| |
147 | | // || |
148 | | // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx |
149 | | // | | | | start | | |
150 | | // | |-------------| |-------| | |
151 | | // | match* | | | | |
152 | | // | | | | | |
153 | | // | |----------| | | |
154 | | // | accel* | | |
155 | | // | | | |
156 | | // | | | |
157 | | // |----------------------------|------------------------ |
158 | | // special non-special* |
159 | | #[derive(Clone, Copy, Debug)] |
160 | | pub struct Special { |
161 | | /// The identifier of the last special state in a DFA. A state is special |
162 | | /// if and only if its identifier is less than or equal to `max`. |
163 | | pub max: StateID, |
164 | | /// The identifier of the quit state in a DFA. (There is no analogous field |
165 | | /// for the dead state since the dead state's ID is always zero, regardless |
166 | | /// of state ID size.) |
167 | | pub quit_id: StateID, |
168 | | /// The identifier of the first match state. |
169 | | pub min_match: StateID, |
170 | | /// The identifier of the last match state. |
171 | | pub max_match: StateID, |
172 | | /// The identifier of the first accelerated state. |
173 | | pub min_accel: StateID, |
174 | | /// The identifier of the last accelerated state. |
175 | | pub max_accel: StateID, |
176 | | /// The identifier of the first start state. |
177 | | pub min_start: StateID, |
178 | | /// The identifier of the last start state. |
179 | | pub max_start: StateID, |
180 | | } |
181 | | |
182 | | impl Special { |
183 | | /// Creates a new set of special ranges for a DFA. All ranges are initially |
184 | | /// set to only contain the dead state. This is interpreted as an empty |
185 | | /// range. |
186 | | #[cfg(feature = "alloc")] |
187 | | pub fn new() -> Special { |
188 | | Special { |
189 | | max: DEAD, |
190 | | quit_id: DEAD, |
191 | | min_match: DEAD, |
192 | | max_match: DEAD, |
193 | | min_accel: DEAD, |
194 | | max_accel: DEAD, |
195 | | min_start: DEAD, |
196 | | max_start: DEAD, |
197 | | } |
198 | | } |
199 | | |
200 | | /// Remaps all of the special state identifiers using the function given. |
201 | | #[cfg(feature = "alloc")] |
202 | | pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { |
203 | | Special { |
204 | | max: map(self.max), |
205 | | quit_id: map(self.quit_id), |
206 | | min_match: map(self.min_match), |
207 | | max_match: map(self.max_match), |
208 | | min_accel: map(self.min_accel), |
209 | | max_accel: map(self.max_accel), |
210 | | min_start: map(self.min_start), |
211 | | max_start: map(self.max_start), |
212 | | } |
213 | | } |
214 | | |
215 | | /// Deserialize the given bytes into special state ranges. If the slice |
216 | | /// given is not big enough, then this returns an error. Similarly, if |
217 | | /// any of the expected invariants around special state ranges aren't |
218 | | /// upheld, an error is returned. Note that this does not guarantee that |
219 | | /// the information returned is correct. |
220 | | /// |
221 | | /// Upon success, this returns the number of bytes read in addition to the |
222 | | /// special state IDs themselves. |
223 | 0 | pub fn from_bytes( |
224 | 0 | mut slice: &[u8], |
225 | 0 | ) -> Result<(Special, usize), DeserializeError> { |
226 | 0 | bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; |
227 | | |
228 | 0 | let mut nread = 0; |
229 | 0 | let mut read_id = |what| -> Result<StateID, DeserializeError> { |
230 | 0 | let (id, nr) = bytes::try_read_state_id(slice, what)?; |
231 | 0 | nread += nr; |
232 | 0 | slice = &slice[StateID::SIZE..]; |
233 | 0 | Ok(id) |
234 | 0 | }; |
235 | | |
236 | 0 | let max = read_id("special max id")?; |
237 | 0 | let quit_id = read_id("special quit id")?; |
238 | 0 | let min_match = read_id("special min match id")?; |
239 | 0 | let max_match = read_id("special max match id")?; |
240 | 0 | let min_accel = read_id("special min accel id")?; |
241 | 0 | let max_accel = read_id("special max accel id")?; |
242 | 0 | let min_start = read_id("special min start id")?; |
243 | 0 | let max_start = read_id("special max start id")?; |
244 | | |
245 | 0 | let special = Special { |
246 | 0 | max, |
247 | 0 | quit_id, |
248 | 0 | min_match, |
249 | 0 | max_match, |
250 | 0 | min_accel, |
251 | 0 | max_accel, |
252 | 0 | min_start, |
253 | 0 | max_start, |
254 | 0 | }; |
255 | 0 | special.validate()?; |
256 | 0 | assert_eq!(nread, special.write_to_len()); |
257 | 0 | Ok((special, nread)) |
258 | 0 | } |
259 | | |
260 | | /// Validate that the information describing special states satisfies |
261 | | /// all known invariants. |
262 | 0 | pub fn validate(&self) -> Result<(), DeserializeError> { |
263 | 0 | // Check that both ends of the range are DEAD or neither are. |
264 | 0 | if self.min_match == DEAD && self.max_match != DEAD { |
265 | 0 | err!("min_match is DEAD, but max_match is not"); |
266 | 0 | } |
267 | 0 | if self.min_match != DEAD && self.max_match == DEAD { |
268 | 0 | err!("max_match is DEAD, but min_match is not"); |
269 | 0 | } |
270 | 0 | if self.min_accel == DEAD && self.max_accel != DEAD { |
271 | 0 | err!("min_accel is DEAD, but max_accel is not"); |
272 | 0 | } |
273 | 0 | if self.min_accel != DEAD && self.max_accel == DEAD { |
274 | 0 | err!("max_accel is DEAD, but min_accel is not"); |
275 | 0 | } |
276 | 0 | if self.min_start == DEAD && self.max_start != DEAD { |
277 | 0 | err!("min_start is DEAD, but max_start is not"); |
278 | 0 | } |
279 | 0 | if self.min_start != DEAD && self.max_start == DEAD { |
280 | 0 | err!("max_start is DEAD, but min_start is not"); |
281 | 0 | } |
282 | 0 |
|
283 | 0 | // Check that ranges are well formed. |
284 | 0 | if self.min_match > self.max_match { |
285 | 0 | err!("min_match should not be greater than max_match"); |
286 | 0 | } |
287 | 0 | if self.min_accel > self.max_accel { |
288 | 0 | err!("min_accel should not be greater than max_accel"); |
289 | 0 | } |
290 | 0 | if self.min_start > self.max_start { |
291 | 0 | err!("min_start should not be greater than max_start"); |
292 | 0 | } |
293 | 0 |
|
294 | 0 | // Check that ranges are ordered with respect to one another. |
295 | 0 | if self.matches() && self.quit_id >= self.min_match { |
296 | 0 | err!("quit_id should not be greater than min_match"); |
297 | 0 | } |
298 | 0 | if self.accels() && self.quit_id >= self.min_accel { |
299 | 0 | err!("quit_id should not be greater than min_accel"); |
300 | 0 | } |
301 | 0 | if self.starts() && self.quit_id >= self.min_start { |
302 | 0 | err!("quit_id should not be greater than min_start"); |
303 | 0 | } |
304 | 0 | if self.matches() && self.accels() && self.min_accel < self.min_match { |
305 | 0 | err!("min_match should not be greater than min_accel"); |
306 | 0 | } |
307 | 0 | if self.matches() && self.starts() && self.min_start < self.min_match { |
308 | 0 | err!("min_match should not be greater than min_start"); |
309 | 0 | } |
310 | 0 | if self.accels() && self.starts() && self.min_start < self.min_accel { |
311 | 0 | err!("min_accel should not be greater than min_start"); |
312 | 0 | } |
313 | 0 |
|
314 | 0 | // Check that max is at least as big as everything else. |
315 | 0 | if self.max < self.quit_id { |
316 | 0 | err!("quit_id should not be greater than max"); |
317 | 0 | } |
318 | 0 | if self.max < self.max_match { |
319 | 0 | err!("max_match should not be greater than max"); |
320 | 0 | } |
321 | 0 | if self.max < self.max_accel { |
322 | 0 | err!("max_accel should not be greater than max"); |
323 | 0 | } |
324 | 0 | if self.max < self.max_start { |
325 | 0 | err!("max_start should not be greater than max"); |
326 | 0 | } |
327 | 0 |
|
328 | 0 | Ok(()) |
329 | 0 | } |
330 | | |
331 | | /// Validate that the special state information is compatible with the |
332 | | /// given state count. |
333 | 0 | pub fn validate_state_count( |
334 | 0 | &self, |
335 | 0 | count: usize, |
336 | 0 | stride2: usize, |
337 | 0 | ) -> Result<(), DeserializeError> { |
338 | 0 | // We assume that 'validate' has already passed, so we know that 'max' |
339 | 0 | // is truly the max. So all we need to check is that the max state |
340 | 0 | // ID is less than the state ID count. The max legal value here is |
341 | 0 | // count-1, which occurs when there are no non-special states. |
342 | 0 | if (self.max.as_usize() >> stride2) >= count { |
343 | 0 | err!("max should not be greater than or equal to state count"); |
344 | 0 | } |
345 | 0 | Ok(()) |
346 | 0 | } |
347 | | |
348 | | /// Write the IDs and ranges for special states to the given byte buffer. |
349 | | /// The buffer given must have enough room to store all data, otherwise |
350 | | /// this will return an error. The number of bytes written is returned |
351 | | /// on success. The number of bytes written is guaranteed to be a multiple |
352 | | /// of 8. |
353 | 0 | pub fn write_to<E: Endian>( |
354 | 0 | &self, |
355 | 0 | dst: &mut [u8], |
356 | 0 | ) -> Result<usize, SerializeError> { |
357 | | use crate::util::bytes::write_state_id as write; |
358 | | |
359 | 0 | if dst.len() < self.write_to_len() { |
360 | 0 | return Err(SerializeError::buffer_too_small("special state ids")); |
361 | 0 | } |
362 | 0 |
|
363 | 0 | let mut nwrite = 0; |
364 | 0 | nwrite += write::<E>(self.max, &mut dst[nwrite..]); |
365 | 0 | nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]); |
366 | 0 | nwrite += write::<E>(self.min_match, &mut dst[nwrite..]); |
367 | 0 | nwrite += write::<E>(self.max_match, &mut dst[nwrite..]); |
368 | 0 | nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]); |
369 | 0 | nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]); |
370 | 0 | nwrite += write::<E>(self.min_start, &mut dst[nwrite..]); |
371 | 0 | nwrite += write::<E>(self.max_start, &mut dst[nwrite..]); |
372 | 0 |
|
373 | 0 | assert_eq!( |
374 | 0 | self.write_to_len(), |
375 | | nwrite, |
376 | 0 | "expected to write certain number of bytes", |
377 | | ); |
378 | 0 | assert_eq!( |
379 | 0 | nwrite % 8, |
380 | | 0, |
381 | 0 | "expected to write multiple of 8 bytes for special states", |
382 | | ); |
383 | 0 | Ok(nwrite) |
384 | 0 | } |
385 | | |
386 | | /// Returns the total number of bytes written by `write_to`. |
387 | 0 | pub fn write_to_len(&self) -> usize { |
388 | 0 | 8 * StateID::SIZE |
389 | 0 | } |
390 | | |
391 | | /// Sets the maximum special state ID based on the current values. This |
392 | | /// should be used once all possible state IDs are set. |
393 | | #[cfg(feature = "alloc")] |
394 | | pub fn set_max(&mut self) { |
395 | | use core::cmp::max; |
396 | | self.max = max( |
397 | | self.quit_id, |
398 | | max(self.max_match, max(self.max_accel, self.max_start)), |
399 | | ); |
400 | | } |
401 | | |
402 | | /// Returns true if and only if the given state ID is a special state. |
403 | | #[inline] |
404 | 0 | pub fn is_special_state(&self, id: StateID) -> bool { |
405 | 0 | id <= self.max |
406 | 0 | } |
407 | | |
408 | | /// Returns true if and only if the given state ID is a dead state. |
409 | | #[inline] |
410 | 0 | pub fn is_dead_state(&self, id: StateID) -> bool { |
411 | 0 | id == DEAD |
412 | 0 | } Unexecuted instantiation: <regex_automata::dfa::special::Special>::is_dead_state Unexecuted instantiation: <regex_automata::dfa::special::Special>::is_dead_state |
413 | | |
414 | | /// Returns true if and only if the given state ID is a quit state. |
415 | | #[inline] |
416 | 0 | pub fn is_quit_state(&self, id: StateID) -> bool { |
417 | 0 | !self.is_dead_state(id) && self.quit_id == id |
418 | 0 | } |
419 | | |
420 | | /// Returns true if and only if the given state ID is a match state. |
421 | | #[inline] |
422 | 0 | pub fn is_match_state(&self, id: StateID) -> bool { |
423 | 0 | !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match |
424 | 0 | } Unexecuted instantiation: <regex_automata::dfa::special::Special>::is_match_state Unexecuted instantiation: <regex_automata::dfa::special::Special>::is_match_state |
425 | | |
426 | | /// Returns true if and only if the given state ID is an accel state. |
427 | | #[inline] |
428 | 0 | pub fn is_accel_state(&self, id: StateID) -> bool { |
429 | 0 | !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel |
430 | 0 | } |
431 | | |
432 | | /// Returns true if and only if the given state ID is a start state. |
433 | | #[inline] |
434 | 0 | pub fn is_start_state(&self, id: StateID) -> bool { |
435 | 0 | !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start |
436 | 0 | } |
437 | | |
438 | | /// Returns the total number of match states for a dense table based DFA. |
439 | | #[inline] |
440 | 0 | pub fn match_len(&self, stride: usize) -> usize { |
441 | 0 | if self.matches() { |
442 | 0 | (self.max_match.as_usize() - self.min_match.as_usize() + stride) |
443 | 0 | / stride |
444 | | } else { |
445 | 0 | 0 |
446 | | } |
447 | 0 | } |
448 | | |
449 | | /// Returns true if and only if there is at least one match state. |
450 | | #[inline] |
451 | 0 | pub fn matches(&self) -> bool { |
452 | 0 | self.min_match != DEAD |
453 | 0 | } |
454 | | |
455 | | /// Returns the total number of accel states. |
456 | | #[cfg(feature = "alloc")] |
457 | | pub fn accel_len(&self, stride: usize) -> usize { |
458 | | if self.accels() { |
459 | | (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) |
460 | | / stride |
461 | | } else { |
462 | | 0 |
463 | | } |
464 | | } |
465 | | |
466 | | /// Returns true if and only if there is at least one accel state. |
467 | | #[inline] |
468 | 0 | pub fn accels(&self) -> bool { |
469 | 0 | self.min_accel != DEAD |
470 | 0 | } |
471 | | |
472 | | /// Returns true if and only if there is at least one start state. |
473 | | #[inline] |
474 | 0 | pub fn starts(&self) -> bool { |
475 | 0 | self.min_start != DEAD |
476 | 0 | } |
477 | | } |