/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.2.0/src/dfa/sparse.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Types and routines specific to sparse DFAs. |
3 | | |
4 | | This module is the home of [`sparse::DFA`](DFA). |
5 | | |
6 | | Unlike the [`dense`](super::dense) module, this module does not contain a |
7 | | builder or configuration specific for sparse DFAs. Instead, the intended |
8 | | way to build a sparse DFA is either by using a default configuration with |
9 | | its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the |
10 | | construction of a dense DFA with [`dense::Builder`](super::dense::Builder) |
11 | | and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For |
12 | | example, this configures a sparse DFA to do an overlapping search: |
13 | | |
14 | | ``` |
15 | | use regex_automata::{ |
16 | | dfa::{Automaton, OverlappingState, dense}, |
17 | | HalfMatch, MatchKind, |
18 | | }; |
19 | | |
20 | | let dense_re = dense::Builder::new() |
21 | | .configure(dense::Config::new().match_kind(MatchKind::All)) |
22 | | .build(r"Samwise|Sam")?; |
23 | | let sparse_re = dense_re.to_sparse()?; |
24 | | |
25 | | // Setup our haystack and initial start state. |
26 | | let haystack = b"Samwise"; |
27 | | let mut state = OverlappingState::start(); |
28 | | |
29 | | // First, 'Sam' will match. |
30 | | let end1 = sparse_re.find_overlapping_fwd_at( |
31 | | None, None, haystack, 0, haystack.len(), &mut state, |
32 | | )?; |
33 | | assert_eq!(end1, Some(HalfMatch::must(0, 3))); |
34 | | |
35 | | // And now 'Samwise' will match. |
36 | | let end2 = sparse_re.find_overlapping_fwd_at( |
37 | | None, None, haystack, 3, haystack.len(), &mut state, |
38 | | )?; |
39 | | assert_eq!(end2, Some(HalfMatch::must(0, 7))); |
40 | | # Ok::<(), Box<dyn std::error::Error>>(()) |
41 | | ``` |
42 | | */ |
43 | | |
44 | | #[cfg(feature = "alloc")] |
45 | | use core::iter; |
46 | | use core::{ |
47 | | convert::{TryFrom, TryInto}, |
48 | | fmt, |
49 | | mem::size_of, |
50 | | }; |
51 | | |
52 | | #[cfg(feature = "alloc")] |
53 | | use alloc::{collections::BTreeSet, vec, vec::Vec}; |
54 | | |
55 | | #[cfg(feature = "alloc")] |
56 | | use crate::dfa::{dense, error::Error}; |
57 | | use crate::{ |
58 | | dfa::{ |
59 | | automaton::{fmt_state_indicator, Automaton}, |
60 | | special::Special, |
61 | | DEAD, |
62 | | }, |
63 | | util::{ |
64 | | alphabet::ByteClasses, |
65 | | bytes::{self, DeserializeError, Endian, SerializeError}, |
66 | | id::{PatternID, StateID}, |
67 | | start::Start, |
68 | | DebugByte, |
69 | | }, |
70 | | }; |
71 | | |
72 | | const LABEL: &str = "rust-regex-automata-dfa-sparse"; |
73 | | const VERSION: u32 = 2; |
74 | | |
75 | | /// A sparse deterministic finite automaton (DFA) with variable sized states. |
76 | | /// |
77 | | /// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses |
78 | | /// a more space efficient representation for its transitions. Consequently, |
79 | | /// sparse DFAs may use much less memory than dense DFAs, but this comes at a |
80 | | /// price. In particular, reading the more space efficient transitions takes |
81 | | /// more work, and consequently, searching using a sparse DFA is typically |
82 | | /// slower than a dense DFA. |
83 | | /// |
84 | | /// A sparse DFA can be built using the default configuration via the |
85 | | /// [`DFA::new`] constructor. Otherwise, one can configure various aspects |
86 | | /// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), |
87 | | /// and then convert a dense DFA to a sparse DFA using |
88 | | /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). |
89 | | /// |
90 | | /// In general, a sparse DFA supports all the same search operations as a dense |
91 | | /// DFA. |
92 | | /// |
93 | | /// Making the choice between a dense and sparse DFA depends on your specific |
94 | | /// work load. If you can sacrifice a bit of search time performance, then a |
95 | | /// sparse DFA might be the best choice. In particular, while sparse DFAs are |
96 | | /// probably always slower than dense DFAs, you may find that they are easily |
97 | | /// fast enough for your purposes! |
98 | | /// |
99 | | /// # Type parameters |
100 | | /// |
101 | | /// A `DFA` has one type parameter, `T`, which is used to represent the parts |
102 | | /// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. |
103 | | /// |
104 | | /// # The `Automaton` trait |
105 | | /// |
106 | | /// This type implements the [`Automaton`] trait, which means it can be used |
107 | | /// for searching. For example: |
108 | | /// |
109 | | /// ``` |
110 | | /// use regex_automata::{ |
111 | | /// dfa::{Automaton, sparse::DFA}, |
112 | | /// HalfMatch, |
113 | | /// }; |
114 | | /// |
115 | | /// let dfa = DFA::new("foo[0-9]+")?; |
116 | | /// let expected = HalfMatch::must(0, 8); |
117 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
118 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
119 | | /// ``` |
120 | | #[derive(Clone)] |
121 | | pub struct DFA<T> { |
122 | | // When compared to a dense DFA, a sparse DFA *looks* a lot simpler |
123 | | // representation-wise. In reality, it is perhaps more complicated. Namely, |
124 | | // in a dense DFA, all information needs to be very cheaply accessible |
125 | | // using only state IDs. In a sparse DFA however, each state uses a |
126 | | // variable amount of space because each state encodes more information |
127 | | // than just its transitions. Each state also includes an accelerator if |
128 | | // one exists, along with the matching pattern IDs if the state is a match |
129 | | // state. |
130 | | // |
131 | | // That is, a lot of the complexity is pushed down into how each state |
132 | | // itself is represented. |
133 | | trans: Transitions<T>, |
134 | | starts: StartTable<T>, |
135 | | special: Special, |
136 | | } |
137 | | |
138 | | #[cfg(feature = "alloc")] |
139 | | impl DFA<Vec<u8>> { |
140 | | /// Parse the given regular expression using a default configuration and |
141 | | /// return the corresponding sparse DFA. |
142 | | /// |
143 | | /// If you want a non-default configuration, then use |
144 | | /// the [`dense::Builder`](crate::dfa::dense::Builder) |
145 | | /// to set your own configuration, and then call |
146 | | /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create |
147 | | /// a sparse DFA. |
148 | | /// |
149 | | /// # Example |
150 | | /// |
151 | | /// ``` |
152 | | /// use regex_automata::{ |
153 | | /// dfa::{Automaton, sparse}, |
154 | | /// HalfMatch, |
155 | | /// }; |
156 | | /// |
157 | | /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; |
158 | | /// |
159 | | /// let expected = HalfMatch::must(0, 11); |
160 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); |
161 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
162 | | /// ``` |
163 | | pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> { |
164 | | dense::Builder::new() |
165 | | .build(pattern) |
166 | | .and_then(|dense| dense.to_sparse()) |
167 | | } |
168 | | |
169 | | /// Parse the given regular expressions using a default configuration and |
170 | | /// return the corresponding multi-DFA. |
171 | | /// |
172 | | /// If you want a non-default configuration, then use |
173 | | /// the [`dense::Builder`](crate::dfa::dense::Builder) |
174 | | /// to set your own configuration, and then call |
175 | | /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create |
176 | | /// a sparse DFA. |
177 | | /// |
178 | | /// # Example |
179 | | /// |
180 | | /// ``` |
181 | | /// use regex_automata::{ |
182 | | /// dfa::{Automaton, sparse}, |
183 | | /// HalfMatch, |
184 | | /// }; |
185 | | /// |
186 | | /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; |
187 | | /// let expected = HalfMatch::must(1, 3); |
188 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); |
189 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
190 | | /// ``` |
191 | | pub fn new_many<P: AsRef<str>>( |
192 | | patterns: &[P], |
193 | | ) -> Result<DFA<Vec<u8>>, Error> { |
194 | | dense::Builder::new() |
195 | | .build_many(patterns) |
196 | | .and_then(|dense| dense.to_sparse()) |
197 | | } |
198 | | } |
199 | | |
200 | | #[cfg(feature = "alloc")] |
201 | | impl DFA<Vec<u8>> { |
202 | | /// Create a new DFA that matches every input. |
203 | | /// |
204 | | /// # Example |
205 | | /// |
206 | | /// ``` |
207 | | /// use regex_automata::{ |
208 | | /// dfa::{Automaton, sparse}, |
209 | | /// HalfMatch, |
210 | | /// }; |
211 | | /// |
212 | | /// let dfa = sparse::DFA::always_match()?; |
213 | | /// |
214 | | /// let expected = HalfMatch::must(0, 0); |
215 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); |
216 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); |
217 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
218 | | /// ``` |
219 | | pub fn always_match() -> Result<DFA<Vec<u8>>, Error> { |
220 | | dense::DFA::always_match()?.to_sparse() |
221 | | } |
222 | | |
223 | | /// Create a new sparse DFA that never matches any input. |
224 | | /// |
225 | | /// # Example |
226 | | /// |
227 | | /// ``` |
228 | | /// use regex_automata::dfa::{Automaton, sparse}; |
229 | | /// |
230 | | /// let dfa = sparse::DFA::never_match()?; |
231 | | /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); |
232 | | /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); |
233 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
234 | | /// ``` |
235 | | pub fn never_match() -> Result<DFA<Vec<u8>>, Error> { |
236 | | dense::DFA::never_match()?.to_sparse() |
237 | | } |
238 | | |
239 | | /// The implementation for constructing a sparse DFA from a dense DFA. |
240 | | pub(crate) fn from_dense<T: AsRef<[u32]>>( |
241 | | dfa: &dense::DFA<T>, |
242 | | ) -> Result<DFA<Vec<u8>>, Error> { |
243 | | // In order to build the transition table, we need to be able to write |
244 | | // state identifiers for each of the "next" transitions in each state. |
245 | | // Our state identifiers correspond to the byte offset in the |
246 | | // transition table at which the state is encoded. Therefore, we do not |
247 | | // actually know what the state identifiers are until we've allocated |
248 | | // exactly as much space as we need for each state. Thus, construction |
249 | | // of the transition table happens in two passes. |
250 | | // |
251 | | // In the first pass, we fill out the shell of each state, which |
252 | | // includes the transition count, the input byte ranges and zero-filled |
253 | | // space for the transitions and accelerators, if present. In this |
254 | | // first pass, we also build up a map from the state identifier index |
255 | | // of the dense DFA to the state identifier in this sparse DFA. |
256 | | // |
257 | | // In the second pass, we fill in the transitions based on the map |
258 | | // built in the first pass. |
259 | | |
260 | | // The capacity given here reflects a minimum. (Well, the true minimum |
261 | | // is likely even bigger, but hopefully this saves a few reallocs.) |
262 | | let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count()); |
263 | | // This maps state indices from the dense DFA to StateIDs in the sparse |
264 | | // DFA. We build out this map on the first pass, and then use it in the |
265 | | // second pass to back-fill our transitions. |
266 | | let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()]; |
267 | | for state in dfa.states() { |
268 | | let pos = sparse.len(); |
269 | | |
270 | | remap[dfa.to_index(state.id())] = |
271 | | StateID::new(pos).map_err(|_| Error::too_many_states())?; |
272 | | // zero-filled space for the transition count |
273 | | sparse.push(0); |
274 | | sparse.push(0); |
275 | | |
276 | | let mut transition_count = 0; |
277 | | for (unit1, unit2, _) in state.sparse_transitions() { |
278 | | match (unit1.as_u8(), unit2.as_u8()) { |
279 | | (Some(b1), Some(b2)) => { |
280 | | transition_count += 1; |
281 | | sparse.push(b1); |
282 | | sparse.push(b2); |
283 | | } |
284 | | (None, None) => {} |
285 | | (Some(_), None) | (None, Some(_)) => { |
286 | | // can never occur because sparse_transitions never |
287 | | // groups EOI with any other transition. |
288 | | unreachable!() |
289 | | } |
290 | | } |
291 | | } |
292 | | // Add dummy EOI transition. This is never actually read while |
293 | | // searching, but having space equivalent to the total number |
294 | | // of transitions is convenient. Otherwise, we'd need to track |
295 | | // a different number of transitions for the byte ranges as for |
296 | | // the 'next' states. |
297 | | // |
298 | | // N.B. The loop above is not guaranteed to yield the EOI |
299 | | // transition, since it may point to a DEAD state. By putting |
300 | | // it here, we always write the EOI transition, and thus |
301 | | // guarantee that our transition count is >0. Why do we always |
302 | | // need the EOI transition? Because in order to implement |
303 | | // Automaton::next_eoi_state, this lets us just ask for the last |
304 | | // transition. There are probably other/better ways to do this. |
305 | | transition_count += 1; |
306 | | sparse.push(0); |
307 | | sparse.push(0); |
308 | | |
309 | | // Check some assumptions about transition count. |
310 | | assert_ne!( |
311 | | transition_count, 0, |
312 | | "transition count should be non-zero", |
313 | | ); |
314 | | assert!( |
315 | | transition_count <= 257, |
316 | | "expected transition count {} to be <= 257", |
317 | | transition_count, |
318 | | ); |
319 | | |
320 | | // Fill in the transition count. |
321 | | // Since transition count is always <= 257, we use the most |
322 | | // significant bit to indicate whether this is a match state or |
323 | | // not. |
324 | | let ntrans = if dfa.is_match_state(state.id()) { |
325 | | transition_count | (1 << 15) |
326 | | } else { |
327 | | transition_count |
328 | | }; |
329 | | bytes::NE::write_u16(ntrans, &mut sparse[pos..]); |
330 | | |
331 | | // zero-fill the actual transitions. |
332 | | // Unwraps are OK since transition_count <= 257 and our minimum |
333 | | // support usize size is 16-bits. |
334 | | let zeros = usize::try_from(transition_count) |
335 | | .unwrap() |
336 | | .checked_mul(StateID::SIZE) |
337 | | .unwrap(); |
338 | | sparse.extend(iter::repeat(0).take(zeros)); |
339 | | |
340 | | // If this is a match state, write the pattern IDs matched by this |
341 | | // state. |
342 | | if dfa.is_match_state(state.id()) { |
343 | | let plen = dfa.match_pattern_len(state.id()); |
344 | | // Write the actual pattern IDs with a u32 length prefix. |
345 | | // First, zero-fill space. |
346 | | let mut pos = sparse.len(); |
347 | | // Unwraps are OK since it's guaranteed that plen <= |
348 | | // PatternID::LIMIT, which is in turn guaranteed to fit into a |
349 | | // u32. |
350 | | let zeros = size_of::<u32>() |
351 | | .checked_mul(plen) |
352 | | .unwrap() |
353 | | .checked_add(size_of::<u32>()) |
354 | | .unwrap(); |
355 | | sparse.extend(iter::repeat(0).take(zeros)); |
356 | | |
357 | | // Now write the length prefix. |
358 | | bytes::NE::write_u32( |
359 | | // Will never fail since u32::MAX is invalid pattern ID. |
360 | | // Thus, the number of pattern IDs is representable by a |
361 | | // u32. |
362 | | plen.try_into().expect("pattern ID count fits in u32"), |
363 | | &mut sparse[pos..], |
364 | | ); |
365 | | pos += size_of::<u32>(); |
366 | | |
367 | | // Now write the pattern IDs. |
368 | | for &pid in dfa.pattern_id_slice(state.id()) { |
369 | | pos += bytes::write_pattern_id::<bytes::NE>( |
370 | | pid, |
371 | | &mut sparse[pos..], |
372 | | ); |
373 | | } |
374 | | } |
375 | | |
376 | | // And now add the accelerator, if one exists. An accelerator is |
377 | | // at most 4 bytes and at least 1 byte. The first byte is the |
378 | | // length, N. N bytes follow the length. The set of bytes that |
379 | | // follow correspond (exhaustively) to the bytes that must be seen |
380 | | // to leave this state. |
381 | | let accel = dfa.accelerator(state.id()); |
382 | | sparse.push(accel.len().try_into().unwrap()); |
383 | | sparse.extend_from_slice(accel); |
384 | | } |
385 | | |
386 | | let mut new = DFA { |
387 | | trans: Transitions { |
388 | | sparse, |
389 | | classes: dfa.byte_classes().clone(), |
390 | | count: dfa.state_count(), |
391 | | patterns: dfa.pattern_count(), |
392 | | }, |
393 | | starts: StartTable::from_dense_dfa(dfa, &remap)?, |
394 | | special: dfa.special().remap(|id| remap[dfa.to_index(id)]), |
395 | | }; |
396 | | // And here's our second pass. Iterate over all of the dense states |
397 | | // again, and update the transitions in each of the states in the |
398 | | // sparse DFA. |
399 | | for old_state in dfa.states() { |
400 | | let new_id = remap[dfa.to_index(old_state.id())]; |
401 | | let mut new_state = new.trans.state_mut(new_id); |
402 | | let sparse = old_state.sparse_transitions(); |
403 | | for (i, (_, _, next)) in sparse.enumerate() { |
404 | | let next = remap[dfa.to_index(next)]; |
405 | | new_state.set_next_at(i, next); |
406 | | } |
407 | | } |
408 | | trace!( |
409 | | "created sparse DFA, memory usage: {} (dense memory usage: {})", |
410 | | new.memory_usage(), |
411 | | dfa.memory_usage(), |
412 | | ); |
413 | | Ok(new) |
414 | | } |
415 | | } |
416 | | |
417 | | impl<T: AsRef<[u8]>> DFA<T> { |
418 | | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the |
419 | | /// DFA returned always uses `&[u8]` for its transitions. |
420 | 0 | pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { |
421 | 0 | DFA { |
422 | 0 | trans: self.trans.as_ref(), |
423 | 0 | starts: self.starts.as_ref(), |
424 | 0 | special: self.special, |
425 | 0 | } |
426 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]>>::as_ref Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_>>::as_ref |
427 | | |
428 | | /// Return an owned version of this sparse DFA. Specifically, the DFA |
429 | | /// returned always uses `Vec<u8>` for its transitions. |
430 | | /// |
431 | | /// Effectively, this returns a sparse DFA whose transitions live on the |
432 | | /// heap. |
433 | | #[cfg(feature = "alloc")] |
434 | | pub fn to_owned(&self) -> DFA<Vec<u8>> { |
435 | | DFA { |
436 | | trans: self.trans.to_owned(), |
437 | | starts: self.starts.to_owned(), |
438 | | special: self.special, |
439 | | } |
440 | | } |
441 | | |
442 | | /// Returns the memory usage, in bytes, of this DFA. |
443 | | /// |
444 | | /// The memory usage is computed based on the number of bytes used to |
445 | | /// represent this DFA. |
446 | | /// |
447 | | /// This does **not** include the stack size used up by this DFA. To |
448 | | /// compute that, use `std::mem::size_of::<sparse::DFA>()`. |
449 | 0 | pub fn memory_usage(&self) -> usize { |
450 | 0 | self.trans.memory_usage() + self.starts.memory_usage() |
451 | 0 | } |
452 | | |
453 | | /// Returns true only if this DFA has starting states for each pattern. |
454 | | /// |
455 | | /// When a DFA has starting states for each pattern, then a search with the |
456 | | /// DFA can be configured to only look for anchored matches of a specific |
457 | | /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] |
458 | | /// can accept a non-None `pattern_id` if and only if this method returns |
459 | | /// true. Otherwise, calling `find_earliest_fwd_at` will panic. |
460 | | /// |
461 | | /// Note that if the DFA is empty, this always returns false. |
462 | 0 | pub fn has_starts_for_each_pattern(&self) -> bool { |
463 | 0 | self.starts.patterns > 0 |
464 | 0 | } |
465 | | } |
466 | | |
467 | | /// Routines for converting a sparse DFA to other representations, such as raw |
468 | | /// bytes suitable for persistent storage. |
469 | | impl<T: AsRef<[u8]>> DFA<T> { |
470 | | /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian |
471 | | /// format. |
472 | | /// |
473 | | /// The written bytes are guaranteed to be deserialized correctly and |
474 | | /// without errors in a semver compatible release of this crate by a |
475 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
476 | | /// deserialization APIs has been satisfied): |
477 | | /// |
478 | | /// * [`DFA::from_bytes`] |
479 | | /// * [`DFA::from_bytes_unchecked`] |
480 | | /// |
481 | | /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s |
482 | | /// serialization methods, this does not add any initial padding to the |
483 | | /// returned bytes. Padding isn't required for sparse DFAs since they have |
484 | | /// no alignment requirements. |
485 | | /// |
486 | | /// # Example |
487 | | /// |
488 | | /// This example shows how to serialize and deserialize a DFA: |
489 | | /// |
490 | | /// ``` |
491 | | /// use regex_automata::{ |
492 | | /// dfa::{Automaton, sparse::DFA}, |
493 | | /// HalfMatch, |
494 | | /// }; |
495 | | /// |
496 | | /// // Compile our original DFA. |
497 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
498 | | /// |
499 | | /// // N.B. We use native endianness here to make the example work, but |
500 | | /// // using to_bytes_little_endian would work on a little endian target. |
501 | | /// let buf = original_dfa.to_bytes_native_endian(); |
502 | | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
503 | | /// // ignore it. |
504 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
505 | | /// |
506 | | /// let expected = HalfMatch::must(0, 8); |
507 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
508 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
509 | | /// ``` |
510 | | #[cfg(feature = "alloc")] |
511 | | pub fn to_bytes_little_endian(&self) -> Vec<u8> { |
512 | | self.to_bytes::<bytes::LE>() |
513 | | } |
514 | | |
515 | | /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian |
516 | | /// format. |
517 | | /// |
518 | | /// The written bytes are guaranteed to be deserialized correctly and |
519 | | /// without errors in a semver compatible release of this crate by a |
520 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
521 | | /// deserialization APIs has been satisfied): |
522 | | /// |
523 | | /// * [`DFA::from_bytes`] |
524 | | /// * [`DFA::from_bytes_unchecked`] |
525 | | /// |
526 | | /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s |
527 | | /// serialization methods, this does not add any initial padding to the |
528 | | /// returned bytes. Padding isn't required for sparse DFAs since they have |
529 | | /// no alignment requirements. |
530 | | /// |
531 | | /// # Example |
532 | | /// |
533 | | /// This example shows how to serialize and deserialize a DFA: |
534 | | /// |
535 | | /// ``` |
536 | | /// use regex_automata::{ |
537 | | /// dfa::{Automaton, sparse::DFA}, |
538 | | /// HalfMatch, |
539 | | /// }; |
540 | | /// |
541 | | /// // Compile our original DFA. |
542 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
543 | | /// |
544 | | /// // N.B. We use native endianness here to make the example work, but |
545 | | /// // using to_bytes_big_endian would work on a big endian target. |
546 | | /// let buf = original_dfa.to_bytes_native_endian(); |
547 | | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
548 | | /// // ignore it. |
549 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
550 | | /// |
551 | | /// let expected = HalfMatch::must(0, 8); |
552 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
553 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
554 | | /// ``` |
555 | | #[cfg(feature = "alloc")] |
556 | | pub fn to_bytes_big_endian(&self) -> Vec<u8> { |
557 | | self.to_bytes::<bytes::BE>() |
558 | | } |
559 | | |
560 | | /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian |
561 | | /// format. |
562 | | /// |
563 | | /// The written bytes are guaranteed to be deserialized correctly and |
564 | | /// without errors in a semver compatible release of this crate by a |
565 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
566 | | /// deserialization APIs has been satisfied): |
567 | | /// |
568 | | /// * [`DFA::from_bytes`] |
569 | | /// * [`DFA::from_bytes_unchecked`] |
570 | | /// |
571 | | /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s |
572 | | /// serialization methods, this does not add any initial padding to the |
573 | | /// returned bytes. Padding isn't required for sparse DFAs since they have |
574 | | /// no alignment requirements. |
575 | | /// |
576 | | /// Generally speaking, native endian format should only be used when |
577 | | /// you know that the target you're compiling the DFA for matches the |
578 | | /// endianness of the target on which you're compiling DFA. For example, |
579 | | /// if serialization and deserialization happen in the same process or on |
580 | | /// the same machine. Otherwise, when serializing a DFA for use in a |
581 | | /// portable environment, you'll almost certainly want to serialize _both_ |
582 | | /// a little endian and a big endian version and then load the correct one |
583 | | /// based on the target's configuration. |
584 | | /// |
585 | | /// # Example |
586 | | /// |
587 | | /// This example shows how to serialize and deserialize a DFA: |
588 | | /// |
589 | | /// ``` |
590 | | /// use regex_automata::{ |
591 | | /// dfa::{Automaton, sparse::DFA}, |
592 | | /// HalfMatch, |
593 | | /// }; |
594 | | /// |
595 | | /// // Compile our original DFA. |
596 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
597 | | /// |
598 | | /// let buf = original_dfa.to_bytes_native_endian(); |
599 | | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
600 | | /// // ignore it. |
601 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
602 | | /// |
603 | | /// let expected = HalfMatch::must(0, 8); |
604 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
605 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
606 | | /// ``` |
607 | | #[cfg(feature = "alloc")] |
608 | | pub fn to_bytes_native_endian(&self) -> Vec<u8> { |
609 | | self.to_bytes::<bytes::NE>() |
610 | | } |
611 | | |
612 | | /// The implementation of the public `to_bytes` serialization methods, |
613 | | /// which is generic over endianness. |
614 | | #[cfg(feature = "alloc")] |
615 | | fn to_bytes<E: Endian>(&self) -> Vec<u8> { |
616 | | let mut buf = vec![0; self.write_to_len()]; |
617 | | // This should always succeed since the only possible serialization |
618 | | // error is providing a buffer that's too small, but we've ensured that |
619 | | // `buf` is big enough here. |
620 | | self.write_to::<E>(&mut buf).unwrap(); |
621 | | buf |
622 | | } |
623 | | |
624 | | /// Serialize this DFA as raw bytes to the given slice, in little endian |
625 | | /// format. Upon success, the total number of bytes written to `dst` is |
626 | | /// returned. |
627 | | /// |
628 | | /// The written bytes are guaranteed to be deserialized correctly and |
629 | | /// without errors in a semver compatible release of this crate by a |
630 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
631 | | /// deserialization APIs has been satisfied): |
632 | | /// |
633 | | /// * [`DFA::from_bytes`] |
634 | | /// * [`DFA::from_bytes_unchecked`] |
635 | | /// |
636 | | /// # Errors |
637 | | /// |
638 | | /// This returns an error if the given destination slice is not big enough |
639 | | /// to contain the full serialized DFA. If an error occurs, then nothing |
640 | | /// is written to `dst`. |
641 | | /// |
642 | | /// # Example |
643 | | /// |
644 | | /// This example shows how to serialize and deserialize a DFA without |
645 | | /// dynamic memory allocation. |
646 | | /// |
647 | | /// ``` |
648 | | /// use regex_automata::{ |
649 | | /// dfa::{Automaton, sparse::DFA}, |
650 | | /// HalfMatch, |
651 | | /// }; |
652 | | /// |
653 | | /// // Compile our original DFA. |
654 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
655 | | /// |
656 | | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
657 | | /// let mut buf = [0u8; 4 * (1<<10)]; |
658 | | /// // N.B. We use native endianness here to make the example work, but |
659 | | /// // using write_to_little_endian would work on a little endian target. |
660 | | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
661 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
662 | | /// |
663 | | /// let expected = HalfMatch::must(0, 8); |
664 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
665 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
666 | | /// ``` |
667 | 0 | pub fn write_to_little_endian( |
668 | 0 | &self, |
669 | 0 | dst: &mut [u8], |
670 | 0 | ) -> Result<usize, SerializeError> { |
671 | 0 | self.write_to::<bytes::LE>(dst) |
672 | 0 | } |
673 | | |
674 | | /// Serialize this DFA as raw bytes to the given slice, in big endian |
675 | | /// format. Upon success, the total number of bytes written to `dst` is |
676 | | /// returned. |
677 | | /// |
678 | | /// The written bytes are guaranteed to be deserialized correctly and |
679 | | /// without errors in a semver compatible release of this crate by a |
680 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
681 | | /// deserialization APIs has been satisfied): |
682 | | /// |
683 | | /// * [`DFA::from_bytes`] |
684 | | /// * [`DFA::from_bytes_unchecked`] |
685 | | /// |
686 | | /// # Errors |
687 | | /// |
688 | | /// This returns an error if the given destination slice is not big enough |
689 | | /// to contain the full serialized DFA. If an error occurs, then nothing |
690 | | /// is written to `dst`. |
691 | | /// |
692 | | /// # Example |
693 | | /// |
694 | | /// This example shows how to serialize and deserialize a DFA without |
695 | | /// dynamic memory allocation. |
696 | | /// |
697 | | /// ``` |
698 | | /// use regex_automata::{ |
699 | | /// dfa::{Automaton, sparse::DFA}, |
700 | | /// HalfMatch, |
701 | | /// }; |
702 | | /// |
703 | | /// // Compile our original DFA. |
704 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
705 | | /// |
706 | | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
707 | | /// let mut buf = [0u8; 4 * (1<<10)]; |
708 | | /// // N.B. We use native endianness here to make the example work, but |
709 | | /// // using write_to_big_endian would work on a big endian target. |
710 | | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
711 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
712 | | /// |
713 | | /// let expected = HalfMatch::must(0, 8); |
714 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
715 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
716 | | /// ``` |
717 | 0 | pub fn write_to_big_endian( |
718 | 0 | &self, |
719 | 0 | dst: &mut [u8], |
720 | 0 | ) -> Result<usize, SerializeError> { |
721 | 0 | self.write_to::<bytes::BE>(dst) |
722 | 0 | } |
723 | | |
724 | | /// Serialize this DFA as raw bytes to the given slice, in native endian |
725 | | /// format. Upon success, the total number of bytes written to `dst` is |
726 | | /// returned. |
727 | | /// |
728 | | /// The written bytes are guaranteed to be deserialized correctly and |
729 | | /// without errors in a semver compatible release of this crate by a |
730 | | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
731 | | /// deserialization APIs has been satisfied): |
732 | | /// |
733 | | /// * [`DFA::from_bytes`] |
734 | | /// * [`DFA::from_bytes_unchecked`] |
735 | | /// |
736 | | /// Generally speaking, native endian format should only be used when |
737 | | /// you know that the target you're compiling the DFA for matches the |
738 | | /// endianness of the target on which you're compiling DFA. For example, |
739 | | /// if serialization and deserialization happen in the same process or on |
740 | | /// the same machine. Otherwise, when serializing a DFA for use in a |
741 | | /// portable environment, you'll almost certainly want to serialize _both_ |
742 | | /// a little endian and a big endian version and then load the correct one |
743 | | /// based on the target's configuration. |
744 | | /// |
745 | | /// # Errors |
746 | | /// |
747 | | /// This returns an error if the given destination slice is not big enough |
748 | | /// to contain the full serialized DFA. If an error occurs, then nothing |
749 | | /// is written to `dst`. |
750 | | /// |
751 | | /// # Example |
752 | | /// |
753 | | /// This example shows how to serialize and deserialize a DFA without |
754 | | /// dynamic memory allocation. |
755 | | /// |
756 | | /// ``` |
757 | | /// use regex_automata::{ |
758 | | /// dfa::{Automaton, sparse::DFA}, |
759 | | /// HalfMatch, |
760 | | /// }; |
761 | | /// |
762 | | /// // Compile our original DFA. |
763 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
764 | | /// |
765 | | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
766 | | /// let mut buf = [0u8; 4 * (1<<10)]; |
767 | | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
768 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
769 | | /// |
770 | | /// let expected = HalfMatch::must(0, 8); |
771 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
772 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
773 | | /// ``` |
774 | 0 | pub fn write_to_native_endian( |
775 | 0 | &self, |
776 | 0 | dst: &mut [u8], |
777 | 0 | ) -> Result<usize, SerializeError> { |
778 | 0 | self.write_to::<bytes::NE>(dst) |
779 | 0 | } |
780 | | |
781 | | /// The implementation of the public `write_to` serialization methods, |
782 | | /// which is generic over endianness. |
783 | 0 | fn write_to<E: Endian>( |
784 | 0 | &self, |
785 | 0 | dst: &mut [u8], |
786 | 0 | ) -> Result<usize, SerializeError> { |
787 | 0 | let mut nw = 0; |
788 | 0 | nw += bytes::write_label(LABEL, &mut dst[nw..])?; |
789 | 0 | nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?; |
790 | 0 | nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?; |
791 | 0 | nw += { |
792 | 0 | // Currently unused, intended for future flexibility |
793 | 0 | E::write_u32(0, &mut dst[nw..]); |
794 | 0 | size_of::<u32>() |
795 | 0 | }; |
796 | 0 | nw += self.trans.write_to::<E>(&mut dst[nw..])?; |
797 | 0 | nw += self.starts.write_to::<E>(&mut dst[nw..])?; |
798 | 0 | nw += self.special.write_to::<E>(&mut dst[nw..])?; |
799 | 0 | Ok(nw) |
800 | 0 | } |
801 | | |
802 | | /// Return the total number of bytes required to serialize this DFA. |
803 | | /// |
804 | | /// This is useful for determining the size of the buffer required to pass |
805 | | /// to one of the serialization routines: |
806 | | /// |
807 | | /// * [`DFA::write_to_little_endian`] |
808 | | /// * [`DFA::write_to_big_endian`] |
809 | | /// * [`DFA::write_to_native_endian`] |
810 | | /// |
811 | | /// Passing a buffer smaller than the size returned by this method will |
812 | | /// result in a serialization error. |
813 | | /// |
814 | | /// # Example |
815 | | /// |
816 | | /// This example shows how to dynamically allocate enough room to serialize |
817 | | /// a sparse DFA. |
818 | | /// |
819 | | /// ``` |
820 | | /// use regex_automata::{ |
821 | | /// dfa::{Automaton, sparse::DFA}, |
822 | | /// HalfMatch, |
823 | | /// }; |
824 | | /// |
825 | | /// // Compile our original DFA. |
826 | | /// let original_dfa = DFA::new("foo[0-9]+")?; |
827 | | /// |
828 | | /// let mut buf = vec![0; original_dfa.write_to_len()]; |
829 | | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
830 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
831 | | /// |
832 | | /// let expected = HalfMatch::must(0, 8); |
833 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
834 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
835 | | /// ``` |
836 | 0 | pub fn write_to_len(&self) -> usize { |
837 | 0 | bytes::write_label_len(LABEL) |
838 | 0 | + bytes::write_endianness_check_len() |
839 | 0 | + bytes::write_version_len() |
840 | 0 | + size_of::<u32>() // unused, intended for future flexibility |
841 | 0 | + self.trans.write_to_len() |
842 | 0 | + self.starts.write_to_len() |
843 | 0 | + self.special.write_to_len() |
844 | 0 | } |
845 | | } |
846 | | |
847 | | impl<'a> DFA<&'a [u8]> { |
848 | | /// Safely deserialize a sparse DFA with a specific state identifier |
849 | | /// representation. Upon success, this returns both the deserialized DFA |
850 | | /// and the number of bytes read from the given slice. Namely, the contents |
851 | | /// of the slice beyond the DFA are not read. |
852 | | /// |
853 | | /// Deserializing a DFA using this routine will never allocate heap memory. |
854 | | /// For safety purposes, the DFA's transitions will be verified such that |
855 | | /// every transition points to a valid state. If this verification is too |
856 | | /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which |
857 | | /// will always execute in constant time. |
858 | | /// |
859 | | /// The bytes given must be generated by one of the serialization APIs |
860 | | /// of a `DFA` using a semver compatible release of this crate. Those |
861 | | /// include: |
862 | | /// |
863 | | /// * [`DFA::to_bytes_little_endian`] |
864 | | /// * [`DFA::to_bytes_big_endian`] |
865 | | /// * [`DFA::to_bytes_native_endian`] |
866 | | /// * [`DFA::write_to_little_endian`] |
867 | | /// * [`DFA::write_to_big_endian`] |
868 | | /// * [`DFA::write_to_native_endian`] |
869 | | /// |
870 | | /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The |
871 | | /// `write_to` methods do not allocate and write to an existing slice |
872 | | /// (which may be on the stack). Since deserialization always uses the |
873 | | /// native endianness of the target platform, the serialization API you use |
874 | | /// should match the endianness of the target platform. (It's often a good |
875 | | /// idea to generate serialized DFAs for both forms of endianness and then |
876 | | /// load the correct one based on endianness.) |
877 | | /// |
878 | | /// # Errors |
879 | | /// |
880 | | /// Generally speaking, it's easier to state the conditions in which an |
881 | | /// error is _not_ returned. All of the following must be true: |
882 | | /// |
883 | | /// * The bytes given must be produced by one of the serialization APIs |
884 | | /// on this DFA, as mentioned above. |
885 | | /// * The endianness of the target platform matches the endianness used to |
886 | | /// serialized the provided DFA. |
887 | | /// |
888 | | /// If any of the above are not true, then an error will be returned. |
889 | | /// |
890 | | /// Note that unlike deserializing a |
891 | | /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has |
892 | | /// no alignment requirements. That is, an alignment of `1` is valid. |
893 | | /// |
894 | | /// # Panics |
895 | | /// |
896 | | /// This routine will never panic for any input. |
897 | | /// |
898 | | /// # Example |
899 | | /// |
900 | | /// This example shows how to serialize a DFA to raw bytes, deserialize it |
901 | | /// and then use it for searching. |
902 | | /// |
903 | | /// ``` |
904 | | /// use regex_automata::{ |
905 | | /// dfa::{Automaton, sparse::DFA}, |
906 | | /// HalfMatch, |
907 | | /// }; |
908 | | /// |
909 | | /// let initial = DFA::new("foo[0-9]+")?; |
910 | | /// let bytes = initial.to_bytes_native_endian(); |
911 | | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; |
912 | | /// |
913 | | /// let expected = HalfMatch::must(0, 8); |
914 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
915 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
916 | | /// ``` |
917 | | /// |
918 | | /// # Example: loading a DFA from static memory |
919 | | /// |
920 | | /// One use case this library supports is the ability to serialize a |
921 | | /// DFA to disk and then use `include_bytes!` to store it in a compiled |
922 | | /// Rust program. Those bytes can then be cheaply deserialized into a |
923 | | /// `DFA` structure at runtime and used for searching without having to |
924 | | /// re-compile the DFA (which can be quite costly). |
925 | | /// |
926 | | /// We can show this in two parts. The first part is serializing the DFA to |
927 | | /// a file: |
928 | | /// |
929 | | /// ```no_run |
930 | | /// use regex_automata::dfa::{Automaton, sparse::DFA}; |
931 | | /// |
932 | | /// let dfa = DFA::new("foo[0-9]+")?; |
933 | | /// |
934 | | /// // Write a big endian serialized version of this DFA to a file. |
935 | | /// let bytes = dfa.to_bytes_big_endian(); |
936 | | /// std::fs::write("foo.bigendian.dfa", &bytes)?; |
937 | | /// |
938 | | /// // Do it again, but this time for little endian. |
939 | | /// let bytes = dfa.to_bytes_little_endian(); |
940 | | /// std::fs::write("foo.littleendian.dfa", &bytes)?; |
941 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
942 | | /// ``` |
943 | | /// |
944 | | /// And now the second part is embedding the DFA into the compiled program |
945 | | /// and deserializing it at runtime on first use. We use conditional |
946 | | /// compilation to choose the correct endianness. As mentioned above, we |
947 | | /// do not need to employ any special tricks to ensure a proper alignment, |
948 | | /// since a sparse DFA has no alignment requirements. |
949 | | /// |
950 | | /// ```no_run |
951 | | /// use regex_automata::{ |
952 | | /// dfa::{Automaton, sparse}, |
953 | | /// HalfMatch, |
954 | | /// }; |
955 | | /// |
956 | | /// type DFA = sparse::DFA<&'static [u8]>; |
957 | | /// |
958 | | /// fn get_foo() -> &'static DFA { |
959 | | /// use std::cell::Cell; |
960 | | /// use std::mem::MaybeUninit; |
961 | | /// use std::sync::Once; |
962 | | /// |
963 | | /// # const _: &str = stringify! { |
964 | | /// #[cfg(target_endian = "big")] |
965 | | /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); |
966 | | /// #[cfg(target_endian = "little")] |
967 | | /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); |
968 | | /// # }; |
969 | | /// # static BYTES: &[u8] = b""; |
970 | | /// |
971 | | /// struct Lazy(Cell<MaybeUninit<DFA>>); |
972 | | /// // SAFETY: This is safe because DFA impls Sync. |
973 | | /// unsafe impl Sync for Lazy {} |
974 | | /// |
975 | | /// static INIT: Once = Once::new(); |
976 | | /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); |
977 | | /// |
978 | | /// INIT.call_once(|| { |
979 | | /// let (dfa, _) = DFA::from_bytes(BYTES) |
980 | | /// .expect("serialized DFA should be valid"); |
981 | | /// // SAFETY: This is guaranteed to only execute once, and all |
982 | | /// // we do with the pointer is write the DFA to it. |
983 | | /// unsafe { |
984 | | /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); |
985 | | /// } |
986 | | /// }); |
987 | | /// // SAFETY: DFA is guaranteed to by initialized via INIT and is |
988 | | /// // stored in static memory. |
989 | | /// unsafe { |
990 | | /// let dfa = (*DFA.0.as_ptr()).as_ptr(); |
991 | | /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) |
992 | | /// } |
993 | | /// } |
994 | | /// |
995 | | /// let dfa = get_foo(); |
996 | | /// let expected = HalfMatch::must(0, 8); |
997 | | /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); |
998 | | /// ``` |
999 | | /// |
1000 | | /// Alternatively, consider using |
1001 | | /// [`lazy_static`](https://crates.io/crates/lazy_static) |
1002 | | /// or |
1003 | | /// [`once_cell`](https://crates.io/crates/once_cell), |
1004 | | /// which will guarantee safety for you. |
1005 | 0 | pub fn from_bytes( |
1006 | 0 | slice: &'a [u8], |
1007 | 0 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |
1008 | | // SAFETY: This is safe because we validate both the sparse transitions |
1009 | | // (by trying to decode every state) and start state ID list below. If |
1010 | | // either validation fails, then we return an error. |
1011 | 0 | let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; |
1012 | 0 | dfa.trans.validate()?; |
1013 | 0 | dfa.starts.validate(&dfa.trans)?; |
1014 | | // N.B. dfa.special doesn't have a way to do unchecked deserialization, |
1015 | | // so it has already been validated. |
1016 | 0 | Ok((dfa, nread)) |
1017 | 0 | } |
1018 | | |
1019 | | /// Deserialize a DFA with a specific state identifier representation in |
1020 | | /// constant time by omitting the verification of the validity of the |
1021 | | /// sparse transitions. |
1022 | | /// |
1023 | | /// This is just like [`DFA::from_bytes`], except it can potentially return |
1024 | | /// a DFA that exhibits undefined behavior if its transitions contains |
1025 | | /// invalid state identifiers. |
1026 | | /// |
1027 | | /// This routine is useful if you need to deserialize a DFA cheaply and |
1028 | | /// cannot afford the transition validation performed by `from_bytes`. |
1029 | | /// |
1030 | | /// # Safety |
1031 | | /// |
1032 | | /// This routine is unsafe because it permits callers to provide |
1033 | | /// arbitrary transitions with possibly incorrect state identifiers. While |
1034 | | /// the various serialization routines will never return an incorrect |
1035 | | /// DFA, there is no guarantee that the bytes provided here |
1036 | | /// are correct. While `from_bytes_unchecked` will still do several forms |
1037 | | /// of basic validation, this routine does not check that the transitions |
1038 | | /// themselves are correct. Given an incorrect transition table, it is |
1039 | | /// possible for the search routines to access out-of-bounds memory because |
1040 | | /// of explicit bounds check elision. |
1041 | | /// |
1042 | | /// # Example |
1043 | | /// |
1044 | | /// ``` |
1045 | | /// use regex_automata::{ |
1046 | | /// dfa::{Automaton, sparse::DFA}, |
1047 | | /// HalfMatch, |
1048 | | /// }; |
1049 | | /// |
1050 | | /// let initial = DFA::new("foo[0-9]+")?; |
1051 | | /// let bytes = initial.to_bytes_native_endian(); |
1052 | | /// // SAFETY: This is guaranteed to be safe since the bytes given come |
1053 | | /// // directly from a compatible serialization routine. |
1054 | | /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; |
1055 | | /// |
1056 | | /// let expected = HalfMatch::must(0, 8); |
1057 | | /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); |
1058 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1059 | | /// ``` |
1060 | 0 | pub unsafe fn from_bytes_unchecked( |
1061 | 0 | slice: &'a [u8], |
1062 | 0 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |
1063 | 0 | let mut nr = 0; |
1064 | 0 |
|
1065 | 0 | nr += bytes::read_label(&slice[nr..], LABEL)?; |
1066 | 0 | nr += bytes::read_endianness_check(&slice[nr..])?; |
1067 | 0 | nr += bytes::read_version(&slice[nr..], VERSION)?; |
1068 | | |
1069 | 0 | let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; |
1070 | 0 | nr += size_of::<u32>(); |
1071 | | |
1072 | 0 | let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; |
1073 | 0 | nr += nread; |
1074 | | |
1075 | 0 | let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; |
1076 | 0 | nr += nread; |
1077 | | |
1078 | 0 | let (special, nread) = Special::from_bytes(&slice[nr..])?; |
1079 | 0 | nr += nread; |
1080 | 0 | if special.max.as_usize() >= trans.sparse().len() { |
1081 | 0 | return Err(DeserializeError::generic( |
1082 | 0 | "max should not be greater than or equal to sparse bytes", |
1083 | 0 | )); |
1084 | 0 | } |
1085 | 0 |
|
1086 | 0 | Ok((DFA { trans, starts, special }, nr)) |
1087 | 0 | } |
1088 | | } |
1089 | | |
1090 | | impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { |
1091 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1092 | 0 | writeln!(f, "sparse::DFA(")?; |
1093 | 0 | for state in self.trans.states() { |
1094 | 0 | fmt_state_indicator(f, self, state.id())?; |
1095 | 0 | writeln!(f, "{:06?}: {:?}", state.id(), state)?; |
1096 | | } |
1097 | 0 | writeln!(f, "")?; |
1098 | 0 | for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() { |
1099 | 0 | if i % self.starts.stride == 0 { |
1100 | 0 | match pid { |
1101 | 0 | None => writeln!(f, "START-GROUP(ALL)")?, |
1102 | 0 | Some(pid) => { |
1103 | 0 | writeln!(f, "START_GROUP(pattern: {:?})", pid)? |
1104 | | } |
1105 | | } |
1106 | 0 | } |
1107 | 0 | writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; |
1108 | | } |
1109 | 0 | writeln!(f, "state count: {:?}", self.trans.count)?; |
1110 | 0 | writeln!(f, ")")?; |
1111 | 0 | Ok(()) |
1112 | 0 | } |
1113 | | } |
1114 | | |
1115 | | unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { |
1116 | | #[inline] |
1117 | 0 | fn is_special_state(&self, id: StateID) -> bool { |
1118 | 0 | self.special.is_special_state(id) |
1119 | 0 | } |
1120 | | |
1121 | | #[inline] |
1122 | 0 | fn is_dead_state(&self, id: StateID) -> bool { |
1123 | 0 | self.special.is_dead_state(id) |
1124 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::is_dead_state Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::is_dead_state |
1125 | | |
1126 | | #[inline] |
1127 | 0 | fn is_quit_state(&self, id: StateID) -> bool { |
1128 | 0 | self.special.is_quit_state(id) |
1129 | 0 | } |
1130 | | |
1131 | | #[inline] |
1132 | 0 | fn is_match_state(&self, id: StateID) -> bool { |
1133 | 0 | self.special.is_match_state(id) |
1134 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::is_match_state Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::is_match_state |
1135 | | |
1136 | | #[inline] |
1137 | 0 | fn is_start_state(&self, id: StateID) -> bool { |
1138 | 0 | self.special.is_start_state(id) |
1139 | 0 | } |
1140 | | |
1141 | | #[inline] |
1142 | 0 | fn is_accel_state(&self, id: StateID) -> bool { |
1143 | 0 | self.special.is_accel_state(id) |
1144 | 0 | } |
1145 | | |
1146 | | // This is marked as inline to help dramatically boost sparse searching, |
1147 | | // which decodes each state it enters to follow the next transition. |
1148 | | #[inline(always)] |
1149 | 0 | fn next_state(&self, current: StateID, input: u8) -> StateID { |
1150 | 0 | let input = self.trans.classes.get(input); |
1151 | 0 | self.trans.state(current).next(input) |
1152 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::next_state Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::next_state |
1153 | | |
1154 | | #[inline] |
1155 | 0 | unsafe fn next_state_unchecked( |
1156 | 0 | &self, |
1157 | 0 | current: StateID, |
1158 | 0 | input: u8, |
1159 | 0 | ) -> StateID { |
1160 | 0 | self.next_state(current, input) |
1161 | 0 | } |
1162 | | |
1163 | | #[inline] |
1164 | 0 | fn next_eoi_state(&self, current: StateID) -> StateID { |
1165 | 0 | self.trans.state(current).next_eoi() |
1166 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::next_eoi_state Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::next_eoi_state |
1167 | | |
1168 | | #[inline] |
1169 | 0 | fn pattern_count(&self) -> usize { |
1170 | 0 | self.trans.patterns |
1171 | 0 | } |
1172 | | |
1173 | | #[inline] |
1174 | 0 | fn match_count(&self, id: StateID) -> usize { |
1175 | 0 | self.trans.state(id).pattern_count() |
1176 | 0 | } |
1177 | | |
1178 | | #[inline] |
1179 | 0 | fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { |
1180 | 0 | // This is an optimization for the very common case of a DFA with a |
1181 | 0 | // single pattern. This conditional avoids a somewhat more costly path |
1182 | 0 | // that finds the pattern ID from the state machine, which requires |
1183 | 0 | // a bit of slicing/pointer-chasing. This optimization tends to only |
1184 | 0 | // matter when matches are frequent. |
1185 | 0 | if self.trans.patterns == 1 { |
1186 | 0 | return PatternID::ZERO; |
1187 | 0 | } |
1188 | 0 | self.trans.state(id).pattern_id(match_index) |
1189 | 0 | } |
1190 | | |
1191 | | #[inline] |
1192 | 0 | fn start_state_forward( |
1193 | 0 | &self, |
1194 | 0 | pattern_id: Option<PatternID>, |
1195 | 0 | bytes: &[u8], |
1196 | 0 | start: usize, |
1197 | 0 | end: usize, |
1198 | 0 | ) -> StateID { |
1199 | 0 | let index = Start::from_position_fwd(bytes, start, end); |
1200 | 0 | self.starts.start(index, pattern_id) |
1201 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::start_state_forward Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::start_state_forward |
1202 | | |
1203 | | #[inline] |
1204 | 0 | fn start_state_reverse( |
1205 | 0 | &self, |
1206 | 0 | pattern_id: Option<PatternID>, |
1207 | 0 | bytes: &[u8], |
1208 | 0 | start: usize, |
1209 | 0 | end: usize, |
1210 | 0 | ) -> StateID { |
1211 | 0 | let index = Start::from_position_rev(bytes, start, end); |
1212 | 0 | self.starts.start(index, pattern_id) |
1213 | 0 | } |
1214 | | |
1215 | | #[inline] |
1216 | 0 | fn accelerator(&self, id: StateID) -> &[u8] { |
1217 | 0 | self.trans.state(id).accelerator() |
1218 | 0 | } |
1219 | | } |
1220 | | |
1221 | | /// The transition table portion of a sparse DFA. |
1222 | | /// |
1223 | | /// The transition table is the core part of the DFA in that it describes how |
1224 | | /// to move from one state to another based on the input sequence observed. |
1225 | | /// |
1226 | | /// Unlike a typical dense table based DFA, states in a sparse transition |
1227 | | /// table have variable size. That is, states with more transitions use more |
1228 | | /// space than states with fewer transitions. This means that finding the next |
1229 | | /// transition takes more work than with a dense DFA, but also typically uses |
1230 | | /// much less space. |
1231 | | #[derive(Clone)] |
1232 | | struct Transitions<T> { |
1233 | | /// The raw encoding of each state in this DFA. |
1234 | | /// |
1235 | | /// Each state has the following information: |
1236 | | /// |
1237 | | /// * A set of transitions to subsequent states. Transitions to the dead |
1238 | | /// state are omitted. |
1239 | | /// * If the state can be accelerated, then any additional accelerator |
1240 | | /// information. |
1241 | | /// * If the state is a match state, then the state contains all pattern |
1242 | | /// IDs that match when in that state. |
1243 | | /// |
1244 | | /// To decode a state, use Transitions::state. |
1245 | | /// |
1246 | | /// In practice, T is either Vec<u8> or &[u8]. |
1247 | | sparse: T, |
1248 | | /// A set of equivalence classes, where a single equivalence class |
1249 | | /// represents a set of bytes that never discriminate between a match |
1250 | | /// and a non-match in the DFA. Each equivalence class corresponds to a |
1251 | | /// single character in this DFA's alphabet, where the maximum number of |
1252 | | /// characters is 257 (each possible value of a byte plus the special |
1253 | | /// EOI transition). Consequently, the number of equivalence classes |
1254 | | /// corresponds to the number of transitions for each DFA state. Note |
1255 | | /// though that the *space* used by each DFA state in the transition table |
1256 | | /// may be larger. The total space used by each DFA state is known as the |
1257 | | /// stride and is documented above. |
1258 | | /// |
1259 | | /// The only time the number of equivalence classes is fewer than 257 is |
1260 | | /// if the DFA's kind uses byte classes which is the default. Equivalence |
1261 | | /// classes should generally only be disabled when debugging, so that |
1262 | | /// the transitions themselves aren't obscured. Disabling them has no |
1263 | | /// other benefit, since the equivalence class map is always used while |
1264 | | /// searching. In the vast majority of cases, the number of equivalence |
1265 | | /// classes is substantially smaller than 257, particularly when large |
1266 | | /// Unicode classes aren't used. |
1267 | | /// |
1268 | | /// N.B. Equivalence classes aren't particularly useful in a sparse DFA |
1269 | | /// in the current implementation, since equivalence classes generally tend |
1270 | | /// to correspond to continuous ranges of bytes that map to the same |
1271 | | /// transition. So in a sparse DFA, equivalence classes don't really lead |
1272 | | /// to a space savings. In the future, it would be good to try and remove |
1273 | | /// them from sparse DFAs entirely, but requires a bit of work since sparse |
1274 | | /// DFAs are built from dense DFAs, which are in turn built on top of |
1275 | | /// equivalence classes. |
1276 | | classes: ByteClasses, |
1277 | | /// The total number of states in this DFA. Note that a DFA always has at |
1278 | | /// least one state---the dead state---even the empty DFA. In particular, |
1279 | | /// the dead state always has ID 0 and is correspondingly always the first |
1280 | | /// state. The dead state is never a match state. |
1281 | | count: usize, |
1282 | | /// The total number of unique patterns represented by these match states. |
1283 | | patterns: usize, |
1284 | | } |
1285 | | |
1286 | | impl<'a> Transitions<&'a [u8]> { |
1287 | 0 | unsafe fn from_bytes_unchecked( |
1288 | 0 | mut slice: &'a [u8], |
1289 | 0 | ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { |
1290 | 0 | let slice_start = slice.as_ptr() as usize; |
1291 | | |
1292 | 0 | let (state_count, nr) = |
1293 | 0 | bytes::try_read_u32_as_usize(&slice, "state count")?; |
1294 | 0 | slice = &slice[nr..]; |
1295 | | |
1296 | 0 | let (pattern_count, nr) = |
1297 | 0 | bytes::try_read_u32_as_usize(&slice, "pattern count")?; |
1298 | 0 | slice = &slice[nr..]; |
1299 | | |
1300 | 0 | let (classes, nr) = ByteClasses::from_bytes(&slice)?; |
1301 | 0 | slice = &slice[nr..]; |
1302 | | |
1303 | 0 | let (len, nr) = |
1304 | 0 | bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?; |
1305 | 0 | slice = &slice[nr..]; |
1306 | 0 |
|
1307 | 0 | bytes::check_slice_len(slice, len, "sparse states byte length")?; |
1308 | 0 | let sparse = &slice[..len]; |
1309 | 0 | slice = &slice[len..]; |
1310 | 0 |
|
1311 | 0 | let trans = Transitions { |
1312 | 0 | sparse, |
1313 | 0 | classes, |
1314 | 0 | count: state_count, |
1315 | 0 | patterns: pattern_count, |
1316 | 0 | }; |
1317 | 0 | Ok((trans, slice.as_ptr() as usize - slice_start)) |
1318 | 0 | } |
1319 | | } |
1320 | | |
1321 | | impl<T: AsRef<[u8]>> Transitions<T> { |
1322 | | /// Writes a serialized form of this transition table to the buffer given. |
1323 | | /// If the buffer is too small, then an error is returned. To determine |
1324 | | /// how big the buffer must be, use `write_to_len`. |
1325 | 0 | fn write_to<E: Endian>( |
1326 | 0 | &self, |
1327 | 0 | mut dst: &mut [u8], |
1328 | 0 | ) -> Result<usize, SerializeError> { |
1329 | 0 | let nwrite = self.write_to_len(); |
1330 | 0 | if dst.len() < nwrite { |
1331 | 0 | return Err(SerializeError::buffer_too_small( |
1332 | 0 | "sparse transition table", |
1333 | 0 | )); |
1334 | 0 | } |
1335 | 0 | dst = &mut dst[..nwrite]; |
1336 | 0 |
|
1337 | 0 | // write state count |
1338 | 0 | E::write_u32(u32::try_from(self.count).unwrap(), dst); |
1339 | 0 | dst = &mut dst[size_of::<u32>()..]; |
1340 | 0 |
|
1341 | 0 | // write pattern count |
1342 | 0 | E::write_u32(u32::try_from(self.patterns).unwrap(), dst); |
1343 | 0 | dst = &mut dst[size_of::<u32>()..]; |
1344 | | |
1345 | | // write byte class map |
1346 | 0 | let n = self.classes.write_to(dst)?; |
1347 | 0 | dst = &mut dst[n..]; |
1348 | 0 |
|
1349 | 0 | // write number of bytes in sparse transitions |
1350 | 0 | E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); |
1351 | 0 | dst = &mut dst[size_of::<u32>()..]; |
1352 | 0 |
|
1353 | 0 | // write actual transitions |
1354 | 0 | dst.copy_from_slice(self.sparse()); |
1355 | 0 | Ok(nwrite) |
1356 | 0 | } |
1357 | | |
1358 | | /// Returns the number of bytes the serialized form of this transition |
1359 | | /// table will use. |
1360 | 0 | fn write_to_len(&self) -> usize { |
1361 | 0 | size_of::<u32>() // state count |
1362 | 0 | + size_of::<u32>() // pattern count |
1363 | 0 | + self.classes.write_to_len() |
1364 | 0 | + size_of::<u32>() // sparse transitions length |
1365 | 0 | + self.sparse().len() |
1366 | 0 | } |
1367 | | |
1368 | | /// Validates that every state ID in this transition table is valid. |
1369 | | /// |
1370 | | /// That is, every state ID can be used to correctly index a state in this |
1371 | | /// table. |
1372 | 0 | fn validate(&self) -> Result<(), DeserializeError> { |
1373 | | // In order to validate everything, we not only need to make sure we |
1374 | | // can decode every state, but that every transition in every state |
1375 | | // points to a valid state. There are many duplicative transitions, so |
1376 | | // we record state IDs that we've verified so that we don't redo the |
1377 | | // decoding work. |
1378 | | // |
1379 | | // Except, when in no_std mode, we don't have dynamic memory allocation |
1380 | | // available to us, so we skip this optimization. It's not clear |
1381 | | // whether doing something more clever is worth it just yet. If you're |
1382 | | // profiling this code and need it to run faster, please file an issue. |
1383 | | // |
1384 | | // ---AG |
1385 | | struct Seen { |
1386 | | #[cfg(feature = "alloc")] |
1387 | | set: BTreeSet<StateID>, |
1388 | | #[cfg(not(feature = "alloc"))] |
1389 | | set: core::marker::PhantomData<StateID>, |
1390 | | } |
1391 | | |
1392 | | #[cfg(feature = "alloc")] |
1393 | | impl Seen { |
1394 | | fn new() -> Seen { |
1395 | | Seen { set: BTreeSet::new() } |
1396 | | } |
1397 | | fn insert(&mut self, id: StateID) { |
1398 | | self.set.insert(id); |
1399 | | } |
1400 | | fn contains(&self, id: &StateID) -> bool { |
1401 | | self.set.contains(id) |
1402 | | } |
1403 | | } |
1404 | | |
1405 | | #[cfg(not(feature = "alloc"))] |
1406 | | impl Seen { |
1407 | 0 | fn new() -> Seen { |
1408 | 0 | Seen { set: core::marker::PhantomData } |
1409 | 0 | } |
1410 | 0 | fn insert(&mut self, _id: StateID) {} |
1411 | 0 | fn contains(&self, _id: &StateID) -> bool { |
1412 | 0 | false |
1413 | 0 | } |
1414 | | } |
1415 | | |
1416 | 0 | let mut verified: Seen = Seen::new(); |
1417 | 0 | // We need to make sure that we decode the correct number of states. |
1418 | 0 | // Otherwise, an empty set of transitions would validate even if the |
1419 | 0 | // recorded state count is non-empty. |
1420 | 0 | let mut count = 0; |
1421 | 0 | // We can't use the self.states() iterator because it assumes the state |
1422 | 0 | // encodings are valid. It could panic if they aren't. |
1423 | 0 | let mut id = DEAD; |
1424 | 0 | while id.as_usize() < self.sparse().len() { |
1425 | 0 | let state = self.try_state(id)?; |
1426 | 0 | verified.insert(id); |
1427 | 0 | // The next ID should be the offset immediately following `state`. |
1428 | 0 | id = StateID::new(bytes::add( |
1429 | 0 | id.as_usize(), |
1430 | 0 | state.bytes_len(), |
1431 | 0 | "next state ID offset", |
1432 | 0 | )?) |
1433 | 0 | .map_err(|err| { |
1434 | 0 | DeserializeError::state_id_error(err, "next state ID offset") |
1435 | 0 | })?; |
1436 | 0 | count += 1; |
1437 | | |
1438 | | // Now check that all transitions in this state are correct. |
1439 | 0 | for i in 0..state.ntrans { |
1440 | 0 | let to = state.next_at(i); |
1441 | 0 | if verified.contains(&to) { |
1442 | 0 | continue; |
1443 | 0 | } |
1444 | 0 | let _ = self.try_state(to)?; |
1445 | 0 | verified.insert(id); |
1446 | | } |
1447 | | } |
1448 | 0 | if count != self.count { |
1449 | 0 | return Err(DeserializeError::generic( |
1450 | 0 | "mismatching sparse state count", |
1451 | 0 | )); |
1452 | 0 | } |
1453 | 0 | Ok(()) |
1454 | 0 | } |
1455 | | |
1456 | | /// Converts these transitions to a borrowed value. |
1457 | 0 | fn as_ref(&self) -> Transitions<&'_ [u8]> { |
1458 | 0 | Transitions { |
1459 | 0 | sparse: self.sparse(), |
1460 | 0 | classes: self.classes.clone(), |
1461 | 0 | count: self.count, |
1462 | 0 | patterns: self.patterns, |
1463 | 0 | } |
1464 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::as_ref Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<_>>::as_ref |
1465 | | |
1466 | | /// Converts these transitions to an owned value. |
1467 | | #[cfg(feature = "alloc")] |
1468 | | fn to_owned(&self) -> Transitions<Vec<u8>> { |
1469 | | Transitions { |
1470 | | sparse: self.sparse().to_vec(), |
1471 | | classes: self.classes.clone(), |
1472 | | count: self.count, |
1473 | | patterns: self.patterns, |
1474 | | } |
1475 | | } |
1476 | | |
1477 | | /// Return a convenient representation of the given state. |
1478 | | /// |
1479 | | /// This panics if the state is invalid. |
1480 | | /// |
1481 | | /// This is marked as inline to help dramatically boost sparse searching, |
1482 | | /// which decodes each state it enters to follow the next transition. Other |
1483 | | /// functions involved are also inlined, which should hopefully eliminate |
1484 | | /// a lot of the extraneous decoding that is never needed just to follow |
1485 | | /// the next transition. |
1486 | | #[inline(always)] |
1487 | 0 | fn state(&self, id: StateID) -> State<'_> { |
1488 | 0 | let mut state = &self.sparse()[id.as_usize()..]; |
1489 | 0 | let mut ntrans = bytes::read_u16(&state) as usize; |
1490 | 0 | let is_match = (1 << 15) & ntrans != 0; |
1491 | 0 | ntrans &= !(1 << 15); |
1492 | 0 | state = &state[2..]; |
1493 | 0 |
|
1494 | 0 | let (input_ranges, state) = state.split_at(ntrans * 2); |
1495 | 0 | let (next, state) = state.split_at(ntrans * StateID::SIZE); |
1496 | 0 | let (pattern_ids, state) = if is_match { |
1497 | 0 | let npats = bytes::read_u32(&state) as usize; |
1498 | 0 | state[4..].split_at(npats * 4) |
1499 | | } else { |
1500 | 0 | (&[][..], state) |
1501 | | }; |
1502 | | |
1503 | 0 | let accel_len = state[0] as usize; |
1504 | 0 | let accel = &state[1..accel_len + 1]; |
1505 | 0 | State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } |
1506 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::state Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::state Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<_>>::state |
1507 | | |
1508 | | /// Like `state`, but will return an error if the state encoding is |
1509 | | /// invalid. This is useful for verifying states after deserialization, |
1510 | | /// which is required for a safe deserialization API. |
1511 | | /// |
1512 | | /// Note that this only verifies that this state is decodable and that |
1513 | | /// all of its data is consistent. It does not verify that its state ID |
1514 | | /// transitions point to valid states themselves, nor does it verify that |
1515 | | /// every pattern ID is valid. |
1516 | 0 | fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> { |
1517 | 0 | if id.as_usize() > self.sparse().len() { |
1518 | 0 | return Err(DeserializeError::generic("invalid sparse state ID")); |
1519 | 0 | } |
1520 | 0 | let mut state = &self.sparse()[id.as_usize()..]; |
1521 | | // Encoding format starts with a u16 that stores the total number of |
1522 | | // transitions in this state. |
1523 | 0 | let (mut ntrans, _) = |
1524 | 0 | bytes::try_read_u16_as_usize(state, "state transition count")?; |
1525 | 0 | let is_match = ((1 << 15) & ntrans) != 0; |
1526 | 0 | ntrans &= !(1 << 15); |
1527 | 0 | state = &state[2..]; |
1528 | 0 | if ntrans > 257 || ntrans == 0 { |
1529 | 0 | return Err(DeserializeError::generic("invalid transition count")); |
1530 | 0 | } |
1531 | 0 |
|
1532 | 0 | // Each transition has two pieces: an inclusive range of bytes on which |
1533 | 0 | // it is defined, and the state ID that those bytes transition to. The |
1534 | 0 | // pairs come first, followed by a corresponding sequence of state IDs. |
1535 | 0 | let input_ranges_len = ntrans.checked_mul(2).unwrap(); |
1536 | 0 | bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; |
1537 | 0 | let (input_ranges, state) = state.split_at(input_ranges_len); |
1538 | | // Every range should be of the form A-B, where A<=B. |
1539 | 0 | for pair in input_ranges.chunks(2) { |
1540 | 0 | let (start, end) = (pair[0], pair[1]); |
1541 | 0 | if start > end { |
1542 | 0 | return Err(DeserializeError::generic("invalid input range")); |
1543 | 0 | } |
1544 | | } |
1545 | | |
1546 | | // And now extract the corresponding sequence of state IDs. We leave |
1547 | | // this sequence as a &[u8] instead of a &[S] because sparse DFAs do |
1548 | | // not have any alignment requirements. |
1549 | 0 | let next_len = ntrans |
1550 | 0 | .checked_mul(self.id_len()) |
1551 | 0 | .expect("state size * #trans should always fit in a usize"); |
1552 | 0 | bytes::check_slice_len(state, next_len, "sparse trans state IDs")?; |
1553 | 0 | let (next, state) = state.split_at(next_len); |
1554 | | // We can at least verify that every state ID is in bounds. |
1555 | 0 | for idbytes in next.chunks(self.id_len()) { |
1556 | 0 | let (id, _) = |
1557 | 0 | bytes::read_state_id(idbytes, "sparse state ID in try_state")?; |
1558 | 0 | bytes::check_slice_len( |
1559 | 0 | self.sparse(), |
1560 | 0 | id.as_usize(), |
1561 | 0 | "invalid sparse state ID", |
1562 | 0 | )?; |
1563 | | } |
1564 | | |
1565 | | // If this is a match state, then read the pattern IDs for this state. |
1566 | | // Pattern IDs is a u32-length prefixed sequence of native endian |
1567 | | // encoded 32-bit integers. |
1568 | 0 | let (pattern_ids, state) = if is_match { |
1569 | 0 | let (npats, nr) = |
1570 | 0 | bytes::try_read_u32_as_usize(state, "pattern ID count")?; |
1571 | 0 | let state = &state[nr..]; |
1572 | | |
1573 | 0 | let pattern_ids_len = |
1574 | 0 | bytes::mul(npats, 4, "sparse pattern ID byte length")?; |
1575 | 0 | bytes::check_slice_len( |
1576 | 0 | state, |
1577 | 0 | pattern_ids_len, |
1578 | 0 | "sparse pattern IDs", |
1579 | 0 | )?; |
1580 | 0 | let (pattern_ids, state) = state.split_at(pattern_ids_len); |
1581 | 0 | for patbytes in pattern_ids.chunks(PatternID::SIZE) { |
1582 | 0 | bytes::read_pattern_id( |
1583 | 0 | patbytes, |
1584 | 0 | "sparse pattern ID in try_state", |
1585 | 0 | )?; |
1586 | | } |
1587 | 0 | (pattern_ids, state) |
1588 | | } else { |
1589 | 0 | (&[][..], state) |
1590 | | }; |
1591 | | |
1592 | | // Now read this state's accelerator info. The first byte is the length |
1593 | | // of the accelerator, which is typically 0 (for no acceleration) but |
1594 | | // is no bigger than 3. The length indicates the number of bytes that |
1595 | | // follow, where each byte corresponds to a transition out of this |
1596 | | // state. |
1597 | 0 | if state.is_empty() { |
1598 | 0 | return Err(DeserializeError::generic("no accelerator length")); |
1599 | 0 | } |
1600 | 0 | let (accel_len, state) = (state[0] as usize, &state[1..]); |
1601 | 0 |
|
1602 | 0 | if accel_len > 3 { |
1603 | 0 | return Err(DeserializeError::generic( |
1604 | 0 | "sparse invalid accelerator length", |
1605 | 0 | )); |
1606 | 0 | } |
1607 | 0 | bytes::check_slice_len( |
1608 | 0 | state, |
1609 | 0 | accel_len, |
1610 | 0 | "sparse corrupt accelerator length", |
1611 | 0 | )?; |
1612 | 0 | let (accel, _) = (&state[..accel_len], &state[accel_len..]); |
1613 | 0 |
|
1614 | 0 | Ok(State { |
1615 | 0 | id, |
1616 | 0 | is_match, |
1617 | 0 | ntrans, |
1618 | 0 | input_ranges, |
1619 | 0 | next, |
1620 | 0 | pattern_ids, |
1621 | 0 | accel, |
1622 | 0 | }) |
1623 | 0 | } |
1624 | | |
1625 | | /// Return an iterator over all of the states in this DFA. |
1626 | | /// |
1627 | | /// The iterator returned yields tuples, where the first element is the |
1628 | | /// state ID and the second element is the state itself. |
1629 | 0 | fn states(&self) -> StateIter<'_, T> { |
1630 | 0 | StateIter { trans: self, id: DEAD.as_usize() } |
1631 | 0 | } |
1632 | | |
1633 | | /// Returns the sparse transitions as raw bytes. |
1634 | 0 | fn sparse(&self) -> &[u8] { |
1635 | 0 | self.sparse.as_ref() |
1636 | 0 | } |
1637 | | |
1638 | | /// Returns the number of bytes represented by a single state ID. |
1639 | 0 | fn id_len(&self) -> usize { |
1640 | 0 | StateID::SIZE |
1641 | 0 | } |
1642 | | |
1643 | | /// Return the memory usage, in bytes, of these transitions. |
1644 | | /// |
1645 | | /// This does not include the size of a `Transitions` value itself. |
1646 | 0 | fn memory_usage(&self) -> usize { |
1647 | 0 | self.sparse().len() |
1648 | 0 | } |
1649 | | } |
1650 | | |
1651 | | #[cfg(feature = "alloc")] |
1652 | | impl<T: AsMut<[u8]>> Transitions<T> { |
1653 | | /// Return a convenient mutable representation of the given state. |
1654 | | /// This panics if the state is invalid. |
1655 | | fn state_mut(&mut self, id: StateID) -> StateMut<'_> { |
1656 | | let mut state = &mut self.sparse_mut()[id.as_usize()..]; |
1657 | | let mut ntrans = bytes::read_u16(&state) as usize; |
1658 | | let is_match = (1 << 15) & ntrans != 0; |
1659 | | ntrans &= !(1 << 15); |
1660 | | state = &mut state[2..]; |
1661 | | |
1662 | | let (input_ranges, state) = state.split_at_mut(ntrans * 2); |
1663 | | let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); |
1664 | | let (pattern_ids, state) = if is_match { |
1665 | | let npats = bytes::read_u32(&state) as usize; |
1666 | | state[4..].split_at_mut(npats * 4) |
1667 | | } else { |
1668 | | (&mut [][..], state) |
1669 | | }; |
1670 | | |
1671 | | let accel_len = state[0] as usize; |
1672 | | let accel = &mut state[1..accel_len + 1]; |
1673 | | StateMut { |
1674 | | id, |
1675 | | is_match, |
1676 | | ntrans, |
1677 | | input_ranges, |
1678 | | next, |
1679 | | pattern_ids, |
1680 | | accel, |
1681 | | } |
1682 | | } |
1683 | | |
1684 | | /// Returns the sparse transitions as raw mutable bytes. |
1685 | | fn sparse_mut(&mut self) -> &mut [u8] { |
1686 | | self.sparse.as_mut() |
1687 | | } |
1688 | | } |
1689 | | |
1690 | | /// The set of all possible starting states in a DFA. |
1691 | | /// |
1692 | | /// See the eponymous type in the `dense` module for more details. This type |
1693 | | /// is very similar to `dense::StartTable`, except that its underlying |
1694 | | /// representation is `&[u8]` instead of `&[S]`. (The latter would require |
1695 | | /// sparse DFAs to be aligned, which is explicitly something we do not require |
1696 | | /// because we don't really need it.) |
1697 | | #[derive(Clone)] |
1698 | | struct StartTable<T> { |
1699 | | /// The initial start state IDs as a contiguous table of native endian |
1700 | | /// encoded integers, represented by `S`. |
1701 | | /// |
1702 | | /// In practice, T is either Vec<u8> or &[u8] and has no alignment |
1703 | | /// requirements. |
1704 | | /// |
1705 | | /// The first `stride` (currently always 4) entries always correspond to |
1706 | | /// the start states for the entire DFA. After that, there are |
1707 | | /// `stride * patterns` state IDs, where `patterns` may be zero in the |
1708 | | /// case of a DFA with no patterns or in the case where the DFA was built |
1709 | | /// without enabling starting states for each pattern. |
1710 | | table: T, |
1711 | | /// The number of starting state IDs per pattern. |
1712 | | stride: usize, |
1713 | | /// The total number of patterns for which starting states are encoded. |
1714 | | /// This may be zero for non-empty DFAs when the DFA was built without |
1715 | | /// start states for each pattern. |
1716 | | patterns: usize, |
1717 | | } |
1718 | | |
1719 | | #[cfg(feature = "alloc")] |
1720 | | impl StartTable<Vec<u8>> { |
1721 | | fn new(patterns: usize) -> StartTable<Vec<u8>> { |
1722 | | let stride = Start::count(); |
1723 | | // This is OK since the only way we're here is if a dense DFA could be |
1724 | | // constructed successfully, which uses the same space. |
1725 | | let len = stride |
1726 | | .checked_mul(patterns) |
1727 | | .unwrap() |
1728 | | .checked_add(stride) |
1729 | | .unwrap() |
1730 | | .checked_mul(StateID::SIZE) |
1731 | | .unwrap(); |
1732 | | StartTable { table: vec![0; len], stride, patterns } |
1733 | | } |
1734 | | |
1735 | | fn from_dense_dfa<T: AsRef<[u32]>>( |
1736 | | dfa: &dense::DFA<T>, |
1737 | | remap: &[StateID], |
1738 | | ) -> Result<StartTable<Vec<u8>>, Error> { |
1739 | | // Unless the DFA has start states compiled for each pattern, then |
1740 | | // as far as the starting state table is concerned, there are zero |
1741 | | // patterns to account for. It will instead only store starting states |
1742 | | // for the entire DFA. |
1743 | | let start_pattern_count = if dfa.has_starts_for_each_pattern() { |
1744 | | dfa.pattern_count() |
1745 | | } else { |
1746 | | 0 |
1747 | | }; |
1748 | | let mut sl = StartTable::new(start_pattern_count); |
1749 | | for (old_start_id, sty, pid) in dfa.starts() { |
1750 | | let new_start_id = remap[dfa.to_index(old_start_id)]; |
1751 | | sl.set_start(sty, pid, new_start_id); |
1752 | | } |
1753 | | Ok(sl) |
1754 | | } |
1755 | | } |
1756 | | |
1757 | | impl<'a> StartTable<&'a [u8]> { |
1758 | 0 | unsafe fn from_bytes_unchecked( |
1759 | 0 | mut slice: &'a [u8], |
1760 | 0 | ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { |
1761 | 0 | let slice_start = slice.as_ptr() as usize; |
1762 | | |
1763 | 0 | let (stride, nr) = |
1764 | 0 | bytes::try_read_u32_as_usize(slice, "sparse start table stride")?; |
1765 | 0 | slice = &slice[nr..]; |
1766 | | |
1767 | 0 | let (patterns, nr) = bytes::try_read_u32_as_usize( |
1768 | 0 | slice, |
1769 | 0 | "sparse start table patterns", |
1770 | 0 | )?; |
1771 | 0 | slice = &slice[nr..]; |
1772 | 0 |
|
1773 | 0 | if stride != Start::count() { |
1774 | 0 | return Err(DeserializeError::generic( |
1775 | 0 | "invalid sparse starting table stride", |
1776 | 0 | )); |
1777 | 0 | } |
1778 | 0 | if patterns > PatternID::LIMIT { |
1779 | 0 | return Err(DeserializeError::generic( |
1780 | 0 | "sparse invalid number of patterns", |
1781 | 0 | )); |
1782 | 0 | } |
1783 | 0 | let pattern_table_size = |
1784 | 0 | bytes::mul(stride, patterns, "sparse invalid pattern count")?; |
1785 | | // Our start states always start with a single stride of start states |
1786 | | // for the entire automaton which permit it to match any pattern. What |
1787 | | // follows it are an optional set of start states for each pattern. |
1788 | 0 | let start_state_count = bytes::add( |
1789 | 0 | stride, |
1790 | 0 | pattern_table_size, |
1791 | 0 | "sparse invalid 'any' pattern starts size", |
1792 | 0 | )?; |
1793 | 0 | let table_bytes_len = bytes::mul( |
1794 | 0 | start_state_count, |
1795 | 0 | StateID::SIZE, |
1796 | 0 | "sparse pattern table bytes length", |
1797 | 0 | )?; |
1798 | 0 | bytes::check_slice_len( |
1799 | 0 | slice, |
1800 | 0 | table_bytes_len, |
1801 | 0 | "sparse start ID table", |
1802 | 0 | )?; |
1803 | 0 | let table_bytes = &slice[..table_bytes_len]; |
1804 | 0 | slice = &slice[table_bytes_len..]; |
1805 | 0 |
|
1806 | 0 | let sl = StartTable { table: table_bytes, stride, patterns }; |
1807 | 0 | Ok((sl, slice.as_ptr() as usize - slice_start)) |
1808 | 0 | } |
1809 | | } |
1810 | | |
1811 | | impl<T: AsRef<[u8]>> StartTable<T> { |
1812 | 0 | fn write_to<E: Endian>( |
1813 | 0 | &self, |
1814 | 0 | mut dst: &mut [u8], |
1815 | 0 | ) -> Result<usize, SerializeError> { |
1816 | 0 | let nwrite = self.write_to_len(); |
1817 | 0 | if dst.len() < nwrite { |
1818 | 0 | return Err(SerializeError::buffer_too_small( |
1819 | 0 | "sparse starting table ids", |
1820 | 0 | )); |
1821 | 0 | } |
1822 | 0 | dst = &mut dst[..nwrite]; |
1823 | 0 |
|
1824 | 0 | // write stride |
1825 | 0 | E::write_u32(u32::try_from(self.stride).unwrap(), dst); |
1826 | 0 | dst = &mut dst[size_of::<u32>()..]; |
1827 | 0 | // write pattern count |
1828 | 0 | E::write_u32(u32::try_from(self.patterns).unwrap(), dst); |
1829 | 0 | dst = &mut dst[size_of::<u32>()..]; |
1830 | 0 | // write start IDs |
1831 | 0 | dst.copy_from_slice(self.table()); |
1832 | 0 | Ok(nwrite) |
1833 | 0 | } |
1834 | | |
1835 | | /// Returns the number of bytes the serialized form of this transition |
1836 | | /// table will use. |
1837 | 0 | fn write_to_len(&self) -> usize { |
1838 | 0 | size_of::<u32>() // stride |
1839 | 0 | + size_of::<u32>() // # patterns |
1840 | 0 | + self.table().len() |
1841 | 0 | } |
1842 | | |
1843 | | /// Validates that every starting state ID in this table is valid. |
1844 | | /// |
1845 | | /// That is, every starting state ID can be used to correctly decode a |
1846 | | /// state in the DFA's sparse transitions. |
1847 | 0 | fn validate( |
1848 | 0 | &self, |
1849 | 0 | trans: &Transitions<T>, |
1850 | 0 | ) -> Result<(), DeserializeError> { |
1851 | 0 | for (id, _, _) in self.iter() { |
1852 | 0 | let _ = trans.try_state(id)?; |
1853 | | } |
1854 | 0 | Ok(()) |
1855 | 0 | } |
1856 | | |
1857 | | /// Converts this start list to a borrowed value. |
1858 | 0 | fn as_ref(&self) -> StartTable<&'_ [u8]> { |
1859 | 0 | StartTable { |
1860 | 0 | table: self.table(), |
1861 | 0 | stride: self.stride, |
1862 | 0 | patterns: self.patterns, |
1863 | 0 | } |
1864 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<&[u8]>>::as_ref Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<_>>::as_ref |
1865 | | |
1866 | | /// Converts this start list to an owned value. |
1867 | | #[cfg(feature = "alloc")] |
1868 | | fn to_owned(&self) -> StartTable<Vec<u8>> { |
1869 | | StartTable { |
1870 | | table: self.table().to_vec(), |
1871 | | stride: self.stride, |
1872 | | patterns: self.patterns, |
1873 | | } |
1874 | | } |
1875 | | |
1876 | | /// Return the start state for the given index and pattern ID. If the |
1877 | | /// pattern ID is None, then the corresponding start state for the entire |
1878 | | /// DFA is returned. If the pattern ID is not None, then the corresponding |
1879 | | /// starting state for the given pattern is returned. If this start table |
1880 | | /// does not have individual starting states for each pattern, then this |
1881 | | /// panics. |
1882 | 0 | fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID { |
1883 | 0 | let start_index = index.as_usize(); |
1884 | 0 | let index = match pattern_id { |
1885 | 0 | None => start_index, |
1886 | 0 | Some(pid) => { |
1887 | 0 | let pid = pid.as_usize(); |
1888 | 0 | assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); |
1889 | 0 | self.stride |
1890 | 0 | .checked_mul(pid) |
1891 | 0 | .unwrap() |
1892 | 0 | .checked_add(self.stride) |
1893 | 0 | .unwrap() |
1894 | 0 | .checked_add(start_index) |
1895 | 0 | .unwrap() |
1896 | | } |
1897 | | }; |
1898 | 0 | let start = index * StateID::SIZE; |
1899 | 0 | // This OK since we're allowed to assume that the start table contains |
1900 | 0 | // valid StateIDs. |
1901 | 0 | bytes::read_state_id_unchecked(&self.table()[start..]).0 |
1902 | 0 | } Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<&[u8]>>::start Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<_>>::start |
1903 | | |
1904 | | /// Return an iterator over all start IDs in this table. |
1905 | 0 | fn iter(&self) -> StartStateIter<'_, T> { |
1906 | 0 | StartStateIter { st: self, i: 0 } |
1907 | 0 | } |
1908 | | |
1909 | | /// Returns the total number of start state IDs in this table. |
1910 | 0 | fn len(&self) -> usize { |
1911 | 0 | self.table().len() / StateID::SIZE |
1912 | 0 | } |
1913 | | |
1914 | | /// Returns the table as a raw slice of bytes. |
1915 | 0 | fn table(&self) -> &[u8] { |
1916 | 0 | self.table.as_ref() |
1917 | 0 | } |
1918 | | |
1919 | | /// Return the memory usage, in bytes, of this start list. |
1920 | | /// |
1921 | | /// This does not include the size of a `StartTable` value itself. |
1922 | 0 | fn memory_usage(&self) -> usize { |
1923 | 0 | self.table().len() |
1924 | 0 | } |
1925 | | } |
1926 | | |
1927 | | #[cfg(feature = "alloc")] |
1928 | | impl<T: AsMut<[u8]>> StartTable<T> { |
1929 | | /// Set the start state for the given index and pattern. |
1930 | | /// |
1931 | | /// If the pattern ID or state ID are not valid, then this will panic. |
1932 | | fn set_start( |
1933 | | &mut self, |
1934 | | index: Start, |
1935 | | pattern_id: Option<PatternID>, |
1936 | | id: StateID, |
1937 | | ) { |
1938 | | let start_index = index.as_usize(); |
1939 | | let index = match pattern_id { |
1940 | | None => start_index, |
1941 | | Some(pid) => { |
1942 | | let pid = pid.as_usize(); |
1943 | | assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); |
1944 | | self.stride |
1945 | | .checked_mul(pid) |
1946 | | .unwrap() |
1947 | | .checked_add(self.stride) |
1948 | | .unwrap() |
1949 | | .checked_add(start_index) |
1950 | | .unwrap() |
1951 | | } |
1952 | | }; |
1953 | | let start = index * StateID::SIZE; |
1954 | | let end = start + StateID::SIZE; |
1955 | | bytes::write_state_id::<bytes::NE>( |
1956 | | id, |
1957 | | &mut self.table.as_mut()[start..end], |
1958 | | ); |
1959 | | } |
1960 | | } |
1961 | | |
1962 | | /// An iterator over all state state IDs in a sparse DFA. |
1963 | | struct StartStateIter<'a, T> { |
1964 | | st: &'a StartTable<T>, |
1965 | | i: usize, |
1966 | | } |
1967 | | |
1968 | | impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { |
1969 | | type Item = (StateID, Start, Option<PatternID>); |
1970 | | |
1971 | 0 | fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> { |
1972 | 0 | let i = self.i; |
1973 | 0 | if i >= self.st.len() { |
1974 | 0 | return None; |
1975 | 0 | } |
1976 | 0 | self.i += 1; |
1977 | 0 |
|
1978 | 0 | // This unwrap is okay since the stride of any DFA must always match |
1979 | 0 | // the number of start state types. |
1980 | 0 | let start_type = Start::from_usize(i % self.st.stride).unwrap(); |
1981 | 0 | let pid = if i < self.st.stride { |
1982 | | // This means we don't have start states for each pattern. |
1983 | 0 | None |
1984 | | } else { |
1985 | | // These unwraps are OK since we may assume our table and stride |
1986 | | // is correct. |
1987 | 0 | let pid = i |
1988 | 0 | .checked_sub(self.st.stride) |
1989 | 0 | .unwrap() |
1990 | 0 | .checked_div(self.st.stride) |
1991 | 0 | .unwrap(); |
1992 | 0 | Some(PatternID::new(pid).unwrap()) |
1993 | | }; |
1994 | 0 | let start = i * StateID::SIZE; |
1995 | 0 | let end = start + StateID::SIZE; |
1996 | 0 | let bytes = self.st.table()[start..end].try_into().unwrap(); |
1997 | 0 | // This is OK since we're allowed to assume that any IDs in this start |
1998 | 0 | // table are correct and valid for this DFA. |
1999 | 0 | let id = StateID::from_ne_bytes_unchecked(bytes); |
2000 | 0 | Some((id, start_type, pid)) |
2001 | 0 | } |
2002 | | } |
2003 | | |
2004 | | impl<'a, T> fmt::Debug for StartStateIter<'a, T> { |
2005 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2006 | 0 | f.debug_struct("StartStateIter").field("i", &self.i).finish() |
2007 | 0 | } |
2008 | | } |
2009 | | |
2010 | | /// An iterator over all states in a sparse DFA. |
2011 | | /// |
2012 | | /// This iterator yields tuples, where the first element is the state ID and |
2013 | | /// the second element is the state itself. |
2014 | | struct StateIter<'a, T> { |
2015 | | trans: &'a Transitions<T>, |
2016 | | id: usize, |
2017 | | } |
2018 | | |
2019 | | impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { |
2020 | | type Item = State<'a>; |
2021 | | |
2022 | 0 | fn next(&mut self) -> Option<State<'a>> { |
2023 | 0 | if self.id >= self.trans.sparse().len() { |
2024 | 0 | return None; |
2025 | 0 | } |
2026 | 0 | let state = self.trans.state(StateID::new_unchecked(self.id)); |
2027 | 0 | self.id = self.id + state.bytes_len(); |
2028 | 0 | Some(state) |
2029 | 0 | } |
2030 | | } |
2031 | | |
2032 | | impl<'a, T> fmt::Debug for StateIter<'a, T> { |
2033 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
2034 | 0 | f.debug_struct("StateIter").field("id", &self.id).finish() |
2035 | 0 | } |
2036 | | } |
2037 | | |
2038 | | /// A representation of a sparse DFA state that can be cheaply materialized |
2039 | | /// from a state identifier. |
2040 | | #[derive(Clone)] |
2041 | | struct State<'a> { |
2042 | | /// The identifier of this state. |
2043 | | id: StateID, |
2044 | | /// Whether this is a match state or not. |
2045 | | is_match: bool, |
2046 | | /// The number of transitions in this state. |
2047 | | ntrans: usize, |
2048 | | /// Pairs of input ranges, where there is one pair for each transition. |
2049 | | /// Each pair specifies an inclusive start and end byte range for the |
2050 | | /// corresponding transition. |
2051 | | input_ranges: &'a [u8], |
2052 | | /// Transitions to the next state. This slice contains native endian |
2053 | | /// encoded state identifiers, with `S` as the representation. Thus, there |
2054 | | /// are `ntrans * size_of::<S>()` bytes in this slice. |
2055 | | next: &'a [u8], |
2056 | | /// If this is a match state, then this contains the pattern IDs that match |
2057 | | /// when the DFA is in this state. |
2058 | | /// |
2059 | | /// This is a contiguous sequence of 32-bit native endian encoded integers. |
2060 | | pattern_ids: &'a [u8], |
2061 | | /// An accelerator for this state, if present. If this state has no |
2062 | | /// accelerator, then this is an empty slice. When non-empty, this slice |
2063 | | /// has length at most 3 and corresponds to the exhaustive set of bytes |
2064 | | /// that must be seen in order to transition out of this state. |
2065 | | accel: &'a [u8], |
2066 | | } |
2067 | | |
2068 | | impl<'a> State<'a> { |
2069 | | /// Searches for the next transition given an input byte. If no such |
2070 | | /// transition could be found, then a dead state is returned. |
2071 | | /// |
2072 | | /// This is marked as inline to help dramatically boost sparse searching, |
2073 | | /// which decodes each state it enters to follow the next transition. |
2074 | | #[inline(always)] |
2075 | 0 | fn next(&self, input: u8) -> StateID { |
2076 | | // This straight linear search was observed to be much better than |
2077 | | // binary search on ASCII haystacks, likely because a binary search |
2078 | | // visits the ASCII case last but a linear search sees it first. A |
2079 | | // binary search does do a little better on non-ASCII haystacks, but |
2080 | | // not by much. There might be a better trade off lurking here. |
2081 | 0 | for i in 0..(self.ntrans - 1) { |
2082 | 0 | let (start, end) = self.range(i); |
2083 | 0 | if start <= input && input <= end { |
2084 | 0 | return self.next_at(i); |
2085 | 0 | } |
2086 | | // We could bail early with an extra branch: if input < b1, then |
2087 | | // we know we'll never find a matching transition. Interestingly, |
2088 | | // this extra branch seems to not help performance, or will even |
2089 | | // hurt it. It's likely very dependent on the DFA itself and what |
2090 | | // is being searched. |
2091 | | } |
2092 | 0 | DEAD |
2093 | 0 | } |
2094 | | |
2095 | | /// Returns the next state ID for the special EOI transition. |
2096 | 0 | fn next_eoi(&self) -> StateID { |
2097 | 0 | self.next_at(self.ntrans - 1) |
2098 | 0 | } |
2099 | | |
2100 | | /// Returns the identifier for this state. |
2101 | 0 | fn id(&self) -> StateID { |
2102 | 0 | self.id |
2103 | 0 | } |
2104 | | |
2105 | | /// Returns the inclusive input byte range for the ith transition in this |
2106 | | /// state. |
2107 | 0 | fn range(&self, i: usize) -> (u8, u8) { |
2108 | 0 | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) |
2109 | 0 | } |
2110 | | |
2111 | | /// Returns the next state for the ith transition in this state. |
2112 | 0 | fn next_at(&self, i: usize) -> StateID { |
2113 | 0 | let start = i * StateID::SIZE; |
2114 | 0 | let end = start + StateID::SIZE; |
2115 | 0 | let bytes = self.next[start..end].try_into().unwrap(); |
2116 | 0 | StateID::from_ne_bytes_unchecked(bytes) |
2117 | 0 | } |
2118 | | |
2119 | | /// Returns the pattern ID for the given match index. If the match index |
2120 | | /// is invalid, then this panics. |
2121 | 0 | fn pattern_id(&self, match_index: usize) -> PatternID { |
2122 | 0 | let start = match_index * PatternID::SIZE; |
2123 | 0 | bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 |
2124 | 0 | } |
2125 | | |
2126 | | /// Returns the total number of pattern IDs for this state. This is always |
2127 | | /// zero when `is_match` is false. |
2128 | 0 | fn pattern_count(&self) -> usize { |
2129 | 0 | assert_eq!(0, self.pattern_ids.len() % 4); |
2130 | 0 | self.pattern_ids.len() / 4 |
2131 | 0 | } |
2132 | | |
2133 | | /// Return the total number of bytes that this state consumes in its |
2134 | | /// encoded form. |
2135 | 0 | fn bytes_len(&self) -> usize { |
2136 | 0 | let mut len = 2 |
2137 | 0 | + (self.ntrans * 2) |
2138 | 0 | + (self.ntrans * StateID::SIZE) |
2139 | 0 | + (1 + self.accel.len()); |
2140 | 0 | if self.is_match { |
2141 | 0 | len += size_of::<u32>() + self.pattern_ids.len(); |
2142 | 0 | } |
2143 | 0 | len |
2144 | 0 | } |
2145 | | |
2146 | | /// Return an accelerator for this state. |
2147 | 0 | fn accelerator(&self) -> &'a [u8] { |
2148 | 0 | self.accel |
2149 | 0 | } |
2150 | | } |
2151 | | |
2152 | | impl<'a> fmt::Debug for State<'a> { |
2153 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
2154 | 0 | let mut printed = false; |
2155 | 0 | for i in 0..(self.ntrans - 1) { |
2156 | 0 | let next = self.next_at(i); |
2157 | 0 | if next == DEAD { |
2158 | 0 | continue; |
2159 | 0 | } |
2160 | 0 |
|
2161 | 0 | if printed { |
2162 | 0 | write!(f, ", ")?; |
2163 | 0 | } |
2164 | 0 | let (start, end) = self.range(i); |
2165 | 0 | if start == end { |
2166 | 0 | write!(f, "{:?} => {:?}", DebugByte(start), next)?; |
2167 | | } else { |
2168 | 0 | write!( |
2169 | 0 | f, |
2170 | 0 | "{:?}-{:?} => {:?}", |
2171 | 0 | DebugByte(start), |
2172 | 0 | DebugByte(end), |
2173 | 0 | next, |
2174 | 0 | )?; |
2175 | | } |
2176 | 0 | printed = true; |
2177 | | } |
2178 | 0 | let eoi = self.next_at(self.ntrans - 1); |
2179 | 0 | if eoi != DEAD { |
2180 | 0 | if printed { |
2181 | 0 | write!(f, ", ")?; |
2182 | 0 | } |
2183 | 0 | write!(f, "EOI => {:?}", eoi)?; |
2184 | 0 | } |
2185 | 0 | Ok(()) |
2186 | 0 | } |
2187 | | } |
2188 | | |
2189 | | /// A representation of a mutable sparse DFA state that can be cheaply |
2190 | | /// materialized from a state identifier. |
2191 | | #[cfg(feature = "alloc")] |
2192 | | struct StateMut<'a> { |
2193 | | /// The identifier of this state. |
2194 | | id: StateID, |
2195 | | /// Whether this is a match state or not. |
2196 | | is_match: bool, |
2197 | | /// The number of transitions in this state. |
2198 | | ntrans: usize, |
2199 | | /// Pairs of input ranges, where there is one pair for each transition. |
2200 | | /// Each pair specifies an inclusive start and end byte range for the |
2201 | | /// corresponding transition. |
2202 | | input_ranges: &'a mut [u8], |
2203 | | /// Transitions to the next state. This slice contains native endian |
2204 | | /// encoded state identifiers, with `S` as the representation. Thus, there |
2205 | | /// are `ntrans * size_of::<S>()` bytes in this slice. |
2206 | | next: &'a mut [u8], |
2207 | | /// If this is a match state, then this contains the pattern IDs that match |
2208 | | /// when the DFA is in this state. |
2209 | | /// |
2210 | | /// This is a contiguous sequence of 32-bit native endian encoded integers. |
2211 | | pattern_ids: &'a [u8], |
2212 | | /// An accelerator for this state, if present. If this state has no |
2213 | | /// accelerator, then this is an empty slice. When non-empty, this slice |
2214 | | /// has length at most 3 and corresponds to the exhaustive set of bytes |
2215 | | /// that must be seen in order to transition out of this state. |
2216 | | accel: &'a mut [u8], |
2217 | | } |
2218 | | |
2219 | | #[cfg(feature = "alloc")] |
2220 | | impl<'a> StateMut<'a> { |
2221 | | /// Sets the ith transition to the given state. |
2222 | | fn set_next_at(&mut self, i: usize, next: StateID) { |
2223 | | let start = i * StateID::SIZE; |
2224 | | let end = start + StateID::SIZE; |
2225 | | bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]); |
2226 | | } |
2227 | | } |
2228 | | |
2229 | | #[cfg(feature = "alloc")] |
2230 | | impl<'a> fmt::Debug for StateMut<'a> { |
2231 | | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
2232 | | let state = State { |
2233 | | id: self.id, |
2234 | | is_match: self.is_match, |
2235 | | ntrans: self.ntrans, |
2236 | | input_ranges: self.input_ranges, |
2237 | | next: self.next, |
2238 | | pattern_ids: self.pattern_ids, |
2239 | | accel: self.accel, |
2240 | | }; |
2241 | | fmt::Debug::fmt(&state, f) |
2242 | | } |
2243 | | } |
2244 | | |
2245 | | /// A binary search routine specialized specifically to a sparse DFA state's |
2246 | | /// transitions. Specifically, the transitions are defined as a set of pairs |
2247 | | /// of input bytes that delineate an inclusive range of bytes. If the input |
2248 | | /// byte is in the range, then the corresponding transition is a match. |
2249 | | /// |
2250 | | /// This binary search accepts a slice of these pairs and returns the position |
2251 | | /// of the matching pair (the ith transition), or None if no matching pair |
2252 | | /// could be found. |
2253 | | /// |
2254 | | /// Note that this routine is not currently used since it was observed to |
2255 | | /// either decrease performance when searching ASCII, or did not provide enough |
2256 | | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here |
2257 | | /// for posterity in case we can find a way to use it. |
2258 | | /// |
2259 | | /// In theory, we could use the standard library's search routine if we could |
2260 | | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently |
2261 | | /// guaranteed to be safe and is thus UB (since I don't think the in-memory |
2262 | | /// representation of `(u8, u8)` has been nailed down). One could define a |
2263 | | /// repr(C) type, but the casting doesn't seem justified. |
2264 | | #[allow(dead_code)] |
2265 | | #[inline(always)] |
2266 | 0 | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { |
2267 | 0 | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); |
2268 | 0 | debug_assert!(ranges.len() <= 512, "ranges should be short"); |
2269 | | |
2270 | 0 | let (mut left, mut right) = (0, ranges.len() / 2); |
2271 | 0 | while left < right { |
2272 | 0 | let mid = (left + right) / 2; |
2273 | 0 | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); |
2274 | 0 | if needle < b1 { |
2275 | 0 | right = mid; |
2276 | 0 | } else if needle > b2 { |
2277 | 0 | left = mid + 1; |
2278 | 0 | } else { |
2279 | 0 | return Some(mid); |
2280 | | } |
2281 | | } |
2282 | 0 | None |
2283 | 0 | } |