/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/hybrid/id.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /// A state identifier specifically tailored for lazy DFAs. |
2 | | /// |
3 | | /// A lazy state ID logically represents a pointer to a DFA state. In practice, |
4 | | /// by limiting the number of DFA states it can address, it reserves some |
5 | | /// bits of its representation to encode some additional information. That |
6 | | /// additional information is called a "tag." That tag is used to record |
7 | | /// whether the state it points to is an unknown, dead, quit, start or match |
8 | | /// state. |
9 | | /// |
10 | | /// When implementing a low level search routine with a lazy DFA, it is |
11 | | /// necessary to query the type of the current state to know what to do: |
12 | | /// |
13 | | /// * **Unknown** - The state has not yet been computed. The |
14 | | /// parameters used to get this state ID must be re-passed to |
15 | | /// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never |
16 | | /// return an unknown state ID. |
17 | | /// * **Dead** - A dead state only has transitions to itself. It indicates that |
18 | | /// the search cannot do anything else and should stop with whatever result it |
19 | | /// has. |
20 | | /// * **Quit** - A quit state indicates that the automaton could not answer |
21 | | /// whether a match exists or not. Correct search implementations must return a |
22 | | /// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit |
23 | | /// state. |
24 | | /// * **Start** - A start state is a state in which a search can begin. |
25 | | /// Lazy DFAs usually have more than one start state. Branching on |
26 | | /// this isn't required for correctness, but a common optimization is |
27 | | /// to run a prefilter when a search enters a start state. Note that |
28 | | /// start states are *not* tagged automatically, and one must enable the |
29 | | /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states) |
30 | | /// setting for start states to be tagged. The reason for this is |
31 | | /// that a DFA search loop is usually written to execute a prefilter once it |
32 | | /// enters a start state. But if there is no prefilter, this handling can be |
33 | | /// quite diastrous as the DFA may ping-pong between the special handling code |
34 | | /// and a possible optimized hot path for handling untagged states. When start |
35 | | /// states aren't specialized, then they are untagged and remain in the hot |
36 | | /// path. |
37 | | /// * **Match** - A match state indicates that a match has been found. |
38 | | /// Depending on the semantics of your search implementation, it may either |
39 | | /// continue until the end of the haystack or a dead state, or it might quit |
40 | | /// and return the match immediately. |
41 | | /// |
42 | | /// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate |
43 | | /// can be used to determine if a tag exists at all. This is useful to avoid |
44 | | /// branching on all of the above types for every byte searched. |
45 | | /// |
46 | | /// # Example |
47 | | /// |
48 | | /// This example shows how `LazyStateID` can be used to implement a correct |
49 | | /// search routine with minimal branching. In particular, this search routine |
50 | | /// implements "leftmost" matching, which means that it doesn't immediately |
51 | | /// stop once a match is found. Instead, it continues until it reaches a dead |
52 | | /// state. |
53 | | /// |
54 | | /// Notice also how a correct search implementation deals with |
55 | | /// [`CacheError`](crate::hybrid::CacheError)s returned by some of |
56 | | /// the lazy DFA routines. When a `CacheError` occurs, it returns |
57 | | /// [`MatchError::gave_up`](crate::MatchError::gave_up). |
58 | | /// |
59 | | /// ``` |
60 | | /// use regex_automata::{ |
61 | | /// hybrid::dfa::{Cache, DFA}, |
62 | | /// HalfMatch, MatchError, Input, |
63 | | /// }; |
64 | | /// |
65 | | /// fn find_leftmost_first( |
66 | | /// dfa: &DFA, |
67 | | /// cache: &mut Cache, |
68 | | /// haystack: &[u8], |
69 | | /// ) -> Result<Option<HalfMatch>, MatchError> { |
70 | | /// // The start state is determined by inspecting the position and the |
71 | | /// // initial bytes of the haystack. Note that start states can never |
72 | | /// // be match states (since DFAs in this crate delay matches by 1 |
73 | | /// // byte), so we don't need to check if the start state is a match. |
74 | | /// let mut sid = dfa.start_state_forward( |
75 | | /// cache, |
76 | | /// &Input::new(haystack), |
77 | | /// )?; |
78 | | /// let mut last_match = None; |
79 | | /// // Walk all the bytes in the haystack. We can quit early if we see |
80 | | /// // a dead or a quit state. The former means the automaton will |
81 | | /// // never transition to any other state. The latter means that the |
82 | | /// // automaton entered a condition in which its search failed. |
83 | | /// for (i, &b) in haystack.iter().enumerate() { |
84 | | /// sid = dfa |
85 | | /// .next_state(cache, sid, b) |
86 | | /// .map_err(|_| MatchError::gave_up(i))?; |
87 | | /// if sid.is_tagged() { |
88 | | /// if sid.is_match() { |
89 | | /// last_match = Some(HalfMatch::new( |
90 | | /// dfa.match_pattern(cache, sid, 0), |
91 | | /// i, |
92 | | /// )); |
93 | | /// } else if sid.is_dead() { |
94 | | /// return Ok(last_match); |
95 | | /// } else if sid.is_quit() { |
96 | | /// // It is possible to enter into a quit state after |
97 | | /// // observing a match has occurred. In that case, we |
98 | | /// // should return the match instead of an error. |
99 | | /// if last_match.is_some() { |
100 | | /// return Ok(last_match); |
101 | | /// } |
102 | | /// return Err(MatchError::quit(b, i)); |
103 | | /// } |
104 | | /// // Implementors may also want to check for start states and |
105 | | /// // handle them differently for performance reasons. But it is |
106 | | /// // not necessary for correctness. Note that in order to check |
107 | | /// // for start states, you'll need to enable the |
108 | | /// // 'specialize_start_states' config knob, otherwise start |
109 | | /// // states will not be tagged. |
110 | | /// } |
111 | | /// } |
112 | | /// // Matches are always delayed by 1 byte, so we must explicitly walk |
113 | | /// // the special "EOI" transition at the end of the search. |
114 | | /// sid = dfa |
115 | | /// .next_eoi_state(cache, sid) |
116 | | /// .map_err(|_| MatchError::gave_up(haystack.len()))?; |
117 | | /// if sid.is_match() { |
118 | | /// last_match = Some(HalfMatch::new( |
119 | | /// dfa.match_pattern(cache, sid, 0), |
120 | | /// haystack.len(), |
121 | | /// )); |
122 | | /// } |
123 | | /// Ok(last_match) |
124 | | /// } |
125 | | /// |
126 | | /// // We use a greedy '+' operator to show how the search doesn't just stop |
127 | | /// // once a match is detected. It continues extending the match. Using |
128 | | /// // '[a-z]+?' would also work as expected and stop the search early. |
129 | | /// // Greediness is built into the automaton. |
130 | | /// let dfa = DFA::new(r"[a-z]+")?; |
131 | | /// let mut cache = dfa.create_cache(); |
132 | | /// let haystack = "123 foobar 4567".as_bytes(); |
133 | | /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); |
134 | | /// assert_eq!(mat.pattern().as_usize(), 0); |
135 | | /// assert_eq!(mat.offset(), 10); |
136 | | /// |
137 | | /// // Here's another example that tests our handling of the special |
138 | | /// // EOI transition. This will fail to find a match if we don't call |
139 | | /// // 'next_eoi_state' at the end of the search since the match isn't found |
140 | | /// // until the final byte in the haystack. |
141 | | /// let dfa = DFA::new(r"[0-9]{4}")?; |
142 | | /// let mut cache = dfa.create_cache(); |
143 | | /// let haystack = "123 foobar 4567".as_bytes(); |
144 | | /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); |
145 | | /// assert_eq!(mat.pattern().as_usize(), 0); |
146 | | /// assert_eq!(mat.offset(), 15); |
147 | | /// |
148 | | /// // And note that our search implementation above automatically works |
149 | | /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects |
150 | | /// // the appropriate pattern ID for us. |
151 | | /// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; |
152 | | /// let mut cache = dfa.create_cache(); |
153 | | /// let haystack = "123 foobar 4567".as_bytes(); |
154 | | /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); |
155 | | /// assert_eq!(mat.pattern().as_usize(), 1); |
156 | | /// assert_eq!(mat.offset(), 3); |
157 | | /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap(); |
158 | | /// assert_eq!(mat.pattern().as_usize(), 0); |
159 | | /// assert_eq!(mat.offset(), 7); |
160 | | /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap(); |
161 | | /// assert_eq!(mat.pattern().as_usize(), 1); |
162 | | /// assert_eq!(mat.offset(), 5); |
163 | | /// |
164 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
165 | | /// ``` |
166 | | #[derive( |
167 | | Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, |
168 | | )] |
169 | | pub struct LazyStateID(u32); |
170 | | |
171 | | impl LazyStateID { |
172 | | #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] |
173 | | const MAX_BIT: usize = 31; |
174 | | |
175 | | #[cfg(target_pointer_width = "16")] |
176 | | const MAX_BIT: usize = 15; |
177 | | |
178 | | const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT); |
179 | | const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1); |
180 | | const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2); |
181 | | const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3); |
182 | | const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4); |
183 | | const MAX: usize = LazyStateID::MASK_MATCH - 1; |
184 | | |
185 | | /// Create a new lazy state ID. |
186 | | /// |
187 | | /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns |
188 | | /// an error. |
189 | | #[inline] |
190 | 0 | pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> { |
191 | 0 | if id > LazyStateID::MAX { |
192 | 0 | let attempted = u64::try_from(id).unwrap(); |
193 | 0 | return Err(LazyStateIDError { attempted }); |
194 | 0 | } |
195 | 0 | Ok(LazyStateID::new_unchecked(id)) |
196 | 0 | } |
197 | | |
198 | | /// Create a new lazy state ID without checking whether the given value |
199 | | /// exceeds [`LazyStateID::MAX`]. |
200 | | /// |
201 | | /// While this is unchecked, providing an incorrect value must never |
202 | | /// sacrifice memory safety. |
203 | | #[inline] |
204 | 0 | const fn new_unchecked(id: usize) -> LazyStateID { |
205 | 0 | // FIXME: Use as_u32() once const functions in traits are stable. |
206 | 0 | LazyStateID(id as u32) |
207 | 0 | } |
208 | | |
209 | | /// Return this lazy state ID as an untagged `usize`. |
210 | | /// |
211 | | /// If this lazy state ID is tagged, then the usize returned is the state |
212 | | /// ID without the tag. If the ID was not tagged, then the usize returned |
213 | | /// is equivalent to the state ID. |
214 | | #[inline] |
215 | 0 | pub(crate) fn as_usize_untagged(&self) -> usize { |
216 | 0 | self.as_usize_unchecked() & LazyStateID::MAX |
217 | 0 | } |
218 | | |
219 | | /// Return this lazy state ID as its raw internal `usize` value, which may |
220 | | /// be tagged (and thus greater than LazyStateID::MAX). |
221 | | #[inline] |
222 | 0 | pub(crate) const fn as_usize_unchecked(&self) -> usize { |
223 | 0 | // FIXME: Use as_usize() once const functions in traits are stable. |
224 | 0 | self.0 as usize |
225 | 0 | } |
226 | | |
227 | | #[inline] |
228 | 0 | pub(crate) const fn to_unknown(&self) -> LazyStateID { |
229 | 0 | LazyStateID::new_unchecked( |
230 | 0 | self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN, |
231 | 0 | ) |
232 | 0 | } |
233 | | |
234 | | #[inline] |
235 | 0 | pub(crate) const fn to_dead(&self) -> LazyStateID { |
236 | 0 | LazyStateID::new_unchecked( |
237 | 0 | self.as_usize_unchecked() | LazyStateID::MASK_DEAD, |
238 | 0 | ) |
239 | 0 | } |
240 | | |
241 | | #[inline] |
242 | 0 | pub(crate) const fn to_quit(&self) -> LazyStateID { |
243 | 0 | LazyStateID::new_unchecked( |
244 | 0 | self.as_usize_unchecked() | LazyStateID::MASK_QUIT, |
245 | 0 | ) |
246 | 0 | } |
247 | | |
248 | | /// Return this lazy state ID as a state ID that is tagged as a start |
249 | | /// state. |
250 | | #[inline] |
251 | 0 | pub(crate) const fn to_start(&self) -> LazyStateID { |
252 | 0 | LazyStateID::new_unchecked( |
253 | 0 | self.as_usize_unchecked() | LazyStateID::MASK_START, |
254 | 0 | ) |
255 | 0 | } |
256 | | |
257 | | /// Return this lazy state ID as a lazy state ID that is tagged as a match |
258 | | /// state. |
259 | | #[inline] |
260 | 0 | pub(crate) const fn to_match(&self) -> LazyStateID { |
261 | 0 | LazyStateID::new_unchecked( |
262 | 0 | self.as_usize_unchecked() | LazyStateID::MASK_MATCH, |
263 | 0 | ) |
264 | 0 | } |
265 | | |
266 | | /// Return true if and only if this lazy state ID is tagged. |
267 | | /// |
268 | | /// When a lazy state ID is tagged, then one can conclude that it is one |
269 | | /// of a match, start, dead, quit or unknown state. |
270 | | #[inline] |
271 | 0 | pub const fn is_tagged(&self) -> bool { |
272 | 0 | self.as_usize_unchecked() > LazyStateID::MAX |
273 | 0 | } |
274 | | |
275 | | /// Return true if and only if this represents a lazy state ID that is |
276 | | /// "unknown." That is, the state has not yet been created. When a caller |
277 | | /// sees this state ID, it generally means that a state has to be computed |
278 | | /// in order to proceed. |
279 | | #[inline] |
280 | 0 | pub const fn is_unknown(&self) -> bool { |
281 | 0 | self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0 |
282 | 0 | } |
283 | | |
284 | | /// Return true if and only if this represents a dead state. A dead state |
285 | | /// is a state that can never transition to any other state except the |
286 | | /// dead state. When a dead state is seen, it generally indicates that a |
287 | | /// search should stop. |
288 | | #[inline] |
289 | 0 | pub const fn is_dead(&self) -> bool { |
290 | 0 | self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0 |
291 | 0 | } |
292 | | |
293 | | /// Return true if and only if this represents a quit state. A quit state |
294 | | /// is a state that is representationally equivalent to a dead state, |
295 | | /// except it indicates the automaton has reached a point at which it can |
296 | | /// no longer determine whether a match exists or not. In general, this |
297 | | /// indicates an error during search and the caller must either pass this |
298 | | /// error up or use a different search technique. |
299 | | #[inline] |
300 | 0 | pub const fn is_quit(&self) -> bool { |
301 | 0 | self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0 |
302 | 0 | } |
303 | | |
304 | | /// Return true if and only if this lazy state ID has been tagged as a |
305 | | /// start state. |
306 | | /// |
307 | | /// Note that if |
308 | | /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is |
309 | | /// disabled (which is the default), then this will always return false |
310 | | /// since start states won't be tagged. |
311 | | #[inline] |
312 | 0 | pub const fn is_start(&self) -> bool { |
313 | 0 | self.as_usize_unchecked() & LazyStateID::MASK_START > 0 |
314 | 0 | } |
315 | | |
316 | | /// Return true if and only if this lazy state ID has been tagged as a |
317 | | /// match state. |
318 | | #[inline] |
319 | 0 | pub const fn is_match(&self) -> bool { |
320 | 0 | self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0 |
321 | 0 | } |
322 | | } |
323 | | |
324 | | /// This error occurs when a lazy state ID could not be constructed. |
325 | | /// |
326 | | /// This occurs when given an integer exceeding the maximum lazy state ID |
327 | | /// value. |
328 | | /// |
329 | | /// When the `std` feature is enabled, this implements the `Error` trait. |
330 | | #[derive(Clone, Debug, Eq, PartialEq)] |
331 | | pub(crate) struct LazyStateIDError { |
332 | | attempted: u64, |
333 | | } |
334 | | |
335 | | impl LazyStateIDError { |
336 | | /// Returns the value that failed to constructed a lazy state ID. |
337 | 0 | pub(crate) fn attempted(&self) -> u64 { |
338 | 0 | self.attempted |
339 | 0 | } |
340 | | } |
341 | | |
342 | | #[cfg(feature = "std")] |
343 | | impl std::error::Error for LazyStateIDError {} |
344 | | |
345 | | impl core::fmt::Display for LazyStateIDError { |
346 | 0 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
347 | 0 | write!( |
348 | 0 | f, |
349 | 0 | "failed to create LazyStateID from {:?}, which exceeds {:?}", |
350 | 0 | self.attempted(), |
351 | 0 | LazyStateID::MAX, |
352 | 0 | ) |
353 | 0 | } |
354 | | } |