/rust/registry/src/index.crates.io-1949cf8c6b5b557f/regex-automata-0.4.13/src/hybrid/dfa.rs
Line | Count | Source |
1 | | /*! |
2 | | Types and routines specific to lazy DFAs. |
3 | | |
4 | | This module is the home of [`hybrid::dfa::DFA`](DFA). |
5 | | |
6 | | This module also contains a [`hybrid::dfa::Builder`](Builder) and a |
7 | | [`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. |
8 | | */ |
9 | | |
10 | | use core::{iter, mem::size_of}; |
11 | | |
12 | | use alloc::vec::Vec; |
13 | | |
14 | | use crate::{ |
15 | | hybrid::{ |
16 | | error::{BuildError, CacheError, StartError}, |
17 | | id::{LazyStateID, LazyStateIDError}, |
18 | | search, |
19 | | }, |
20 | | nfa::thompson, |
21 | | util::{ |
22 | | alphabet::{self, ByteClasses, ByteSet}, |
23 | | determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, |
24 | | empty, |
25 | | prefilter::Prefilter, |
26 | | primitives::{PatternID, StateID as NFAStateID}, |
27 | | search::{ |
28 | | Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, |
29 | | }, |
30 | | sparse_set::SparseSets, |
31 | | start::{self, Start, StartByteMap}, |
32 | | }, |
33 | | }; |
34 | | |
35 | | /// The minimum number of states that a lazy DFA's cache size must support. |
36 | | /// |
37 | | /// This is checked at time of construction to ensure that at least some small |
38 | | /// number of states can fit in the given capacity allotment. If we can't fit |
39 | | /// at least this number of states, then the thinking is that it's pretty |
40 | | /// senseless to use the lazy DFA. More to the point, parts of the code do |
41 | | /// assume that the cache can fit at least some small number of states. |
42 | | const MIN_STATES: usize = SENTINEL_STATES + 2; |
43 | | |
44 | | /// The number of "sentinel" states that get added to every lazy DFA. |
45 | | /// |
46 | | /// These are special states indicating status conditions of a search: unknown, |
47 | | /// dead and quit. These states in particular also use zero NFA states, so |
48 | | /// their memory usage is quite small. This is relevant for computing the |
49 | | /// minimum memory needed for a lazy DFA cache. |
50 | | const SENTINEL_STATES: usize = 3; |
51 | | |
52 | | /// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. |
53 | | /// |
54 | | /// A lazy DFA is a DFA that builds itself at search time. It otherwise has |
55 | | /// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). |
56 | | /// Indeed, both support precisely the same regex features with precisely the |
57 | | /// same semantics. |
58 | | /// |
59 | | /// Where as a `dense::DFA` must be completely built to handle any input before |
60 | | /// it may be used for search, a lazy DFA starts off effectively empty. During |
61 | | /// a search, a lazy DFA will build itself depending on whether it has already |
62 | | /// computed the next transition or not. If it has, then it looks a lot like |
63 | | /// a `dense::DFA` internally: it does a very fast table based access to find |
64 | | /// the next transition. Otherwise, if the state hasn't been computed, then it |
65 | | /// does determinization _for that specific transition_ to compute the next DFA |
66 | | /// state. |
67 | | /// |
68 | | /// The main selling point of a lazy DFA is that, in practice, it has |
69 | | /// the performance profile of a `dense::DFA` without the weakness of it |
70 | | /// taking worst case exponential time to build. Indeed, for each byte of |
71 | | /// input, the lazy DFA will construct as most one new DFA state. Thus, a |
72 | | /// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ |
73 | | /// pattern.len()` and `n ~ haystack.len()`). |
74 | | /// |
75 | | /// The main downsides of a lazy DFA are: |
76 | | /// |
77 | | /// 1. It requires mutable "cache" space during search. This is where the |
78 | | /// transition table, among other things, is stored. |
79 | | /// 2. In pathological cases (e.g., if the cache is too small), it will run |
80 | | /// out of room and either require a bigger cache capacity or will repeatedly |
81 | | /// clear the cache and thus repeatedly regenerate DFA states. Overall, this |
82 | | /// will tend to be slower than a typical NFA simulation. |
83 | | /// |
84 | | /// # Capabilities |
85 | | /// |
86 | | /// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following |
87 | | /// operations: |
88 | | /// |
89 | | /// 1. Detection of a match. |
90 | | /// 2. Location of the end of a match. |
91 | | /// 3. In the case of a lazy DFA with multiple patterns, which pattern matched |
92 | | /// is reported as well. |
93 | | /// |
94 | | /// A notable absence from the above list of capabilities is the location of |
95 | | /// the *start* of a match. In order to provide both the start and end of |
96 | | /// a match, *two* lazy DFAs are required. This functionality is provided by a |
97 | | /// [`Regex`](crate::hybrid::regex::Regex). |
98 | | /// |
99 | | /// # Example |
100 | | /// |
101 | | /// This shows how to build a lazy DFA with the default configuration and |
102 | | /// execute a search. Notice how, in contrast to a `dense::DFA`, we must create |
103 | | /// a cache and pass it to our search routine. |
104 | | /// |
105 | | /// ``` |
106 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
107 | | /// |
108 | | /// let dfa = DFA::new("foo[0-9]+")?; |
109 | | /// let mut cache = dfa.create_cache(); |
110 | | /// |
111 | | /// let expected = Some(HalfMatch::must(0, 8)); |
112 | | /// assert_eq!(expected, dfa.try_search_fwd( |
113 | | /// &mut cache, &Input::new("foo12345"))?, |
114 | | /// ); |
115 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
116 | | /// ``` |
117 | | #[derive(Clone, Debug)] |
118 | | pub struct DFA { |
119 | | config: Config, |
120 | | nfa: thompson::NFA, |
121 | | stride2: usize, |
122 | | start_map: StartByteMap, |
123 | | classes: ByteClasses, |
124 | | quitset: ByteSet, |
125 | | cache_capacity: usize, |
126 | | } |
127 | | |
128 | | impl DFA { |
129 | | /// Parse the given regular expression using a default configuration and |
130 | | /// return the corresponding lazy DFA. |
131 | | /// |
132 | | /// If you want a non-default configuration, then use the [`Builder`] to |
133 | | /// set your own configuration. |
134 | | /// |
135 | | /// # Example |
136 | | /// |
137 | | /// ``` |
138 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
139 | | /// |
140 | | /// let dfa = DFA::new("foo[0-9]+bar")?; |
141 | | /// let mut cache = dfa.create_cache(); |
142 | | /// |
143 | | /// let expected = HalfMatch::must(0, 11); |
144 | | /// assert_eq!( |
145 | | /// Some(expected), |
146 | | /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, |
147 | | /// ); |
148 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
149 | | /// ``` |
150 | | #[cfg(feature = "syntax")] |
151 | 0 | pub fn new(pattern: &str) -> Result<DFA, BuildError> { |
152 | 0 | DFA::builder().build(pattern) |
153 | 0 | } |
154 | | |
155 | | /// Parse the given regular expressions using a default configuration and |
156 | | /// return the corresponding lazy multi-DFA. |
157 | | /// |
158 | | /// If you want a non-default configuration, then use the [`Builder`] to |
159 | | /// set your own configuration. |
160 | | /// |
161 | | /// # Example |
162 | | /// |
163 | | /// ``` |
164 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
165 | | /// |
166 | | /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; |
167 | | /// let mut cache = dfa.create_cache(); |
168 | | /// |
169 | | /// let expected = HalfMatch::must(1, 3); |
170 | | /// assert_eq!( |
171 | | /// Some(expected), |
172 | | /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, |
173 | | /// ); |
174 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
175 | | /// ``` |
176 | | #[cfg(feature = "syntax")] |
177 | 0 | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { |
178 | 0 | DFA::builder().build_many(patterns) |
179 | 0 | } |
180 | | |
181 | | /// Create a new lazy DFA that matches every input. |
182 | | /// |
183 | | /// # Example |
184 | | /// |
185 | | /// ``` |
186 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
187 | | /// |
188 | | /// let dfa = DFA::always_match()?; |
189 | | /// let mut cache = dfa.create_cache(); |
190 | | /// |
191 | | /// let expected = HalfMatch::must(0, 0); |
192 | | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
193 | | /// &mut cache, &Input::new(""))?, |
194 | | /// ); |
195 | | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
196 | | /// &mut cache, &Input::new("foo"))?, |
197 | | /// ); |
198 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
199 | | /// ``` |
200 | 0 | pub fn always_match() -> Result<DFA, BuildError> { |
201 | 0 | let nfa = thompson::NFA::always_match(); |
202 | 0 | Builder::new().build_from_nfa(nfa) |
203 | 0 | } |
204 | | |
205 | | /// Create a new lazy DFA that never matches any input. |
206 | | /// |
207 | | /// # Example |
208 | | /// |
209 | | /// ``` |
210 | | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
211 | | /// |
212 | | /// let dfa = DFA::never_match()?; |
213 | | /// let mut cache = dfa.create_cache(); |
214 | | /// |
215 | | /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?); |
216 | | /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?); |
217 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
218 | | /// ``` |
219 | 0 | pub fn never_match() -> Result<DFA, BuildError> { |
220 | 0 | let nfa = thompson::NFA::never_match(); |
221 | 0 | Builder::new().build_from_nfa(nfa) |
222 | 0 | } |
223 | | |
224 | | /// Return a default configuration for a `DFA`. |
225 | | /// |
226 | | /// This is a convenience routine to avoid needing to import the [`Config`] |
227 | | /// type when customizing the construction of a lazy DFA. |
228 | | /// |
229 | | /// # Example |
230 | | /// |
231 | | /// This example shows how to build a lazy DFA that heuristically supports |
232 | | /// Unicode word boundaries. |
233 | | /// |
234 | | /// ``` |
235 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
236 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input}; |
237 | | /// |
238 | | /// let re = DFA::builder() |
239 | | /// .configure(DFA::config().unicode_word_boundary(true)) |
240 | | /// .build(r"\b\w+\b")?; |
241 | | /// let mut cache = re.create_cache(); |
242 | | /// |
243 | | /// // Since our haystack is all ASCII, the DFA search sees then and knows |
244 | | /// // it is legal to interpret Unicode word boundaries as ASCII word |
245 | | /// // boundaries. |
246 | | /// let input = Input::new("!!foo!!"); |
247 | | /// let expected = HalfMatch::must(0, 5); |
248 | | /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?); |
249 | | /// |
250 | | /// // But if our haystack contains non-ASCII, then the search will fail |
251 | | /// // with an error. |
252 | | /// let input = Input::new("!!βββ!!"); |
253 | | /// let expected = MatchError::quit(b'\xCE', 2); |
254 | | /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input)); |
255 | | /// |
256 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
257 | | /// ``` |
258 | 0 | pub fn config() -> Config { |
259 | 0 | Config::new() |
260 | 0 | } |
261 | | |
262 | | /// Return a builder for configuring the construction of a `Regex`. |
263 | | /// |
264 | | /// This is a convenience routine to avoid needing to import the |
265 | | /// [`Builder`] type in common cases. |
266 | | /// |
267 | | /// # Example |
268 | | /// |
269 | | /// This example shows how to use the builder to disable UTF-8 mode |
270 | | /// everywhere for lazy DFAs. |
271 | | /// |
272 | | /// ``` |
273 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
274 | | /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input}; |
275 | | /// |
276 | | /// let re = DFA::builder() |
277 | | /// .syntax(syntax::Config::new().utf8(false)) |
278 | | /// .build(r"foo(?-u:[^b])ar.*")?; |
279 | | /// let mut cache = re.create_cache(); |
280 | | /// |
281 | | /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"); |
282 | | /// let expected = Some(HalfMatch::must(0, 9)); |
283 | | /// let got = re.try_search_fwd(&mut cache, &input)?; |
284 | | /// assert_eq!(expected, got); |
285 | | /// |
286 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
287 | | /// ``` |
288 | 0 | pub fn builder() -> Builder { |
289 | 0 | Builder::new() |
290 | 0 | } |
291 | | |
292 | | /// Create a new cache for this lazy DFA. |
293 | | /// |
294 | | /// The cache returned should only be used for searches for this |
295 | | /// lazy DFA. If you want to reuse the cache for another DFA, then |
296 | | /// you must call [`Cache::reset`] with that DFA (or, equivalently, |
297 | | /// [`DFA::reset_cache`]). |
298 | 0 | pub fn create_cache(&self) -> Cache { |
299 | 0 | Cache::new(self) |
300 | 0 | } |
301 | | |
302 | | /// Reset the given cache such that it can be used for searching with the |
303 | | /// this lazy DFA (and only this DFA). |
304 | | /// |
305 | | /// A cache reset permits reusing memory already allocated in this cache |
306 | | /// with a different lazy DFA. |
307 | | /// |
308 | | /// Resetting a cache sets its "clear count" to 0. This is relevant if the |
309 | | /// lazy DFA has been configured to "give up" after it has cleared the |
310 | | /// cache a certain number of times. |
311 | | /// |
312 | | /// Any lazy state ID generated by the cache prior to resetting it is |
313 | | /// invalid after the reset. |
314 | | /// |
315 | | /// # Example |
316 | | /// |
317 | | /// This shows how to re-purpose a cache for use with a different DFA. |
318 | | /// |
319 | | /// ``` |
320 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
321 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
322 | | /// |
323 | | /// let dfa1 = DFA::new(r"\w")?; |
324 | | /// let dfa2 = DFA::new(r"\W")?; |
325 | | /// |
326 | | /// let mut cache = dfa1.create_cache(); |
327 | | /// assert_eq!( |
328 | | /// Some(HalfMatch::must(0, 2)), |
329 | | /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, |
330 | | /// ); |
331 | | /// |
332 | | /// // Using 'cache' with dfa2 is not allowed. It may result in panics or |
333 | | /// // incorrect results. In order to re-purpose the cache, we must reset |
334 | | /// // it with the DFA we'd like to use it with. |
335 | | /// // |
336 | | /// // Similarly, after this reset, using the cache with 'dfa1' is also not |
337 | | /// // allowed. |
338 | | /// dfa2.reset_cache(&mut cache); |
339 | | /// assert_eq!( |
340 | | /// Some(HalfMatch::must(0, 3)), |
341 | | /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, |
342 | | /// ); |
343 | | /// |
344 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
345 | | /// ``` |
346 | 0 | pub fn reset_cache(&self, cache: &mut Cache) { |
347 | 0 | Lazy::new(self, cache).reset_cache() |
348 | 0 | } |
349 | | |
350 | | /// Returns the total number of patterns compiled into this lazy DFA. |
351 | | /// |
352 | | /// In the case of a DFA that contains no patterns, this returns `0`. |
353 | | /// |
354 | | /// # Example |
355 | | /// |
356 | | /// This example shows the pattern length for a DFA that never matches: |
357 | | /// |
358 | | /// ``` |
359 | | /// use regex_automata::hybrid::dfa::DFA; |
360 | | /// |
361 | | /// let dfa = DFA::never_match()?; |
362 | | /// assert_eq!(dfa.pattern_len(), 0); |
363 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
364 | | /// ``` |
365 | | /// |
366 | | /// And another example for a DFA that matches at every position: |
367 | | /// |
368 | | /// ``` |
369 | | /// use regex_automata::hybrid::dfa::DFA; |
370 | | /// |
371 | | /// let dfa = DFA::always_match()?; |
372 | | /// assert_eq!(dfa.pattern_len(), 1); |
373 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
374 | | /// ``` |
375 | | /// |
376 | | /// And finally, a DFA that was constructed from multiple patterns: |
377 | | /// |
378 | | /// ``` |
379 | | /// use regex_automata::hybrid::dfa::DFA; |
380 | | /// |
381 | | /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; |
382 | | /// assert_eq!(dfa.pattern_len(), 3); |
383 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
384 | | /// ``` |
385 | 0 | pub fn pattern_len(&self) -> usize { |
386 | 0 | self.nfa.pattern_len() |
387 | 0 | } |
388 | | |
389 | | /// Returns the equivalence classes that make up the alphabet for this DFA. |
390 | | /// |
391 | | /// Unless [`Config::byte_classes`] was disabled, it is possible that |
392 | | /// multiple distinct bytes are grouped into the same equivalence class |
393 | | /// if it is impossible for them to discriminate between a match and a |
394 | | /// non-match. This has the effect of reducing the overall alphabet size |
395 | | /// and in turn potentially substantially reducing the size of the DFA's |
396 | | /// transition table. |
397 | | /// |
398 | | /// The downside of using equivalence classes like this is that every state |
399 | | /// transition will automatically use this map to convert an arbitrary |
400 | | /// byte to its corresponding equivalence class. In practice this has a |
401 | | /// negligible impact on performance. |
402 | 0 | pub fn byte_classes(&self) -> &ByteClasses { |
403 | 0 | &self.classes |
404 | 0 | } |
405 | | |
406 | | /// Returns this lazy DFA's configuration. |
407 | 0 | pub fn get_config(&self) -> &Config { |
408 | 0 | &self.config |
409 | 0 | } |
410 | | |
411 | | /// Returns a reference to the underlying NFA. |
412 | 0 | pub fn get_nfa(&self) -> &thompson::NFA { |
413 | 0 | &self.nfa |
414 | 0 | } |
415 | | |
416 | | /// Returns the stride, as a base-2 exponent, required for these |
417 | | /// equivalence classes. |
418 | | /// |
419 | | /// The stride is always the smallest power of 2 that is greater than or |
420 | | /// equal to the alphabet length. This is done so that converting between |
421 | | /// state IDs and indices can be done with shifts alone, which is much |
422 | | /// faster than integer division. |
423 | 0 | fn stride2(&self) -> usize { |
424 | 0 | self.stride2 |
425 | 0 | } |
426 | | |
427 | | /// Returns the total stride for every state in this lazy DFA. This |
428 | | /// corresponds to the total number of transitions used by each state in |
429 | | /// this DFA's transition table. |
430 | 0 | fn stride(&self) -> usize { |
431 | 0 | 1 << self.stride2() |
432 | 0 | } |
433 | | |
434 | | /// Returns the memory usage, in bytes, of this lazy DFA. |
435 | | /// |
436 | | /// This does **not** include the stack size used up by this lazy DFA. To |
437 | | /// compute that, use `std::mem::size_of::<DFA>()`. This also does not |
438 | | /// include the size of the `Cache` used. |
439 | | /// |
440 | | /// This also does not include any heap memory used by the NFA inside of |
441 | | /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and |
442 | | /// thus not owned by this hybrid NFA/DFA. More practically, several regex |
443 | | /// engines in this crate embed an NFA, and reporting the NFA's memory |
444 | | /// usage in all of them would likely result in reporting higher heap |
445 | | /// memory than is actually used. |
446 | 0 | pub fn memory_usage(&self) -> usize { |
447 | | // The only thing that uses heap memory in a DFA is the NFA. But the |
448 | | // NFA has shared ownership, so reporting its memory as part of the |
449 | | // hybrid DFA is likely to lead to double-counting the NFA memory |
450 | | // somehow. In particular, this DFA does not really own an NFA, so |
451 | | // including it in the DFA's memory usage doesn't seem semantically |
452 | | // correct. |
453 | 0 | 0 |
454 | 0 | } |
455 | | } |
456 | | |
457 | | impl DFA { |
458 | | /// Executes a forward search and returns the end position of the leftmost |
459 | | /// match that is found. If no match exists, then `None` is returned. |
460 | | /// |
461 | | /// In particular, this method continues searching even after it enters |
462 | | /// a match state. The search only terminates once it has reached the |
463 | | /// end of the input or when it has entered a dead or quit state. Upon |
464 | | /// termination, the position of the last byte seen while still in a match |
465 | | /// state is returned. |
466 | | /// |
467 | | /// # Errors |
468 | | /// |
469 | | /// This routine errors if the search could not complete. This can occur |
470 | | /// in a number of circumstances: |
471 | | /// |
472 | | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
473 | | /// For example, setting quit bytes or enabling heuristic support for |
474 | | /// Unicode word boundaries. The default configuration does not enable any |
475 | | /// option that could result in the lazy DFA quitting. |
476 | | /// * The configuration of the lazy DFA may also permit it to "give up" |
477 | | /// on a search if it makes ineffective use of its transition table |
478 | | /// cache. The default configuration does not enable this by default, |
479 | | /// although it is typically a good idea to. |
480 | | /// * When the provided `Input` configuration is not supported. For |
481 | | /// example, by providing an unsupported anchor mode. |
482 | | /// |
483 | | /// When a search returns an error, callers cannot know whether a match |
484 | | /// exists or not. |
485 | | /// |
486 | | /// # Example |
487 | | /// |
488 | | /// This example shows how to run a basic search. |
489 | | /// |
490 | | /// ``` |
491 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
492 | | /// |
493 | | /// let dfa = DFA::new("foo[0-9]+")?; |
494 | | /// let mut cache = dfa.create_cache(); |
495 | | /// let expected = HalfMatch::must(0, 8); |
496 | | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
497 | | /// &mut cache, &Input::new("foo12345"))?, |
498 | | /// ); |
499 | | /// |
500 | | /// // Even though a match is found after reading the first byte (`a`), |
501 | | /// // the leftmost first match semantics demand that we find the earliest |
502 | | /// // match that prefers earlier parts of the pattern over later parts. |
503 | | /// let dfa = DFA::new("abc|a")?; |
504 | | /// let mut cache = dfa.create_cache(); |
505 | | /// let expected = HalfMatch::must(0, 3); |
506 | | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
507 | | /// &mut cache, &Input::new("abc"))?, |
508 | | /// ); |
509 | | /// |
510 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
511 | | /// ``` |
512 | | /// |
513 | | /// # Example: specific pattern search |
514 | | /// |
515 | | /// This example shows how to build a lazy multi-DFA that permits searching |
516 | | /// for specific patterns. |
517 | | /// |
518 | | /// ``` |
519 | | /// use regex_automata::{ |
520 | | /// hybrid::dfa::DFA, |
521 | | /// Anchored, HalfMatch, PatternID, Input, |
522 | | /// }; |
523 | | /// |
524 | | /// let dfa = DFA::builder() |
525 | | /// .configure(DFA::config().starts_for_each_pattern(true)) |
526 | | /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; |
527 | | /// let mut cache = dfa.create_cache(); |
528 | | /// let haystack = "foo123"; |
529 | | /// |
530 | | /// // Since we are using the default leftmost-first match and both |
531 | | /// // patterns match at the same starting position, only the first pattern |
532 | | /// // will be returned in this case when doing a search for any of the |
533 | | /// // patterns. |
534 | | /// let expected = Some(HalfMatch::must(0, 6)); |
535 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
536 | | /// assert_eq!(expected, got); |
537 | | /// |
538 | | /// // But if we want to check whether some other pattern matches, then we |
539 | | /// // can provide its pattern ID. |
540 | | /// let expected = Some(HalfMatch::must(1, 6)); |
541 | | /// let input = Input::new(haystack) |
542 | | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
543 | | /// let got = dfa.try_search_fwd(&mut cache, &input)?; |
544 | | /// assert_eq!(expected, got); |
545 | | /// |
546 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
547 | | /// ``` |
548 | | /// |
549 | | /// # Example: specifying the bounds of a search |
550 | | /// |
551 | | /// This example shows how providing the bounds of a search can produce |
552 | | /// different results than simply sub-slicing the haystack. |
553 | | /// |
554 | | /// ``` |
555 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
556 | | /// |
557 | | /// // N.B. We disable Unicode here so that we use a simple ASCII word |
558 | | /// // boundary. Alternatively, we could enable heuristic support for |
559 | | /// // Unicode word boundaries since our haystack is pure ASCII. |
560 | | /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; |
561 | | /// let mut cache = dfa.create_cache(); |
562 | | /// let haystack = "foo123bar"; |
563 | | /// |
564 | | /// // Since we sub-slice the haystack, the search doesn't know about the |
565 | | /// // larger context and assumes that `123` is surrounded by word |
566 | | /// // boundaries. And of course, the match position is reported relative |
567 | | /// // to the sub-slice as well, which means we get `3` instead of `6`. |
568 | | /// let expected = Some(HalfMatch::must(0, 3)); |
569 | | /// let got = dfa.try_search_fwd( |
570 | | /// &mut cache, |
571 | | /// &Input::new(&haystack[3..6]), |
572 | | /// )?; |
573 | | /// assert_eq!(expected, got); |
574 | | /// |
575 | | /// // But if we provide the bounds of the search within the context of the |
576 | | /// // entire haystack, then the search can take the surrounding context |
577 | | /// // into account. (And if we did find a match, it would be reported |
578 | | /// // as a valid offset into `haystack` instead of its sub-slice.) |
579 | | /// let expected = None; |
580 | | /// let got = dfa.try_search_fwd( |
581 | | /// &mut cache, |
582 | | /// &Input::new(haystack).range(3..6), |
583 | | /// )?; |
584 | | /// assert_eq!(expected, got); |
585 | | /// |
586 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
587 | | /// ``` |
588 | | #[inline] |
589 | 0 | pub fn try_search_fwd( |
590 | 0 | &self, |
591 | 0 | cache: &mut Cache, |
592 | 0 | input: &Input<'_>, |
593 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
594 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
595 | 0 | let hm = match search::find_fwd(self, cache, input)? { |
596 | 0 | None => return Ok(None), |
597 | 0 | Some(hm) if !utf8empty => return Ok(Some(hm)), |
598 | 0 | Some(hm) => hm, |
599 | | }; |
600 | | // We get to this point when we know our DFA can match the empty string |
601 | | // AND when UTF-8 mode is enabled. In this case, we skip any matches |
602 | | // whose offset splits a codepoint. Such a match is necessarily a |
603 | | // zero-width match, because UTF-8 mode requires the underlying NFA |
604 | | // to be built such that all non-empty matches span valid UTF-8. |
605 | | // Therefore, any match that ends in the middle of a codepoint cannot |
606 | | // be part of a span of valid UTF-8 and thus must be an empty match. |
607 | | // In such cases, we skip it, so as not to report matches that split a |
608 | | // codepoint. |
609 | | // |
610 | | // Note that this is not a checked assumption. Callers *can* provide an |
611 | | // NFA with UTF-8 mode enabled but produces non-empty matches that span |
612 | | // invalid UTF-8. But doing so is documented to result in unspecified |
613 | | // behavior. |
614 | 0 | empty::skip_splits_fwd(input, hm, hm.offset(), |input| { |
615 | 0 | let got = search::find_fwd(self, cache, input)?; |
616 | 0 | Ok(got.map(|hm| (hm, hm.offset()))) |
617 | 0 | }) |
618 | 0 | } |
619 | | |
620 | | /// Executes a reverse search and returns the start of the position of the |
621 | | /// leftmost match that is found. If no match exists, then `None` is |
622 | | /// returned. |
623 | | /// |
624 | | /// # Errors |
625 | | /// |
626 | | /// This routine errors if the search could not complete. This can occur |
627 | | /// in a number of circumstances: |
628 | | /// |
629 | | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
630 | | /// For example, setting quit bytes or enabling heuristic support for |
631 | | /// Unicode word boundaries. The default configuration does not enable any |
632 | | /// option that could result in the lazy DFA quitting. |
633 | | /// * The configuration of the lazy DFA may also permit it to "give up" |
634 | | /// on a search if it makes ineffective use of its transition table |
635 | | /// cache. The default configuration does not enable this by default, |
636 | | /// although it is typically a good idea to. |
637 | | /// * When the provided `Input` configuration is not supported. For |
638 | | /// example, by providing an unsupported anchor mode. |
639 | | /// |
640 | | /// When a search returns an error, callers cannot know whether a match |
641 | | /// exists or not. |
642 | | /// |
643 | | /// # Example |
644 | | /// |
645 | | /// This routine is principally useful when used in |
646 | | /// conjunction with the |
647 | | /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) |
648 | | /// configuration. In general, it's unlikely to be correct to use both |
649 | | /// `try_search_fwd` and `try_search_rev` with the same DFA since any |
650 | | /// particular DFA will only support searching in one direction with |
651 | | /// respect to the pattern. |
652 | | /// |
653 | | /// ``` |
654 | | /// use regex_automata::{ |
655 | | /// nfa::thompson, |
656 | | /// hybrid::dfa::DFA, |
657 | | /// HalfMatch, Input, |
658 | | /// }; |
659 | | /// |
660 | | /// let dfa = DFA::builder() |
661 | | /// .thompson(thompson::Config::new().reverse(true)) |
662 | | /// .build("foo[0-9]+")?; |
663 | | /// let mut cache = dfa.create_cache(); |
664 | | /// let expected = HalfMatch::must(0, 0); |
665 | | /// assert_eq!( |
666 | | /// Some(expected), |
667 | | /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?, |
668 | | /// ); |
669 | | /// |
670 | | /// // Even though a match is found after reading the last byte (`c`), |
671 | | /// // the leftmost first match semantics demand that we find the earliest |
672 | | /// // match that prefers earlier parts of the pattern over latter parts. |
673 | | /// let dfa = DFA::builder() |
674 | | /// .thompson(thompson::Config::new().reverse(true)) |
675 | | /// .build("abc|c")?; |
676 | | /// let mut cache = dfa.create_cache(); |
677 | | /// let expected = HalfMatch::must(0, 0); |
678 | | /// assert_eq!(Some(expected), dfa.try_search_rev( |
679 | | /// &mut cache, &Input::new("abc"))?, |
680 | | /// ); |
681 | | /// |
682 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
683 | | /// ``` |
684 | | /// |
685 | | /// # Example: UTF-8 mode |
686 | | /// |
687 | | /// This examples demonstrates that UTF-8 mode applies to reverse |
688 | | /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all |
689 | | /// matches reported must correspond to valid UTF-8 spans. This includes |
690 | | /// prohibiting zero-width matches that split a codepoint. |
691 | | /// |
692 | | /// UTF-8 mode is enabled by default. Notice below how the only zero-width |
693 | | /// matches reported are those at UTF-8 boundaries: |
694 | | /// |
695 | | /// ``` |
696 | | /// use regex_automata::{ |
697 | | /// hybrid::dfa::DFA, |
698 | | /// nfa::thompson, |
699 | | /// HalfMatch, Input, MatchKind, |
700 | | /// }; |
701 | | /// |
702 | | /// let dfa = DFA::builder() |
703 | | /// .thompson(thompson::Config::new().reverse(true)) |
704 | | /// .build(r"")?; |
705 | | /// let mut cache = dfa.create_cache(); |
706 | | /// |
707 | | /// // Run the reverse DFA to collect all matches. |
708 | | /// let mut input = Input::new("☃"); |
709 | | /// let mut matches = vec![]; |
710 | | /// loop { |
711 | | /// match dfa.try_search_rev(&mut cache, &input)? { |
712 | | /// None => break, |
713 | | /// Some(hm) => { |
714 | | /// matches.push(hm); |
715 | | /// if hm.offset() == 0 || input.end() == 0 { |
716 | | /// break; |
717 | | /// } else if hm.offset() < input.end() { |
718 | | /// input.set_end(hm.offset()); |
719 | | /// } else { |
720 | | /// // This is only necessary to handle zero-width |
721 | | /// // matches, which of course occur in this example. |
722 | | /// // Without this, the search would never advance |
723 | | /// // backwards beyond the initial match. |
724 | | /// input.set_end(input.end() - 1); |
725 | | /// } |
726 | | /// } |
727 | | /// } |
728 | | /// } |
729 | | /// |
730 | | /// // No matches split a codepoint. |
731 | | /// let expected = vec![ |
732 | | /// HalfMatch::must(0, 3), |
733 | | /// HalfMatch::must(0, 0), |
734 | | /// ]; |
735 | | /// assert_eq!(expected, matches); |
736 | | /// |
737 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
738 | | /// ``` |
739 | | /// |
740 | | /// Now let's look at the same example, but with UTF-8 mode on the |
741 | | /// underlying NFA disabled: |
742 | | /// |
743 | | /// ``` |
744 | | /// use regex_automata::{ |
745 | | /// hybrid::dfa::DFA, |
746 | | /// nfa::thompson, |
747 | | /// HalfMatch, Input, MatchKind, |
748 | | /// }; |
749 | | /// |
750 | | /// let dfa = DFA::builder() |
751 | | /// .thompson(thompson::Config::new().reverse(true).utf8(false)) |
752 | | /// .build(r"")?; |
753 | | /// let mut cache = dfa.create_cache(); |
754 | | /// |
755 | | /// // Run the reverse DFA to collect all matches. |
756 | | /// let mut input = Input::new("☃"); |
757 | | /// let mut matches = vec![]; |
758 | | /// loop { |
759 | | /// match dfa.try_search_rev(&mut cache, &input)? { |
760 | | /// None => break, |
761 | | /// Some(hm) => { |
762 | | /// matches.push(hm); |
763 | | /// if hm.offset() == 0 || input.end() == 0 { |
764 | | /// break; |
765 | | /// } else if hm.offset() < input.end() { |
766 | | /// input.set_end(hm.offset()); |
767 | | /// } else { |
768 | | /// // This is only necessary to handle zero-width |
769 | | /// // matches, which of course occur in this example. |
770 | | /// // Without this, the search would never advance |
771 | | /// // backwards beyond the initial match. |
772 | | /// input.set_end(input.end() - 1); |
773 | | /// } |
774 | | /// } |
775 | | /// } |
776 | | /// } |
777 | | /// |
778 | | /// // No matches split a codepoint. |
779 | | /// let expected = vec![ |
780 | | /// HalfMatch::must(0, 3), |
781 | | /// HalfMatch::must(0, 2), |
782 | | /// HalfMatch::must(0, 1), |
783 | | /// HalfMatch::must(0, 0), |
784 | | /// ]; |
785 | | /// assert_eq!(expected, matches); |
786 | | /// |
787 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
788 | | /// ``` |
789 | | #[inline] |
790 | 0 | pub fn try_search_rev( |
791 | 0 | &self, |
792 | 0 | cache: &mut Cache, |
793 | 0 | input: &Input<'_>, |
794 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
795 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
796 | 0 | let hm = match search::find_rev(self, cache, input)? { |
797 | 0 | None => return Ok(None), |
798 | 0 | Some(hm) if !utf8empty => return Ok(Some(hm)), |
799 | 0 | Some(hm) => hm, |
800 | | }; |
801 | 0 | empty::skip_splits_rev(input, hm, hm.offset(), |input| { |
802 | 0 | let got = search::find_rev(self, cache, input)?; |
803 | 0 | Ok(got.map(|hm| (hm, hm.offset()))) |
804 | 0 | }) |
805 | 0 | } |
806 | | |
807 | | /// Executes an overlapping forward search and returns the end position of |
808 | | /// matches as they are found. If no match exists, then `None` is returned. |
809 | | /// |
810 | | /// This routine is principally only useful when searching for multiple |
811 | | /// patterns on inputs where multiple patterns may match the same regions |
812 | | /// of text. In particular, callers must preserve the automaton's search |
813 | | /// state from prior calls so that the implementation knows where the last |
814 | | /// match occurred. |
815 | | /// |
816 | | /// When using this routine to implement an iterator of overlapping |
817 | | /// matches, the `start` of the search should remain invariant throughout |
818 | | /// iteration. The `OverlappingState` given to the search will keep track |
819 | | /// of the current position of the search. (This is because multiple |
820 | | /// matches may be reported at the same position, so only the search |
821 | | /// implementation itself knows when to advance the position.) |
822 | | /// |
823 | | /// If for some reason you want the search to forget about its previous |
824 | | /// state and restart the search at a particular position, then setting the |
825 | | /// state to [`OverlappingState::start`] will accomplish that. |
826 | | /// |
827 | | /// # Errors |
828 | | /// |
829 | | /// This routine errors if the search could not complete. This can occur |
830 | | /// in a number of circumstances: |
831 | | /// |
832 | | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
833 | | /// For example, setting quit bytes or enabling heuristic support for |
834 | | /// Unicode word boundaries. The default configuration does not enable any |
835 | | /// option that could result in the lazy DFA quitting. |
836 | | /// * The configuration of the lazy DFA may also permit it to "give up" |
837 | | /// on a search if it makes ineffective use of its transition table |
838 | | /// cache. The default configuration does not enable this by default, |
839 | | /// although it is typically a good idea to. |
840 | | /// * When the provided `Input` configuration is not supported. For |
841 | | /// example, by providing an unsupported anchor mode. |
842 | | /// |
843 | | /// When a search returns an error, callers cannot know whether a match |
844 | | /// exists or not. |
845 | | /// |
846 | | /// # Example |
847 | | /// |
848 | | /// This example shows how to run a basic overlapping search. Notice |
849 | | /// that we build the automaton with a `MatchKind::All` configuration. |
850 | | /// Overlapping searches are unlikely to work as one would expect when |
851 | | /// using the default `MatchKind::LeftmostFirst` match semantics, since |
852 | | /// leftmost-first matching is fundamentally incompatible with overlapping |
853 | | /// searches. Namely, overlapping searches need to report matches as they |
854 | | /// are seen, where as leftmost-first searches will continue searching even |
855 | | /// after a match has been observed in order to find the conventional end |
856 | | /// position of the match. More concretely, leftmost-first searches use |
857 | | /// dead states to terminate a search after a specific match can no longer |
858 | | /// be extended. Overlapping searches instead do the opposite by continuing |
859 | | /// the search to find totally new matches (potentially of other patterns). |
860 | | /// |
861 | | /// ``` |
862 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
863 | | /// use regex_automata::{ |
864 | | /// hybrid::dfa::{DFA, OverlappingState}, |
865 | | /// HalfMatch, Input, MatchKind, |
866 | | /// }; |
867 | | /// |
868 | | /// let dfa = DFA::builder() |
869 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
870 | | /// .build_many(&[r"\w+$", r"\S+$"])?; |
871 | | /// let mut cache = dfa.create_cache(); |
872 | | /// |
873 | | /// let haystack = "@foo"; |
874 | | /// let mut state = OverlappingState::start(); |
875 | | /// |
876 | | /// let expected = Some(HalfMatch::must(1, 4)); |
877 | | /// dfa.try_search_overlapping_fwd( |
878 | | /// &mut cache, &Input::new(haystack), &mut state, |
879 | | /// )?; |
880 | | /// assert_eq!(expected, state.get_match()); |
881 | | /// |
882 | | /// // The first pattern also matches at the same position, so re-running |
883 | | /// // the search will yield another match. Notice also that the first |
884 | | /// // pattern is returned after the second. This is because the second |
885 | | /// // pattern begins its match before the first, is therefore an earlier |
886 | | /// // match and is thus reported first. |
887 | | /// let expected = Some(HalfMatch::must(0, 4)); |
888 | | /// dfa.try_search_overlapping_fwd( |
889 | | /// &mut cache, &Input::new(haystack), &mut state, |
890 | | /// )?; |
891 | | /// assert_eq!(expected, state.get_match()); |
892 | | /// |
893 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
894 | | /// ``` |
895 | | #[inline] |
896 | 0 | pub fn try_search_overlapping_fwd( |
897 | 0 | &self, |
898 | 0 | cache: &mut Cache, |
899 | 0 | input: &Input<'_>, |
900 | 0 | state: &mut OverlappingState, |
901 | 0 | ) -> Result<(), MatchError> { |
902 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
903 | 0 | search::find_overlapping_fwd(self, cache, input, state)?; |
904 | 0 | match state.get_match() { |
905 | 0 | None => Ok(()), |
906 | 0 | Some(_) if !utf8empty => Ok(()), |
907 | 0 | Some(_) => skip_empty_utf8_splits_overlapping( |
908 | 0 | input, |
909 | 0 | state, |
910 | 0 | |input, state| { |
911 | 0 | search::find_overlapping_fwd(self, cache, input, state) |
912 | 0 | }, |
913 | | ), |
914 | | } |
915 | 0 | } |
916 | | |
917 | | /// Executes a reverse overlapping search and returns the start of the |
918 | | /// position of the leftmost match that is found. If no match exists, then |
919 | | /// `None` is returned. |
920 | | /// |
921 | | /// When using this routine to implement an iterator of overlapping |
922 | | /// matches, the `start` of the search should remain invariant throughout |
923 | | /// iteration. The `OverlappingState` given to the search will keep track |
924 | | /// of the current position of the search. (This is because multiple |
925 | | /// matches may be reported at the same position, so only the search |
926 | | /// implementation itself knows when to advance the position.) |
927 | | /// |
928 | | /// If for some reason you want the search to forget about its previous |
929 | | /// state and restart the search at a particular position, then setting the |
930 | | /// state to [`OverlappingState::start`] will accomplish that. |
931 | | /// |
932 | | /// # Errors |
933 | | /// |
934 | | /// This routine errors if the search could not complete. This can occur |
935 | | /// in a number of circumstances: |
936 | | /// |
937 | | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
938 | | /// For example, setting quit bytes or enabling heuristic support for |
939 | | /// Unicode word boundaries. The default configuration does not enable any |
940 | | /// option that could result in the lazy DFA quitting. |
941 | | /// * The configuration of the lazy DFA may also permit it to "give up" |
942 | | /// on a search if it makes ineffective use of its transition table |
943 | | /// cache. The default configuration does not enable this by default, |
944 | | /// although it is typically a good idea to. |
945 | | /// * When the provided `Input` configuration is not supported. For |
946 | | /// example, by providing an unsupported anchor mode. |
947 | | /// |
948 | | /// When a search returns an error, callers cannot know whether a match |
949 | | /// exists or not. |
950 | | /// |
951 | | /// # Example: UTF-8 mode |
952 | | /// |
953 | | /// This examples demonstrates that UTF-8 mode applies to reverse |
954 | | /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all |
955 | | /// matches reported must correspond to valid UTF-8 spans. This includes |
956 | | /// prohibiting zero-width matches that split a codepoint. |
957 | | /// |
958 | | /// UTF-8 mode is enabled by default. Notice below how the only zero-width |
959 | | /// matches reported are those at UTF-8 boundaries: |
960 | | /// |
961 | | /// ``` |
962 | | /// use regex_automata::{ |
963 | | /// hybrid::dfa::{DFA, OverlappingState}, |
964 | | /// nfa::thompson, |
965 | | /// HalfMatch, Input, MatchKind, |
966 | | /// }; |
967 | | /// |
968 | | /// let dfa = DFA::builder() |
969 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
970 | | /// .thompson(thompson::Config::new().reverse(true)) |
971 | | /// .build_many(&[r"", r"☃"])?; |
972 | | /// let mut cache = dfa.create_cache(); |
973 | | /// |
974 | | /// // Run the reverse DFA to collect all matches. |
975 | | /// let input = Input::new("☃"); |
976 | | /// let mut state = OverlappingState::start(); |
977 | | /// let mut matches = vec![]; |
978 | | /// loop { |
979 | | /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; |
980 | | /// match state.get_match() { |
981 | | /// None => break, |
982 | | /// Some(hm) => matches.push(hm), |
983 | | /// } |
984 | | /// } |
985 | | /// |
986 | | /// // No matches split a codepoint. |
987 | | /// let expected = vec![ |
988 | | /// HalfMatch::must(0, 3), |
989 | | /// HalfMatch::must(1, 0), |
990 | | /// HalfMatch::must(0, 0), |
991 | | /// ]; |
992 | | /// assert_eq!(expected, matches); |
993 | | /// |
994 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
995 | | /// ``` |
996 | | /// |
997 | | /// Now let's look at the same example, but with UTF-8 mode on the |
998 | | /// underlying NFA disabled: |
999 | | /// |
1000 | | /// ``` |
1001 | | /// use regex_automata::{ |
1002 | | /// hybrid::dfa::{DFA, OverlappingState}, |
1003 | | /// nfa::thompson, |
1004 | | /// HalfMatch, Input, MatchKind, |
1005 | | /// }; |
1006 | | /// |
1007 | | /// let dfa = DFA::builder() |
1008 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1009 | | /// .thompson(thompson::Config::new().reverse(true).utf8(false)) |
1010 | | /// .build_many(&[r"", r"☃"])?; |
1011 | | /// let mut cache = dfa.create_cache(); |
1012 | | /// |
1013 | | /// // Run the reverse DFA to collect all matches. |
1014 | | /// let input = Input::new("☃"); |
1015 | | /// let mut state = OverlappingState::start(); |
1016 | | /// let mut matches = vec![]; |
1017 | | /// loop { |
1018 | | /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; |
1019 | | /// match state.get_match() { |
1020 | | /// None => break, |
1021 | | /// Some(hm) => matches.push(hm), |
1022 | | /// } |
1023 | | /// } |
1024 | | /// |
1025 | | /// // Now *all* positions match, even within a codepoint, |
1026 | | /// // because we lifted the requirement that matches |
1027 | | /// // correspond to valid UTF-8 spans. |
1028 | | /// let expected = vec![ |
1029 | | /// HalfMatch::must(0, 3), |
1030 | | /// HalfMatch::must(0, 2), |
1031 | | /// HalfMatch::must(0, 1), |
1032 | | /// HalfMatch::must(1, 0), |
1033 | | /// HalfMatch::must(0, 0), |
1034 | | /// ]; |
1035 | | /// assert_eq!(expected, matches); |
1036 | | /// |
1037 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1038 | | /// ``` |
1039 | | #[inline] |
1040 | 0 | pub fn try_search_overlapping_rev( |
1041 | 0 | &self, |
1042 | 0 | cache: &mut Cache, |
1043 | 0 | input: &Input<'_>, |
1044 | 0 | state: &mut OverlappingState, |
1045 | 0 | ) -> Result<(), MatchError> { |
1046 | 0 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
1047 | 0 | search::find_overlapping_rev(self, cache, input, state)?; |
1048 | 0 | match state.get_match() { |
1049 | 0 | None => Ok(()), |
1050 | 0 | Some(_) if !utf8empty => Ok(()), |
1051 | 0 | Some(_) => skip_empty_utf8_splits_overlapping( |
1052 | 0 | input, |
1053 | 0 | state, |
1054 | 0 | |input, state| { |
1055 | 0 | search::find_overlapping_rev(self, cache, input, state) |
1056 | 0 | }, |
1057 | | ), |
1058 | | } |
1059 | 0 | } |
1060 | | |
1061 | | /// Writes the set of patterns that match anywhere in the given search |
1062 | | /// configuration to `patset`. If multiple patterns match at the same |
1063 | | /// position and the underlying DFA supports overlapping matches, then all |
1064 | | /// matching patterns are written to the given set. |
1065 | | /// |
1066 | | /// Unless all of the patterns in this DFA are anchored, then generally |
1067 | | /// speaking, this will visit every byte in the haystack. |
1068 | | /// |
1069 | | /// This search routine *does not* clear the pattern set. This gives some |
1070 | | /// flexibility to the caller (e.g., running multiple searches with the |
1071 | | /// same pattern set), but does make the API bug-prone if you're reusing |
1072 | | /// the same pattern set for multiple searches but intended them to be |
1073 | | /// independent. |
1074 | | /// |
1075 | | /// If a pattern ID matched but the given `PatternSet` does not have |
1076 | | /// sufficient capacity to store it, then it is not inserted and silently |
1077 | | /// dropped. |
1078 | | /// |
1079 | | /// # Errors |
1080 | | /// |
1081 | | /// This routine errors if the search could not complete. This can occur |
1082 | | /// in a number of circumstances: |
1083 | | /// |
1084 | | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
1085 | | /// For example, setting quit bytes or enabling heuristic support for |
1086 | | /// Unicode word boundaries. The default configuration does not enable any |
1087 | | /// option that could result in the lazy DFA quitting. |
1088 | | /// * The configuration of the lazy DFA may also permit it to "give up" |
1089 | | /// on a search if it makes ineffective use of its transition table |
1090 | | /// cache. The default configuration does not enable this by default, |
1091 | | /// although it is typically a good idea to. |
1092 | | /// * When the provided `Input` configuration is not supported. For |
1093 | | /// example, by providing an unsupported anchor mode. |
1094 | | /// |
1095 | | /// When a search returns an error, callers cannot know whether a match |
1096 | | /// exists or not. |
1097 | | /// |
1098 | | /// # Example |
1099 | | /// |
1100 | | /// This example shows how to find all matching patterns in a haystack, |
1101 | | /// even when some patterns match at the same position as other patterns. |
1102 | | /// |
1103 | | /// ``` |
1104 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1105 | | /// use regex_automata::{ |
1106 | | /// hybrid::dfa::DFA, |
1107 | | /// Input, MatchKind, PatternSet, |
1108 | | /// }; |
1109 | | /// |
1110 | | /// let patterns = &[ |
1111 | | /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", |
1112 | | /// ]; |
1113 | | /// let dfa = DFA::builder() |
1114 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1115 | | /// .build_many(patterns)?; |
1116 | | /// let mut cache = dfa.create_cache(); |
1117 | | /// |
1118 | | /// let input = Input::new("foobar"); |
1119 | | /// let mut patset = PatternSet::new(dfa.pattern_len()); |
1120 | | /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?; |
1121 | | /// let expected = vec![0, 2, 3, 4, 6]; |
1122 | | /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); |
1123 | | /// assert_eq!(expected, got); |
1124 | | /// |
1125 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1126 | | /// ``` |
1127 | | #[inline] |
1128 | 0 | pub fn try_which_overlapping_matches( |
1129 | 0 | &self, |
1130 | 0 | cache: &mut Cache, |
1131 | 0 | input: &Input<'_>, |
1132 | 0 | patset: &mut PatternSet, |
1133 | 0 | ) -> Result<(), MatchError> { |
1134 | 0 | let mut state = OverlappingState::start(); |
1135 | 0 | while let Some(m) = { |
1136 | 0 | self.try_search_overlapping_fwd(cache, input, &mut state)?; |
1137 | 0 | state.get_match() |
1138 | | } { |
1139 | 0 | let _ = patset.try_insert(m.pattern()); |
1140 | | // There's nothing left to find, so we can stop. Or the caller |
1141 | | // asked us to. |
1142 | 0 | if patset.is_full() || input.get_earliest() { |
1143 | 0 | break; |
1144 | 0 | } |
1145 | | } |
1146 | 0 | Ok(()) |
1147 | 0 | } |
1148 | | } |
1149 | | |
1150 | | impl DFA { |
1151 | | /// Transitions from the current state to the next state, given the next |
1152 | | /// byte of input. |
1153 | | /// |
1154 | | /// The given cache is used to either reuse pre-computed state |
1155 | | /// transitions, or to store this newly computed transition for future |
1156 | | /// reuse. Thus, this routine guarantees that it will never return a state |
1157 | | /// ID that has an "unknown" tag. |
1158 | | /// |
1159 | | /// # State identifier validity |
1160 | | /// |
1161 | | /// The only valid value for `current` is the lazy state ID returned |
1162 | | /// by the most recent call to `next_state`, `next_state_untagged`, |
1163 | | /// `next_state_untagged_unchecked`, `start_state_forward` or |
1164 | | /// `state_state_reverse` for the given `cache`. Any state ID returned from |
1165 | | /// prior calls to these routines (with the same `cache`) is considered |
1166 | | /// invalid (even if it gives an appearance of working). State IDs returned |
1167 | | /// from _any_ prior call for different `cache` values are also always |
1168 | | /// invalid. |
1169 | | /// |
1170 | | /// The returned ID is always a valid ID when `current` refers to a valid |
1171 | | /// ID. Moreover, this routine is defined for all possible values of |
1172 | | /// `input`. |
1173 | | /// |
1174 | | /// These validity rules are not checked, even in debug mode. Callers are |
1175 | | /// required to uphold these rules themselves. |
1176 | | /// |
1177 | | /// Violating these state ID validity rules will not sacrifice memory |
1178 | | /// safety, but _may_ produce an incorrect result or a panic. |
1179 | | /// |
1180 | | /// # Panics |
1181 | | /// |
1182 | | /// If the given ID does not refer to a valid state, then this routine |
1183 | | /// may panic but it also may not panic and instead return an invalid or |
1184 | | /// incorrect ID. |
1185 | | /// |
1186 | | /// # Example |
1187 | | /// |
1188 | | /// This shows a simplistic example for walking a lazy DFA for a given |
1189 | | /// haystack by using the `next_state` method. |
1190 | | /// |
1191 | | /// ``` |
1192 | | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
1193 | | /// |
1194 | | /// let dfa = DFA::new(r"[a-z]+r")?; |
1195 | | /// let mut cache = dfa.create_cache(); |
1196 | | /// let haystack = "bar".as_bytes(); |
1197 | | /// |
1198 | | /// // The start state is determined by inspecting the position and the |
1199 | | /// // initial bytes of the haystack. |
1200 | | /// let mut sid = dfa.start_state_forward( |
1201 | | /// &mut cache, &Input::new(haystack), |
1202 | | /// )?; |
1203 | | /// // Walk all the bytes in the haystack. |
1204 | | /// for &b in haystack { |
1205 | | /// sid = dfa.next_state(&mut cache, sid, b)?; |
1206 | | /// } |
1207 | | /// // Matches are always delayed by 1 byte, so we must explicitly walk the |
1208 | | /// // special "EOI" transition at the end of the search. |
1209 | | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
1210 | | /// assert!(sid.is_match()); |
1211 | | /// |
1212 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1213 | | /// ``` |
1214 | | #[inline] |
1215 | 0 | pub fn next_state( |
1216 | 0 | &self, |
1217 | 0 | cache: &mut Cache, |
1218 | 0 | current: LazyStateID, |
1219 | 0 | input: u8, |
1220 | 0 | ) -> Result<LazyStateID, CacheError> { |
1221 | 0 | let class = usize::from(self.classes.get(input)); |
1222 | 0 | let offset = current.as_usize_untagged() + class; |
1223 | 0 | let sid = cache.trans[offset]; |
1224 | 0 | if !sid.is_unknown() { |
1225 | 0 | return Ok(sid); |
1226 | 0 | } |
1227 | 0 | let unit = alphabet::Unit::u8(input); |
1228 | 0 | Lazy::new(self, cache).cache_next_state(current, unit) |
1229 | 0 | } |
1230 | | |
1231 | | /// Transitions from the current state to the next state, given the next |
1232 | | /// byte of input and a state ID that is not tagged. |
1233 | | /// |
1234 | | /// The only reason to use this routine is performance. In particular, the |
1235 | | /// `next_state` method needs to do some additional checks, among them is |
1236 | | /// to account for identifiers to states that are not yet computed. In |
1237 | | /// such a case, the transition is computed on the fly. However, if it is |
1238 | | /// known that the `current` state ID is untagged, then these checks can be |
1239 | | /// omitted. |
1240 | | /// |
1241 | | /// Since this routine does not compute states on the fly, it does not |
1242 | | /// modify the cache and thus cannot return an error. Consequently, `cache` |
1243 | | /// does not need to be mutable and it is possible for this routine to |
1244 | | /// return a state ID corresponding to the special "unknown" state. In |
1245 | | /// this case, it is the caller's responsibility to use the prior state |
1246 | | /// ID and `input` with `next_state` in order to force the computation of |
1247 | | /// the unknown transition. Otherwise, trying to use the "unknown" state |
1248 | | /// ID will just result in transitioning back to itself, and thus never |
1249 | | /// terminating. (This is technically a special exemption to the state ID |
1250 | | /// validity rules, but is permissible since this routine is guaranteed to |
1251 | | /// never mutate the given `cache`, and thus the identifier is guaranteed |
1252 | | /// to remain valid.) |
1253 | | /// |
1254 | | /// See [`LazyStateID`] for more details on what it means for a state ID |
1255 | | /// to be tagged. Also, see |
1256 | | /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) |
1257 | | /// for this same idea, but with bounds checks forcefully elided. |
1258 | | /// |
1259 | | /// # State identifier validity |
1260 | | /// |
1261 | | /// The only valid value for `current` is an **untagged** lazy |
1262 | | /// state ID returned by the most recent call to `next_state`, |
1263 | | /// `next_state_untagged`, `next_state_untagged_unchecked`, |
1264 | | /// `start_state_forward` or `state_state_reverse` for the given `cache`. |
1265 | | /// Any state ID returned from prior calls to these routines (with the |
1266 | | /// same `cache`) is considered invalid (even if it gives an appearance |
1267 | | /// of working). State IDs returned from _any_ prior call for different |
1268 | | /// `cache` values are also always invalid. |
1269 | | /// |
1270 | | /// The returned ID is always a valid ID when `current` refers to a valid |
1271 | | /// ID, although it may be tagged. Moreover, this routine is defined for |
1272 | | /// all possible values of `input`. |
1273 | | /// |
1274 | | /// Not all validity rules are checked, even in debug mode. Callers are |
1275 | | /// required to uphold these rules themselves. |
1276 | | /// |
1277 | | /// Violating these state ID validity rules will not sacrifice memory |
1278 | | /// safety, but _may_ produce an incorrect result or a panic. |
1279 | | /// |
1280 | | /// # Panics |
1281 | | /// |
1282 | | /// If the given ID does not refer to a valid state, then this routine |
1283 | | /// may panic but it also may not panic and instead return an invalid or |
1284 | | /// incorrect ID. |
1285 | | /// |
1286 | | /// # Example |
1287 | | /// |
1288 | | /// This shows a simplistic example for walking a lazy DFA for a given |
1289 | | /// haystack by using the `next_state_untagged` method where possible. |
1290 | | /// |
1291 | | /// ``` |
1292 | | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
1293 | | /// |
1294 | | /// let dfa = DFA::new(r"[a-z]+r")?; |
1295 | | /// let mut cache = dfa.create_cache(); |
1296 | | /// let haystack = "bar".as_bytes(); |
1297 | | /// |
1298 | | /// // The start state is determined by inspecting the position and the |
1299 | | /// // initial bytes of the haystack. |
1300 | | /// let mut sid = dfa.start_state_forward( |
1301 | | /// &mut cache, &Input::new(haystack), |
1302 | | /// )?; |
1303 | | /// // Walk all the bytes in the haystack. |
1304 | | /// let mut at = 0; |
1305 | | /// while at < haystack.len() { |
1306 | | /// if sid.is_tagged() { |
1307 | | /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; |
1308 | | /// } else { |
1309 | | /// let mut prev_sid = sid; |
1310 | | /// // We attempt to chew through as much as we can while moving |
1311 | | /// // through untagged state IDs. Thus, the transition function |
1312 | | /// // does less work on average per byte. (Unrolling this loop |
1313 | | /// // may help even more.) |
1314 | | /// while at < haystack.len() { |
1315 | | /// prev_sid = sid; |
1316 | | /// sid = dfa.next_state_untagged( |
1317 | | /// &mut cache, sid, haystack[at], |
1318 | | /// ); |
1319 | | /// at += 1; |
1320 | | /// if sid.is_tagged() { |
1321 | | /// break; |
1322 | | /// } |
1323 | | /// } |
1324 | | /// // We must ensure that we never proceed to the next iteration |
1325 | | /// // with an unknown state ID. If we don't account for this |
1326 | | /// // case, then search isn't guaranteed to terminate since all |
1327 | | /// // transitions on unknown states loop back to itself. |
1328 | | /// if sid.is_unknown() { |
1329 | | /// sid = dfa.next_state( |
1330 | | /// &mut cache, prev_sid, haystack[at - 1], |
1331 | | /// )?; |
1332 | | /// } |
1333 | | /// } |
1334 | | /// } |
1335 | | /// // Matches are always delayed by 1 byte, so we must explicitly walk the |
1336 | | /// // special "EOI" transition at the end of the search. |
1337 | | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
1338 | | /// assert!(sid.is_match()); |
1339 | | /// |
1340 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1341 | | /// ``` |
1342 | | #[inline] |
1343 | 0 | pub fn next_state_untagged( |
1344 | 0 | &self, |
1345 | 0 | cache: &Cache, |
1346 | 0 | current: LazyStateID, |
1347 | 0 | input: u8, |
1348 | 0 | ) -> LazyStateID { |
1349 | 0 | debug_assert!(!current.is_tagged()); |
1350 | 0 | let class = usize::from(self.classes.get(input)); |
1351 | 0 | let offset = current.as_usize_unchecked() + class; |
1352 | 0 | cache.trans[offset] |
1353 | 0 | } |
1354 | | |
1355 | | /// Transitions from the current state to the next state, eliding bounds |
1356 | | /// checks, given the next byte of input and a state ID that is not tagged. |
1357 | | /// |
1358 | | /// The only reason to use this routine is performance. In particular, the |
1359 | | /// `next_state` method needs to do some additional checks, among them is |
1360 | | /// to account for identifiers to states that are not yet computed. In |
1361 | | /// such a case, the transition is computed on the fly. However, if it is |
1362 | | /// known that the `current` state ID is untagged, then these checks can be |
1363 | | /// omitted. |
1364 | | /// |
1365 | | /// Since this routine does not compute states on the fly, it does not |
1366 | | /// modify the cache and thus cannot return an error. Consequently, `cache` |
1367 | | /// does not need to be mutable and it is possible for this routine to |
1368 | | /// return a state ID corresponding to the special "unknown" state. In |
1369 | | /// this case, it is the caller's responsibility to use the prior state |
1370 | | /// ID and `input` with `next_state` in order to force the computation of |
1371 | | /// the unknown transition. Otherwise, trying to use the "unknown" state |
1372 | | /// ID will just result in transitioning back to itself, and thus never |
1373 | | /// terminating. (This is technically a special exemption to the state ID |
1374 | | /// validity rules, but is permissible since this routine is guaranteed to |
1375 | | /// never mutate the given `cache`, and thus the identifier is guaranteed |
1376 | | /// to remain valid.) |
1377 | | /// |
1378 | | /// See [`LazyStateID`] for more details on what it means for a state ID |
1379 | | /// to be tagged. Also, see |
1380 | | /// [`next_state_untagged`](DFA::next_state_untagged) |
1381 | | /// for this same idea, but with memory safety guaranteed by retaining |
1382 | | /// bounds checks. |
1383 | | /// |
1384 | | /// # State identifier validity |
1385 | | /// |
1386 | | /// The only valid value for `current` is an **untagged** lazy |
1387 | | /// state ID returned by the most recent call to `next_state`, |
1388 | | /// `next_state_untagged`, `next_state_untagged_unchecked`, |
1389 | | /// `start_state_forward` or `state_state_reverse` for the given `cache`. |
1390 | | /// Any state ID returned from prior calls to these routines (with the |
1391 | | /// same `cache`) is considered invalid (even if it gives an appearance |
1392 | | /// of working). State IDs returned from _any_ prior call for different |
1393 | | /// `cache` values are also always invalid. |
1394 | | /// |
1395 | | /// The returned ID is always a valid ID when `current` refers to a valid |
1396 | | /// ID, although it may be tagged. Moreover, this routine is defined for |
1397 | | /// all possible values of `input`. |
1398 | | /// |
1399 | | /// Not all validity rules are checked, even in debug mode. Callers are |
1400 | | /// required to uphold these rules themselves. |
1401 | | /// |
1402 | | /// Violating these state ID validity rules will not sacrifice memory |
1403 | | /// safety, but _may_ produce an incorrect result or a panic. |
1404 | | /// |
1405 | | /// # Safety |
1406 | | /// |
1407 | | /// Callers of this method must guarantee that `current` refers to a valid |
1408 | | /// state ID according to the rules described above. If `current` is not a |
1409 | | /// valid state ID for this automaton, then calling this routine may result |
1410 | | /// in undefined behavior. |
1411 | | /// |
1412 | | /// If `current` is valid, then the ID returned is valid for all possible |
1413 | | /// values of `input`. |
1414 | | #[inline] |
1415 | 0 | pub unsafe fn next_state_untagged_unchecked( |
1416 | 0 | &self, |
1417 | 0 | cache: &Cache, |
1418 | 0 | current: LazyStateID, |
1419 | 0 | input: u8, |
1420 | 0 | ) -> LazyStateID { |
1421 | 0 | debug_assert!(!current.is_tagged()); |
1422 | 0 | let class = usize::from(self.classes.get(input)); |
1423 | 0 | let offset = current.as_usize_unchecked() + class; |
1424 | 0 | *cache.trans.get_unchecked(offset) |
1425 | 0 | } |
1426 | | |
1427 | | /// Transitions from the current state to the next state for the special |
1428 | | /// EOI symbol. |
1429 | | /// |
1430 | | /// The given cache is used to either reuse pre-computed state |
1431 | | /// transitions, or to store this newly computed transition for future |
1432 | | /// reuse. Thus, this routine guarantees that it will never return a state |
1433 | | /// ID that has an "unknown" tag. |
1434 | | /// |
1435 | | /// This routine must be called at the end of every search in a correct |
1436 | | /// implementation of search. Namely, lazy DFAs in this crate delay matches |
1437 | | /// by one byte in order to support look-around operators. Thus, after |
1438 | | /// reaching the end of a haystack, a search implementation must follow one |
1439 | | /// last EOI transition. |
1440 | | /// |
1441 | | /// It is best to think of EOI as an additional symbol in the alphabet of a |
1442 | | /// DFA that is distinct from every other symbol. That is, the alphabet of |
1443 | | /// lazy DFAs in this crate has a logical size of 257 instead of 256, where |
1444 | | /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the |
1445 | | /// physical alphabet size may be smaller because of alphabet compression |
1446 | | /// via equivalence classes, but EOI is always represented somehow in the |
1447 | | /// alphabet.) |
1448 | | /// |
1449 | | /// # State identifier validity |
1450 | | /// |
1451 | | /// The only valid value for `current` is the lazy state ID returned |
1452 | | /// by the most recent call to `next_state`, `next_state_untagged`, |
1453 | | /// `next_state_untagged_unchecked`, `start_state_forward` or |
1454 | | /// `state_state_reverse` for the given `cache`. Any state ID returned from |
1455 | | /// prior calls to these routines (with the same `cache`) is considered |
1456 | | /// invalid (even if it gives an appearance of working). State IDs returned |
1457 | | /// from _any_ prior call for different `cache` values are also always |
1458 | | /// invalid. |
1459 | | /// |
1460 | | /// The returned ID is always a valid ID when `current` refers to a valid |
1461 | | /// ID. |
1462 | | /// |
1463 | | /// These validity rules are not checked, even in debug mode. Callers are |
1464 | | /// required to uphold these rules themselves. |
1465 | | /// |
1466 | | /// Violating these state ID validity rules will not sacrifice memory |
1467 | | /// safety, but _may_ produce an incorrect result or a panic. |
1468 | | /// |
1469 | | /// # Panics |
1470 | | /// |
1471 | | /// If the given ID does not refer to a valid state, then this routine |
1472 | | /// may panic but it also may not panic and instead return an invalid or |
1473 | | /// incorrect ID. |
1474 | | /// |
1475 | | /// # Example |
1476 | | /// |
1477 | | /// This shows a simplistic example for walking a DFA for a given haystack, |
1478 | | /// and then finishing the search with the final EOI transition. |
1479 | | /// |
1480 | | /// ``` |
1481 | | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
1482 | | /// |
1483 | | /// let dfa = DFA::new(r"[a-z]+r")?; |
1484 | | /// let mut cache = dfa.create_cache(); |
1485 | | /// let haystack = "bar".as_bytes(); |
1486 | | /// |
1487 | | /// // The start state is determined by inspecting the position and the |
1488 | | /// // initial bytes of the haystack. |
1489 | | /// let mut sid = dfa.start_state_forward( |
1490 | | /// &mut cache, &Input::new(haystack), |
1491 | | /// )?; |
1492 | | /// // Walk all the bytes in the haystack. |
1493 | | /// for &b in haystack { |
1494 | | /// sid = dfa.next_state(&mut cache, sid, b)?; |
1495 | | /// } |
1496 | | /// // Matches are always delayed by 1 byte, so we must explicitly walk |
1497 | | /// // the special "EOI" transition at the end of the search. Without this |
1498 | | /// // final transition, the assert below will fail since the DFA will not |
1499 | | /// // have entered a match state yet! |
1500 | | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
1501 | | /// assert!(sid.is_match()); |
1502 | | /// |
1503 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1504 | | /// ``` |
1505 | | #[inline] |
1506 | 0 | pub fn next_eoi_state( |
1507 | 0 | &self, |
1508 | 0 | cache: &mut Cache, |
1509 | 0 | current: LazyStateID, |
1510 | 0 | ) -> Result<LazyStateID, CacheError> { |
1511 | 0 | let eoi = self.classes.eoi().as_usize(); |
1512 | 0 | let offset = current.as_usize_untagged() + eoi; |
1513 | 0 | let sid = cache.trans[offset]; |
1514 | 0 | if !sid.is_unknown() { |
1515 | 0 | return Ok(sid); |
1516 | 0 | } |
1517 | 0 | let unit = self.classes.eoi(); |
1518 | 0 | Lazy::new(self, cache).cache_next_state(current, unit) |
1519 | 0 | } |
1520 | | |
1521 | | /// Return the ID of the start state for this lazy DFA for the given |
1522 | | /// starting configuration. |
1523 | | /// |
1524 | | /// Unlike typical DFA implementations, the start state for DFAs in this |
1525 | | /// crate is dependent on a few different factors: |
1526 | | /// |
1527 | | /// * The [`Anchored`] mode of the search. Unanchored, anchored and |
1528 | | /// anchored searches for a specific [`PatternID`] all use different start |
1529 | | /// states. |
1530 | | /// * Whether a "look-behind" byte exists. For example, the `^` anchor |
1531 | | /// matches if and only if there is no look-behind byte. |
1532 | | /// * The specific value of that look-behind byte. For example, a `(?m:^)` |
1533 | | /// assertion only matches when there is either no look-behind byte, or |
1534 | | /// when the look-behind byte is a line terminator. |
1535 | | /// |
1536 | | /// The [starting configuration](start::Config) provides the above |
1537 | | /// information. |
1538 | | /// |
1539 | | /// This routine can be used for either forward or reverse searches. |
1540 | | /// Although, as a convenience, if you have an [`Input`], then it |
1541 | | /// may be more succinct to use [`DFA::start_state_forward`] or |
1542 | | /// [`DFA::start_state_reverse`]. Note, for example, that the convenience |
1543 | | /// routines return a [`MatchError`] on failure where as this routine |
1544 | | /// returns a [`StartError`]. |
1545 | | /// |
1546 | | /// # Errors |
1547 | | /// |
1548 | | /// This may return a [`StartError`] if the search needs to give up when |
1549 | | /// determining the start state (for example, if it sees a "quit" byte |
1550 | | /// or if the cache has become inefficient). This can also return an |
1551 | | /// error if the given configuration contains an unsupported [`Anchored`] |
1552 | | /// configuration. |
1553 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1554 | 0 | pub fn start_state( |
1555 | 0 | &self, |
1556 | 0 | cache: &mut Cache, |
1557 | 0 | config: &start::Config, |
1558 | 0 | ) -> Result<LazyStateID, StartError> { |
1559 | 0 | let lazy = LazyRef::new(self, cache); |
1560 | 0 | let anchored = config.get_anchored(); |
1561 | 0 | let start = match config.get_look_behind() { |
1562 | 0 | None => Start::Text, |
1563 | 0 | Some(byte) => { |
1564 | 0 | if !self.quitset.is_empty() && self.quitset.contains(byte) { |
1565 | 0 | return Err(StartError::quit(byte)); |
1566 | 0 | } |
1567 | 0 | self.start_map.get(byte) |
1568 | | } |
1569 | | }; |
1570 | 0 | let start_id = lazy.get_cached_start_id(anchored, start)?; |
1571 | 0 | if !start_id.is_unknown() { |
1572 | 0 | return Ok(start_id); |
1573 | 0 | } |
1574 | 0 | Lazy::new(self, cache).cache_start_group(anchored, start) |
1575 | 0 | } |
1576 | | |
1577 | | /// Return the ID of the start state for this lazy DFA when executing a |
1578 | | /// forward search. |
1579 | | /// |
1580 | | /// This is a convenience routine for calling [`DFA::start_state`] that |
1581 | | /// converts the given [`Input`] to a [start configuration](start::Config). |
1582 | | /// Additionally, if an error occurs, it is converted from a [`StartError`] |
1583 | | /// to a [`MatchError`] using the offset information in the given |
1584 | | /// [`Input`]. |
1585 | | /// |
1586 | | /// # Errors |
1587 | | /// |
1588 | | /// This may return a [`MatchError`] if the search needs to give up when |
1589 | | /// determining the start state (for example, if it sees a "quit" byte or |
1590 | | /// if the cache has become inefficient). This can also return an error if |
1591 | | /// the given `Input` contains an unsupported [`Anchored`] configuration. |
1592 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1593 | 0 | pub fn start_state_forward( |
1594 | 0 | &self, |
1595 | 0 | cache: &mut Cache, |
1596 | 0 | input: &Input<'_>, |
1597 | 0 | ) -> Result<LazyStateID, MatchError> { |
1598 | 0 | let config = start::Config::from_input_forward(input); |
1599 | 0 | self.start_state(cache, &config).map_err(|err| match err { |
1600 | 0 | StartError::Cache { .. } => MatchError::gave_up(input.start()), |
1601 | 0 | StartError::Quit { byte } => { |
1602 | 0 | let offset = input |
1603 | 0 | .start() |
1604 | 0 | .checked_sub(1) |
1605 | 0 | .expect("no quit in start without look-behind"); |
1606 | 0 | MatchError::quit(byte, offset) |
1607 | | } |
1608 | 0 | StartError::UnsupportedAnchored { mode } => { |
1609 | 0 | MatchError::unsupported_anchored(mode) |
1610 | | } |
1611 | 0 | }) |
1612 | 0 | } |
1613 | | |
1614 | | /// Return the ID of the start state for this lazy DFA when executing a |
1615 | | /// reverse search. |
1616 | | /// |
1617 | | /// This is a convenience routine for calling [`DFA::start_state`] that |
1618 | | /// converts the given [`Input`] to a [start configuration](start::Config). |
1619 | | /// Additionally, if an error occurs, it is converted from a [`StartError`] |
1620 | | /// to a [`MatchError`] using the offset information in the given |
1621 | | /// [`Input`]. |
1622 | | /// |
1623 | | /// # Errors |
1624 | | /// |
1625 | | /// This may return a [`MatchError`] if the search needs to give up when |
1626 | | /// determining the start state (for example, if it sees a "quit" byte or |
1627 | | /// if the cache has become inefficient). This can also return an error if |
1628 | | /// the given `Input` contains an unsupported [`Anchored`] configuration. |
1629 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
1630 | 0 | pub fn start_state_reverse( |
1631 | 0 | &self, |
1632 | 0 | cache: &mut Cache, |
1633 | 0 | input: &Input<'_>, |
1634 | 0 | ) -> Result<LazyStateID, MatchError> { |
1635 | 0 | let config = start::Config::from_input_reverse(input); |
1636 | 0 | self.start_state(cache, &config).map_err(|err| match err { |
1637 | 0 | StartError::Cache { .. } => MatchError::gave_up(input.end()), |
1638 | 0 | StartError::Quit { byte } => { |
1639 | 0 | let offset = input.end(); |
1640 | 0 | MatchError::quit(byte, offset) |
1641 | | } |
1642 | 0 | StartError::UnsupportedAnchored { mode } => { |
1643 | 0 | MatchError::unsupported_anchored(mode) |
1644 | | } |
1645 | 0 | }) |
1646 | 0 | } |
1647 | | |
1648 | | /// Returns the total number of patterns that match in this state. |
1649 | | /// |
1650 | | /// If the lazy DFA was compiled with one pattern, then this must |
1651 | | /// necessarily always return `1` for all match states. |
1652 | | /// |
1653 | | /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with |
1654 | | /// indices up to (but not including) the length returned by this routine |
1655 | | /// without panicking. |
1656 | | /// |
1657 | | /// # Panics |
1658 | | /// |
1659 | | /// If the given state is not a match state, then this may either panic |
1660 | | /// or return an incorrect result. |
1661 | | /// |
1662 | | /// # Example |
1663 | | /// |
1664 | | /// This example shows a simple instance of implementing overlapping |
1665 | | /// matches. In particular, it shows not only how to determine how many |
1666 | | /// patterns have matched in a particular state, but also how to access |
1667 | | /// which specific patterns have matched. |
1668 | | /// |
1669 | | /// Notice that we must use [`MatchKind::All`] when building the DFA. If we |
1670 | | /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be |
1671 | | /// constructed in a way that supports overlapping matches. (It would only |
1672 | | /// report a single pattern that matches at any particular point in time.) |
1673 | | /// |
1674 | | /// Another thing to take note of is the patterns used and the order in |
1675 | | /// which the pattern IDs are reported. In the example below, pattern `3` |
1676 | | /// is yielded first. Why? Because it corresponds to the match that |
1677 | | /// appears first. Namely, the `@` symbol is part of `\S+` but not part |
1678 | | /// of any of the other patterns. Since the `\S+` pattern has a match that |
1679 | | /// starts to the left of any other pattern, its ID is returned before any |
1680 | | /// other. |
1681 | | /// |
1682 | | /// ``` |
1683 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1684 | | /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind}; |
1685 | | /// |
1686 | | /// let dfa = DFA::builder() |
1687 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1688 | | /// .build_many(&[ |
1689 | | /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", |
1690 | | /// ])?; |
1691 | | /// let mut cache = dfa.create_cache(); |
1692 | | /// let haystack = "@bar".as_bytes(); |
1693 | | /// |
1694 | | /// // The start state is determined by inspecting the position and the |
1695 | | /// // initial bytes of the haystack. |
1696 | | /// let mut sid = dfa.start_state_forward( |
1697 | | /// &mut cache, &Input::new(haystack), |
1698 | | /// )?; |
1699 | | /// // Walk all the bytes in the haystack. |
1700 | | /// for &b in haystack { |
1701 | | /// sid = dfa.next_state(&mut cache, sid, b)?; |
1702 | | /// } |
1703 | | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
1704 | | /// |
1705 | | /// assert!(sid.is_match()); |
1706 | | /// assert_eq!(dfa.match_len(&mut cache, sid), 3); |
1707 | | /// // The following calls are guaranteed to not panic since `match_len` |
1708 | | /// // returned `3` above. |
1709 | | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); |
1710 | | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); |
1711 | | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); |
1712 | | /// |
1713 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1714 | | /// ``` |
1715 | | #[inline] |
1716 | 0 | pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize { |
1717 | 0 | assert!(id.is_match()); |
1718 | 0 | LazyRef::new(self, cache).get_cached_state(id).match_len() |
1719 | 0 | } |
1720 | | |
1721 | | /// Returns the pattern ID corresponding to the given match index in the |
1722 | | /// given state. |
1723 | | /// |
1724 | | /// See [`DFA::match_len`] for an example of how to use this method |
1725 | | /// correctly. Note that if you know your lazy DFA is configured with a |
1726 | | /// single pattern, then this routine is never necessary since it will |
1727 | | /// always return a pattern ID of `0` for an index of `0` when `id` |
1728 | | /// corresponds to a match state. |
1729 | | /// |
1730 | | /// Typically, this routine is used when implementing an overlapping |
1731 | | /// search, as the example for `DFA::match_len` does. |
1732 | | /// |
1733 | | /// # Panics |
1734 | | /// |
1735 | | /// If the state ID is not a match state or if the match index is out |
1736 | | /// of bounds for the given state, then this routine may either panic |
1737 | | /// or produce an incorrect result. If the state ID is correct and the |
1738 | | /// match index is correct, then this routine always produces a valid |
1739 | | /// `PatternID`. |
1740 | | #[inline] |
1741 | 0 | pub fn match_pattern( |
1742 | 0 | &self, |
1743 | 0 | cache: &Cache, |
1744 | 0 | id: LazyStateID, |
1745 | 0 | match_index: usize, |
1746 | 0 | ) -> PatternID { |
1747 | | // This is an optimization for the very common case of a DFA with a |
1748 | | // single pattern. This conditional avoids a somewhat more costly path |
1749 | | // that finds the pattern ID from the corresponding `State`, which |
1750 | | // requires a bit of slicing/pointer-chasing. This optimization tends |
1751 | | // to only matter when matches are frequent. |
1752 | 0 | if self.pattern_len() == 1 { |
1753 | 0 | return PatternID::ZERO; |
1754 | 0 | } |
1755 | 0 | LazyRef::new(self, cache) |
1756 | 0 | .get_cached_state(id) |
1757 | 0 | .match_pattern(match_index) |
1758 | 0 | } |
1759 | | } |
1760 | | |
1761 | | /// A cache represents a partially computed DFA. |
1762 | | /// |
1763 | | /// A cache is the key component that differentiates a classical DFA and a |
1764 | | /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a |
1765 | | /// complete transition table that can handle all possible inputs, a hybrid |
1766 | | /// NFA/DFA starts with an empty transition table and builds only the parts |
1767 | | /// required during search. The parts that are built are stored in a cache. For |
1768 | | /// this reason, a cache is a required parameter for nearly every operation on |
1769 | | /// a [`DFA`]. |
1770 | | /// |
1771 | | /// Caches can be created from their corresponding DFA via |
1772 | | /// [`DFA::create_cache`]. A cache can only be used with either the DFA that |
1773 | | /// created it, or the DFA that was most recently used to reset it with |
1774 | | /// [`Cache::reset`]. Using a cache with any other DFA may result in panics |
1775 | | /// or incorrect results. |
1776 | | #[derive(Clone, Debug)] |
1777 | | pub struct Cache { |
1778 | | // N.B. If you're looking to understand how determinization works, it |
1779 | | // is probably simpler to first grok src/dfa/determinize.rs, since that |
1780 | | // doesn't have the "laziness" component. |
1781 | | /// The transition table. |
1782 | | /// |
1783 | | /// Given a `current` LazyStateID and an `input` byte, the next state can |
1784 | | /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice |
1785 | | /// that no multiplication is used. That's because state identifiers are |
1786 | | /// "premultiplied." |
1787 | | /// |
1788 | | /// Note that the next state may be the "unknown" state. In this case, the |
1789 | | /// next state is not known and determinization for `current` on `input` |
1790 | | /// must be performed. |
1791 | | trans: Vec<LazyStateID>, |
1792 | | /// The starting states for this DFA. |
1793 | | /// |
1794 | | /// These are computed lazily. Initially, these are all set to "unknown" |
1795 | | /// lazy state IDs. |
1796 | | /// |
1797 | | /// When 'starts_for_each_pattern' is disabled (the default), then the size |
1798 | | /// of this is constrained to the possible starting configurations based |
1799 | | /// on the search parameters. (At time of writing, that's 4.) However, |
1800 | | /// when starting states for each pattern is enabled, then there are N |
1801 | | /// additional groups of starting states, where each group reflects the |
1802 | | /// different possible configurations and N is the number of patterns. |
1803 | | starts: Vec<LazyStateID>, |
1804 | | /// A sequence of NFA/DFA powerset states that have been computed for this |
1805 | | /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every |
1806 | | /// tagged LazyStateID can be used to index this sequence by converting it |
1807 | | /// to its untagged form.) |
1808 | | states: Vec<State>, |
1809 | | /// A map from states to their corresponding IDs. This map may be accessed |
1810 | | /// via the raw byte representation of a state, which means that a `State` |
1811 | | /// does not need to be allocated to determine whether it already exists |
1812 | | /// in this map. Indeed, the existence of such a state is what determines |
1813 | | /// whether we allocate a new `State` or not. |
1814 | | /// |
1815 | | /// The higher level idea here is that we do just enough determinization |
1816 | | /// for a state to check whether we've already computed it. If we have, |
1817 | | /// then we can save a little (albeit not much) work. The real savings is |
1818 | | /// in memory usage. If we never checked for trivially duplicate states, |
1819 | | /// then our memory usage would explode to unreasonable levels. |
1820 | | states_to_id: StateMap, |
1821 | | /// Sparse sets used to track which NFA states have been visited during |
1822 | | /// various traversals. |
1823 | | sparses: SparseSets, |
1824 | | /// Scratch space for traversing the NFA graph. (We use space on the heap |
1825 | | /// instead of the call stack.) |
1826 | | stack: Vec<NFAStateID>, |
1827 | | /// Scratch space for building a NFA/DFA powerset state. This is used to |
1828 | | /// help amortize allocation since not every powerset state generated is |
1829 | | /// added to the cache. In particular, if it already exists in the cache, |
1830 | | /// then there is no need to allocate a new `State` for it. |
1831 | | scratch_state_builder: StateBuilderEmpty, |
1832 | | /// A simple abstraction for handling the saving of at most a single state |
1833 | | /// across a cache clearing. This is required for correctness. Namely, if |
1834 | | /// adding a new state after clearing the cache fails, then the caller |
1835 | | /// must retain the ability to continue using the state ID given. The |
1836 | | /// state corresponding to the state ID is what we preserve across cache |
1837 | | /// clearings. |
1838 | | state_saver: StateSaver, |
1839 | | /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We |
1840 | | /// track this as new states are added since states use a variable amount |
1841 | | /// of heap. Tracking this as we add states makes it possible to compute |
1842 | | /// the total amount of memory used by the determinizer in constant time. |
1843 | | memory_usage_state: usize, |
1844 | | /// The number of times the cache has been cleared. When a minimum cache |
1845 | | /// clear count is set, then the cache will return an error instead of |
1846 | | /// clearing the cache if the count has been exceeded. |
1847 | | clear_count: usize, |
1848 | | /// The total number of bytes searched since the last time this cache was |
1849 | | /// cleared, not including the current search. |
1850 | | /// |
1851 | | /// This can be added to the length of the current search to get the true |
1852 | | /// total number of bytes searched. |
1853 | | /// |
1854 | | /// This is generally only non-zero when the |
1855 | | /// `Cache::search_{start,update,finish}` APIs are used to track search |
1856 | | /// progress. |
1857 | | bytes_searched: usize, |
1858 | | /// The progress of the current search. |
1859 | | /// |
1860 | | /// This is only non-`None` when callers utilize the `Cache::search_start`, |
1861 | | /// `Cache::search_update` and `Cache::search_finish` APIs. |
1862 | | /// |
1863 | | /// The purpose of recording search progress is to be able to make a |
1864 | | /// determination about the efficiency of the cache. Namely, by keeping |
1865 | | /// track of the |
1866 | | progress: Option<SearchProgress>, |
1867 | | } |
1868 | | |
1869 | | impl Cache { |
1870 | | /// Create a new cache for the given lazy DFA. |
1871 | | /// |
1872 | | /// The cache returned should only be used for searches for the given DFA. |
1873 | | /// If you want to reuse the cache for another DFA, then you must call |
1874 | | /// [`Cache::reset`] with that DFA. |
1875 | 0 | pub fn new(dfa: &DFA) -> Cache { |
1876 | 0 | let mut cache = Cache { |
1877 | 0 | trans: alloc::vec![], |
1878 | 0 | starts: alloc::vec![], |
1879 | 0 | states: alloc::vec![], |
1880 | 0 | states_to_id: StateMap::new(), |
1881 | 0 | sparses: SparseSets::new(dfa.get_nfa().states().len()), |
1882 | 0 | stack: alloc::vec![], |
1883 | 0 | scratch_state_builder: StateBuilderEmpty::new(), |
1884 | 0 | state_saver: StateSaver::none(), |
1885 | 0 | memory_usage_state: 0, |
1886 | 0 | clear_count: 0, |
1887 | 0 | bytes_searched: 0, |
1888 | 0 | progress: None, |
1889 | 0 | }; |
1890 | | debug!("pre-init lazy DFA cache size: {}", cache.memory_usage()); |
1891 | 0 | Lazy { dfa, cache: &mut cache }.init_cache(); |
1892 | | debug!("post-init lazy DFA cache size: {}", cache.memory_usage()); |
1893 | 0 | cache |
1894 | 0 | } |
1895 | | |
1896 | | /// Reset this cache such that it can be used for searching with the given |
1897 | | /// lazy DFA (and only that DFA). |
1898 | | /// |
1899 | | /// A cache reset permits reusing memory already allocated in this cache |
1900 | | /// with a different lazy DFA. |
1901 | | /// |
1902 | | /// Resetting a cache sets its "clear count" to 0. This is relevant if the |
1903 | | /// lazy DFA has been configured to "give up" after it has cleared the |
1904 | | /// cache a certain number of times. |
1905 | | /// |
1906 | | /// Any lazy state ID generated by the cache prior to resetting it is |
1907 | | /// invalid after the reset. |
1908 | | /// |
1909 | | /// # Example |
1910 | | /// |
1911 | | /// This shows how to re-purpose a cache for use with a different DFA. |
1912 | | /// |
1913 | | /// ``` |
1914 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1915 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
1916 | | /// |
1917 | | /// let dfa1 = DFA::new(r"\w")?; |
1918 | | /// let dfa2 = DFA::new(r"\W")?; |
1919 | | /// |
1920 | | /// let mut cache = dfa1.create_cache(); |
1921 | | /// assert_eq!( |
1922 | | /// Some(HalfMatch::must(0, 2)), |
1923 | | /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, |
1924 | | /// ); |
1925 | | /// |
1926 | | /// // Using 'cache' with dfa2 is not allowed. It may result in panics or |
1927 | | /// // incorrect results. In order to re-purpose the cache, we must reset |
1928 | | /// // it with the DFA we'd like to use it with. |
1929 | | /// // |
1930 | | /// // Similarly, after this reset, using the cache with 'dfa1' is also not |
1931 | | /// // allowed. |
1932 | | /// cache.reset(&dfa2); |
1933 | | /// assert_eq!( |
1934 | | /// Some(HalfMatch::must(0, 3)), |
1935 | | /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, |
1936 | | /// ); |
1937 | | /// |
1938 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1939 | | /// ``` |
1940 | 0 | pub fn reset(&mut self, dfa: &DFA) { |
1941 | 0 | Lazy::new(dfa, self).reset_cache() |
1942 | 0 | } |
1943 | | |
1944 | | /// Initializes a new search starting at the given position. |
1945 | | /// |
1946 | | /// If a previous search was unfinished, then it is finished automatically |
1947 | | /// and a new search is begun. |
1948 | | /// |
1949 | | /// Note that keeping track of search progress is _not necessary_ |
1950 | | /// for correct implementations of search using a lazy DFA. Keeping |
1951 | | /// track of search progress is only necessary if you want the |
1952 | | /// [`Config::minimum_bytes_per_state`] configuration knob to work. |
1953 | | #[inline] |
1954 | 0 | pub fn search_start(&mut self, at: usize) { |
1955 | | // If a previous search wasn't marked as finished, then finish it |
1956 | | // now automatically. |
1957 | 0 | if let Some(p) = self.progress.take() { |
1958 | 0 | self.bytes_searched += p.len(); |
1959 | 0 | } |
1960 | 0 | self.progress = Some(SearchProgress { start: at, at }); |
1961 | 0 | } |
1962 | | |
1963 | | /// Updates the current search to indicate that it has search to the |
1964 | | /// current position. |
1965 | | /// |
1966 | | /// No special care needs to be taken for reverse searches. Namely, the |
1967 | | /// position given may be _less than_ the starting position of the search. |
1968 | | /// |
1969 | | /// # Panics |
1970 | | /// |
1971 | | /// This panics if no search has been started by [`Cache::search_start`]. |
1972 | | #[inline] |
1973 | 0 | pub fn search_update(&mut self, at: usize) { |
1974 | 0 | let p = |
1975 | 0 | self.progress.as_mut().expect("no in-progress search to update"); |
1976 | 0 | p.at = at; |
1977 | 0 | } |
1978 | | |
1979 | | /// Indicates that a search has finished at the given position. |
1980 | | /// |
1981 | | /// # Panics |
1982 | | /// |
1983 | | /// This panics if no search has been started by [`Cache::search_start`]. |
1984 | | #[inline] |
1985 | 0 | pub fn search_finish(&mut self, at: usize) { |
1986 | 0 | let mut p = |
1987 | 0 | self.progress.take().expect("no in-progress search to finish"); |
1988 | 0 | p.at = at; |
1989 | 0 | self.bytes_searched += p.len(); |
1990 | 0 | } |
1991 | | |
1992 | | /// Returns the total number of bytes that have been searched since this |
1993 | | /// cache was last cleared. |
1994 | | /// |
1995 | | /// This is useful for determining the efficiency of the cache. For |
1996 | | /// example, the lazy DFA uses this value in conjunction with the |
1997 | | /// [`Config::minimum_bytes_per_state`] knob to help determine whether it |
1998 | | /// should quit searching. |
1999 | | /// |
2000 | | /// This always returns `0` if search progress isn't being tracked. Note |
2001 | | /// that the lazy DFA search routines in this crate always track search |
2002 | | /// progress. |
2003 | 0 | pub fn search_total_len(&self) -> usize { |
2004 | 0 | self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len()) |
2005 | 0 | } |
2006 | | |
2007 | | /// Returns the total number of times this cache has been cleared since it |
2008 | | /// was either created or last reset. |
2009 | | /// |
2010 | | /// This is useful for informational purposes or if you want to change |
2011 | | /// search strategies based on the number of times the cache has been |
2012 | | /// cleared. |
2013 | 0 | pub fn clear_count(&self) -> usize { |
2014 | 0 | self.clear_count |
2015 | 0 | } |
2016 | | |
2017 | | /// Returns the heap memory usage, in bytes, of this cache. |
2018 | | /// |
2019 | | /// This does **not** include the stack size used up by this cache. To |
2020 | | /// compute that, use `std::mem::size_of::<Cache>()`. |
2021 | 0 | pub fn memory_usage(&self) -> usize { |
2022 | | const ID_SIZE: usize = size_of::<LazyStateID>(); |
2023 | | const STATE_SIZE: usize = size_of::<State>(); |
2024 | | |
2025 | | // NOTE: If you make changes to the below, then |
2026 | | // 'minimum_cache_capacity' should be updated correspondingly. |
2027 | | |
2028 | 0 | self.trans.len() * ID_SIZE |
2029 | 0 | + self.starts.len() * ID_SIZE |
2030 | 0 | + self.states.len() * STATE_SIZE |
2031 | 0 | // Maps likely use more memory than this, but it's probably close. |
2032 | 0 | + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) |
2033 | 0 | + self.sparses.memory_usage() |
2034 | 0 | + self.stack.capacity() * ID_SIZE |
2035 | 0 | + self.scratch_state_builder.capacity() |
2036 | 0 | // Heap memory used by 'State' in both 'states' and 'states_to_id'. |
2037 | 0 | + self.memory_usage_state |
2038 | 0 | } |
2039 | | } |
2040 | | |
2041 | | /// Keeps track of the progress of the current search. |
2042 | | /// |
2043 | | /// This is updated via the `Cache::search_{start,update,finish}` APIs to |
2044 | | /// record how many bytes have been searched. This permits computing a |
2045 | | /// heuristic that represents the efficiency of a cache, and thus helps inform |
2046 | | /// whether the lazy DFA should give up or not. |
2047 | | #[derive(Clone, Debug)] |
2048 | | struct SearchProgress { |
2049 | | start: usize, |
2050 | | at: usize, |
2051 | | } |
2052 | | |
2053 | | impl SearchProgress { |
2054 | | /// Returns the length, in bytes, of this search so far. |
2055 | | /// |
2056 | | /// This automatically handles the case of a reverse search, where `at` |
2057 | | /// is likely to be less than `start`. |
2058 | 0 | fn len(&self) -> usize { |
2059 | 0 | if self.start <= self.at { |
2060 | 0 | self.at - self.start |
2061 | | } else { |
2062 | 0 | self.start - self.at |
2063 | | } |
2064 | 0 | } |
2065 | | } |
2066 | | |
2067 | | /// A map from states to state identifiers. When using std, we use a standard |
2068 | | /// hashmap, since it's a bit faster for this use case. (Other maps, like |
2069 | | /// one's based on FNV, have not yet been benchmarked.) |
2070 | | /// |
2071 | | /// The main purpose of this map is to reuse states where possible. This won't |
2072 | | /// fully minimize the DFA, but it works well in a lot of cases. |
2073 | | #[cfg(feature = "std")] |
2074 | | type StateMap = std::collections::HashMap<State, LazyStateID>; |
2075 | | #[cfg(not(feature = "std"))] |
2076 | | type StateMap = alloc::collections::BTreeMap<State, LazyStateID>; |
2077 | | |
2078 | | /// A type that groups methods that require the base NFA/DFA and writable |
2079 | | /// access to the cache. |
2080 | | #[derive(Debug)] |
2081 | | struct Lazy<'i, 'c> { |
2082 | | dfa: &'i DFA, |
2083 | | cache: &'c mut Cache, |
2084 | | } |
2085 | | |
2086 | | impl<'i, 'c> Lazy<'i, 'c> { |
2087 | | /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. |
2088 | 0 | fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { |
2089 | 0 | Lazy { dfa, cache } |
2090 | 0 | } |
2091 | | |
2092 | | /// Return an immutable view by downgrading a writable cache to a read-only |
2093 | | /// cache. |
2094 | 0 | fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { |
2095 | 0 | LazyRef::new(self.dfa, self.cache) |
2096 | 0 | } |
2097 | | |
2098 | | /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' |
2099 | | /// like 'next_state' and 'next_eoi_state' that are called in critical |
2100 | | /// areas. The idea is to let the optimizer focus on the other areas of |
2101 | | /// those methods as the hot path. |
2102 | | /// |
2103 | | /// Here's an example that justifies 'inline(never)' |
2104 | | /// |
2105 | | /// ```ignore |
2106 | | /// regex-cli find match hybrid \ |
2107 | | /// --cache-capacity 100000000 \ |
2108 | | /// -p '\pL{100}' |
2109 | | /// all-codepoints-utf8-100x |
2110 | | /// ``` |
2111 | | /// |
2112 | | /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every |
2113 | | /// codepoint, in sequence, repeated 100 times. |
2114 | | /// |
2115 | | /// With 'inline(never)' hyperfine reports 1.1s per run. With |
2116 | | /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. |
2117 | | #[cold] |
2118 | | #[inline(never)] |
2119 | 0 | fn cache_next_state( |
2120 | 0 | &mut self, |
2121 | 0 | mut current: LazyStateID, |
2122 | 0 | unit: alphabet::Unit, |
2123 | 0 | ) -> Result<LazyStateID, CacheError> { |
2124 | 0 | let stride2 = self.dfa.stride2(); |
2125 | 0 | let empty_builder = self.get_state_builder(); |
2126 | 0 | let builder = determinize::next( |
2127 | 0 | self.dfa.get_nfa(), |
2128 | 0 | self.dfa.get_config().get_match_kind(), |
2129 | 0 | &mut self.cache.sparses, |
2130 | 0 | &mut self.cache.stack, |
2131 | 0 | &self.cache.states[current.as_usize_untagged() >> stride2], |
2132 | 0 | unit, |
2133 | 0 | empty_builder, |
2134 | | ); |
2135 | | // This is subtle, but if we *might* clear the cache, then we should |
2136 | | // try to save the current state so that we can re-map its ID after |
2137 | | // cache clearing. We *might* clear the cache when either the new |
2138 | | // state can't fit in the cache or when the number of transitions has |
2139 | | // reached the maximum. Even if either of these conditions is true, |
2140 | | // the cache might not be cleared if we can reuse an existing state. |
2141 | | // But we don't know that at this point. Moreover, we don't save the |
2142 | | // current state every time because it is costly. |
2143 | | // |
2144 | | // TODO: We should try to find a way to make this less subtle and error |
2145 | | // prone. ---AG |
2146 | 0 | let save_state = !self.as_ref().state_builder_fits_in_cache(&builder) |
2147 | 0 | || self.cache.trans.len() >= LazyStateID::MAX; |
2148 | 0 | if save_state { |
2149 | 0 | self.save_state(current); |
2150 | 0 | } |
2151 | 0 | let next = self.add_builder_state(builder, |sid| sid)?; |
2152 | 0 | if save_state { |
2153 | 0 | current = self.saved_state_id(); |
2154 | 0 | } |
2155 | | // This is the payoff. The next time 'next_state' is called with this |
2156 | | // state and alphabet unit, it will find this transition and avoid |
2157 | | // having to re-determinize this transition. |
2158 | 0 | self.set_transition(current, unit, next); |
2159 | 0 | Ok(next) |
2160 | 0 | } |
2161 | | |
2162 | | /// Compute and cache the starting state for the given pattern ID (if |
2163 | | /// present) and the starting configuration. |
2164 | | /// |
2165 | | /// This panics if a pattern ID is given and the DFA isn't configured to |
2166 | | /// build anchored start states for each pattern. |
2167 | | /// |
2168 | | /// This will never return an unknown lazy state ID. |
2169 | | /// |
2170 | | /// If caching this state would otherwise result in a cache that has been |
2171 | | /// cleared too many times, then an error is returned. |
2172 | | #[cold] |
2173 | | #[inline(never)] |
2174 | 0 | fn cache_start_group( |
2175 | 0 | &mut self, |
2176 | 0 | anchored: Anchored, |
2177 | 0 | start: Start, |
2178 | 0 | ) -> Result<LazyStateID, StartError> { |
2179 | 0 | let nfa_start_id = match anchored { |
2180 | 0 | Anchored::No => self.dfa.get_nfa().start_unanchored(), |
2181 | 0 | Anchored::Yes => self.dfa.get_nfa().start_anchored(), |
2182 | 0 | Anchored::Pattern(pid) => { |
2183 | 0 | if !self.dfa.get_config().get_starts_for_each_pattern() { |
2184 | 0 | return Err(StartError::unsupported_anchored(anchored)); |
2185 | 0 | } |
2186 | 0 | match self.dfa.get_nfa().start_pattern(pid) { |
2187 | 0 | None => return Ok(self.as_ref().dead_id()), |
2188 | 0 | Some(sid) => sid, |
2189 | | } |
2190 | | } |
2191 | | }; |
2192 | | |
2193 | 0 | let id = self |
2194 | 0 | .cache_start_one(nfa_start_id, start) |
2195 | 0 | .map_err(StartError::cache)?; |
2196 | 0 | self.set_start_state(anchored, start, id); |
2197 | 0 | Ok(id) |
2198 | 0 | } |
2199 | | |
2200 | | /// Compute and cache the starting state for the given NFA state ID and the |
2201 | | /// starting configuration. The NFA state ID might be one of the following: |
2202 | | /// |
2203 | | /// 1) An unanchored start state to match any pattern. |
2204 | | /// 2) An anchored start state to match any pattern. |
2205 | | /// 3) An anchored start state for a particular pattern. |
2206 | | /// |
2207 | | /// This will never return an unknown lazy state ID. |
2208 | | /// |
2209 | | /// If caching this state would otherwise result in a cache that has been |
2210 | | /// cleared too many times, then an error is returned. |
2211 | 0 | fn cache_start_one( |
2212 | 0 | &mut self, |
2213 | 0 | nfa_start_id: NFAStateID, |
2214 | 0 | start: Start, |
2215 | 0 | ) -> Result<LazyStateID, CacheError> { |
2216 | 0 | let mut builder_matches = self.get_state_builder().into_matches(); |
2217 | 0 | determinize::set_lookbehind_from_start( |
2218 | 0 | self.dfa.get_nfa(), |
2219 | 0 | &start, |
2220 | 0 | &mut builder_matches, |
2221 | | ); |
2222 | 0 | self.cache.sparses.set1.clear(); |
2223 | 0 | determinize::epsilon_closure( |
2224 | 0 | self.dfa.get_nfa(), |
2225 | 0 | nfa_start_id, |
2226 | 0 | builder_matches.look_have(), |
2227 | 0 | &mut self.cache.stack, |
2228 | 0 | &mut self.cache.sparses.set1, |
2229 | | ); |
2230 | 0 | let mut builder = builder_matches.into_nfa(); |
2231 | 0 | determinize::add_nfa_states( |
2232 | 0 | &self.dfa.get_nfa(), |
2233 | 0 | &self.cache.sparses.set1, |
2234 | 0 | &mut builder, |
2235 | | ); |
2236 | 0 | let tag_starts = self.dfa.get_config().get_specialize_start_states(); |
2237 | 0 | self.add_builder_state(builder, |id| { |
2238 | 0 | if tag_starts { |
2239 | 0 | id.to_start() |
2240 | | } else { |
2241 | 0 | id |
2242 | | } |
2243 | 0 | }) |
2244 | 0 | } |
2245 | | |
2246 | | /// Either add the given builder state to this cache, or return an ID to an |
2247 | | /// equivalent state already in this cache. |
2248 | | /// |
2249 | | /// In the case where no equivalent state exists, the idmap function given |
2250 | | /// may be used to transform the identifier allocated. This is useful if |
2251 | | /// the caller needs to tag the ID with additional information. |
2252 | | /// |
2253 | | /// This will never return an unknown lazy state ID. |
2254 | | /// |
2255 | | /// If caching this state would otherwise result in a cache that has been |
2256 | | /// cleared too many times, then an error is returned. |
2257 | 0 | fn add_builder_state( |
2258 | 0 | &mut self, |
2259 | 0 | builder: StateBuilderNFA, |
2260 | 0 | idmap: impl Fn(LazyStateID) -> LazyStateID, |
2261 | 0 | ) -> Result<LazyStateID, CacheError> { |
2262 | 0 | if let Some(&cached_id) = |
2263 | 0 | self.cache.states_to_id.get(builder.as_bytes()) |
2264 | | { |
2265 | | // Since we have a cached state, put the constructed state's |
2266 | | // memory back into our scratch space, so that it can be reused. |
2267 | 0 | self.put_state_builder(builder); |
2268 | 0 | return Ok(cached_id); |
2269 | 0 | } |
2270 | 0 | let result = self.add_state(builder.to_state(), idmap); |
2271 | 0 | self.put_state_builder(builder); |
2272 | 0 | result |
2273 | 0 | } Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_builder_state::<<regex_automata::hybrid::dfa::Lazy>::cache_start_one::{closure#0}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_builder_state::<<regex_automata::hybrid::dfa::Lazy>::cache_next_state::{closure#0}> |
2274 | | |
2275 | | /// Allocate a new state ID and add the given state to this cache. |
2276 | | /// |
2277 | | /// The idmap function given may be used to transform the identifier |
2278 | | /// allocated. This is useful if the caller needs to tag the ID with |
2279 | | /// additional information. |
2280 | | /// |
2281 | | /// This will never return an unknown lazy state ID. |
2282 | | /// |
2283 | | /// If caching this state would otherwise result in a cache that has been |
2284 | | /// cleared too many times, then an error is returned. |
2285 | 0 | fn add_state( |
2286 | 0 | &mut self, |
2287 | 0 | state: State, |
2288 | 0 | idmap: impl Fn(LazyStateID) -> LazyStateID, |
2289 | 0 | ) -> Result<LazyStateID, CacheError> { |
2290 | 0 | if !self.as_ref().state_fits_in_cache(&state) { |
2291 | 0 | self.try_clear_cache()?; |
2292 | 0 | } |
2293 | | // It's important for this to come second, since the above may clear |
2294 | | // the cache. If we clear the cache after ID generation, then the ID |
2295 | | // is likely bunk since it would have been generated based on a larger |
2296 | | // transition table. |
2297 | 0 | let mut id = idmap(self.next_state_id()?); |
2298 | 0 | if state.is_match() { |
2299 | 0 | id = id.to_match(); |
2300 | 0 | } |
2301 | | // Add room in the transition table. Since this is a fresh state, all |
2302 | | // of its transitions are unknown. |
2303 | 0 | self.cache.trans.extend( |
2304 | 0 | iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), |
2305 | | ); |
2306 | | // When we add a sentinel state, we never want to set any quit |
2307 | | // transitions. Technically, this is harmless, since sentinel states |
2308 | | // have all of their transitions set to loop back to themselves. But |
2309 | | // when creating sentinel states before the quit sentinel state, |
2310 | | // this will try to call 'set_transition' on a state ID that doesn't |
2311 | | // actually exist yet, which isn't allowed. So we just skip doing so |
2312 | | // entirely. |
2313 | 0 | if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { |
2314 | 0 | let quit_id = self.as_ref().quit_id(); |
2315 | 0 | for b in self.dfa.quitset.iter() { |
2316 | 0 | self.set_transition(id, alphabet::Unit::u8(b), quit_id); |
2317 | 0 | } |
2318 | 0 | } |
2319 | 0 | self.cache.memory_usage_state += state.memory_usage(); |
2320 | 0 | self.cache.states.push(state.clone()); |
2321 | 0 | self.cache.states_to_id.insert(state, id); |
2322 | 0 | Ok(id) |
2323 | 0 | } Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#0}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#2}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::init_cache::{closure#1}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::clear_cache::{closure#0}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::cache_start_one::{closure#0}>Unexecuted instantiation: <regex_automata::hybrid::dfa::Lazy>::add_state::<<regex_automata::hybrid::dfa::Lazy>::cache_next_state::{closure#0}> |
2324 | | |
2325 | | /// Allocate a new state ID. |
2326 | | /// |
2327 | | /// This will never return an unknown lazy state ID. |
2328 | | /// |
2329 | | /// If caching this state would otherwise result in a cache that has been |
2330 | | /// cleared too many times, then an error is returned. |
2331 | 0 | fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> { |
2332 | 0 | let sid = match LazyStateID::new(self.cache.trans.len()) { |
2333 | 0 | Ok(sid) => sid, |
2334 | | Err(_) => { |
2335 | 0 | self.try_clear_cache()?; |
2336 | | // This has to pass since we check that ID capacity at |
2337 | | // construction time can fit at least MIN_STATES states. |
2338 | 0 | LazyStateID::new(self.cache.trans.len()).unwrap() |
2339 | | } |
2340 | | }; |
2341 | 0 | Ok(sid) |
2342 | 0 | } |
2343 | | |
2344 | | /// Attempt to clear the cache used by this lazy DFA. |
2345 | | /// |
2346 | | /// If clearing the cache exceeds the minimum number of required cache |
2347 | | /// clearings, then this will return a cache error. In this case, |
2348 | | /// callers should bubble this up as the cache can't be used until it is |
2349 | | /// reset. Implementations of search should convert this error into a |
2350 | | /// [`MatchError::gave_up`]. |
2351 | | /// |
2352 | | /// If 'self.state_saver' is set to save a state, then this state is |
2353 | | /// persisted through cache clearing. Otherwise, the cache is returned to |
2354 | | /// its state after initialization with two exceptions: its clear count |
2355 | | /// is incremented and some of its memory likely has additional capacity. |
2356 | | /// That is, clearing a cache does _not_ release memory. |
2357 | | /// |
2358 | | /// Otherwise, any lazy state ID generated by the cache prior to resetting |
2359 | | /// it is invalid after the reset. |
2360 | 0 | fn try_clear_cache(&mut self) -> Result<(), CacheError> { |
2361 | 0 | let c = self.dfa.get_config(); |
2362 | 0 | if let Some(min_count) = c.get_minimum_cache_clear_count() { |
2363 | 0 | if self.cache.clear_count >= min_count { |
2364 | 0 | if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() { |
2365 | 0 | let len = self.cache.search_total_len(); |
2366 | 0 | let min_bytes = |
2367 | 0 | min_bytes_per.saturating_mul(self.cache.states.len()); |
2368 | | // If we've searched 0 bytes then probably something has |
2369 | | // gone wrong and the lazy DFA search implementation isn't |
2370 | | // correctly updating the search progress state. |
2371 | 0 | if len == 0 { |
2372 | 0 | trace!( |
2373 | 0 | "number of bytes searched is 0, but \ |
2374 | 0 | a minimum bytes per state searched ({}) is \ |
2375 | 0 | enabled, maybe Cache::search_update \ |
2376 | 0 | is not being used?", |
2377 | 0 | min_bytes_per, |
2378 | 0 | ); |
2379 | 0 | } |
2380 | 0 | if len < min_bytes { |
2381 | | trace!( |
2382 | | "lazy DFA cache has been cleared {} times, \ |
2383 | | which exceeds the limit of {}, \ |
2384 | | AND its bytes searched per state is less \ |
2385 | | than the configured minimum of {}, \ |
2386 | | therefore lazy DFA is giving up \ |
2387 | | (bytes searched since cache clear = {}, \ |
2388 | | number of states = {})", |
2389 | | self.cache.clear_count, |
2390 | | min_count, |
2391 | | min_bytes_per, |
2392 | | len, |
2393 | | self.cache.states.len(), |
2394 | | ); |
2395 | 0 | return Err(CacheError::bad_efficiency()); |
2396 | 0 | } else { |
2397 | 0 | trace!( |
2398 | 0 | "lazy DFA cache has been cleared {} times, \ |
2399 | 0 | which exceeds the limit of {}, \ |
2400 | 0 | AND its bytes searched per state is greater \ |
2401 | 0 | than the configured minimum of {}, \ |
2402 | 0 | therefore lazy DFA is continuing! \ |
2403 | 0 | (bytes searched since cache clear = {}, \ |
2404 | 0 | number of states = {})", |
2405 | 0 | self.cache.clear_count, |
2406 | 0 | min_count, |
2407 | 0 | min_bytes_per, |
2408 | 0 | len, |
2409 | 0 | self.cache.states.len(), |
2410 | 0 | ); |
2411 | 0 | } |
2412 | | } else { |
2413 | | trace!( |
2414 | | "lazy DFA cache has been cleared {} times, \ |
2415 | | which exceeds the limit of {}, \ |
2416 | | since there is no configured bytes per state \ |
2417 | | minimum, lazy DFA is giving up", |
2418 | | self.cache.clear_count, |
2419 | | min_count, |
2420 | | ); |
2421 | 0 | return Err(CacheError::too_many_cache_clears()); |
2422 | | } |
2423 | 0 | } |
2424 | 0 | } |
2425 | 0 | self.clear_cache(); |
2426 | 0 | Ok(()) |
2427 | 0 | } |
2428 | | |
2429 | | /// Clears _and_ resets the cache. Resetting the cache means that no |
2430 | | /// states are persisted and the clear count is reset to 0. No heap memory |
2431 | | /// is released. |
2432 | | /// |
2433 | | /// Note that the caller may reset a cache with a different DFA than what |
2434 | | /// it was created from. In which case, the cache can now be used with the |
2435 | | /// new DFA (and not the old DFA). |
2436 | 0 | fn reset_cache(&mut self) { |
2437 | 0 | self.cache.state_saver = StateSaver::none(); |
2438 | 0 | self.clear_cache(); |
2439 | | // If a new DFA is used, it might have a different number of NFA |
2440 | | // states, so we need to make sure our sparse sets have the appropriate |
2441 | | // size. |
2442 | 0 | self.cache.sparses.resize(self.dfa.get_nfa().states().len()); |
2443 | 0 | self.cache.clear_count = 0; |
2444 | 0 | self.cache.progress = None; |
2445 | 0 | } |
2446 | | |
2447 | | /// Clear the cache used by this lazy DFA. |
2448 | | /// |
2449 | | /// If 'self.state_saver' is set to save a state, then this state is |
2450 | | /// persisted through cache clearing. Otherwise, the cache is returned to |
2451 | | /// its state after initialization with two exceptions: its clear count |
2452 | | /// is incremented and some of its memory likely has additional capacity. |
2453 | | /// That is, clearing a cache does _not_ release memory. |
2454 | | /// |
2455 | | /// Otherwise, any lazy state ID generated by the cache prior to resetting |
2456 | | /// it is invalid after the reset. |
2457 | 0 | fn clear_cache(&mut self) { |
2458 | 0 | self.cache.trans.clear(); |
2459 | 0 | self.cache.starts.clear(); |
2460 | 0 | self.cache.states.clear(); |
2461 | 0 | self.cache.states_to_id.clear(); |
2462 | 0 | self.cache.memory_usage_state = 0; |
2463 | 0 | self.cache.clear_count += 1; |
2464 | 0 | self.cache.bytes_searched = 0; |
2465 | 0 | if let Some(ref mut progress) = self.cache.progress { |
2466 | 0 | progress.start = progress.at; |
2467 | 0 | } |
2468 | | trace!( |
2469 | | "lazy DFA cache has been cleared (count: {})", |
2470 | | self.cache.clear_count |
2471 | | ); |
2472 | 0 | self.init_cache(); |
2473 | | // If the state we want to save is one of the sentinel |
2474 | | // (unknown/dead/quit) states, then 'init_cache' adds those back, and |
2475 | | // their identifier values remains invariant. So there's no need to add |
2476 | | // it again. (And indeed, doing so would be incorrect!) |
2477 | 0 | if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { |
2478 | | // If the state is one of the special sentinel states, then it is |
2479 | | // automatically added by cache initialization and its ID always |
2480 | | // remains the same. With that said, this should never occur since |
2481 | | // the sentinel states are all loop states back to themselves. So |
2482 | | // we should never be in a position where we're attempting to save |
2483 | | // a sentinel state since we never compute transitions out of a |
2484 | | // sentinel state. |
2485 | 0 | assert!( |
2486 | 0 | !self.as_ref().is_sentinel(old_id), |
2487 | 0 | "cannot save sentinel state" |
2488 | | ); |
2489 | 0 | let new_id = self |
2490 | 0 | .add_state(state, |id| { |
2491 | 0 | if old_id.is_start() { |
2492 | | // We don't need to consult the |
2493 | | // 'specialize_start_states' config knob here, because |
2494 | | // if it's disabled, old_id.is_start() will never |
2495 | | // return true. |
2496 | 0 | id.to_start() |
2497 | | } else { |
2498 | 0 | id |
2499 | | } |
2500 | 0 | }) |
2501 | | // The unwrap here is OK because lazy DFA creation ensures that |
2502 | | // we have room in the cache to add MIN_STATES states. Since |
2503 | | // 'init_cache' above adds 3, this adds a 4th. |
2504 | 0 | .expect("adding one state after cache clear must work"); |
2505 | 0 | self.cache.state_saver = StateSaver::Saved(new_id); |
2506 | 0 | } |
2507 | 0 | } |
2508 | | |
2509 | | /// Initialize this cache from emptiness to a place where it can be used |
2510 | | /// for search. |
2511 | | /// |
2512 | | /// This is called both at cache creation time and after the cache has been |
2513 | | /// cleared. |
2514 | | /// |
2515 | | /// Primarily, this adds the three sentinel states and allocates some |
2516 | | /// initial memory. |
2517 | 0 | fn init_cache(&mut self) { |
2518 | | // Why multiply by 2 here? Because we make room for both the unanchored |
2519 | | // and anchored start states. Unanchored is first and then anchored. |
2520 | 0 | let mut starts_len = Start::len().checked_mul(2).unwrap(); |
2521 | | // ... but if we also want start states for every pattern, we make room |
2522 | | // for that too. |
2523 | 0 | if self.dfa.get_config().get_starts_for_each_pattern() { |
2524 | 0 | starts_len += Start::len() * self.dfa.pattern_len(); |
2525 | 0 | } |
2526 | 0 | self.cache |
2527 | 0 | .starts |
2528 | 0 | .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); |
2529 | | // This is the set of NFA states that corresponds to each of our three |
2530 | | // sentinel states: the empty set. |
2531 | 0 | let dead = State::dead(); |
2532 | | // This sets up some states that we use as sentinels that are present |
2533 | | // in every DFA. While it would be technically possible to implement |
2534 | | // this DFA without explicitly putting these states in the transition |
2535 | | // table, this is convenient to do to make `next_state` correct for all |
2536 | | // valid state IDs without needing explicit conditionals to special |
2537 | | // case these sentinel states. |
2538 | | // |
2539 | | // All three of these states are "dead" states. That is, all of |
2540 | | // them transition only to themselves. So once you enter one of |
2541 | | // these states, it's impossible to leave them. Thus, any correct |
2542 | | // search routine must explicitly check for these state types. (Sans |
2543 | | // `unknown`, since that is only used internally to represent missing |
2544 | | // states.) |
2545 | 0 | let unk_id = |
2546 | 0 | self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); |
2547 | 0 | let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); |
2548 | 0 | let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); |
2549 | 0 | assert_eq!(unk_id, self.as_ref().unknown_id()); |
2550 | 0 | assert_eq!(dead_id, self.as_ref().dead_id()); |
2551 | 0 | assert_eq!(quit_id, self.as_ref().quit_id()); |
2552 | | // The idea here is that if you start in an unknown/dead/quit state and |
2553 | | // try to transition on them, then you should end up where you started. |
2554 | 0 | self.set_all_transitions(unk_id, unk_id); |
2555 | 0 | self.set_all_transitions(dead_id, dead_id); |
2556 | 0 | self.set_all_transitions(quit_id, quit_id); |
2557 | | // All of these states are technically equivalent from the FSM |
2558 | | // perspective, so putting all three of them in the cache isn't |
2559 | | // possible. (They are distinct merely because we use their |
2560 | | // identifiers as sentinels to mean something, as indicated by the |
2561 | | // names.) Moreover, we wouldn't want to do that. Unknown and quit |
2562 | | // states are special in that they are artificial constructions |
2563 | | // this implementation. But dead states are a natural part of |
2564 | | // determinization. When you reach a point in the NFA where you cannot |
2565 | | // go anywhere else, a dead state will naturally arise and we MUST |
2566 | | // reuse the canonical dead state that we've created here. Why? Because |
2567 | | // it is the state ID that tells the search routine whether a state is |
2568 | | // dead or not, and thus, whether to stop the search. Having a bunch of |
2569 | | // distinct dead states would be quite wasteful! |
2570 | 0 | self.cache.states_to_id.insert(dead, dead_id); |
2571 | 0 | } |
2572 | | |
2573 | | /// Save the state corresponding to the ID given such that the state |
2574 | | /// persists through a cache clearing. |
2575 | | /// |
2576 | | /// While the state may persist, the ID may not. In order to discover the |
2577 | | /// new state ID, one must call 'saved_state_id' after a cache clearing. |
2578 | 0 | fn save_state(&mut self, id: LazyStateID) { |
2579 | 0 | let state = self.as_ref().get_cached_state(id).clone(); |
2580 | 0 | self.cache.state_saver = StateSaver::ToSave { id, state }; |
2581 | 0 | } |
2582 | | |
2583 | | /// Returns the updated lazy state ID for a state that was persisted |
2584 | | /// through a cache clearing. |
2585 | | /// |
2586 | | /// It is only correct to call this routine when both a state has been |
2587 | | /// saved and the cache has just been cleared. Otherwise, this panics. |
2588 | 0 | fn saved_state_id(&mut self) -> LazyStateID { |
2589 | 0 | self.cache |
2590 | 0 | .state_saver |
2591 | 0 | .take_saved() |
2592 | 0 | .expect("state saver does not have saved state ID") |
2593 | 0 | } |
2594 | | |
2595 | | /// Set all transitions on the state 'from' to 'to'. |
2596 | 0 | fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { |
2597 | 0 | for unit in self.dfa.classes.representatives(..) { |
2598 | 0 | self.set_transition(from, unit, to); |
2599 | 0 | } |
2600 | 0 | } |
2601 | | |
2602 | | /// Set the transition on 'from' for 'unit' to 'to'. |
2603 | | /// |
2604 | | /// This panics if either 'from' or 'to' is invalid. |
2605 | | /// |
2606 | | /// All unit values are OK. |
2607 | 0 | fn set_transition( |
2608 | 0 | &mut self, |
2609 | 0 | from: LazyStateID, |
2610 | 0 | unit: alphabet::Unit, |
2611 | 0 | to: LazyStateID, |
2612 | 0 | ) { |
2613 | 0 | assert!(self.as_ref().is_valid(from), "invalid 'from' id: {from:?}"); |
2614 | 0 | assert!(self.as_ref().is_valid(to), "invalid 'to' id: {to:?}"); |
2615 | 0 | let offset = |
2616 | 0 | from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); |
2617 | 0 | self.cache.trans[offset] = to; |
2618 | 0 | } |
2619 | | |
2620 | | /// Set the start ID for the given pattern ID (if given) and starting |
2621 | | /// configuration to the ID given. |
2622 | | /// |
2623 | | /// This panics if 'id' is not valid or if a pattern ID is given and |
2624 | | /// 'starts_for_each_pattern' is not enabled. |
2625 | 0 | fn set_start_state( |
2626 | 0 | &mut self, |
2627 | 0 | anchored: Anchored, |
2628 | 0 | start: Start, |
2629 | 0 | id: LazyStateID, |
2630 | 0 | ) { |
2631 | 0 | assert!(self.as_ref().is_valid(id)); |
2632 | 0 | let start_index = start.as_usize(); |
2633 | 0 | let index = match anchored { |
2634 | 0 | Anchored::No => start_index, |
2635 | 0 | Anchored::Yes => Start::len() + start_index, |
2636 | 0 | Anchored::Pattern(pid) => { |
2637 | 0 | assert!( |
2638 | 0 | self.dfa.get_config().get_starts_for_each_pattern(), |
2639 | 0 | "attempted to search for a specific pattern \ |
2640 | 0 | without enabling starts_for_each_pattern", |
2641 | | ); |
2642 | 0 | let pid = pid.as_usize(); |
2643 | 0 | (2 * Start::len()) + (Start::len() * pid) + start_index |
2644 | | } |
2645 | | }; |
2646 | 0 | self.cache.starts[index] = id; |
2647 | 0 | } |
2648 | | |
2649 | | /// Returns a state builder from this DFA that might have existing |
2650 | | /// capacity. This helps avoid allocs in cases where a state is built that |
2651 | | /// turns out to already be cached. |
2652 | | /// |
2653 | | /// Callers must put the state builder back with 'put_state_builder', |
2654 | | /// otherwise the allocation reuse won't work. |
2655 | 0 | fn get_state_builder(&mut self) -> StateBuilderEmpty { |
2656 | 0 | core::mem::replace( |
2657 | 0 | &mut self.cache.scratch_state_builder, |
2658 | 0 | StateBuilderEmpty::new(), |
2659 | | ) |
2660 | 0 | } |
2661 | | |
2662 | | /// Puts the given state builder back into this DFA for reuse. |
2663 | | /// |
2664 | | /// Note that building a 'State' from a builder always creates a new alloc, |
2665 | | /// so callers should always put the builder back. |
2666 | 0 | fn put_state_builder(&mut self, builder: StateBuilderNFA) { |
2667 | 0 | let _ = core::mem::replace( |
2668 | 0 | &mut self.cache.scratch_state_builder, |
2669 | 0 | builder.clear(), |
2670 | 0 | ); |
2671 | 0 | } |
2672 | | } |
2673 | | |
2674 | | /// A type that groups methods that require the base NFA/DFA and read-only |
2675 | | /// access to the cache. |
2676 | | #[derive(Debug)] |
2677 | | struct LazyRef<'i, 'c> { |
2678 | | dfa: &'i DFA, |
2679 | | cache: &'c Cache, |
2680 | | } |
2681 | | |
2682 | | impl<'i, 'c> LazyRef<'i, 'c> { |
2683 | | /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. |
2684 | 0 | fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { |
2685 | 0 | LazyRef { dfa, cache } |
2686 | 0 | } |
2687 | | |
2688 | | /// Return the ID of the start state for the given configuration. |
2689 | | /// |
2690 | | /// If the start state has not yet been computed, then this returns an |
2691 | | /// unknown lazy state ID. |
2692 | | #[cfg_attr(feature = "perf-inline", inline(always))] |
2693 | 0 | fn get_cached_start_id( |
2694 | 0 | &self, |
2695 | 0 | anchored: Anchored, |
2696 | 0 | start: Start, |
2697 | 0 | ) -> Result<LazyStateID, StartError> { |
2698 | 0 | let start_index = start.as_usize(); |
2699 | 0 | let index = match anchored { |
2700 | 0 | Anchored::No => start_index, |
2701 | 0 | Anchored::Yes => Start::len() + start_index, |
2702 | 0 | Anchored::Pattern(pid) => { |
2703 | 0 | if !self.dfa.get_config().get_starts_for_each_pattern() { |
2704 | 0 | return Err(StartError::unsupported_anchored(anchored)); |
2705 | 0 | } |
2706 | 0 | if pid.as_usize() >= self.dfa.pattern_len() { |
2707 | 0 | return Ok(self.dead_id()); |
2708 | 0 | } |
2709 | 0 | (2 * Start::len()) |
2710 | 0 | + (Start::len() * pid.as_usize()) |
2711 | 0 | + start_index |
2712 | | } |
2713 | | }; |
2714 | 0 | Ok(self.cache.starts[index]) |
2715 | 0 | } |
2716 | | |
2717 | | /// Return the cached NFA/DFA powerset state for the given ID. |
2718 | | /// |
2719 | | /// This panics if the given ID does not address a valid state. |
2720 | 0 | fn get_cached_state(&self, sid: LazyStateID) -> &State { |
2721 | 0 | let index = sid.as_usize_untagged() >> self.dfa.stride2(); |
2722 | 0 | &self.cache.states[index] |
2723 | 0 | } |
2724 | | |
2725 | | /// Returns true if and only if the given ID corresponds to a "sentinel" |
2726 | | /// state. |
2727 | | /// |
2728 | | /// A sentinel state is a state that signifies a special condition of |
2729 | | /// search, and where every transition maps back to itself. See LazyStateID |
2730 | | /// for more details. Note that start and match states are _not_ sentinels |
2731 | | /// since they may otherwise be real states with non-trivial transitions. |
2732 | | /// The purposes of sentinel states is purely to indicate something. Their |
2733 | | /// transitions are not meant to be followed. |
2734 | 0 | fn is_sentinel(&self, id: LazyStateID) -> bool { |
2735 | 0 | id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() |
2736 | 0 | } |
2737 | | |
2738 | | /// Returns the ID of the unknown state for this lazy DFA. |
2739 | 0 | fn unknown_id(&self) -> LazyStateID { |
2740 | | // This unwrap is OK since 0 is always a valid state ID. |
2741 | 0 | LazyStateID::new(0).unwrap().to_unknown() |
2742 | 0 | } |
2743 | | |
2744 | | /// Returns the ID of the dead state for this lazy DFA. |
2745 | 0 | fn dead_id(&self) -> LazyStateID { |
2746 | | // This unwrap is OK since the maximum value here is 1 * 512 = 512, |
2747 | | // which is <= 2047 (the maximum state ID on 16-bit systems). Where |
2748 | | // 512 is the worst case for our equivalence classes (every byte is a |
2749 | | // distinct class). |
2750 | 0 | LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() |
2751 | 0 | } |
2752 | | |
2753 | | /// Returns the ID of the quit state for this lazy DFA. |
2754 | 0 | fn quit_id(&self) -> LazyStateID { |
2755 | | // This unwrap is OK since the maximum value here is 2 * 512 = 1024, |
2756 | | // which is <= 2047 (the maximum state ID on 16-bit systems). Where |
2757 | | // 512 is the worst case for our equivalence classes (every byte is a |
2758 | | // distinct class). |
2759 | 0 | LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() |
2760 | 0 | } |
2761 | | |
2762 | | /// Returns true if and only if the given ID is valid. |
2763 | | /// |
2764 | | /// An ID is valid if it is both a valid index into the transition table |
2765 | | /// and is a multiple of the DFA's stride. |
2766 | 0 | fn is_valid(&self, id: LazyStateID) -> bool { |
2767 | 0 | let id = id.as_usize_untagged(); |
2768 | 0 | id < self.cache.trans.len() && id % self.dfa.stride() == 0 |
2769 | 0 | } |
2770 | | |
2771 | | /// Returns true if adding the state given would fit in this cache. |
2772 | 0 | fn state_fits_in_cache(&self, state: &State) -> bool { |
2773 | 0 | let needed = self.cache.memory_usage() |
2774 | 0 | + self.memory_usage_for_one_more_state(state.memory_usage()); |
2775 | | trace!( |
2776 | | "lazy DFA cache capacity state check: {:?} ?<=? {:?}", |
2777 | | needed, |
2778 | | self.dfa.cache_capacity |
2779 | | ); |
2780 | 0 | needed <= self.dfa.cache_capacity |
2781 | 0 | } |
2782 | | |
2783 | | /// Returns true if adding the state to be built by the given builder would |
2784 | | /// fit in this cache. |
2785 | 0 | fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { |
2786 | 0 | let needed = self.cache.memory_usage() |
2787 | 0 | + self.memory_usage_for_one_more_state(state.as_bytes().len()); |
2788 | | trace!( |
2789 | | "lazy DFA cache capacity state builder check: {:?} ?<=? {:?}", |
2790 | | needed, |
2791 | | self.dfa.cache_capacity |
2792 | | ); |
2793 | 0 | needed <= self.dfa.cache_capacity |
2794 | 0 | } |
2795 | | |
2796 | | /// Returns the additional memory usage, in bytes, required to add one more |
2797 | | /// state to this cache. The given size should be the heap size, in bytes, |
2798 | | /// that would be used by the new state being added. |
2799 | 0 | fn memory_usage_for_one_more_state( |
2800 | 0 | &self, |
2801 | 0 | state_heap_size: usize, |
2802 | 0 | ) -> usize { |
2803 | | const ID_SIZE: usize = size_of::<LazyStateID>(); |
2804 | | const STATE_SIZE: usize = size_of::<State>(); |
2805 | | |
2806 | 0 | self.dfa.stride() * ID_SIZE // additional space needed in trans table |
2807 | 0 | + STATE_SIZE // space in cache.states |
2808 | 0 | + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id |
2809 | 0 | + state_heap_size // heap memory used by state itself |
2810 | 0 | } |
2811 | | } |
2812 | | |
2813 | | /// A simple type that encapsulates the saving of a state ID through a cache |
2814 | | /// clearing. |
2815 | | /// |
2816 | | /// A state ID can be marked for saving with ToSave, while a state ID can be |
2817 | | /// saved itself with Saved. |
2818 | | #[derive(Clone, Debug)] |
2819 | | enum StateSaver { |
2820 | | /// An empty state saver. In this case, no states (other than the special |
2821 | | /// sentinel states) are preserved after clearing the cache. |
2822 | | None, |
2823 | | /// An ID of a state (and the state itself) that should be preserved after |
2824 | | /// the lazy DFA's cache has been cleared. After clearing, the updated ID |
2825 | | /// is stored in 'Saved' since it may have changed. |
2826 | | ToSave { id: LazyStateID, state: State }, |
2827 | | /// An ID that of a state that has been persisted through a lazy DFA |
2828 | | /// cache clearing. The ID recorded here corresponds to an ID that was |
2829 | | /// once marked as ToSave. The IDs are likely not equivalent even though |
2830 | | /// the states they point to are. |
2831 | | Saved(LazyStateID), |
2832 | | } |
2833 | | |
2834 | | impl StateSaver { |
2835 | | /// Create an empty state saver. |
2836 | 0 | fn none() -> StateSaver { |
2837 | 0 | StateSaver::None |
2838 | 0 | } |
2839 | | |
2840 | | /// Replace this state saver with an empty saver, and if this saver is a |
2841 | | /// request to save a state, return that request. |
2842 | 0 | fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { |
2843 | 0 | match core::mem::replace(self, StateSaver::None) { |
2844 | 0 | StateSaver::None | StateSaver::Saved(_) => None, |
2845 | 0 | StateSaver::ToSave { id, state } => Some((id, state)), |
2846 | | } |
2847 | 0 | } |
2848 | | |
2849 | | /// Replace this state saver with an empty saver, and if this saver is a |
2850 | | /// saved state (or a request to save a state), return that state's ID. |
2851 | | /// |
2852 | | /// The idea here is that a request to save a state isn't necessarily |
2853 | | /// honored because it might not be needed. e.g., Some higher level code |
2854 | | /// might request a state to be saved on the off chance that the cache gets |
2855 | | /// cleared when a new state is added at a lower level. But if that new |
2856 | | /// state is never added, then the cache is never cleared and the state and |
2857 | | /// its ID remain unchanged. |
2858 | 0 | fn take_saved(&mut self) -> Option<LazyStateID> { |
2859 | 0 | match core::mem::replace(self, StateSaver::None) { |
2860 | 0 | StateSaver::None => None, |
2861 | 0 | StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), |
2862 | | } |
2863 | 0 | } |
2864 | | } |
2865 | | |
2866 | | /// The configuration used for building a lazy DFA. |
2867 | | /// |
2868 | | /// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The |
2869 | | /// advantage of the former is that it often lets you avoid importing the |
2870 | | /// `Config` type directly. |
2871 | | /// |
2872 | | /// A lazy DFA configuration is a simple data object that is typically used |
2873 | | /// with [`Builder::configure`]. |
2874 | | /// |
2875 | | /// The default configuration guarantees that a search will never return a |
2876 | | /// "gave up" or "quit" error, although it is possible for a search to fail |
2877 | | /// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by |
2878 | | /// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. |
2879 | | #[derive(Clone, Debug, Default)] |
2880 | | pub struct Config { |
2881 | | // As with other configuration types in this crate, we put all our knobs |
2882 | | // in options so that we can distinguish between "default" and "not set." |
2883 | | // This makes it possible to easily combine multiple configurations |
2884 | | // without default values overwriting explicitly specified values. See the |
2885 | | // 'overwrite' method. |
2886 | | // |
2887 | | // For docs on the fields below, see the corresponding method setters. |
2888 | | match_kind: Option<MatchKind>, |
2889 | | pre: Option<Option<Prefilter>>, |
2890 | | starts_for_each_pattern: Option<bool>, |
2891 | | byte_classes: Option<bool>, |
2892 | | unicode_word_boundary: Option<bool>, |
2893 | | quitset: Option<ByteSet>, |
2894 | | specialize_start_states: Option<bool>, |
2895 | | cache_capacity: Option<usize>, |
2896 | | skip_cache_capacity_check: Option<bool>, |
2897 | | minimum_cache_clear_count: Option<Option<usize>>, |
2898 | | minimum_bytes_per_state: Option<Option<usize>>, |
2899 | | } |
2900 | | |
2901 | | impl Config { |
2902 | | /// Return a new default lazy DFA builder configuration. |
2903 | 0 | pub fn new() -> Config { |
2904 | 0 | Config::default() |
2905 | 0 | } |
2906 | | |
2907 | | /// Set the desired match semantics. |
2908 | | /// |
2909 | | /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
2910 | | /// match semantics of Perl-like regex engines. That is, when multiple |
2911 | | /// patterns would match at the same leftmost position, the pattern that |
2912 | | /// appears first in the concrete syntax is chosen. |
2913 | | /// |
2914 | | /// Currently, the only other kind of match semantics supported is |
2915 | | /// [`MatchKind::All`]. This corresponds to classical DFA construction |
2916 | | /// where all possible matches are added to the lazy DFA. |
2917 | | /// |
2918 | | /// Typically, `All` is used when one wants to execute an overlapping |
2919 | | /// search and `LeftmostFirst` otherwise. In particular, it rarely makes |
2920 | | /// sense to use `All` with the various "leftmost" find routines, since the |
2921 | | /// leftmost routines depend on the `LeftmostFirst` automata construction |
2922 | | /// strategy. Specifically, `LeftmostFirst` adds dead states to the |
2923 | | /// lazy DFA as a way to terminate the search and report a match. |
2924 | | /// `LeftmostFirst` also supports non-greedy matches using this strategy |
2925 | | /// where as `All` does not. |
2926 | | /// |
2927 | | /// # Example: overlapping search |
2928 | | /// |
2929 | | /// This example shows the typical use of `MatchKind::All`, which is to |
2930 | | /// report overlapping matches. |
2931 | | /// |
2932 | | /// ``` |
2933 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
2934 | | /// use regex_automata::{ |
2935 | | /// hybrid::dfa::{DFA, OverlappingState}, |
2936 | | /// HalfMatch, Input, MatchKind, |
2937 | | /// }; |
2938 | | /// |
2939 | | /// let dfa = DFA::builder() |
2940 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
2941 | | /// .build_many(&[r"\w+$", r"\S+$"])?; |
2942 | | /// let mut cache = dfa.create_cache(); |
2943 | | /// let haystack = "@foo"; |
2944 | | /// let mut state = OverlappingState::start(); |
2945 | | /// |
2946 | | /// let expected = Some(HalfMatch::must(1, 4)); |
2947 | | /// dfa.try_search_overlapping_fwd( |
2948 | | /// &mut cache, &Input::new(haystack), &mut state, |
2949 | | /// )?; |
2950 | | /// assert_eq!(expected, state.get_match()); |
2951 | | /// |
2952 | | /// // The first pattern also matches at the same position, so re-running |
2953 | | /// // the search will yield another match. Notice also that the first |
2954 | | /// // pattern is returned after the second. This is because the second |
2955 | | /// // pattern begins its match before the first, is therefore an earlier |
2956 | | /// // match and is thus reported first. |
2957 | | /// let expected = Some(HalfMatch::must(0, 4)); |
2958 | | /// dfa.try_search_overlapping_fwd( |
2959 | | /// &mut cache, &Input::new(haystack), &mut state, |
2960 | | /// )?; |
2961 | | /// assert_eq!(expected, state.get_match()); |
2962 | | /// |
2963 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2964 | | /// ``` |
2965 | | /// |
2966 | | /// # Example: reverse automaton to find start of match |
2967 | | /// |
2968 | | /// Another example for using `MatchKind::All` is for constructing a |
2969 | | /// reverse automaton to find the start of a match. `All` semantics are |
2970 | | /// used for this in order to find the longest possible match, which |
2971 | | /// corresponds to the leftmost starting position. |
2972 | | /// |
2973 | | /// Note that if you need the starting position then |
2974 | | /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this |
2975 | | /// for you, so it's usually not necessary to do this yourself. |
2976 | | /// |
2977 | | /// ``` |
2978 | | /// use regex_automata::{ |
2979 | | /// hybrid::dfa::DFA, |
2980 | | /// nfa::thompson::NFA, |
2981 | | /// Anchored, HalfMatch, Input, MatchKind, |
2982 | | /// }; |
2983 | | /// |
2984 | | /// let input = Input::new("123foobar456"); |
2985 | | /// let pattern = r"[a-z]+r"; |
2986 | | /// |
2987 | | /// let dfa_fwd = DFA::new(pattern)?; |
2988 | | /// let dfa_rev = DFA::builder() |
2989 | | /// .thompson(NFA::config().reverse(true)) |
2990 | | /// .configure(DFA::config().match_kind(MatchKind::All)) |
2991 | | /// .build(pattern)?; |
2992 | | /// let mut cache_fwd = dfa_fwd.create_cache(); |
2993 | | /// let mut cache_rev = dfa_rev.create_cache(); |
2994 | | /// |
2995 | | /// let expected_fwd = HalfMatch::must(0, 9); |
2996 | | /// let expected_rev = HalfMatch::must(0, 3); |
2997 | | /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap(); |
2998 | | /// // Here we don't specify the pattern to search for since there's only |
2999 | | /// // one pattern and we're doing a leftmost search. But if this were an |
3000 | | /// // overlapping search, you'd need to specify the pattern that matched |
3001 | | /// // in the forward direction. (Otherwise, you might wind up finding the |
3002 | | /// // starting position of a match of some other pattern.) That in turn |
3003 | | /// // requires building the reverse automaton with starts_for_each_pattern |
3004 | | /// // enabled. |
3005 | | /// let input = input |
3006 | | /// .clone() |
3007 | | /// .range(..got_fwd.offset()) |
3008 | | /// .anchored(Anchored::Yes); |
3009 | | /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap(); |
3010 | | /// assert_eq!(expected_fwd, got_fwd); |
3011 | | /// assert_eq!(expected_rev, got_rev); |
3012 | | /// |
3013 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3014 | | /// ``` |
3015 | 0 | pub fn match_kind(mut self, kind: MatchKind) -> Config { |
3016 | 0 | self.match_kind = Some(kind); |
3017 | 0 | self |
3018 | 0 | } |
3019 | | |
3020 | | /// Set a prefilter to be used whenever a start state is entered. |
3021 | | /// |
3022 | | /// A [`Prefilter`] in this context is meant to accelerate searches by |
3023 | | /// looking for literal prefixes that every match for the corresponding |
3024 | | /// pattern (or patterns) must start with. Once a prefilter produces a |
3025 | | /// match, the underlying search routine continues on to try and confirm |
3026 | | /// the match. |
3027 | | /// |
3028 | | /// Be warned that setting a prefilter does not guarantee that the search |
3029 | | /// will be faster. While it's usually a good bet, if the prefilter |
3030 | | /// produces a lot of false positive candidates (i.e., positions matched |
3031 | | /// by the prefilter but not by the regex), then the overall result can |
3032 | | /// be slower than if you had just executed the regex engine without any |
3033 | | /// prefilters. |
3034 | | /// |
3035 | | /// Note that unless [`Config::specialize_start_states`] has been |
3036 | | /// explicitly set, then setting this will also enable (when `pre` is |
3037 | | /// `Some`) or disable (when `pre` is `None`) start state specialization. |
3038 | | /// This occurs because without start state specialization, a prefilter |
3039 | | /// is likely to be less effective. And without a prefilter, start state |
3040 | | /// specialization is usually pointless. |
3041 | | /// |
3042 | | /// By default no prefilter is set. |
3043 | | /// |
3044 | | /// # Example |
3045 | | /// |
3046 | | /// ``` |
3047 | | /// use regex_automata::{ |
3048 | | /// hybrid::dfa::DFA, |
3049 | | /// util::prefilter::Prefilter, |
3050 | | /// Input, HalfMatch, MatchKind, |
3051 | | /// }; |
3052 | | /// |
3053 | | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); |
3054 | | /// let re = DFA::builder() |
3055 | | /// .configure(DFA::config().prefilter(pre)) |
3056 | | /// .build(r"(foo|bar)[a-z]+")?; |
3057 | | /// let mut cache = re.create_cache(); |
3058 | | /// let input = Input::new("foo1 barfox bar"); |
3059 | | /// assert_eq!( |
3060 | | /// Some(HalfMatch::must(0, 11)), |
3061 | | /// re.try_search_fwd(&mut cache, &input)?, |
3062 | | /// ); |
3063 | | /// |
3064 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3065 | | /// ``` |
3066 | | /// |
3067 | | /// Be warned though that an incorrect prefilter can lead to incorrect |
3068 | | /// results! |
3069 | | /// |
3070 | | /// ``` |
3071 | | /// use regex_automata::{ |
3072 | | /// hybrid::dfa::DFA, |
3073 | | /// util::prefilter::Prefilter, |
3074 | | /// Input, HalfMatch, MatchKind, |
3075 | | /// }; |
3076 | | /// |
3077 | | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); |
3078 | | /// let re = DFA::builder() |
3079 | | /// .configure(DFA::config().prefilter(pre)) |
3080 | | /// .build(r"(foo|bar)[a-z]+")?; |
3081 | | /// let mut cache = re.create_cache(); |
3082 | | /// let input = Input::new("foo1 barfox bar"); |
3083 | | /// assert_eq!( |
3084 | | /// // No match reported even though there clearly is one! |
3085 | | /// None, |
3086 | | /// re.try_search_fwd(&mut cache, &input)?, |
3087 | | /// ); |
3088 | | /// |
3089 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3090 | | /// ``` |
3091 | 0 | pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { |
3092 | 0 | self.pre = Some(pre); |
3093 | 0 | if self.specialize_start_states.is_none() { |
3094 | 0 | self.specialize_start_states = |
3095 | 0 | Some(self.get_prefilter().is_some()); |
3096 | 0 | } |
3097 | 0 | self |
3098 | 0 | } |
3099 | | |
3100 | | /// Whether to compile a separate start state for each pattern in the |
3101 | | /// lazy DFA. |
3102 | | /// |
3103 | | /// When enabled, a separate **anchored** start state is added for each |
3104 | | /// pattern in the lazy DFA. When this start state is used, then the DFA |
3105 | | /// will only search for matches for the pattern specified, even if there |
3106 | | /// are other patterns in the DFA. |
3107 | | /// |
3108 | | /// The main downside of this option is that it can potentially increase |
3109 | | /// the size of the DFA and/or increase the time it takes to build the |
3110 | | /// DFA at search time. However, since this is configuration for a lazy |
3111 | | /// DFA, these states aren't actually built unless they're used. Enabling |
3112 | | /// this isn't necessarily free, however, as it may result in higher cache |
3113 | | /// usage. |
3114 | | /// |
3115 | | /// There are a few reasons one might want to enable this (it's disabled |
3116 | | /// by default): |
3117 | | /// |
3118 | | /// 1. When looking for the start of an overlapping match (using a reverse |
3119 | | /// DFA), doing it correctly requires starting the reverse search using the |
3120 | | /// starting state of the pattern that matched in the forward direction. |
3121 | | /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it |
3122 | | /// will automatically enable this option when building the reverse DFA |
3123 | | /// internally. |
3124 | | /// 2. When you want to use a DFA with multiple patterns to both search |
3125 | | /// for matches of any pattern or to search for anchored matches of one |
3126 | | /// particular pattern while using the same DFA. (Otherwise, you would need |
3127 | | /// to compile a new DFA for each pattern.) |
3128 | | /// |
3129 | | /// By default this is disabled. |
3130 | | /// |
3131 | | /// # Example |
3132 | | /// |
3133 | | /// This example shows how to use this option to permit the same lazy DFA |
3134 | | /// to run both general searches for any pattern and anchored searches for |
3135 | | /// a specific pattern. |
3136 | | /// |
3137 | | /// ``` |
3138 | | /// use regex_automata::{ |
3139 | | /// hybrid::dfa::DFA, |
3140 | | /// Anchored, HalfMatch, Input, PatternID, |
3141 | | /// }; |
3142 | | /// |
3143 | | /// let dfa = DFA::builder() |
3144 | | /// .configure(DFA::config().starts_for_each_pattern(true)) |
3145 | | /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?; |
3146 | | /// let mut cache = dfa.create_cache(); |
3147 | | /// let haystack = "bar foo123"; |
3148 | | /// |
3149 | | /// // Here's a normal unanchored search that looks for any pattern. |
3150 | | /// let expected = HalfMatch::must(0, 10); |
3151 | | /// let input = Input::new(haystack); |
3152 | | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
3153 | | /// // We can also do a normal anchored search for any pattern. Since it's |
3154 | | /// // an anchored search, we position the start of the search where we |
3155 | | /// // know the match will begin. |
3156 | | /// let expected = HalfMatch::must(0, 10); |
3157 | | /// let input = Input::new(haystack).range(4..); |
3158 | | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
3159 | | /// // Since we compiled anchored start states for each pattern, we can |
3160 | | /// // also look for matches of other patterns explicitly, even if a |
3161 | | /// // different pattern would have normally matched. |
3162 | | /// let expected = HalfMatch::must(1, 10); |
3163 | | /// let input = Input::new(haystack) |
3164 | | /// .range(4..) |
3165 | | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
3166 | | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
3167 | | /// |
3168 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3169 | | /// ``` |
3170 | 0 | pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { |
3171 | 0 | self.starts_for_each_pattern = Some(yes); |
3172 | 0 | self |
3173 | 0 | } |
3174 | | |
3175 | | /// Whether to attempt to shrink the size of the lazy DFA's alphabet or |
3176 | | /// not. |
3177 | | /// |
3178 | | /// This option is enabled by default and should never be disabled unless |
3179 | | /// one is debugging the lazy DFA. |
3180 | | /// |
3181 | | /// When enabled, the lazy DFA will use a map from all possible bytes |
3182 | | /// to their corresponding equivalence class. Each equivalence class |
3183 | | /// represents a set of bytes that does not discriminate between a match |
3184 | | /// and a non-match in the DFA. For example, the pattern `[ab]+` has at |
3185 | | /// least two equivalence classes: a set containing `a` and `b` and a set |
3186 | | /// containing every byte except for `a` and `b`. `a` and `b` are in the |
3187 | | /// same equivalence classes because they never discriminate between a |
3188 | | /// match and a non-match. |
3189 | | /// |
3190 | | /// The advantage of this map is that the size of the transition table |
3191 | | /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` |
3192 | | /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of |
3193 | | /// equivalence classes (rounded up to the nearest power of 2). As a |
3194 | | /// result, total space usage can decrease substantially. Moreover, since a |
3195 | | /// smaller alphabet is used, DFA compilation during search becomes faster |
3196 | | /// as well since it will potentially be able to reuse a single transition |
3197 | | /// for multiple bytes. |
3198 | | /// |
3199 | | /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling |
3200 | | /// this does not yield any speed advantages. Namely, even when this is |
3201 | | /// disabled, a byte class map is still used while searching. The only |
3202 | | /// difference is that every byte will be forced into its own distinct |
3203 | | /// equivalence class. This is useful for debugging the actual generated |
3204 | | /// transitions because it lets one see the transitions defined on actual |
3205 | | /// bytes instead of the equivalence classes. |
3206 | 0 | pub fn byte_classes(mut self, yes: bool) -> Config { |
3207 | 0 | self.byte_classes = Some(yes); |
3208 | 0 | self |
3209 | 0 | } |
3210 | | |
3211 | | /// Heuristically enable Unicode word boundaries. |
3212 | | /// |
3213 | | /// When set, this will attempt to implement Unicode word boundaries as if |
3214 | | /// they were ASCII word boundaries. This only works when the search input |
3215 | | /// is ASCII only. If a non-ASCII byte is observed while searching, then a |
3216 | | /// [`MatchError::quit`] error is returned. |
3217 | | /// |
3218 | | /// A possible alternative to enabling this option is to simply use an |
3219 | | /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this |
3220 | | /// option is if you absolutely need Unicode support. This option lets one |
3221 | | /// use a fast search implementation (a DFA) for some potentially very |
3222 | | /// common cases, while providing the option to fall back to some other |
3223 | | /// regex engine to handle the general case when an error is returned. |
3224 | | /// |
3225 | | /// If the pattern provided has no Unicode word boundary in it, then this |
3226 | | /// option has no effect. (That is, quitting on a non-ASCII byte only |
3227 | | /// occurs when this option is enabled _and_ a Unicode word boundary is |
3228 | | /// present in the pattern.) |
3229 | | /// |
3230 | | /// This is almost equivalent to setting all non-ASCII bytes to be quit |
3231 | | /// bytes. The only difference is that this will cause non-ASCII bytes to |
3232 | | /// be quit bytes _only_ when a Unicode word boundary is present in the |
3233 | | /// pattern. |
3234 | | /// |
3235 | | /// When enabling this option, callers _must_ be prepared to |
3236 | | /// handle a [`MatchError`] error during search. When using a |
3237 | | /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the |
3238 | | /// `try_` suite of methods. Alternatively, if callers can guarantee that |
3239 | | /// their input is ASCII only, then a [`MatchError::quit`] error will never |
3240 | | /// be returned while searching. |
3241 | | /// |
3242 | | /// This is disabled by default. |
3243 | | /// |
3244 | | /// # Example |
3245 | | /// |
3246 | | /// This example shows how to heuristically enable Unicode word boundaries |
3247 | | /// in a pattern. It also shows what happens when a search comes across a |
3248 | | /// non-ASCII byte. |
3249 | | /// |
3250 | | /// ``` |
3251 | | /// use regex_automata::{ |
3252 | | /// hybrid::dfa::DFA, |
3253 | | /// HalfMatch, Input, MatchError, |
3254 | | /// }; |
3255 | | /// |
3256 | | /// let dfa = DFA::builder() |
3257 | | /// .configure(DFA::config().unicode_word_boundary(true)) |
3258 | | /// .build(r"\b[0-9]+\b")?; |
3259 | | /// let mut cache = dfa.create_cache(); |
3260 | | /// |
3261 | | /// // The match occurs before the search ever observes the snowman |
3262 | | /// // character, so no error occurs. |
3263 | | /// let haystack = "foo 123 ☃"; |
3264 | | /// let expected = Some(HalfMatch::must(0, 7)); |
3265 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
3266 | | /// assert_eq!(expected, got); |
3267 | | /// |
3268 | | /// // Notice that this search fails, even though the snowman character |
3269 | | /// // occurs after the ending match offset. This is because search |
3270 | | /// // routines read one byte past the end of the search to account for |
3271 | | /// // look-around, and indeed, this is required here to determine whether |
3272 | | /// // the trailing \b matches. |
3273 | | /// let haystack = "foo 123 ☃"; |
3274 | | /// let expected = MatchError::quit(0xE2, 8); |
3275 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); |
3276 | | /// assert_eq!(Err(expected), got); |
3277 | | /// |
3278 | | /// // Another example is executing a search where the span of the haystack |
3279 | | /// // we specify is all ASCII, but there is non-ASCII just before it. This |
3280 | | /// // correctly also reports an error. |
3281 | | /// let input = Input::new("β123").range(2..); |
3282 | | /// let expected = MatchError::quit(0xB2, 1); |
3283 | | /// let got = dfa.try_search_fwd(&mut cache, &input); |
3284 | | /// assert_eq!(Err(expected), got); |
3285 | | /// |
3286 | | /// // And similarly for the trailing word boundary. |
3287 | | /// let input = Input::new("123β").range(..3); |
3288 | | /// let expected = MatchError::quit(0xCE, 3); |
3289 | | /// let got = dfa.try_search_fwd(&mut cache, &input); |
3290 | | /// assert_eq!(Err(expected), got); |
3291 | | /// |
3292 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3293 | | /// ``` |
3294 | 0 | pub fn unicode_word_boundary(mut self, yes: bool) -> Config { |
3295 | | // We have a separate option for this instead of just setting the |
3296 | | // appropriate quit bytes here because we don't want to set quit bytes |
3297 | | // for every regex. We only want to set them when the regex contains a |
3298 | | // Unicode word boundary. |
3299 | 0 | self.unicode_word_boundary = Some(yes); |
3300 | 0 | self |
3301 | 0 | } |
3302 | | |
3303 | | /// Add a "quit" byte to the lazy DFA. |
3304 | | /// |
3305 | | /// When a quit byte is seen during search time, then search will return a |
3306 | | /// [`MatchError::quit`] error indicating the offset at which the search |
3307 | | /// stopped. |
3308 | | /// |
3309 | | /// A quit byte will always overrule any other aspects of a regex. For |
3310 | | /// example, if the `x` byte is added as a quit byte and the regex `\w` is |
3311 | | /// used, then observing `x` will cause the search to quit immediately |
3312 | | /// despite the fact that `x` is in the `\w` class. |
3313 | | /// |
3314 | | /// This mechanism is primarily useful for heuristically enabling certain |
3315 | | /// features like Unicode word boundaries in a DFA. Namely, if the input |
3316 | | /// to search is ASCII, then a Unicode word boundary can be implemented |
3317 | | /// via an ASCII word boundary with no change in semantics. Thus, a DFA |
3318 | | /// can attempt to match a Unicode word boundary but give up as soon as it |
3319 | | /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes |
3320 | | /// to be quit bytes, then Unicode word boundaries will be permitted when |
3321 | | /// building lazy DFAs. Of course, callers should enable |
3322 | | /// [`Config::unicode_word_boundary`] if they want this behavior instead. |
3323 | | /// (The advantage being that non-ASCII quit bytes will only be added if a |
3324 | | /// Unicode word boundary is in the pattern.) |
3325 | | /// |
3326 | | /// When enabling this option, callers _must_ be prepared to |
3327 | | /// handle a [`MatchError`] error during search. When using a |
3328 | | /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the |
3329 | | /// `try_` suite of methods. |
3330 | | /// |
3331 | | /// By default, there are no quit bytes set. |
3332 | | /// |
3333 | | /// # Panics |
3334 | | /// |
3335 | | /// This panics if heuristic Unicode word boundaries are enabled and any |
3336 | | /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling |
3337 | | /// Unicode word boundaries requires setting every non-ASCII byte to a quit |
3338 | | /// byte. So if the caller attempts to undo any of that, then this will |
3339 | | /// panic. |
3340 | | /// |
3341 | | /// # Example |
3342 | | /// |
3343 | | /// This example shows how to cause a search to terminate if it sees a |
3344 | | /// `\n` byte. This could be useful if, for example, you wanted to prevent |
3345 | | /// a user supplied pattern from matching across a line boundary. |
3346 | | /// |
3347 | | /// ``` |
3348 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
3349 | | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
3350 | | /// |
3351 | | /// let dfa = DFA::builder() |
3352 | | /// .configure(DFA::config().quit(b'\n', true)) |
3353 | | /// .build(r"foo\p{any}+bar")?; |
3354 | | /// let mut cache = dfa.create_cache(); |
3355 | | /// |
3356 | | /// let haystack = "foo\nbar"; |
3357 | | /// // Normally this would produce a match, since \p{any} contains '\n'. |
3358 | | /// // But since we instructed the automaton to enter a quit state if a |
3359 | | /// // '\n' is observed, this produces a match error instead. |
3360 | | /// let expected = MatchError::quit(b'\n', 3); |
3361 | | /// let got = dfa.try_search_fwd( |
3362 | | /// &mut cache, |
3363 | | /// &Input::new(haystack), |
3364 | | /// ).unwrap_err(); |
3365 | | /// assert_eq!(expected, got); |
3366 | | /// |
3367 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3368 | | /// ``` |
3369 | 0 | pub fn quit(mut self, byte: u8, yes: bool) -> Config { |
3370 | 0 | if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { |
3371 | 0 | panic!( |
3372 | 0 | "cannot set non-ASCII byte to be non-quit when \ |
3373 | 0 | Unicode word boundaries are enabled" |
3374 | | ); |
3375 | 0 | } |
3376 | 0 | if self.quitset.is_none() { |
3377 | 0 | self.quitset = Some(ByteSet::empty()); |
3378 | 0 | } |
3379 | 0 | if yes { |
3380 | 0 | self.quitset.as_mut().unwrap().add(byte); |
3381 | 0 | } else { |
3382 | 0 | self.quitset.as_mut().unwrap().remove(byte); |
3383 | 0 | } |
3384 | 0 | self |
3385 | 0 | } |
3386 | | |
3387 | | /// Enable specializing start states in the lazy DFA. |
3388 | | /// |
3389 | | /// When start states are specialized, an implementor of a search routine |
3390 | | /// using a lazy DFA can tell when the search has entered a starting state. |
3391 | | /// When start states aren't specialized, then it is impossible to know |
3392 | | /// whether the search has entered a start state. |
3393 | | /// |
3394 | | /// Ideally, this option wouldn't need to exist and we could always |
3395 | | /// specialize start states. The problem is that start states can be quite |
3396 | | /// active. This in turn means that an efficient search routine is likely |
3397 | | /// to ping-pong between a heavily optimized hot loop that handles most |
3398 | | /// states and to a less optimized specialized handling of start states. |
3399 | | /// This causes branches to get heavily mispredicted and overall can |
3400 | | /// materially decrease throughput. Therefore, specializing start states |
3401 | | /// should only be enabled when it is needed. |
3402 | | /// |
3403 | | /// Knowing whether a search is in a start state is typically useful when a |
3404 | | /// prefilter is active for the search. A prefilter is typically only run |
3405 | | /// when in a start state and a prefilter can greatly accelerate a search. |
3406 | | /// Therefore, the possible cost of specializing start states is worth it |
3407 | | /// in this case. Otherwise, if you have no prefilter, there is likely no |
3408 | | /// reason to specialize start states. |
3409 | | /// |
3410 | | /// This is disabled by default, but note that it is automatically |
3411 | | /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless |
3412 | | /// `specialize_start_states` has already been set, [`Config::prefilter`] |
3413 | | /// will automatically enable or disable it based on whether a prefilter |
3414 | | /// is present or not, respectively. This is done because a prefilter's |
3415 | | /// effectiveness is rooted in being executed whenever the DFA is in a |
3416 | | /// start state, and that's only possible to do when they are specialized. |
3417 | | /// |
3418 | | /// Note that it is plausibly reasonable to _disable_ this option |
3419 | | /// explicitly while _enabling_ a prefilter. In that case, a prefilter |
3420 | | /// will still be run at the beginning of a search, but never again. This |
3421 | | /// in theory could strike a good balance if you're in a situation where a |
3422 | | /// prefilter is likely to produce many false positive candidates. |
3423 | | /// |
3424 | | /// # Example |
3425 | | /// |
3426 | | /// This example shows how to enable start state specialization and then |
3427 | | /// shows how to check whether a state is a start state or not. |
3428 | | /// |
3429 | | /// ``` |
3430 | | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
3431 | | /// |
3432 | | /// let dfa = DFA::builder() |
3433 | | /// .configure(DFA::config().specialize_start_states(true)) |
3434 | | /// .build(r"[a-z]+")?; |
3435 | | /// let mut cache = dfa.create_cache(); |
3436 | | /// |
3437 | | /// let haystack = "123 foobar 4567".as_bytes(); |
3438 | | /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; |
3439 | | /// // The ID returned by 'start_state_forward' will always be tagged as |
3440 | | /// // a start state when start state specialization is enabled. |
3441 | | /// assert!(sid.is_tagged()); |
3442 | | /// assert!(sid.is_start()); |
3443 | | /// |
3444 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3445 | | /// ``` |
3446 | | /// |
3447 | | /// Compare the above with the default lazy DFA configuration where |
3448 | | /// start states are _not_ specialized. In this case, the start state |
3449 | | /// is not tagged and `sid.is_start()` returns false. |
3450 | | /// |
3451 | | /// ``` |
3452 | | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
3453 | | /// |
3454 | | /// let dfa = DFA::new(r"[a-z]+")?; |
3455 | | /// let mut cache = dfa.create_cache(); |
3456 | | /// |
3457 | | /// let haystack = "123 foobar 4567".as_bytes(); |
3458 | | /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; |
3459 | | /// // Start states are not tagged in the default configuration! |
3460 | | /// assert!(!sid.is_tagged()); |
3461 | | /// assert!(!sid.is_start()); |
3462 | | /// |
3463 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3464 | | /// ``` |
3465 | 0 | pub fn specialize_start_states(mut self, yes: bool) -> Config { |
3466 | 0 | self.specialize_start_states = Some(yes); |
3467 | 0 | self |
3468 | 0 | } |
3469 | | |
3470 | | /// Sets the maximum amount of heap memory, in bytes, to allocate to the |
3471 | | /// cache for use during a lazy DFA search. If the lazy DFA would otherwise |
3472 | | /// use more heap memory, then, depending on other configuration knobs, |
3473 | | /// either stop the search and return an error or clear the cache and |
3474 | | /// continue the search. |
3475 | | /// |
3476 | | /// The default cache capacity is some "reasonable" number that will |
3477 | | /// accommodate most regular expressions. You may find that if you need |
3478 | | /// to build a large DFA then it may be necessary to increase the cache |
3479 | | /// capacity. |
3480 | | /// |
3481 | | /// Note that while building a lazy DFA will do a "minimum" check to ensure |
3482 | | /// the capacity is big enough, this is more or less about correctness. |
3483 | | /// If the cache is bigger than the minimum but still "too small," then the |
3484 | | /// lazy DFA could wind up spending a lot of time clearing the cache and |
3485 | | /// recomputing transitions, thus negating the performance benefits of a |
3486 | | /// lazy DFA. Thus, setting the cache capacity is mostly an experimental |
3487 | | /// endeavor. For most common patterns, however, the default should be |
3488 | | /// sufficient. |
3489 | | /// |
3490 | | /// For more details on how the lazy DFA's cache is used, see the |
3491 | | /// documentation for [`Cache`]. |
3492 | | /// |
3493 | | /// # Example |
3494 | | /// |
3495 | | /// This example shows what happens if the configured cache capacity is |
3496 | | /// too small. In such cases, one can override the cache capacity to make |
3497 | | /// it bigger. Alternatively, one might want to use less memory by setting |
3498 | | /// a smaller cache capacity. |
3499 | | /// |
3500 | | /// ``` |
3501 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
3502 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
3503 | | /// |
3504 | | /// let pattern = r"\p{L}{1000}"; |
3505 | | /// |
3506 | | /// // The default cache capacity is likely too small to deal with regexes |
3507 | | /// // that are very large. Large repetitions of large Unicode character |
3508 | | /// // classes are a common way to make very large regexes. |
3509 | | /// let _ = DFA::new(pattern).unwrap_err(); |
3510 | | /// // Bump up the capacity to something bigger. |
3511 | | /// let dfa = DFA::builder() |
3512 | | /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB |
3513 | | /// .build(pattern)?; |
3514 | | /// let mut cache = dfa.create_cache(); |
3515 | | /// |
3516 | | /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); |
3517 | | /// let expected = Some(HalfMatch::must(0, 2000)); |
3518 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; |
3519 | | /// assert_eq!(expected, got); |
3520 | | /// |
3521 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3522 | | /// ``` |
3523 | 0 | pub fn cache_capacity(mut self, bytes: usize) -> Config { |
3524 | 0 | self.cache_capacity = Some(bytes); |
3525 | 0 | self |
3526 | 0 | } |
3527 | | |
3528 | | /// Configures construction of a lazy DFA to use the minimum cache capacity |
3529 | | /// if the configured capacity is otherwise too small for the provided NFA. |
3530 | | /// |
3531 | | /// This is useful if you never want lazy DFA construction to fail because |
3532 | | /// of a capacity that is too small. |
3533 | | /// |
3534 | | /// In general, this option is typically not a good idea. In particular, |
3535 | | /// while a minimum cache capacity does permit the lazy DFA to function |
3536 | | /// where it otherwise couldn't, it's plausible that it may not function |
3537 | | /// well if it's constantly running out of room. In that case, the speed |
3538 | | /// advantages of the lazy DFA may be negated. On the other hand, the |
3539 | | /// "minimum" cache capacity computed may not be completely accurate and |
3540 | | /// could actually be bigger than what is really necessary. Therefore, it |
3541 | | /// is plausible that using the minimum cache capacity could still result |
3542 | | /// in very good performance. |
3543 | | /// |
3544 | | /// This is disabled by default. |
3545 | | /// |
3546 | | /// # Example |
3547 | | /// |
3548 | | /// This example shows what happens if the configured cache capacity is |
3549 | | /// too small. In such cases, one could override the capacity explicitly. |
3550 | | /// An alternative, demonstrated here, let's us force construction to use |
3551 | | /// the minimum cache capacity if the configured capacity is otherwise |
3552 | | /// too small. |
3553 | | /// |
3554 | | /// ``` |
3555 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
3556 | | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
3557 | | /// |
3558 | | /// let pattern = r"\p{L}{1000}"; |
3559 | | /// |
3560 | | /// // The default cache capacity is likely too small to deal with regexes |
3561 | | /// // that are very large. Large repetitions of large Unicode character |
3562 | | /// // classes are a common way to make very large regexes. |
3563 | | /// let _ = DFA::new(pattern).unwrap_err(); |
3564 | | /// // Configure construction such it automatically selects the minimum |
3565 | | /// // cache capacity if it would otherwise be too small. |
3566 | | /// let dfa = DFA::builder() |
3567 | | /// .configure(DFA::config().skip_cache_capacity_check(true)) |
3568 | | /// .build(pattern)?; |
3569 | | /// let mut cache = dfa.create_cache(); |
3570 | | /// |
3571 | | /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); |
3572 | | /// let expected = Some(HalfMatch::must(0, 2000)); |
3573 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; |
3574 | | /// assert_eq!(expected, got); |
3575 | | /// |
3576 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3577 | | /// ``` |
3578 | 0 | pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { |
3579 | 0 | self.skip_cache_capacity_check = Some(yes); |
3580 | 0 | self |
3581 | 0 | } |
3582 | | |
3583 | | /// Configure a lazy DFA search to quit after a certain number of cache |
3584 | | /// clearings. |
3585 | | /// |
3586 | | /// When a minimum is set, then a lazy DFA search will *possibly* "give |
3587 | | /// up" after the minimum number of cache clearings has occurred. This is |
3588 | | /// typically useful in scenarios where callers want to detect whether the |
3589 | | /// lazy DFA search is "efficient" or not. If the cache is cleared too many |
3590 | | /// times, this is a good indicator that it is not efficient, and thus, the |
3591 | | /// caller may wish to use some other regex engine. |
3592 | | /// |
3593 | | /// Note that the number of times a cache is cleared is a property of |
3594 | | /// the cache itself. Thus, if a cache is used in a subsequent search |
3595 | | /// with a similarly configured lazy DFA, then it could cause the |
3596 | | /// search to "give up" if the cache needed to be cleared, depending |
3597 | | /// on its internal count and configured minimum. The cache clear |
3598 | | /// count can only be reset to `0` via [`DFA::reset_cache`] (or |
3599 | | /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if |
3600 | | /// you're using the `Regex` API). |
3601 | | /// |
3602 | | /// By default, no minimum is configured. Thus, a lazy DFA search will |
3603 | | /// never give up due to cache clearings. If you do set this option, you |
3604 | | /// might consider also setting [`Config::minimum_bytes_per_state`] in |
3605 | | /// order for the lazy DFA to take efficiency into account before giving |
3606 | | /// up. |
3607 | | /// |
3608 | | /// # Example |
3609 | | /// |
3610 | | /// This example uses a somewhat pathological configuration to demonstrate |
3611 | | /// the _possible_ behavior of cache clearing and how it might result |
3612 | | /// in a search that returns an error. |
3613 | | /// |
3614 | | /// It is important to note that the precise mechanics of how and when |
3615 | | /// a cache gets cleared is an implementation detail. |
3616 | | /// |
3617 | | /// ``` |
3618 | | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
3619 | | /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind}; |
3620 | | /// |
3621 | | /// // This is a carefully chosen regex. The idea is to pick one |
3622 | | /// // that requires some decent number of states (hence the bounded |
3623 | | /// // repetition). But we specifically choose to create a class with an |
3624 | | /// // ASCII letter and a non-ASCII letter so that we can check that no new |
3625 | | /// // states are created once the cache is full. Namely, if we fill up the |
3626 | | /// // cache on a haystack of 'a's, then in order to match one 'β', a new |
3627 | | /// // state will need to be created since a 'β' is encoded with multiple |
3628 | | /// // bytes. Since there's no room for this state, the search should quit |
3629 | | /// // at the very first position. |
3630 | | /// let pattern = r"[aβ]{100}"; |
3631 | | /// let dfa = DFA::builder() |
3632 | | /// .configure( |
3633 | | /// // Configure it so that we have the minimum cache capacity |
3634 | | /// // possible. And that if any clearings occur, the search quits. |
3635 | | /// DFA::config() |
3636 | | /// .skip_cache_capacity_check(true) |
3637 | | /// .cache_capacity(0) |
3638 | | /// .minimum_cache_clear_count(Some(0)), |
3639 | | /// ) |
3640 | | /// .build(pattern)?; |
3641 | | /// let mut cache = dfa.create_cache(); |
3642 | | /// |
3643 | | /// // Our search will give up before reaching the end! |
3644 | | /// let haystack = "a".repeat(101).into_bytes(); |
3645 | | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
3646 | | /// assert!(matches!( |
3647 | | /// *result.unwrap_err().kind(), |
3648 | | /// MatchErrorKind::GaveUp { .. }, |
3649 | | /// )); |
3650 | | /// |
3651 | | /// // Now that we know the cache is full, if we search a haystack that we |
3652 | | /// // know will require creating at least one new state, it should not |
3653 | | /// // be able to make much progress. |
3654 | | /// let haystack = "β".repeat(101).into_bytes(); |
3655 | | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
3656 | | /// assert!(matches!( |
3657 | | /// *result.unwrap_err().kind(), |
3658 | | /// MatchErrorKind::GaveUp { .. }, |
3659 | | /// )); |
3660 | | /// |
3661 | | /// // If we reset the cache, then we should be able to create more states |
3662 | | /// // and make more progress with searching for betas. |
3663 | | /// cache.reset(&dfa); |
3664 | | /// let haystack = "β".repeat(101).into_bytes(); |
3665 | | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
3666 | | /// assert!(matches!( |
3667 | | /// *result.unwrap_err().kind(), |
3668 | | /// MatchErrorKind::GaveUp { .. }, |
3669 | | /// )); |
3670 | | /// |
3671 | | /// // ... switching back to ASCII still makes progress since it just needs |
3672 | | /// // to set transitions on existing states! |
3673 | | /// let haystack = "a".repeat(101).into_bytes(); |
3674 | | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
3675 | | /// assert!(matches!( |
3676 | | /// *result.unwrap_err().kind(), |
3677 | | /// MatchErrorKind::GaveUp { .. }, |
3678 | | /// )); |
3679 | | /// |
3680 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3681 | | /// ``` |
3682 | 0 | pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config { |
3683 | 0 | self.minimum_cache_clear_count = Some(min); |
3684 | 0 | self |
3685 | 0 | } |
3686 | | |
3687 | | /// Configure a lazy DFA search to quit only when its efficiency drops |
3688 | | /// below the given minimum. |
3689 | | /// |
3690 | | /// The efficiency of the cache is determined by the number of DFA states |
3691 | | /// compiled per byte of haystack searched. For example, if the efficiency |
3692 | | /// is 2, then it means the lazy DFA is creating a new DFA state after |
3693 | | /// searching approximately 2 bytes in a haystack. Generally speaking, 2 |
3694 | | /// is quite bad and it's likely that even a slower regex engine like the |
3695 | | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster. |
3696 | | /// |
3697 | | /// This has no effect if [`Config::minimum_cache_clear_count`] is not set. |
3698 | | /// Namely, this option only kicks in when the cache has been cleared more |
3699 | | /// than the minimum number. If no minimum is set, then the cache is simply |
3700 | | /// cleared whenever it fills up and it is impossible for the lazy DFA to |
3701 | | /// quit due to ineffective use of the cache. |
3702 | | /// |
3703 | | /// In general, if one is setting [`Config::minimum_cache_clear_count`], |
3704 | | /// then one should probably also set this knob as well. The reason is |
3705 | | /// that the absolute number of times the cache is cleared is generally |
3706 | | /// not a great predictor of efficiency. For example, if a new DFA state |
3707 | | /// is created for every 1,000 bytes searched, then it wouldn't be hard |
3708 | | /// for the cache to get cleared more than `N` times and then cause the |
3709 | | /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite |
3710 | | /// good from a performance perspective, and it's likely that the lazy |
3711 | | /// DFA should continue searching, even if it requires clearing the cache |
3712 | | /// occasionally. |
3713 | | /// |
3714 | | /// Finally, note that if you're implementing your own lazy DFA search |
3715 | | /// routine and also want this efficiency check to work correctly, then |
3716 | | /// you'll need to use the following routines to record search progress: |
3717 | | /// |
3718 | | /// * Call [`Cache::search_start`] at the beginning of every search. |
3719 | | /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is |
3720 | | /// called. |
3721 | | /// * Call [`Cache::search_finish`] before completing a search. (It is |
3722 | | /// not strictly necessary to call this when an error is returned, as |
3723 | | /// `Cache::search_start` will automatically finish the previous search |
3724 | | /// for you. But calling it where possible before returning helps improve |
3725 | | /// the accuracy of how many bytes have actually been searched.) |
3726 | 0 | pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config { |
3727 | 0 | self.minimum_bytes_per_state = Some(min); |
3728 | 0 | self |
3729 | 0 | } |
3730 | | |
3731 | | /// Returns the match semantics set in this configuration. |
3732 | 0 | pub fn get_match_kind(&self) -> MatchKind { |
3733 | 0 | self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
3734 | 0 | } |
3735 | | |
3736 | | /// Returns the prefilter set in this configuration, if one at all. |
3737 | 0 | pub fn get_prefilter(&self) -> Option<&Prefilter> { |
3738 | 0 | self.pre.as_ref().unwrap_or(&None).as_ref() |
3739 | 0 | } |
3740 | | |
3741 | | /// Returns whether this configuration has enabled anchored starting states |
3742 | | /// for every pattern in the DFA. |
3743 | 0 | pub fn get_starts_for_each_pattern(&self) -> bool { |
3744 | 0 | self.starts_for_each_pattern.unwrap_or(false) |
3745 | 0 | } |
3746 | | |
3747 | | /// Returns whether this configuration has enabled byte classes or not. |
3748 | | /// This is typically a debugging oriented option, as disabling it confers |
3749 | | /// no speed benefit. |
3750 | 0 | pub fn get_byte_classes(&self) -> bool { |
3751 | 0 | self.byte_classes.unwrap_or(true) |
3752 | 0 | } |
3753 | | |
3754 | | /// Returns whether this configuration has enabled heuristic Unicode word |
3755 | | /// boundary support. When enabled, it is possible for a search to return |
3756 | | /// an error. |
3757 | 0 | pub fn get_unicode_word_boundary(&self) -> bool { |
3758 | 0 | self.unicode_word_boundary.unwrap_or(false) |
3759 | 0 | } |
3760 | | |
3761 | | /// Returns whether this configuration will instruct the lazy DFA to enter |
3762 | | /// a quit state whenever the given byte is seen during a search. When at |
3763 | | /// least one byte has this enabled, it is possible for a search to return |
3764 | | /// an error. |
3765 | 0 | pub fn get_quit(&self, byte: u8) -> bool { |
3766 | 0 | self.quitset.map_or(false, |q| q.contains(byte)) |
3767 | 0 | } |
3768 | | |
3769 | | /// Returns whether this configuration will instruct the lazy DFA to |
3770 | | /// "specialize" start states. When enabled, the lazy DFA will tag start |
3771 | | /// states so that search routines using the lazy DFA can detect when |
3772 | | /// it's in a start state and do some kind of optimization (like run a |
3773 | | /// prefilter). |
3774 | 0 | pub fn get_specialize_start_states(&self) -> bool { |
3775 | 0 | self.specialize_start_states.unwrap_or(false) |
3776 | 0 | } |
3777 | | |
3778 | | /// Returns the cache capacity set on this configuration. |
3779 | 0 | pub fn get_cache_capacity(&self) -> usize { |
3780 | 0 | self.cache_capacity.unwrap_or(2 * (1 << 20)) |
3781 | 0 | } |
3782 | | |
3783 | | /// Returns whether the cache capacity check should be skipped. |
3784 | 0 | pub fn get_skip_cache_capacity_check(&self) -> bool { |
3785 | 0 | self.skip_cache_capacity_check.unwrap_or(false) |
3786 | 0 | } |
3787 | | |
3788 | | /// Returns, if set, the minimum number of times the cache must be cleared |
3789 | | /// before a lazy DFA search can give up. When no minimum is set, then a |
3790 | | /// search will never quit and will always clear the cache whenever it |
3791 | | /// fills up. |
3792 | 0 | pub fn get_minimum_cache_clear_count(&self) -> Option<usize> { |
3793 | 0 | self.minimum_cache_clear_count.unwrap_or(None) |
3794 | 0 | } |
3795 | | |
3796 | | /// Returns, if set, the minimum number of bytes per state that need to be |
3797 | | /// processed in order for the lazy DFA to keep going. If the minimum falls |
3798 | | /// below this number (and the cache has been cleared a minimum number of |
3799 | | /// times), then the lazy DFA will return a "gave up" error. |
3800 | 0 | pub fn get_minimum_bytes_per_state(&self) -> Option<usize> { |
3801 | 0 | self.minimum_bytes_per_state.unwrap_or(None) |
3802 | 0 | } |
3803 | | |
3804 | | /// Returns the minimum lazy DFA cache capacity required for the given NFA. |
3805 | | /// |
3806 | | /// The cache capacity required for a particular NFA may change without |
3807 | | /// notice. Callers should not rely on it being stable. |
3808 | | /// |
3809 | | /// This is useful for informational purposes, but can also be useful for |
3810 | | /// other reasons. For example, if one wants to check the minimum cache |
3811 | | /// capacity themselves or if one wants to set the capacity based on the |
3812 | | /// minimum. |
3813 | | /// |
3814 | | /// This may return an error if this configuration does not support all of |
3815 | | /// the instructions used in the given NFA. For example, if the NFA has a |
3816 | | /// Unicode word boundary but this configuration does not enable heuristic |
3817 | | /// support for Unicode word boundaries. |
3818 | 0 | pub fn get_minimum_cache_capacity( |
3819 | 0 | &self, |
3820 | 0 | nfa: &thompson::NFA, |
3821 | 0 | ) -> Result<usize, BuildError> { |
3822 | 0 | let quitset = self.quit_set_from_nfa(nfa)?; |
3823 | 0 | let classes = self.byte_classes_from_nfa(nfa, &quitset); |
3824 | 0 | let starts = self.get_starts_for_each_pattern(); |
3825 | 0 | Ok(minimum_cache_capacity(nfa, &classes, starts)) |
3826 | 0 | } |
3827 | | |
3828 | | /// Returns the byte class map used during search from the given NFA. |
3829 | | /// |
3830 | | /// If byte classes are disabled on this configuration, then a map is |
3831 | | /// returned that puts each byte in its own equivalent class. |
3832 | 0 | fn byte_classes_from_nfa( |
3833 | 0 | &self, |
3834 | 0 | nfa: &thompson::NFA, |
3835 | 0 | quit: &ByteSet, |
3836 | 0 | ) -> ByteClasses { |
3837 | 0 | if !self.get_byte_classes() { |
3838 | | // The lazy DFA will always use the equivalence class map, but |
3839 | | // enabling this option is useful for debugging. Namely, this will |
3840 | | // cause all transitions to be defined over their actual bytes |
3841 | | // instead of an opaque equivalence class identifier. The former is |
3842 | | // much easier to grok as a human. |
3843 | 0 | ByteClasses::singletons() |
3844 | | } else { |
3845 | 0 | let mut set = nfa.byte_class_set().clone(); |
3846 | | // It is important to distinguish any "quit" bytes from all other |
3847 | | // bytes. Otherwise, a non-quit byte may end up in the same class |
3848 | | // as a quit byte, and thus cause the DFA stop when it shouldn't. |
3849 | | // |
3850 | | // Test case: |
3851 | | // |
3852 | | // regex-cli find match hybrid --unicode-word-boundary \ |
3853 | | // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log |
3854 | 0 | if !quit.is_empty() { |
3855 | 0 | set.add_set(&quit); |
3856 | 0 | } |
3857 | 0 | set.byte_classes() |
3858 | | } |
3859 | 0 | } |
3860 | | |
3861 | | /// Return the quit set for this configuration and the given NFA. |
3862 | | /// |
3863 | | /// This may return an error if the NFA is incompatible with this |
3864 | | /// configuration's quit set. For example, if the NFA has a Unicode word |
3865 | | /// boundary and the quit set doesn't include non-ASCII bytes. |
3866 | 0 | fn quit_set_from_nfa( |
3867 | 0 | &self, |
3868 | 0 | nfa: &thompson::NFA, |
3869 | 0 | ) -> Result<ByteSet, BuildError> { |
3870 | 0 | let mut quit = self.quitset.unwrap_or(ByteSet::empty()); |
3871 | 0 | if nfa.look_set_any().contains_word_unicode() { |
3872 | 0 | if self.get_unicode_word_boundary() { |
3873 | 0 | for b in 0x80..=0xFF { |
3874 | 0 | quit.add(b); |
3875 | 0 | } |
3876 | | } else { |
3877 | | // If heuristic support for Unicode word boundaries wasn't |
3878 | | // enabled, then we can still check if our quit set is correct. |
3879 | | // If the caller set their quit bytes in a way that causes the |
3880 | | // DFA to quit on at least all non-ASCII bytes, then that's all |
3881 | | // we need for heuristic support to work. |
3882 | 0 | if !quit.contains_range(0x80, 0xFF) { |
3883 | 0 | return Err( |
3884 | 0 | BuildError::unsupported_dfa_word_boundary_unicode(), |
3885 | 0 | ); |
3886 | 0 | } |
3887 | | } |
3888 | 0 | } |
3889 | 0 | Ok(quit) |
3890 | 0 | } |
3891 | | |
3892 | | /// Overwrite the default configuration such that the options in `o` are |
3893 | | /// always used. If an option in `o` is not set, then the corresponding |
3894 | | /// option in `self` is used. If it's not set in `self` either, then it |
3895 | | /// remains not set. |
3896 | 0 | fn overwrite(&self, o: Config) -> Config { |
3897 | | Config { |
3898 | 0 | match_kind: o.match_kind.or(self.match_kind), |
3899 | 0 | pre: o.pre.or_else(|| self.pre.clone()), |
3900 | 0 | starts_for_each_pattern: o |
3901 | 0 | .starts_for_each_pattern |
3902 | 0 | .or(self.starts_for_each_pattern), |
3903 | 0 | byte_classes: o.byte_classes.or(self.byte_classes), |
3904 | 0 | unicode_word_boundary: o |
3905 | 0 | .unicode_word_boundary |
3906 | 0 | .or(self.unicode_word_boundary), |
3907 | 0 | quitset: o.quitset.or(self.quitset), |
3908 | 0 | specialize_start_states: o |
3909 | 0 | .specialize_start_states |
3910 | 0 | .or(self.specialize_start_states), |
3911 | 0 | cache_capacity: o.cache_capacity.or(self.cache_capacity), |
3912 | 0 | skip_cache_capacity_check: o |
3913 | 0 | .skip_cache_capacity_check |
3914 | 0 | .or(self.skip_cache_capacity_check), |
3915 | 0 | minimum_cache_clear_count: o |
3916 | 0 | .minimum_cache_clear_count |
3917 | 0 | .or(self.minimum_cache_clear_count), |
3918 | 0 | minimum_bytes_per_state: o |
3919 | 0 | .minimum_bytes_per_state |
3920 | 0 | .or(self.minimum_bytes_per_state), |
3921 | | } |
3922 | 0 | } |
3923 | | } |
3924 | | |
3925 | | /// A builder for constructing a lazy deterministic finite automaton from |
3926 | | /// regular expressions. |
3927 | | /// |
3928 | | /// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The |
3929 | | /// advantage of the former is that it often lets you avoid importing the |
3930 | | /// `Builder` type directly. |
3931 | | /// |
3932 | | /// This builder provides two main things: |
3933 | | /// |
3934 | | /// 1. It provides a few different `build` routines for actually constructing |
3935 | | /// a DFA from different kinds of inputs. The most convenient is |
3936 | | /// [`Builder::build`], which builds a DFA directly from a pattern string. The |
3937 | | /// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight |
3938 | | /// from an NFA. |
3939 | | /// 2. The builder permits configuring a number of things. |
3940 | | /// [`Builder::configure`] is used with [`Config`] to configure aspects of |
3941 | | /// the DFA and the construction process itself. [`Builder::syntax`] and |
3942 | | /// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA |
3943 | | /// construction, respectively. The syntax and thompson configurations only |
3944 | | /// apply when building from a pattern string. |
3945 | | /// |
3946 | | /// This builder always constructs a *single* lazy DFA. As such, this builder |
3947 | | /// can only be used to construct regexes that either detect the presence |
3948 | | /// of a match or find the end location of a match. A single DFA cannot |
3949 | | /// produce both the start and end of a match. For that information, use a |
3950 | | /// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured |
3951 | | /// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason |
3952 | | /// to use a DFA directly is if the end location of a match is enough for your |
3953 | | /// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, |
3954 | | /// since a second reverse DFA is needed to find the start of a match. |
3955 | | /// |
3956 | | /// # Example |
3957 | | /// |
3958 | | /// This example shows how to build a lazy DFA that uses a tiny cache capacity |
3959 | | /// and completely disables Unicode. That is: |
3960 | | /// |
3961 | | /// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` |
3962 | | /// and `\b` are ASCII-only while `.` matches any byte except for `\n` |
3963 | | /// (instead of any UTF-8 encoding of a Unicode scalar value except for |
3964 | | /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. |
3965 | | /// * The pattern itself is permitted to match invalid UTF-8. For example, |
3966 | | /// things like `[^a]` that match any byte except for `a` are permitted. |
3967 | | /// |
3968 | | /// ``` |
3969 | | /// use regex_automata::{ |
3970 | | /// hybrid::dfa::DFA, |
3971 | | /// nfa::thompson, |
3972 | | /// util::syntax, |
3973 | | /// HalfMatch, Input, |
3974 | | /// }; |
3975 | | /// |
3976 | | /// let dfa = DFA::builder() |
3977 | | /// .configure(DFA::config().cache_capacity(5_000)) |
3978 | | /// .thompson(thompson::Config::new().utf8(false)) |
3979 | | /// .syntax(syntax::Config::new().unicode(false).utf8(false)) |
3980 | | /// .build(r"foo[^b]ar.*")?; |
3981 | | /// let mut cache = dfa.create_cache(); |
3982 | | /// |
3983 | | /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; |
3984 | | /// let expected = Some(HalfMatch::must(0, 10)); |
3985 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
3986 | | /// assert_eq!(expected, got); |
3987 | | /// |
3988 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
3989 | | /// ``` |
3990 | | #[derive(Clone, Debug)] |
3991 | | pub struct Builder { |
3992 | | config: Config, |
3993 | | #[cfg(feature = "syntax")] |
3994 | | thompson: thompson::Compiler, |
3995 | | } |
3996 | | |
3997 | | impl Builder { |
3998 | | /// Create a new lazy DFA builder with the default configuration. |
3999 | 0 | pub fn new() -> Builder { |
4000 | 0 | Builder { |
4001 | 0 | config: Config::default(), |
4002 | 0 | #[cfg(feature = "syntax")] |
4003 | 0 | thompson: thompson::Compiler::new(), |
4004 | 0 | } |
4005 | 0 | } |
4006 | | |
4007 | | /// Build a lazy DFA from the given pattern. |
4008 | | /// |
4009 | | /// If there was a problem parsing or compiling the pattern, then an error |
4010 | | /// is returned. |
4011 | | #[cfg(feature = "syntax")] |
4012 | 0 | pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { |
4013 | 0 | self.build_many(&[pattern]) |
4014 | 0 | } |
4015 | | |
4016 | | /// Build a lazy DFA from the given patterns. |
4017 | | /// |
4018 | | /// When matches are returned, the pattern ID corresponds to the index of |
4019 | | /// the pattern in the slice given. |
4020 | | #[cfg(feature = "syntax")] |
4021 | 0 | pub fn build_many<P: AsRef<str>>( |
4022 | 0 | &self, |
4023 | 0 | patterns: &[P], |
4024 | 0 | ) -> Result<DFA, BuildError> { |
4025 | 0 | let nfa = self |
4026 | 0 | .thompson |
4027 | 0 | .clone() |
4028 | 0 | // We can always forcefully disable captures because DFAs do not |
4029 | 0 | // support them. |
4030 | 0 | .configure( |
4031 | 0 | thompson::Config::new() |
4032 | 0 | .which_captures(thompson::WhichCaptures::None), |
4033 | 0 | ) |
4034 | 0 | .build_many(patterns) |
4035 | 0 | .map_err(BuildError::nfa)?; |
4036 | 0 | self.build_from_nfa(nfa) |
4037 | 0 | } |
4038 | | |
4039 | | /// Build a DFA from the given NFA. |
4040 | | /// |
4041 | | /// Note that this requires owning a `thompson::NFA`. While this may force |
4042 | | /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs |
4043 | | /// are defined internally to support shared ownership such that cloning is |
4044 | | /// very cheap. |
4045 | | /// |
4046 | | /// # Example |
4047 | | /// |
4048 | | /// This example shows how to build a lazy DFA if you already have an NFA |
4049 | | /// in hand. |
4050 | | /// |
4051 | | /// ``` |
4052 | | /// use regex_automata::{ |
4053 | | /// hybrid::dfa::DFA, |
4054 | | /// nfa::thompson, |
4055 | | /// HalfMatch, Input, |
4056 | | /// }; |
4057 | | /// |
4058 | | /// let haystack = "foo123bar"; |
4059 | | /// |
4060 | | /// // This shows how to set non-default options for building an NFA. |
4061 | | /// let nfa = thompson::Compiler::new() |
4062 | | /// .configure(thompson::Config::new().shrink(true)) |
4063 | | /// .build(r"[0-9]+")?; |
4064 | | /// let dfa = DFA::builder().build_from_nfa(nfa)?; |
4065 | | /// let mut cache = dfa.create_cache(); |
4066 | | /// let expected = Some(HalfMatch::must(0, 6)); |
4067 | | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
4068 | | /// assert_eq!(expected, got); |
4069 | | /// |
4070 | | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
4071 | | /// ``` |
4072 | 0 | pub fn build_from_nfa( |
4073 | 0 | &self, |
4074 | 0 | nfa: thompson::NFA, |
4075 | 0 | ) -> Result<DFA, BuildError> { |
4076 | 0 | let quitset = self.config.quit_set_from_nfa(&nfa)?; |
4077 | 0 | let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); |
4078 | | // Check that we can fit at least a few states into our cache, |
4079 | | // otherwise it's pretty senseless to use the lazy DFA. This does have |
4080 | | // a possible failure mode though. This assumes the maximum size of a |
4081 | | // state in powerset space (so, the total number of NFA states), which |
4082 | | // may never actually materialize, and could be quite a bit larger |
4083 | | // than the actual biggest state. If this turns out to be a problem, |
4084 | | // we could expose a knob that disables this check. But if so, we have |
4085 | | // to be careful not to panic in other areas of the code (the cache |
4086 | | // clearing and init code) that tend to assume some minimum useful |
4087 | | // cache capacity. |
4088 | 0 | let min_cache = minimum_cache_capacity( |
4089 | 0 | &nfa, |
4090 | 0 | &classes, |
4091 | 0 | self.config.get_starts_for_each_pattern(), |
4092 | | ); |
4093 | 0 | let mut cache_capacity = self.config.get_cache_capacity(); |
4094 | 0 | if cache_capacity < min_cache { |
4095 | | // When the caller has asked us to skip the cache capacity check, |
4096 | | // then we simply force the cache capacity to its minimum amount |
4097 | | // and mush on. |
4098 | 0 | if self.config.get_skip_cache_capacity_check() { |
4099 | 0 | debug!( |
4100 | 0 | "given capacity ({cache_capacity}) is too small, \ |
4101 | 0 | since skip_cache_capacity_check is enabled, \ |
4102 | 0 | setting cache capacity to minimum ({min_cache})", |
4103 | 0 | ); |
4104 | 0 | cache_capacity = min_cache; |
4105 | 0 | } else { |
4106 | 0 | return Err(BuildError::insufficient_cache_capacity( |
4107 | 0 | min_cache, |
4108 | 0 | cache_capacity, |
4109 | 0 | )); |
4110 | | } |
4111 | 0 | } |
4112 | | // We also need to check that we can fit at least some small number |
4113 | | // of states in our state ID space. This is unlikely to trigger in |
4114 | | // >=32-bit systems, but 16-bit systems have a pretty small state ID |
4115 | | // space since a number of bits are used up as sentinels. |
4116 | 0 | if let Err(err) = minimum_lazy_state_id(&classes) { |
4117 | 0 | return Err(BuildError::insufficient_state_id_capacity(err)); |
4118 | 0 | } |
4119 | 0 | let stride2 = classes.stride2(); |
4120 | 0 | let start_map = StartByteMap::new(nfa.look_matcher()); |
4121 | 0 | Ok(DFA { |
4122 | 0 | config: self.config.clone(), |
4123 | 0 | nfa, |
4124 | 0 | stride2, |
4125 | 0 | start_map, |
4126 | 0 | classes, |
4127 | 0 | quitset, |
4128 | 0 | cache_capacity, |
4129 | 0 | }) |
4130 | 0 | } |
4131 | | |
4132 | | /// Apply the given lazy DFA configuration options to this builder. |
4133 | 0 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
4134 | 0 | self.config = self.config.overwrite(config); |
4135 | 0 | self |
4136 | 0 | } |
4137 | | |
4138 | | /// Set the syntax configuration for this builder using |
4139 | | /// [`syntax::Config`](crate::util::syntax::Config). |
4140 | | /// |
4141 | | /// This permits setting things like case insensitivity, Unicode and multi |
4142 | | /// line mode. |
4143 | | /// |
4144 | | /// These settings only apply when constructing a lazy DFA directly from a |
4145 | | /// pattern. |
4146 | | #[cfg(feature = "syntax")] |
4147 | 0 | pub fn syntax( |
4148 | 0 | &mut self, |
4149 | 0 | config: crate::util::syntax::Config, |
4150 | 0 | ) -> &mut Builder { |
4151 | 0 | self.thompson.syntax(config); |
4152 | 0 | self |
4153 | 0 | } |
4154 | | |
4155 | | /// Set the Thompson NFA configuration for this builder using |
4156 | | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
4157 | | /// |
4158 | | /// This permits setting things like whether the DFA should match the regex |
4159 | | /// in reverse or if additional time should be spent shrinking the size of |
4160 | | /// the NFA. |
4161 | | /// |
4162 | | /// These settings only apply when constructing a DFA directly from a |
4163 | | /// pattern. |
4164 | | #[cfg(feature = "syntax")] |
4165 | 0 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
4166 | 0 | self.thompson.configure(config); |
4167 | 0 | self |
4168 | 0 | } |
4169 | | } |
4170 | | |
4171 | | /// Represents the current state of an overlapping search. |
4172 | | /// |
4173 | | /// This is used for overlapping searches since they need to know something |
4174 | | /// about the previous search. For example, when multiple patterns match at the |
4175 | | /// same position, this state tracks the last reported pattern so that the next |
4176 | | /// search knows whether to report another matching pattern or continue with |
4177 | | /// the search at the next position. Additionally, it also tracks which state |
4178 | | /// the last search call terminated in. |
4179 | | /// |
4180 | | /// This type provides little introspection capabilities. The only thing a |
4181 | | /// caller can do is construct it and pass it around to permit search routines |
4182 | | /// to use it to track state, and also ask whether a match has been found. |
4183 | | /// |
4184 | | /// Callers should always provide a fresh state constructed via |
4185 | | /// [`OverlappingState::start`] when starting a new search. Reusing state from |
4186 | | /// a previous search may result in incorrect results. |
4187 | | #[derive(Clone, Debug, Eq, PartialEq)] |
4188 | | pub struct OverlappingState { |
4189 | | /// The match reported by the most recent overlapping search to use this |
4190 | | /// state. |
4191 | | /// |
4192 | | /// If a search does not find any matches, then it is expected to clear |
4193 | | /// this value. |
4194 | | pub(crate) mat: Option<HalfMatch>, |
4195 | | /// The state ID of the state at which the search was in when the call |
4196 | | /// terminated. When this is a match state, `last_match` must be set to a |
4197 | | /// non-None value. |
4198 | | /// |
4199 | | /// A `None` value indicates the start state of the corresponding |
4200 | | /// automaton. We cannot use the actual ID, since any one automaton may |
4201 | | /// have many start states, and which one is in use depends on several |
4202 | | /// search-time factors. |
4203 | | pub(crate) id: Option<LazyStateID>, |
4204 | | /// The position of the search. |
4205 | | /// |
4206 | | /// When `id` is None (i.e., we are starting a search), this is set to |
4207 | | /// the beginning of the search as given by the caller regardless of its |
4208 | | /// current value. Subsequent calls to an overlapping search pick up at |
4209 | | /// this offset. |
4210 | | pub(crate) at: usize, |
4211 | | /// The index into the matching patterns of the next match to report if the |
4212 | | /// current state is a match state. Note that this may be 1 greater than |
4213 | | /// the total number of matches to report for the current match state. (In |
4214 | | /// which case, no more matches should be reported at the current position |
4215 | | /// and the search should advance to the next position.) |
4216 | | pub(crate) next_match_index: Option<usize>, |
4217 | | /// This is set to true when a reverse overlapping search has entered its |
4218 | | /// EOI transitions. |
4219 | | /// |
4220 | | /// This isn't used in a forward search because it knows to stop once the |
4221 | | /// position exceeds the end of the search range. In a reverse search, |
4222 | | /// since we use unsigned offsets, we don't "know" once we've gone past |
4223 | | /// `0`. So the only way to detect it is with this extra flag. The reverse |
4224 | | /// overlapping search knows to terminate specifically after it has |
4225 | | /// reported all matches after following the EOI transition. |
4226 | | pub(crate) rev_eoi: bool, |
4227 | | } |
4228 | | |
4229 | | impl OverlappingState { |
4230 | | /// Create a new overlapping state that begins at the start state of any |
4231 | | /// automaton. |
4232 | 0 | pub fn start() -> OverlappingState { |
4233 | 0 | OverlappingState { |
4234 | 0 | mat: None, |
4235 | 0 | id: None, |
4236 | 0 | at: 0, |
4237 | 0 | next_match_index: None, |
4238 | 0 | rev_eoi: false, |
4239 | 0 | } |
4240 | 0 | } |
4241 | | |
4242 | | /// Return the match result of the most recent search to execute with this |
4243 | | /// state. |
4244 | | /// |
4245 | | /// A searches will clear this result automatically, such that if no |
4246 | | /// match is found, this will correctly report `None`. |
4247 | 0 | pub fn get_match(&self) -> Option<HalfMatch> { |
4248 | 0 | self.mat |
4249 | 0 | } |
4250 | | } |
4251 | | |
4252 | | /// Runs the given overlapping `search` function (forwards or backwards) until |
4253 | | /// a match is found whose offset does not split a codepoint. |
4254 | | /// |
4255 | | /// This is *not* always correct to call. It should only be called when the |
4256 | | /// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width |
4257 | | /// matches. Calling this when both of those things aren't true might result |
4258 | | /// in legitimate matches getting skipped. |
4259 | | #[cold] |
4260 | | #[inline(never)] |
4261 | 0 | fn skip_empty_utf8_splits_overlapping<F>( |
4262 | 0 | input: &Input<'_>, |
4263 | 0 | state: &mut OverlappingState, |
4264 | 0 | mut search: F, |
4265 | 0 | ) -> Result<(), MatchError> |
4266 | 0 | where |
4267 | 0 | F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, |
4268 | | { |
4269 | | // Note that this routine works for forwards and reverse searches |
4270 | | // even though there's no code here to handle those cases. That's |
4271 | | // because overlapping searches drive themselves to completion via |
4272 | | // `OverlappingState`. So all we have to do is push it until no matches are |
4273 | | // found. |
4274 | | |
4275 | 0 | let mut hm = match state.get_match() { |
4276 | 0 | None => return Ok(()), |
4277 | 0 | Some(hm) => hm, |
4278 | | }; |
4279 | 0 | if input.get_anchored().is_anchored() { |
4280 | 0 | if !input.is_char_boundary(hm.offset()) { |
4281 | 0 | state.mat = None; |
4282 | 0 | } |
4283 | 0 | return Ok(()); |
4284 | 0 | } |
4285 | 0 | while !input.is_char_boundary(hm.offset()) { |
4286 | 0 | search(input, state)?; |
4287 | 0 | hm = match state.get_match() { |
4288 | 0 | None => return Ok(()), |
4289 | 0 | Some(hm) => hm, |
4290 | | }; |
4291 | | } |
4292 | 0 | Ok(()) |
4293 | 0 | } |
4294 | | |
4295 | | /// Based on the minimum number of states required for a useful lazy DFA cache, |
4296 | | /// this returns the minimum lazy state ID that must be representable. |
4297 | | /// |
4298 | | /// It's not likely for this to have any impact 32-bit systems (or higher), but |
4299 | | /// on 16-bit systems, the lazy state ID space is quite constrained and thus |
4300 | | /// may be insufficient if our MIN_STATES value is (for some reason) too high. |
4301 | 0 | fn minimum_lazy_state_id( |
4302 | 0 | classes: &ByteClasses, |
4303 | 0 | ) -> Result<LazyStateID, LazyStateIDError> { |
4304 | 0 | let stride = 1 << classes.stride2(); |
4305 | 0 | let min_state_index = MIN_STATES.checked_sub(1).unwrap(); |
4306 | 0 | LazyStateID::new(min_state_index * stride) |
4307 | 0 | } |
4308 | | |
4309 | | /// Based on the minimum number of states required for a useful lazy DFA cache, |
4310 | | /// this returns a heuristic minimum number of bytes of heap space required. |
4311 | | /// |
4312 | | /// This is a "heuristic" because the minimum it returns is likely bigger than |
4313 | | /// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses |
4314 | | /// the maximum number of NFA states (all of them). This is likely bigger |
4315 | | /// than what is required in practice. Computing the true minimum effectively |
4316 | | /// requires determinization, which is probably too much work to do for a |
4317 | | /// simple check like this. |
4318 | | /// |
4319 | | /// One of the issues with this approach IMO is that it requires that this |
4320 | | /// be in sync with the calculation above for computing how much heap memory |
4321 | | /// the DFA cache uses. If we get it wrong, it's possible for example for the |
4322 | | /// minimum to be smaller than the computed heap memory, and thus, it may be |
4323 | | /// the case that we can't add the required minimum number of states. That in |
4324 | | /// turn will make lazy DFA panic because we assume that we can add at least a |
4325 | | /// minimum number of states. |
4326 | | /// |
4327 | | /// Another approach would be to always allow the minimum number of states to |
4328 | | /// be added to the lazy DFA cache, even if it exceeds the configured cache |
4329 | | /// limit. This does mean that the limit isn't really a limit in all cases, |
4330 | | /// which is unfortunate. But it does at least guarantee that the lazy DFA can |
4331 | | /// always make progress, even if it is slow. (This approach is very similar to |
4332 | | /// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't |
4333 | | /// rely on cache size calculation. Instead, it would just always permit a |
4334 | | /// minimum number of states to be added.) |
4335 | 0 | fn minimum_cache_capacity( |
4336 | 0 | nfa: &thompson::NFA, |
4337 | 0 | classes: &ByteClasses, |
4338 | 0 | starts_for_each_pattern: bool, |
4339 | 0 | ) -> usize { |
4340 | | const ID_SIZE: usize = size_of::<LazyStateID>(); |
4341 | | const STATE_SIZE: usize = size_of::<State>(); |
4342 | | |
4343 | 0 | let stride = 1 << classes.stride2(); |
4344 | 0 | let states_len = nfa.states().len(); |
4345 | 0 | let sparses = 2 * states_len * NFAStateID::SIZE; |
4346 | 0 | let trans = MIN_STATES * stride * ID_SIZE; |
4347 | | |
4348 | 0 | let mut starts = Start::len() * ID_SIZE; |
4349 | 0 | if starts_for_each_pattern { |
4350 | 0 | starts += (Start::len() * nfa.pattern_len()) * ID_SIZE; |
4351 | 0 | } |
4352 | | |
4353 | | // The min number of states HAS to be at least 4: we have 3 sentinel states |
4354 | | // and then we need space for one more when we save a state after clearing |
4355 | | // the cache. We also need space for one more, otherwise we get stuck in a |
4356 | | // loop where we try to add a 5th state, which gets rejected, which clears |
4357 | | // the cache, which adds back a saved state (4th total state) which then |
4358 | | // tries to add the 5th state again. |
4359 | 0 | assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5"); |
4360 | | // The minimum number of non-sentinel states. We consider this separately |
4361 | | // because sentinel states are much smaller in that they contain no NFA |
4362 | | // states. Given our aggressive calculation here, it's worth being more |
4363 | | // precise with the number of states we need. |
4364 | 0 | let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap(); |
4365 | | |
4366 | | // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of |
4367 | | // patterns, followed by 32-bit encodings of patterns and then delta |
4368 | | // varint encodings of NFA state IDs. We use the worst case (which isn't |
4369 | | // technically possible) of 5 bytes for each NFA state ID. |
4370 | | // |
4371 | | // HOWEVER, three of the states needed by a lazy DFA are just the sentinel |
4372 | | // unknown, dead and quit states. Those states have a known size and it is |
4373 | | // small. |
4374 | 0 | let dead_state_size = State::dead().memory_usage(); |
4375 | 0 | let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5); |
4376 | 0 | let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size)) |
4377 | 0 | + (non_sentinel * (STATE_SIZE + max_state_size)); |
4378 | | // NOTE: We don't double count heap memory used by State for this map since |
4379 | | // we use reference counting to avoid doubling memory usage. (This tends to |
4380 | | // be where most memory is allocated in the cache.) |
4381 | 0 | let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE); |
4382 | 0 | let stack = states_len * NFAStateID::SIZE; |
4383 | 0 | let scratch_state_builder = max_state_size; |
4384 | | |
4385 | 0 | trans |
4386 | 0 | + starts |
4387 | 0 | + states |
4388 | 0 | + states_to_sid |
4389 | 0 | + sparses |
4390 | 0 | + stack |
4391 | 0 | + scratch_state_builder |
4392 | 0 | } |
4393 | | |
4394 | | #[cfg(all(test, feature = "syntax"))] |
4395 | | mod tests { |
4396 | | use super::*; |
4397 | | |
4398 | | // Tests that we handle heuristic Unicode word boundary support in reverse |
4399 | | // DFAs in the specific case of contextual searches. |
4400 | | // |
4401 | | // I wrote this test when I discovered a bug in how heuristic word |
4402 | | // boundaries were handled. Namely, that the starting state selection |
4403 | | // didn't consider the DFA's quit byte set when looking at the byte |
4404 | | // immediately before the start of the search (or immediately after the |
4405 | | // end of the search in the case of a reverse search). As a result, it was |
4406 | | // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte |
4407 | | // in the 'β' codepoint would be treated as a non-word character. But of |
4408 | | // course, this search should trigger the DFA to quit, since there is a |
4409 | | // non-ASCII byte in consideration. |
4410 | | // |
4411 | | // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set |
4412 | | // if it wasn't empty. The forward case is tested in the doc test for the |
4413 | | // Config::unicode_word_boundary API. We test the reverse case here, which |
4414 | | // is sufficiently niche that it doesn't really belong in a doc test. |
4415 | | #[test] |
4416 | | fn heuristic_unicode_reverse() { |
4417 | | let dfa = DFA::builder() |
4418 | | .configure(DFA::config().unicode_word_boundary(true)) |
4419 | | .thompson(thompson::Config::new().reverse(true)) |
4420 | | .build(r"\b[0-9]+\b") |
4421 | | .unwrap(); |
4422 | | let mut cache = dfa.create_cache(); |
4423 | | |
4424 | | let input = Input::new("β123").range(2..); |
4425 | | let expected = MatchError::quit(0xB2, 1); |
4426 | | let got = dfa.try_search_rev(&mut cache, &input); |
4427 | | assert_eq!(Err(expected), got); |
4428 | | |
4429 | | let input = Input::new("123β").range(..3); |
4430 | | let expected = MatchError::quit(0xCE, 3); |
4431 | | let got = dfa.try_search_rev(&mut cache, &input); |
4432 | | assert_eq!(Err(expected), got); |
4433 | | } |
4434 | | } |