/src/regex/regex-automata/src/util/determinize/mod.rs
Line | Count | Source |
1 | | /*! |
2 | | This module contains types and routines for implementing determinization. |
3 | | |
4 | | In this crate, there are at least two places where we implement |
5 | | determinization: fully ahead-of-time compiled DFAs in the `dfa` module and |
6 | | lazily compiled DFAs in the `hybrid` module. The stuff in this module |
7 | | corresponds to the things that are in common between these implementations. |
8 | | |
9 | | There are three broad things that our implementations of determinization have |
10 | | in common, as defined by this module: |
11 | | |
12 | | * The classification of start states. That is, whether we're dealing with |
13 | | word boundaries, line boundaries, etc., is all the same. This also includes |
14 | | the look-behind assertions that are satisfied by each starting state |
15 | | classification. |
16 | | * The representation of DFA states as sets of NFA states, including |
17 | | convenience types for building these DFA states that are amenable to reusing |
18 | | allocations. |
19 | | * Routines for the "classical" parts of determinization: computing the |
20 | | epsilon closure, tracking match states (with corresponding pattern IDs, since |
21 | | we support multi-pattern finite automata) and, of course, computing the |
22 | | transition function between states for units of input. |
23 | | |
24 | | I did consider a couple of alternatives to this particular form of code reuse: |
25 | | |
26 | | 1. Don't do any code reuse. The problem here is that we *really* want both |
27 | | forms of determinization to do exactly identical things when it comes to |
28 | | their handling of NFA states. While our tests generally ensure this, the code |
29 | | is tricky and large enough where not reusing code is a pretty big bummer. |
30 | | |
31 | | 2. Implement all of determinization once and make it generic over fully |
32 | | compiled DFAs and lazily compiled DFAs. While I didn't actually try this |
33 | | approach, my instinct is that it would be more complex than is needed here. |
34 | | And the interface required would be pretty hairy. Instead, I think splitting |
35 | | it into logical sub-components works better. |
36 | | */ |
37 | | |
38 | | use alloc::vec::Vec; |
39 | | |
40 | | pub(crate) use self::state::{ |
41 | | State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, |
42 | | }; |
43 | | |
44 | | use crate::{ |
45 | | nfa::thompson, |
46 | | util::{ |
47 | | alphabet, |
48 | | look::{Look, LookSet}, |
49 | | primitives::StateID, |
50 | | search::MatchKind, |
51 | | sparse_set::{SparseSet, SparseSets}, |
52 | | start::Start, |
53 | | utf8, |
54 | | }, |
55 | | }; |
56 | | |
57 | | mod state; |
58 | | |
59 | | /// Compute the set of all reachable NFA states, including the full epsilon |
60 | | /// closure, from a DFA state for a single unit of input. The set of reachable |
61 | | /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned |
62 | | /// also includes any look-behind assertions satisfied by `unit`, in addition |
63 | | /// to whether it is a match state. For multi-pattern DFAs, the builder will |
64 | | /// also include the pattern IDs that match (in the order seen). |
65 | | /// |
66 | | /// `nfa` must be able to resolve any NFA state in `state` and any NFA state |
67 | | /// reachable via the epsilon closure of any NFA state in `state`. `sparses` |
68 | | /// must have capacity equivalent to `nfa.len()`. |
69 | | /// |
70 | | /// `match_kind` should correspond to the match semantics implemented by the |
71 | | /// DFA being built. Generally speaking, for leftmost-first match semantics, |
72 | | /// states that appear after the first NFA match state will not be included in |
73 | | /// the `StateBuilderNFA` returned since they are impossible to visit. |
74 | | /// |
75 | | /// `sparses` is used as scratch space for NFA traversal. Other than their |
76 | | /// capacity requirements (detailed above), there are no requirements on what's |
77 | | /// contained within them (if anything). Similarly, what's inside of them once |
78 | | /// this routine returns is unspecified. |
79 | | /// |
80 | | /// `stack` must have length 0. It is used as scratch space for depth first |
81 | | /// traversal. After returning, it is guaranteed that `stack` will have length |
82 | | /// 0. |
83 | | /// |
84 | | /// `state` corresponds to the current DFA state on which one wants to compute |
85 | | /// the transition for the input `unit`. |
86 | | /// |
87 | | /// `empty_builder` corresponds to the builder allocation to use to produce a |
88 | | /// complete `StateBuilderNFA` state. If the state is not needed (or is already |
89 | | /// cached), then it can be cleared and reused without needing to create a new |
90 | | /// `State`. The `StateBuilderNFA` state returned is final and ready to be |
91 | | /// turned into a `State` if necessary. |
92 | 16.3M | pub(crate) fn next( |
93 | 16.3M | nfa: &thompson::NFA, |
94 | 16.3M | match_kind: MatchKind, |
95 | 16.3M | sparses: &mut SparseSets, |
96 | 16.3M | stack: &mut Vec<StateID>, |
97 | 16.3M | state: &State, |
98 | 16.3M | unit: alphabet::Unit, |
99 | 16.3M | empty_builder: StateBuilderEmpty, |
100 | 16.3M | ) -> StateBuilderNFA { |
101 | 16.3M | sparses.clear(); |
102 | | |
103 | | // Whether the NFA is matched in reverse or not. We use this in some |
104 | | // conditional logic for dealing with the exceptionally annoying CRLF-aware |
105 | | // line anchors. |
106 | 16.3M | let rev = nfa.is_reverse(); |
107 | | // The look-around matcher that our NFA is configured with. We don't |
108 | | // actually use it to match look-around assertions, but we do need its |
109 | | // configuration for constructing states consistent with how it matches. |
110 | 16.3M | let lookm = nfa.look_matcher(); |
111 | | |
112 | | // Put the NFA state IDs into a sparse set in case we need to |
113 | | // re-compute their epsilon closure. |
114 | | // |
115 | | // Doing this state shuffling is technically not necessary unless some |
116 | | // kind of look-around is used in the DFA. Some ad hoc experiments |
117 | | // suggested that avoiding this didn't lead to much of an improvement, |
118 | | // but perhaps more rigorous experimentation should be done. And in |
119 | | // particular, avoiding this check requires some light refactoring of |
120 | | // the code below. |
121 | 2.20G | state.iter_nfa_state_ids(|nfa_id| { |
122 | 2.20G | sparses.set1.insert(nfa_id); |
123 | 2.20G | }); |
124 | | |
125 | | // Compute look-ahead assertions originating from the current state. Based |
126 | | // on the input unit we're transitioning over, some additional set of |
127 | | // assertions may be true. Thus, we re-compute this state's epsilon closure |
128 | | // (but only if necessary). Notably, when we build a DFA state initially, |
129 | | // we don't enable any look-ahead assertions because we don't know whether |
130 | | // they're true or not at that point. |
131 | 16.3M | if !state.look_need().is_empty() { |
132 | | // Add look-ahead assertions that are now true based on the current |
133 | | // input unit. |
134 | 2.79M | let mut look_have = state.look_have(); |
135 | 2.79M | match unit.as_u8() { |
136 | | Some(b'\r') => { |
137 | 31.1k | if !rev || !state.is_half_crlf() { |
138 | 29.5k | look_have = look_have.insert(Look::EndCRLF); |
139 | 29.5k | } |
140 | | } |
141 | | Some(b'\n') => { |
142 | 74.5k | if rev || !state.is_half_crlf() { |
143 | 71.4k | look_have = look_have.insert(Look::EndCRLF); |
144 | 71.4k | } |
145 | | } |
146 | 2.52M | Some(_) => {} |
147 | 168k | None => { |
148 | 168k | look_have = look_have |
149 | 168k | .insert(Look::End) |
150 | 168k | .insert(Look::EndLF) |
151 | 168k | .insert(Look::EndCRLF); |
152 | 168k | } |
153 | | } |
154 | 2.79M | if unit.is_byte(lookm.get_line_terminator()) { |
155 | 74.5k | look_have = look_have.insert(Look::EndLF); |
156 | 2.72M | } |
157 | 2.79M | if state.is_half_crlf() |
158 | 66.2k | && ((rev && !unit.is_byte(b'\r')) |
159 | 46.0k | || (!rev && !unit.is_byte(b'\n'))) |
160 | 61.6k | { |
161 | 61.6k | look_have = look_have.insert(Look::StartCRLF); |
162 | 2.73M | } |
163 | 2.79M | if state.is_from_word() == unit.is_word_byte() { |
164 | 1.89M | look_have = look_have |
165 | 1.89M | .insert(Look::WordAsciiNegate) |
166 | 1.89M | .insert(Look::WordUnicodeNegate); |
167 | 1.89M | } else { |
168 | 903k | look_have = |
169 | 903k | look_have.insert(Look::WordAscii).insert(Look::WordUnicode); |
170 | 903k | } |
171 | 2.79M | if !unit.is_word_byte() { |
172 | 2.02M | look_have = look_have |
173 | 2.02M | .insert(Look::WordEndHalfAscii) |
174 | 2.02M | .insert(Look::WordEndHalfUnicode); |
175 | 2.02M | } |
176 | 2.79M | if state.is_from_word() && !unit.is_word_byte() { |
177 | 408k | look_have = look_have |
178 | 408k | .insert(Look::WordEndAscii) |
179 | 408k | .insert(Look::WordEndUnicode); |
180 | 2.38M | } else if !state.is_from_word() && unit.is_word_byte() { |
181 | 494k | look_have = look_have |
182 | 494k | .insert(Look::WordStartAscii) |
183 | 494k | .insert(Look::WordStartUnicode); |
184 | 1.89M | } |
185 | | // If we have new assertions satisfied that are among the set of |
186 | | // assertions that exist in this state (that is, just because we added |
187 | | // an EndLF assertion above doesn't mean there is an EndLF conditional |
188 | | // epsilon transition in this state), then we re-compute this state's |
189 | | // epsilon closure using the updated set of assertions. |
190 | | // |
191 | | // Note that since our DFA states omit unconditional epsilon |
192 | | // transitions, this check is necessary for correctness. If we re-did |
193 | | // the epsilon closure below needlessly, it could change based on the |
194 | | // fact that we omitted epsilon states originally. |
195 | 2.79M | if !look_have |
196 | 2.79M | .subtract(state.look_have()) |
197 | 2.79M | .intersect(state.look_need()) |
198 | 2.79M | .is_empty() |
199 | | { |
200 | 286M | for nfa_id in sparses.set1.iter() { |
201 | 286M | epsilon_closure( |
202 | 286M | nfa, |
203 | 286M | nfa_id, |
204 | 286M | look_have, |
205 | 286M | stack, |
206 | 286M | &mut sparses.set2, |
207 | 286M | ); |
208 | 286M | } |
209 | 710k | sparses.swap(); |
210 | 710k | sparses.set2.clear(); |
211 | 2.08M | } |
212 | 13.5M | } |
213 | | |
214 | | // Convert our empty builder into one that can record assertions and match |
215 | | // pattern IDs. |
216 | 16.3M | let mut builder = empty_builder.into_matches(); |
217 | | // Set whether the StartLF look-behind assertion is true for this |
218 | | // transition or not. The look-behind assertion for ASCII word boundaries |
219 | | // is handled below. |
220 | 16.3M | if nfa.look_set_any().contains_anchor_line() |
221 | 995k | && unit.is_byte(lookm.get_line_terminator()) |
222 | | { |
223 | | // Why only handle StartLF here and not Start? That's because Start |
224 | | // can only impact the starting state, which is special cased in |
225 | | // start state handling. |
226 | 60.7k | builder.set_look_have(|have| have.insert(Look::StartLF)); |
227 | 16.2M | } |
228 | | // We also need to add StartCRLF to our assertions too, if we can. This |
229 | | // is unfortunately a bit more complicated, because it depends on the |
230 | | // direction of the search. In the forward direction, ^ matches after a |
231 | | // \n, but in the reverse direction, ^ only matches after a \r. (This is |
232 | | // further complicated by the fact that reverse a regex means changing a ^ |
233 | | // to a $ and vice versa.) |
234 | 16.3M | if nfa.look_set_any().contains_anchor_crlf() |
235 | 527k | && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) |
236 | | { |
237 | 31.2k | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |
238 | 16.2M | } |
239 | | // And also for the start-half word boundary assertions. As long as the |
240 | | // look-behind byte is not a word char, then the assertions are satisfied. |
241 | 16.3M | if nfa.look_set_any().contains_word() && !unit.is_word_byte() { |
242 | 2.58M | builder.set_look_have(|have| { |
243 | 2.58M | have.insert(Look::WordStartHalfAscii) |
244 | 2.58M | .insert(Look::WordStartHalfUnicode) |
245 | 2.58M | }); |
246 | 13.7M | } |
247 | 2.23G | for nfa_id in sparses.set1.iter() { |
248 | 2.23G | match *nfa.state(nfa_id) { |
249 | | thompson::State::Union { .. } |
250 | | | thompson::State::BinaryUnion { .. } |
251 | | | thompson::State::Fail |
252 | | | thompson::State::Look { .. } |
253 | 720M | | thompson::State::Capture { .. } => {} |
254 | 2.38M | thompson::State::Match { pattern_id } => { |
255 | | // Notice here that we are calling the NEW state a match |
256 | | // state if the OLD state we are transitioning from |
257 | | // contains an NFA match state. This is precisely how we |
258 | | // delay all matches by one byte and also what therefore |
259 | | // guarantees that starting states cannot be match states. |
260 | | // |
261 | | // If we didn't delay matches by one byte, then whether |
262 | | // a DFA is a matching state or not would be determined |
263 | | // by whether one of its own constituent NFA states |
264 | | // was a match state. (And that would be done in |
265 | | // 'add_nfa_states'.) |
266 | | // |
267 | | // Also, 'add_match_pattern_id' requires that callers never |
268 | | // pass duplicative pattern IDs. We do in fact uphold that |
269 | | // guarantee here, but it's subtle. In particular, a Thompson |
270 | | // NFA guarantees that each pattern has exactly one match |
271 | | // state. Moreover, since we're iterating over the NFA state |
272 | | // IDs in a set, we are guaranteed not to have any duplicative |
273 | | // match states. Thus, it is impossible to add the same pattern |
274 | | // ID more than once. |
275 | | // |
276 | | // N.B. We delay matches by 1 byte as a way to hack 1-byte |
277 | | // look-around into DFA searches. This lets us support ^, $ |
278 | | // and ASCII-only \b. The delay is also why we need a special |
279 | | // "end-of-input" (EOI) sentinel and why we need to follow the |
280 | | // EOI sentinel at the end of every search. This final EOI |
281 | | // transition is necessary to report matches found at the end |
282 | | // of a haystack. |
283 | 2.38M | builder.add_match_pattern_id(pattern_id); |
284 | 2.38M | if !match_kind.continue_past_first_match() { |
285 | 1.32M | break; |
286 | 1.06M | } |
287 | | } |
288 | 1.13G | thompson::State::ByteRange { ref trans } => { |
289 | 1.13G | if trans.matches_unit(unit) { |
290 | 386M | epsilon_closure( |
291 | 386M | nfa, |
292 | 386M | trans.next, |
293 | 386M | builder.look_have(), |
294 | 386M | stack, |
295 | 386M | &mut sparses.set2, |
296 | 386M | ); |
297 | 752M | } |
298 | | } |
299 | 369M | thompson::State::Sparse(ref sparse) => { |
300 | 369M | if let Some(next) = sparse.matches_unit(unit) { |
301 | 352M | epsilon_closure( |
302 | 352M | nfa, |
303 | 352M | next, |
304 | 352M | builder.look_have(), |
305 | 352M | stack, |
306 | 352M | &mut sparses.set2, |
307 | 352M | ); |
308 | 352M | } |
309 | | } |
310 | 0 | thompson::State::Dense(ref dense) => { |
311 | 0 | if let Some(next) = dense.matches_unit(unit) { |
312 | 0 | epsilon_closure( |
313 | 0 | nfa, |
314 | 0 | next, |
315 | 0 | builder.look_have(), |
316 | 0 | stack, |
317 | 0 | &mut sparses.set2, |
318 | 0 | ); |
319 | 0 | } |
320 | | } |
321 | | } |
322 | | } |
323 | | // We only set the word byte if there's a word boundary look-around |
324 | | // anywhere in this regex. Otherwise, there's no point in bloating the |
325 | | // number of states if we don't have one. |
326 | | // |
327 | | // We also only set it when the state has a non-zero number of NFA states. |
328 | | // Otherwise, we could wind up with states that *should* be DEAD states |
329 | | // but are otherwise distinct from DEAD states because of this look-behind |
330 | | // assertion being set. While this can't technically impact correctness *in |
331 | | // theory*, it can create pathological DFAs that consume input until EOI or |
332 | | // a quit byte is seen. Consuming until EOI isn't a correctness problem, |
333 | | // but a (serious) perf problem. Hitting a quit byte, however, could be a |
334 | | // correctness problem since it could cause search routines to report an |
335 | | // error instead of a detected match once the quit state is entered. (The |
336 | | // search routine could be made to be a bit smarter by reporting a match |
337 | | // if one was detected once it enters a quit state (and indeed, the search |
338 | | // routines in this crate do just that), but it seems better to prevent |
339 | | // these things by construction if possible.) |
340 | 16.3M | if !sparses.set2.is_empty() { |
341 | 7.29M | if nfa.look_set_any().contains_word() && unit.is_word_byte() { |
342 | 479k | builder.set_is_from_word(); |
343 | 6.81M | } |
344 | 7.29M | if nfa.look_set_any().contains_anchor_crlf() |
345 | 224k | && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) |
346 | 13.5k | { |
347 | 13.5k | builder.set_is_half_crlf(); |
348 | 7.28M | } |
349 | 9.00M | } |
350 | 16.3M | let mut builder_nfa = builder.into_nfa(); |
351 | 16.3M | add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); |
352 | 16.3M | builder_nfa |
353 | 16.3M | } |
354 | | |
355 | | /// Compute the epsilon closure for the given NFA state. The epsilon closure |
356 | | /// consists of all NFA state IDs, including `start_nfa_id`, that can be |
357 | | /// reached from `start_nfa_id` without consuming any input. These state IDs |
358 | | /// are written to `set` in the order they are visited, but only if they are |
359 | | /// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA |
360 | | /// given. |
361 | | /// |
362 | | /// `look_have` consists of the satisfied assertions at the current |
363 | | /// position. For conditional look-around epsilon transitions, these are |
364 | | /// only followed if they are satisfied by `look_have`. |
365 | | /// |
366 | | /// `stack` must have length 0. It is used as scratch space for depth first |
367 | | /// traversal. After returning, it is guaranteed that `stack` will have length |
368 | | /// 0. |
369 | 1.02G | pub(crate) fn epsilon_closure( |
370 | 1.02G | nfa: &thompson::NFA, |
371 | 1.02G | start_nfa_id: StateID, |
372 | 1.02G | look_have: LookSet, |
373 | 1.02G | stack: &mut Vec<StateID>, |
374 | 1.02G | set: &mut SparseSet, |
375 | 1.02G | ) { |
376 | 1.02G | assert!(stack.is_empty()); |
377 | | // If this isn't an epsilon state, then the epsilon closure is always just |
378 | | // itself, so there's no need to spin up the machinery below to handle it. |
379 | 1.02G | if !nfa.state(start_nfa_id).is_epsilon() { |
380 | 516M | set.insert(start_nfa_id); |
381 | 516M | return; |
382 | 509M | } |
383 | | |
384 | 509M | stack.push(start_nfa_id); |
385 | 2.47G | while let Some(mut id) = stack.pop() { |
386 | | // In many cases, we can avoid stack operations when an NFA state only |
387 | | // adds one new state to visit. In that case, we just set our ID to |
388 | | // that state and mush on. We only use the stack when an NFA state |
389 | | // introduces multiple new states to visit. |
390 | | loop { |
391 | | // Insert this NFA state, and if it's already in the set and thus |
392 | | // already visited, then we can move on to the next one. |
393 | 2.66G | if !set.insert(id) { |
394 | 353M | break; |
395 | 2.31G | } |
396 | 2.31G | match *nfa.state(id) { |
397 | | thompson::State::ByteRange { .. } |
398 | | | thompson::State::Sparse { .. } |
399 | | | thompson::State::Dense { .. } |
400 | | | thompson::State::Fail |
401 | 1.21G | | thompson::State::Match { .. } => break, |
402 | 471M | thompson::State::Look { look, next } => { |
403 | 471M | if !look_have.contains(look) { |
404 | 393M | break; |
405 | 77.9M | } |
406 | 77.9M | id = next; |
407 | | } |
408 | 152M | thompson::State::Union { ref alternates } => { |
409 | 152M | id = match alternates.get(0) { |
410 | 0 | None => break, |
411 | 152M | Some(&id) => id, |
412 | | }; |
413 | | // We need to process our alternates in order to preserve |
414 | | // match preferences, so put the earliest alternates closer |
415 | | // to the top of the stack. |
416 | 152M | stack.extend(alternates[1..].iter().rev()); |
417 | | } |
418 | 216M | thompson::State::BinaryUnion { alt1, alt2 } => { |
419 | 216M | id = alt1; |
420 | 216M | stack.push(alt2); |
421 | 216M | } |
422 | 258M | thompson::State::Capture { next, .. } => { |
423 | 258M | id = next; |
424 | 258M | } |
425 | | } |
426 | | } |
427 | | } |
428 | 1.02G | } |
429 | | |
430 | | /// Add the NFA state IDs in the given `set` to the given DFA builder state. |
431 | | /// The order in which states are added corresponds to the order in which they |
432 | | /// were added to `set`. |
433 | | /// |
434 | | /// The DFA builder state given should already have its complete set of match |
435 | | /// pattern IDs added (if any) and any look-behind assertions (StartLF, Start |
436 | | /// and whether this state is being generated for a transition over a word byte |
437 | | /// when applicable) that are true immediately prior to transitioning into this |
438 | | /// state (via `builder.look_have()`). The match pattern IDs should correspond |
439 | | /// to matches that occurred on the previous transition, since all matches are |
440 | | /// delayed by one byte. The things that should _not_ be set are look-ahead |
441 | | /// assertions (EndLF, End and whether the next byte is a word byte or not). |
442 | | /// The builder state should also not have anything in `look_need` set, as this |
443 | | /// routine will compute that for you. |
444 | | /// |
445 | | /// The given NFA should be able to resolve all identifiers in `set` to a |
446 | | /// particular NFA state. Additionally, `set` must have capacity equivalent |
447 | | /// to `nfa.len()`. |
448 | 16.6M | pub(crate) fn add_nfa_states( |
449 | 16.6M | nfa: &thompson::NFA, |
450 | 16.6M | set: &SparseSet, |
451 | 16.6M | builder: &mut StateBuilderNFA, |
452 | 16.6M | ) { |
453 | 2.38G | for nfa_id in set.iter() { |
454 | 2.38G | match *nfa.state(nfa_id) { |
455 | 1.12G | thompson::State::ByteRange { .. } => { |
456 | 1.12G | builder.add_nfa_state_id(nfa_id); |
457 | 1.12G | } |
458 | 380M | thompson::State::Sparse { .. } => { |
459 | 380M | builder.add_nfa_state_id(nfa_id); |
460 | 380M | } |
461 | 0 | thompson::State::Dense { .. } => { |
462 | 0 | builder.add_nfa_state_id(nfa_id); |
463 | 0 | } |
464 | 364M | thompson::State::Look { look, .. } => { |
465 | 364M | builder.add_nfa_state_id(nfa_id); |
466 | 364M | builder.set_look_need(|need| need.insert(look)); |
467 | | } |
468 | | thompson::State::Union { .. } |
469 | 310M | | thompson::State::BinaryUnion { .. } => { |
470 | 310M | // Pure epsilon transitions don't need to be tracked as part |
471 | 310M | // of the DFA state. Tracking them is actually superfluous; |
472 | 310M | // they won't cause any harm other than making determinization |
473 | 310M | // slower. |
474 | 310M | // |
475 | 310M | // Why aren't these needed? Well, in an NFA, epsilon |
476 | 310M | // transitions are really just jumping points to other states. |
477 | 310M | // So once you hit an epsilon transition, the same set of |
478 | 310M | // resulting states always appears. Therefore, putting them in |
479 | 310M | // a DFA's set of ordered NFA states is strictly redundant. |
480 | 310M | // |
481 | 310M | // Look-around states are also epsilon transitions, but |
482 | 310M | // they are *conditional*. So their presence could be |
483 | 310M | // discriminatory, and thus, they are tracked above. |
484 | 310M | // |
485 | 310M | // But wait... why are epsilon states in our `set` in the first |
486 | 310M | // place? Why not just leave them out? They're in our `set` |
487 | 310M | // because it was generated by computing an epsilon closure, |
488 | 310M | // and we want to keep track of all states we visited to avoid |
489 | 310M | // re-visiting them. In exchange, we have to do this second |
490 | 310M | // iteration over our collected states to finalize our DFA |
491 | 310M | // state. In theory, we could avoid this second iteration if |
492 | 310M | // we maintained two sets during epsilon closure: the set of |
493 | 310M | // visited states (to avoid cycles) and the set of states that |
494 | 310M | // will actually be used to construct the next DFA state. |
495 | 310M | // |
496 | 310M | // Note that this optimization requires that we re-compute the |
497 | 310M | // epsilon closure to account for look-ahead in 'next' *only |
498 | 310M | // when necessary*. Namely, only when the set of look-around |
499 | 310M | // assertions changes and only when those changes are within |
500 | 310M | // the set of assertions that are needed in order to step |
501 | 310M | // through the closure correctly. Otherwise, if we re-do the |
502 | 310M | // epsilon closure needlessly, it could change based on the |
503 | 310M | // fact that we are omitting epsilon states here. |
504 | 310M | // |
505 | 310M | // ----- |
506 | 310M | // |
507 | 310M | // Welp, scratch the above. It turns out that recording these |
508 | 310M | // is in fact necessary to seemingly handle one particularly |
509 | 310M | // annoying case: when a conditional epsilon transition is |
510 | 310M | // put inside of a repetition operator. One specific case I |
511 | 310M | // ran into was the regex `(?:\b|%)+` on the haystack `z%`. |
512 | 310M | // The correct leftmost first matches are: [0, 0] and [1, 1]. |
513 | 310M | // But the DFA was reporting [0, 0] and [1, 2]. To understand |
514 | 310M | // why this happens, consider the NFA for the aforementioned |
515 | 310M | // regex: |
516 | 310M | // |
517 | 310M | // >000000: binary-union(4, 1) |
518 | 310M | // 000001: \x00-\xFF => 0 |
519 | 310M | // 000002: WordAscii => 5 |
520 | 310M | // 000003: % => 5 |
521 | 310M | // ^000004: binary-union(2, 3) |
522 | 310M | // 000005: binary-union(4, 6) |
523 | 310M | // 000006: MATCH(0) |
524 | 310M | // |
525 | 310M | // The problem here is that one of the DFA start states is |
526 | 310M | // going to consist of the NFA states [2, 3] by computing the |
527 | 310M | // epsilon closure of state 4. State 4 isn't included because |
528 | 310M | // we previously were not keeping track of union states. But |
529 | 310M | // only a subset of transitions out of this state will be able |
530 | 310M | // to follow WordAscii, and in those cases, the epsilon closure |
531 | 310M | // is redone. The only problem is that computing the epsilon |
532 | 310M | // closure from [2, 3] is different than computing the epsilon |
533 | 310M | // closure from [4]. In the former case, assuming the WordAscii |
534 | 310M | // assertion is satisfied, you get: [2, 3, 6]. In the latter |
535 | 310M | // case, you get: [2, 6, 3]. Notice that '6' is the match state |
536 | 310M | // and appears AFTER '3' in the former case. This leads to a |
537 | 310M | // preferential but incorrect match of '%' before returning |
538 | 310M | // a match. In the latter case, the match is preferred over |
539 | 310M | // continuing to accept the '%'. |
540 | 310M | // |
541 | 310M | // It almost feels like we might be able to fix the NFA states |
542 | 310M | // to avoid this, or to at least only keep track of union |
543 | 310M | // states where this actually matters, since in the vast |
544 | 310M | // majority of cases, this doesn't matter. |
545 | 310M | // |
546 | 310M | // Another alternative would be to define a new HIR property |
547 | 310M | // called "assertion is repeated anywhere" and compute it |
548 | 310M | // inductively over the entire pattern. If it happens anywhere, |
549 | 310M | // which is probably pretty rare, then we record union states. |
550 | 310M | // Otherwise we don't. |
551 | 310M | builder.add_nfa_state_id(nfa_id); |
552 | 310M | } |
553 | | // Capture states we definitely do not need to record, since they |
554 | | // are unconditional epsilon transitions with no branching. |
555 | 204M | thompson::State::Capture { .. } => {} |
556 | | // It's not totally clear whether we need to record fail states or |
557 | | // not, but we do so out of an abundance of caution. Since they are |
558 | | // quite rare in practice, there isn't much cost to recording them. |
559 | 611k | thompson::State::Fail => { |
560 | 611k | builder.add_nfa_state_id(nfa_id); |
561 | 611k | } |
562 | 1.34M | thompson::State::Match { .. } => { |
563 | 1.34M | // Normally, the NFA match state doesn't actually need to |
564 | 1.34M | // be inside the DFA state. But since we delay matches by |
565 | 1.34M | // one byte, the matching DFA state corresponds to states |
566 | 1.34M | // that transition from the one we're building here. And |
567 | 1.34M | // the way we detect those cases is by looking for an NFA |
568 | 1.34M | // match state. See 'next' for how this is handled. |
569 | 1.34M | builder.add_nfa_state_id(nfa_id); |
570 | 1.34M | } |
571 | | } |
572 | | } |
573 | | // If we know this state contains no look-around assertions, then |
574 | | // there's no reason to track which look-around assertions were |
575 | | // satisfied when this state was created. |
576 | 16.6M | if builder.look_need().is_empty() { |
577 | 15.0M | builder.set_look_have(|_| LookSet::empty()); |
578 | 1.64M | } |
579 | 16.6M | } |
580 | | |
581 | | /// Sets the appropriate look-behind assertions on the given state based on |
582 | | /// this starting configuration. |
583 | 387k | pub(crate) fn set_lookbehind_from_start( |
584 | 387k | nfa: &thompson::NFA, |
585 | 387k | start: &Start, |
586 | 387k | builder: &mut StateBuilderMatches, |
587 | 387k | ) { |
588 | 387k | let rev = nfa.is_reverse(); |
589 | 387k | let lineterm = nfa.look_matcher().get_line_terminator(); |
590 | 387k | let lookset = nfa.look_set_any(); |
591 | 387k | match *start { |
592 | | Start::NonWordByte => { |
593 | 208k | if lookset.contains_word() { |
594 | 81.8k | builder.set_look_have(|have| { |
595 | 81.8k | have.insert(Look::WordStartHalfAscii) |
596 | 81.8k | .insert(Look::WordStartHalfUnicode) |
597 | 81.8k | }); |
598 | 126k | } |
599 | | } |
600 | | Start::WordByte => { |
601 | 48.4k | if lookset.contains_word() { |
602 | 48.0k | builder.set_is_from_word(); |
603 | 48.0k | } |
604 | | } |
605 | | Start::Text => { |
606 | 45.7k | if lookset.contains_anchor_haystack() { |
607 | 25.6k | builder.set_look_have(|have| have.insert(Look::Start)); |
608 | 20.0k | } |
609 | 45.7k | if lookset.contains_anchor_line() { |
610 | 11.4k | builder.set_look_have(|have| { |
611 | 11.4k | have.insert(Look::StartLF).insert(Look::StartCRLF) |
612 | 11.4k | }); |
613 | 34.2k | } |
614 | 45.7k | if lookset.contains_word() { |
615 | 24.8k | builder.set_look_have(|have| { |
616 | 24.8k | have.insert(Look::WordStartHalfAscii) |
617 | 24.8k | .insert(Look::WordStartHalfUnicode) |
618 | 24.8k | }); |
619 | 20.8k | } |
620 | | } |
621 | | Start::LineLF => { |
622 | 28.6k | if rev { |
623 | 12.2k | if lookset.contains_anchor_crlf() { |
624 | 1.90k | builder.set_is_half_crlf(); |
625 | 10.3k | } |
626 | 12.2k | if lookset.contains_anchor_line() { |
627 | 3.50k | builder.set_look_have(|have| have.insert(Look::StartLF)); |
628 | 8.72k | } |
629 | | } else { |
630 | 16.4k | if lookset.contains_anchor_line() { |
631 | 5.81k | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |
632 | 10.6k | } |
633 | | } |
634 | 28.6k | if lookset.contains_anchor_line() && lineterm == b'\n' { |
635 | 9.31k | builder.set_look_have(|have| have.insert(Look::StartLF)); |
636 | 19.3k | } |
637 | 28.6k | if lookset.contains_word() { |
638 | 16.2k | builder.set_look_have(|have| { |
639 | 16.2k | have.insert(Look::WordStartHalfAscii) |
640 | 16.2k | .insert(Look::WordStartHalfUnicode) |
641 | 16.2k | }); |
642 | 12.4k | } |
643 | | } |
644 | | Start::LineCR => { |
645 | 28.5k | if lookset.contains_anchor_crlf() { |
646 | 4.81k | if rev { |
647 | 1.91k | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |
648 | 2.90k | } else { |
649 | 2.90k | builder.set_is_half_crlf(); |
650 | 2.90k | } |
651 | 23.7k | } |
652 | 28.5k | if lookset.contains_anchor_line() && lineterm == b'\r' { |
653 | 0 | builder.set_look_have(|have| have.insert(Look::StartLF)); |
654 | 28.5k | } |
655 | 28.5k | if lookset.contains_word() { |
656 | 16.1k | builder.set_look_have(|have| { |
657 | 16.1k | have.insert(Look::WordStartHalfAscii) |
658 | 16.1k | .insert(Look::WordStartHalfUnicode) |
659 | 16.1k | }); |
660 | 12.3k | } |
661 | | } |
662 | | Start::CustomLineTerminator => { |
663 | 28.4k | if lookset.contains_anchor_line() { |
664 | 9.24k | builder.set_look_have(|have| have.insert(Look::StartLF)); |
665 | 19.1k | } |
666 | | // This is a bit of a tricky case, but if the line terminator was |
667 | | // set to a word byte, then we also need to behave as if the start |
668 | | // configuration is Start::WordByte. That is, we need to mark our |
669 | | // state as having come from a word byte. |
670 | 28.4k | if lookset.contains_word() { |
671 | 16.0k | if utf8::is_word_byte(lineterm) { |
672 | 0 | builder.set_is_from_word(); |
673 | 0 | } else { |
674 | 16.0k | builder.set_look_have(|have| { |
675 | 16.0k | have.insert(Look::WordStartHalfAscii) |
676 | 16.0k | .insert(Look::WordStartHalfUnicode) |
677 | 16.0k | }); |
678 | | } |
679 | 12.3k | } |
680 | | } |
681 | | } |
682 | 387k | } |