/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.2.0/src/dfa/search.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use crate::{ |
2 | | dfa::{ |
3 | | accel, |
4 | | automaton::{Automaton, OverlappingState, StateMatch}, |
5 | | }, |
6 | | util::{ |
7 | | id::{PatternID, StateID}, |
8 | | matchtypes::HalfMatch, |
9 | | prefilter, MATCH_OFFSET, |
10 | | }, |
11 | | MatchError, |
12 | | }; |
13 | | |
14 | | #[inline(never)] |
15 | 0 | pub fn find_earliest_fwd<A: Automaton + ?Sized>( |
16 | 0 | pre: Option<&mut prefilter::Scanner>, |
17 | 0 | dfa: &A, |
18 | 0 | pattern_id: Option<PatternID>, |
19 | 0 | bytes: &[u8], |
20 | 0 | start: usize, |
21 | 0 | end: usize, |
22 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
23 | 0 | // Searching with a pattern ID is always anchored, so we should never use |
24 | 0 | // a prefilter. |
25 | 0 | if pre.is_some() && pattern_id.is_none() { |
26 | 0 | find_fwd(pre, true, dfa, pattern_id, bytes, start, end) |
27 | | } else { |
28 | 0 | find_fwd(None, true, dfa, pattern_id, bytes, start, end) |
29 | | } |
30 | 0 | } |
31 | | |
32 | | #[inline(never)] |
33 | 0 | pub fn find_leftmost_fwd<A: Automaton + ?Sized>( |
34 | 0 | pre: Option<&mut prefilter::Scanner>, |
35 | 0 | dfa: &A, |
36 | 0 | pattern_id: Option<PatternID>, |
37 | 0 | bytes: &[u8], |
38 | 0 | start: usize, |
39 | 0 | end: usize, |
40 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
41 | 0 | // Searching with a pattern ID is always anchored, so we should never use |
42 | 0 | // a prefilter. |
43 | 0 | if pre.is_some() && pattern_id.is_none() { |
44 | 0 | find_fwd(pre, false, dfa, pattern_id, bytes, start, end) |
45 | | } else { |
46 | 0 | find_fwd(None, false, dfa, pattern_id, bytes, start, end) |
47 | | } |
48 | 0 | } |
49 | | |
50 | | /// This is marked as `inline(always)` specifically because it supports |
51 | | /// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters |
52 | | /// getting inlined eliminate some critical branches. To avoid bloating binary |
53 | | /// size, we only call this function in a fixed number of places. |
54 | | #[inline(always)] |
55 | 0 | fn find_fwd<A: Automaton + ?Sized>( |
56 | 0 | mut pre: Option<&mut prefilter::Scanner>, |
57 | 0 | earliest: bool, |
58 | 0 | dfa: &A, |
59 | 0 | pattern_id: Option<PatternID>, |
60 | 0 | haystack: &[u8], |
61 | 0 | start: usize, |
62 | 0 | end: usize, |
63 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
64 | 0 | assert!(start <= end); |
65 | 0 | assert!(start <= haystack.len()); |
66 | 0 | assert!(end <= haystack.len()); |
67 | | |
68 | | // Why do this? This lets 'bytes[at]' work without bounds checks below. |
69 | | // It seems the assert on 'end <= haystack.len()' above is otherwise |
70 | | // not enough. Why not just make 'bytes' scoped this way anyway? Well, |
71 | | // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' |
72 | | // for resolving look-ahead. |
73 | 0 | let bytes = &haystack[..end]; |
74 | | |
75 | 0 | let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; |
76 | 0 | let mut last_match = None; |
77 | 0 | let mut at = start; |
78 | 0 | if let Some(ref mut pre) = pre { |
79 | | // If a prefilter doesn't report false positives, then we don't need to |
80 | | // touch the DFA at all. However, since all matches include the pattern |
81 | | // ID, and the prefilter infrastructure doesn't report pattern IDs, we |
82 | | // limit this optimization to cases where there is exactly one pattern. |
83 | | // In that case, any match must be the 0th pattern. |
84 | 0 | if dfa.pattern_count() == 1 && !pre.reports_false_positives() { |
85 | 0 | return Ok(pre.next_candidate(bytes, at).into_option().map( |
86 | 0 | |offset| HalfMatch { pattern: PatternID::ZERO, offset }, |
87 | 0 | )); |
88 | 0 | } else if pre.is_effective(at) { |
89 | 0 | match pre.next_candidate(bytes, at).into_option() { |
90 | 0 | None => return Ok(None), |
91 | 0 | Some(i) => { |
92 | 0 | at = i; |
93 | 0 | } |
94 | | } |
95 | 0 | } |
96 | 0 | } |
97 | 0 | while at < end { |
98 | 0 | let byte = bytes[at]; |
99 | 0 | state = dfa.next_state(state, byte); |
100 | 0 | at += 1; |
101 | 0 | if dfa.is_special_state(state) { |
102 | 0 | if dfa.is_start_state(state) { |
103 | 0 | if let Some(ref mut pre) = pre { |
104 | 0 | if pre.is_effective(at) { |
105 | 0 | match pre.next_candidate(bytes, at).into_option() { |
106 | 0 | None => return Ok(None), |
107 | 0 | Some(i) => { |
108 | 0 | at = i; |
109 | 0 | } |
110 | | } |
111 | 0 | } |
112 | 0 | } else if dfa.is_accel_state(state) { |
113 | 0 | let needles = dfa.accelerator(state); |
114 | 0 | at = accel::find_fwd(needles, bytes, at) |
115 | 0 | .unwrap_or(bytes.len()); |
116 | 0 | } |
117 | 0 | } else if dfa.is_match_state(state) { |
118 | 0 | last_match = Some(HalfMatch { |
119 | 0 | pattern: dfa.match_pattern(state, 0), |
120 | 0 | offset: at - MATCH_OFFSET, |
121 | 0 | }); |
122 | 0 | if earliest { |
123 | 0 | return Ok(last_match); |
124 | 0 | } |
125 | 0 | if dfa.is_accel_state(state) { |
126 | 0 | let needles = dfa.accelerator(state); |
127 | 0 | at = accel::find_fwd(needles, bytes, at) |
128 | 0 | .unwrap_or(bytes.len()); |
129 | 0 | } |
130 | 0 | } else if dfa.is_accel_state(state) { |
131 | 0 | let needs = dfa.accelerator(state); |
132 | 0 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
133 | 0 | } else if dfa.is_dead_state(state) { |
134 | 0 | return Ok(last_match); |
135 | | } else { |
136 | 0 | debug_assert!(dfa.is_quit_state(state)); |
137 | 0 | if last_match.is_some() { |
138 | 0 | return Ok(last_match); |
139 | 0 | } |
140 | 0 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
141 | | } |
142 | 0 | } |
143 | 0 | while at < end && dfa.next_state(state, bytes[at]) == state { |
144 | 0 | at += 1; |
145 | 0 | } |
146 | | } |
147 | 0 | Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) |
148 | 0 | } |
149 | | |
150 | | #[inline(never)] |
151 | 0 | pub fn find_earliest_rev<A: Automaton + ?Sized>( |
152 | 0 | dfa: &A, |
153 | 0 | pattern_id: Option<PatternID>, |
154 | 0 | bytes: &[u8], |
155 | 0 | start: usize, |
156 | 0 | end: usize, |
157 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
158 | 0 | find_rev(true, dfa, pattern_id, bytes, start, end) |
159 | 0 | } |
160 | | |
161 | | #[inline(never)] |
162 | 0 | pub fn find_leftmost_rev<A: Automaton + ?Sized>( |
163 | 0 | dfa: &A, |
164 | 0 | pattern_id: Option<PatternID>, |
165 | 0 | bytes: &[u8], |
166 | 0 | start: usize, |
167 | 0 | end: usize, |
168 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
169 | 0 | find_rev(false, dfa, pattern_id, bytes, start, end) |
170 | 0 | } |
171 | | |
172 | | /// This is marked as `inline(always)` specifically because it supports |
173 | | /// multiple modes of searching. Namely, the 'earliest' boolean getting inlined |
174 | | /// permits eliminating a few crucial branches. |
175 | | #[inline(always)] |
176 | 0 | fn find_rev<A: Automaton + ?Sized>( |
177 | 0 | earliest: bool, |
178 | 0 | dfa: &A, |
179 | 0 | pattern_id: Option<PatternID>, |
180 | 0 | bytes: &[u8], |
181 | 0 | start: usize, |
182 | 0 | end: usize, |
183 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
184 | 0 | assert!(start <= end); |
185 | 0 | assert!(start <= bytes.len()); |
186 | 0 | assert!(end <= bytes.len()); |
187 | | |
188 | 0 | let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; |
189 | 0 | let mut last_match = None; |
190 | 0 | let mut at = end; |
191 | 0 | while at > start { |
192 | 0 | at -= 1; |
193 | 0 | while at > start && dfa.next_state(state, bytes[at]) == state { |
194 | 0 | at -= 1; |
195 | 0 | } |
196 | | |
197 | 0 | let byte = bytes[at]; |
198 | 0 | state = dfa.next_state(state, byte); |
199 | 0 | if dfa.is_special_state(state) { |
200 | 0 | if dfa.is_start_state(state) { |
201 | 0 | if dfa.is_accel_state(state) { |
202 | 0 | let needles = dfa.accelerator(state); |
203 | 0 | at = accel::find_rev(needles, bytes, at) |
204 | 0 | .map(|i| i + 1) |
205 | 0 | .unwrap_or(0); |
206 | 0 | } |
207 | 0 | } else if dfa.is_match_state(state) { |
208 | 0 | last_match = Some(HalfMatch { |
209 | 0 | pattern: dfa.match_pattern(state, 0), |
210 | 0 | offset: at + MATCH_OFFSET, |
211 | 0 | }); |
212 | 0 | if earliest { |
213 | 0 | return Ok(last_match); |
214 | 0 | } |
215 | 0 | if dfa.is_accel_state(state) { |
216 | 0 | let needles = dfa.accelerator(state); |
217 | 0 | at = accel::find_rev(needles, bytes, at) |
218 | 0 | .map(|i| i + 1) |
219 | 0 | .unwrap_or(0); |
220 | 0 | } |
221 | 0 | } else if dfa.is_accel_state(state) { |
222 | 0 | let needles = dfa.accelerator(state); |
223 | 0 | at = accel::find_rev(needles, bytes, at) |
224 | 0 | .map(|i| i + 1) |
225 | 0 | .unwrap_or(0); |
226 | 0 | } else if dfa.is_dead_state(state) { |
227 | 0 | return Ok(last_match); |
228 | | } else { |
229 | 0 | debug_assert!(dfa.is_quit_state(state)); |
230 | 0 | if last_match.is_some() { |
231 | 0 | return Ok(last_match); |
232 | 0 | } |
233 | 0 | return Err(MatchError::Quit { byte, offset: at }); |
234 | | } |
235 | 0 | } |
236 | | } |
237 | 0 | Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) |
238 | 0 | } |
239 | | |
240 | | #[inline(never)] |
241 | 0 | pub fn find_overlapping_fwd<A: Automaton + ?Sized>( |
242 | 0 | pre: Option<&mut prefilter::Scanner>, |
243 | 0 | dfa: &A, |
244 | 0 | pattern_id: Option<PatternID>, |
245 | 0 | bytes: &[u8], |
246 | 0 | start: usize, |
247 | 0 | end: usize, |
248 | 0 | caller_state: &mut OverlappingState, |
249 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
250 | 0 | // Searching with a pattern ID is always anchored, so we should only ever |
251 | 0 | // use a prefilter when no pattern ID is given. |
252 | 0 | if pre.is_some() && pattern_id.is_none() { |
253 | 0 | find_overlapping_fwd_imp( |
254 | 0 | pre, |
255 | 0 | dfa, |
256 | 0 | pattern_id, |
257 | 0 | bytes, |
258 | 0 | start, |
259 | 0 | end, |
260 | 0 | caller_state, |
261 | 0 | ) |
262 | | } else { |
263 | 0 | find_overlapping_fwd_imp( |
264 | 0 | None, |
265 | 0 | dfa, |
266 | 0 | pattern_id, |
267 | 0 | bytes, |
268 | 0 | start, |
269 | 0 | end, |
270 | 0 | caller_state, |
271 | 0 | ) |
272 | | } |
273 | 0 | } |
274 | | |
275 | | /// This is marked as `inline(always)` specifically because it supports |
276 | | /// multiple modes of searching. Namely, the 'pre' prefilter getting inlined |
277 | | /// permits eliminating a few crucial branches and reduces code size when it is |
278 | | /// not used. |
279 | | #[inline(always)] |
280 | 0 | fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( |
281 | 0 | mut pre: Option<&mut prefilter::Scanner>, |
282 | 0 | dfa: &A, |
283 | 0 | pattern_id: Option<PatternID>, |
284 | 0 | bytes: &[u8], |
285 | 0 | mut start: usize, |
286 | 0 | end: usize, |
287 | 0 | caller_state: &mut OverlappingState, |
288 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
289 | 0 | assert!(start <= end); |
290 | 0 | assert!(start <= bytes.len()); |
291 | 0 | assert!(end <= bytes.len()); |
292 | | |
293 | 0 | let mut state = match caller_state.id() { |
294 | 0 | None => init_fwd(dfa, pattern_id, bytes, start, end)?, |
295 | 0 | Some(id) => { |
296 | 0 | if let Some(last) = caller_state.last_match() { |
297 | 0 | let match_count = dfa.match_count(id); |
298 | 0 | if last.match_index < match_count { |
299 | 0 | let m = HalfMatch { |
300 | 0 | pattern: dfa.match_pattern(id, last.match_index), |
301 | 0 | offset: last.offset, |
302 | 0 | }; |
303 | 0 | last.match_index += 1; |
304 | 0 | return Ok(Some(m)); |
305 | 0 | } |
306 | 0 | } |
307 | | |
308 | | // This is a subtle but critical detail. If the caller provides a |
309 | | // non-None state ID, then it must be the case that the state ID |
310 | | // corresponds to one set by this function. The state ID therefore |
311 | | // corresponds to a match state, a dead state or some other state. |
312 | | // However, "some other" state _only_ occurs when the input has |
313 | | // been exhausted because the only way to stop before then is to |
314 | | // see a match or a dead/quit state. |
315 | | // |
316 | | // If the input is exhausted or if it's a dead state, then |
317 | | // incrementing the starting position has no relevance on |
318 | | // correctness, since the loop below will either not execute |
319 | | // at all or will immediately stop due to being in a dead state. |
320 | | // (Once in a dead state it is impossible to leave it.) |
321 | | // |
322 | | // Therefore, the only case we need to consider is when |
323 | | // caller_state is a match state. In this case, since our machines |
324 | | // support the ability to delay a match by a certain number of |
325 | | // bytes (to support look-around), it follows that we actually |
326 | | // consumed that many additional bytes on our previous search. When |
327 | | // the caller resumes their search to find subsequent matches, they |
328 | | // will use the ending location from the previous match as the next |
329 | | // starting point, which is `MATCH_OFFSET` bytes PRIOR to where |
330 | | // we scanned to on the previous search. Therefore, we need to |
331 | | // compensate by bumping `start` up by `MATCH_OFFSET` bytes. |
332 | | // |
333 | | // Incidentally, since MATCH_OFFSET is non-zero, this also makes |
334 | | // dealing with empty matches convenient. Namely, callers needn't |
335 | | // special case them when implementing an iterator. Instead, this |
336 | | // ensures that forward progress is always made. |
337 | 0 | start += MATCH_OFFSET; |
338 | 0 | id |
339 | | } |
340 | | }; |
341 | | |
342 | 0 | let mut at = start; |
343 | 0 | while at < end { |
344 | 0 | let byte = bytes[at]; |
345 | 0 | state = dfa.next_state(state, byte); |
346 | 0 | at += 1; |
347 | 0 | if dfa.is_special_state(state) { |
348 | 0 | caller_state.set_id(state); |
349 | 0 | if dfa.is_start_state(state) { |
350 | 0 | if let Some(ref mut pre) = pre { |
351 | 0 | if pre.is_effective(at) { |
352 | 0 | match pre.next_candidate(bytes, at).into_option() { |
353 | 0 | None => return Ok(None), |
354 | 0 | Some(i) => { |
355 | 0 | at = i; |
356 | 0 | } |
357 | | } |
358 | 0 | } |
359 | 0 | } else if dfa.is_accel_state(state) { |
360 | 0 | let needles = dfa.accelerator(state); |
361 | 0 | at = accel::find_fwd(needles, bytes, at) |
362 | 0 | .unwrap_or(bytes.len()); |
363 | 0 | } |
364 | 0 | } else if dfa.is_match_state(state) { |
365 | 0 | let offset = at - MATCH_OFFSET; |
366 | 0 | caller_state |
367 | 0 | .set_last_match(StateMatch { match_index: 1, offset }); |
368 | 0 | return Ok(Some(HalfMatch { |
369 | 0 | pattern: dfa.match_pattern(state, 0), |
370 | 0 | offset, |
371 | 0 | })); |
372 | 0 | } else if dfa.is_accel_state(state) { |
373 | 0 | let needs = dfa.accelerator(state); |
374 | 0 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
375 | 0 | } else if dfa.is_dead_state(state) { |
376 | 0 | return Ok(None); |
377 | | } else { |
378 | 0 | debug_assert!(dfa.is_quit_state(state)); |
379 | 0 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
380 | | } |
381 | 0 | } |
382 | | } |
383 | | |
384 | 0 | let result = eoi_fwd(dfa, bytes, end, &mut state); |
385 | 0 | caller_state.set_id(state); |
386 | 0 | if let Ok(Some(ref last_match)) = result { |
387 | 0 | caller_state.set_last_match(StateMatch { |
388 | 0 | match_index: 1, |
389 | 0 | offset: last_match.offset(), |
390 | 0 | }); |
391 | 0 | } |
392 | 0 | result |
393 | 0 | } |
394 | | |
395 | 0 | fn init_fwd<A: Automaton + ?Sized>( |
396 | 0 | dfa: &A, |
397 | 0 | pattern_id: Option<PatternID>, |
398 | 0 | bytes: &[u8], |
399 | 0 | start: usize, |
400 | 0 | end: usize, |
401 | 0 | ) -> Result<StateID, MatchError> { |
402 | 0 | let state = dfa.start_state_forward(pattern_id, bytes, start, end); |
403 | 0 | // Start states can never be match states, since all matches are delayed |
404 | 0 | // by 1 byte. |
405 | 0 | assert!(!dfa.is_match_state(state)); |
406 | 0 | Ok(state) |
407 | 0 | } |
408 | | |
409 | 0 | fn init_rev<A: Automaton + ?Sized>( |
410 | 0 | dfa: &A, |
411 | 0 | pattern_id: Option<PatternID>, |
412 | 0 | bytes: &[u8], |
413 | 0 | start: usize, |
414 | 0 | end: usize, |
415 | 0 | ) -> Result<StateID, MatchError> { |
416 | 0 | let state = dfa.start_state_reverse(pattern_id, bytes, start, end); |
417 | 0 | // Start states can never be match states, since all matches are delayed |
418 | 0 | // by 1 byte. |
419 | 0 | assert!(!dfa.is_match_state(state)); |
420 | 0 | Ok(state) |
421 | 0 | } |
422 | | |
423 | 0 | fn eoi_fwd<A: Automaton + ?Sized>( |
424 | 0 | dfa: &A, |
425 | 0 | bytes: &[u8], |
426 | 0 | end: usize, |
427 | 0 | state: &mut StateID, |
428 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
429 | 0 | match bytes.get(end) { |
430 | 0 | Some(&b) => { |
431 | 0 | *state = dfa.next_state(*state, b); |
432 | 0 | if dfa.is_match_state(*state) { |
433 | 0 | Ok(Some(HalfMatch { |
434 | 0 | pattern: dfa.match_pattern(*state, 0), |
435 | 0 | offset: end, |
436 | 0 | })) |
437 | | } else { |
438 | 0 | Ok(None) |
439 | | } |
440 | | } |
441 | | None => { |
442 | 0 | *state = dfa.next_eoi_state(*state); |
443 | 0 | if dfa.is_match_state(*state) { |
444 | 0 | Ok(Some(HalfMatch { |
445 | 0 | pattern: dfa.match_pattern(*state, 0), |
446 | 0 | offset: bytes.len(), |
447 | 0 | })) |
448 | | } else { |
449 | 0 | Ok(None) |
450 | | } |
451 | | } |
452 | | } |
453 | 0 | } |
454 | | |
455 | 0 | fn eoi_rev<A: Automaton + ?Sized>( |
456 | 0 | dfa: &A, |
457 | 0 | bytes: &[u8], |
458 | 0 | start: usize, |
459 | 0 | state: StateID, |
460 | 0 | ) -> Result<Option<HalfMatch>, MatchError> { |
461 | 0 | if start > 0 { |
462 | 0 | let state = dfa.next_state(state, bytes[start - 1]); |
463 | 0 | if dfa.is_match_state(state) { |
464 | 0 | Ok(Some(HalfMatch { |
465 | 0 | pattern: dfa.match_pattern(state, 0), |
466 | 0 | offset: start, |
467 | 0 | })) |
468 | | } else { |
469 | 0 | Ok(None) |
470 | | } |
471 | | } else { |
472 | 0 | let state = dfa.next_eoi_state(state); |
473 | 0 | if dfa.is_match_state(state) { |
474 | 0 | Ok(Some(HalfMatch { |
475 | 0 | pattern: dfa.match_pattern(state, 0), |
476 | 0 | offset: 0, |
477 | 0 | })) |
478 | | } else { |
479 | 0 | Ok(None) |
480 | | } |
481 | | } |
482 | 0 | } |
483 | | |
484 | | // Currently unused, but is useful to keep around. This was originally used |
485 | | // when the code above used raw pointers for its main loop. |
486 | | // /// Returns the distance between the given pointer and the start of `bytes`. |
487 | | // /// This assumes that the given pointer points to somewhere in the `bytes` |
488 | | // /// slice given. |
489 | | // fn offset(bytes: &[u8], p: *const u8) -> usize { |
490 | | // debug_assert!(bytes.as_ptr() <= p); |
491 | | // debug_assert!(bytes[bytes.len()..].as_ptr() >= p); |
492 | | // ((p as isize) - (bytes.as_ptr() as isize)) as usize |
493 | | // } |