Coverage Report

Created: 2025-08-12 06:35

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.2.0/src/dfa/search.rs
Line
Count
Source (jump to first uncovered line)
1
use crate::{
2
    dfa::{
3
        accel,
4
        automaton::{Automaton, OverlappingState, StateMatch},
5
    },
6
    util::{
7
        id::{PatternID, StateID},
8
        matchtypes::HalfMatch,
9
        prefilter, MATCH_OFFSET,
10
    },
11
    MatchError,
12
};
13
14
#[inline(never)]
15
0
pub fn find_earliest_fwd<A: Automaton + ?Sized>(
16
0
    pre: Option<&mut prefilter::Scanner>,
17
0
    dfa: &A,
18
0
    pattern_id: Option<PatternID>,
19
0
    bytes: &[u8],
20
0
    start: usize,
21
0
    end: usize,
22
0
) -> Result<Option<HalfMatch>, MatchError> {
23
0
    // Searching with a pattern ID is always anchored, so we should never use
24
0
    // a prefilter.
25
0
    if pre.is_some() && pattern_id.is_none() {
26
0
        find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
27
    } else {
28
0
        find_fwd(None, true, dfa, pattern_id, bytes, start, end)
29
    }
30
0
}
31
32
#[inline(never)]
33
0
pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
34
0
    pre: Option<&mut prefilter::Scanner>,
35
0
    dfa: &A,
36
0
    pattern_id: Option<PatternID>,
37
0
    bytes: &[u8],
38
0
    start: usize,
39
0
    end: usize,
40
0
) -> Result<Option<HalfMatch>, MatchError> {
41
0
    // Searching with a pattern ID is always anchored, so we should never use
42
0
    // a prefilter.
43
0
    if pre.is_some() && pattern_id.is_none() {
44
0
        find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
45
    } else {
46
0
        find_fwd(None, false, dfa, pattern_id, bytes, start, end)
47
    }
48
0
}
49
50
/// This is marked as `inline(always)` specifically because it supports
51
/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
52
/// getting inlined eliminate some critical branches. To avoid bloating binary
53
/// size, we only call this function in a fixed number of places.
54
#[inline(always)]
55
0
fn find_fwd<A: Automaton + ?Sized>(
56
0
    mut pre: Option<&mut prefilter::Scanner>,
57
0
    earliest: bool,
58
0
    dfa: &A,
59
0
    pattern_id: Option<PatternID>,
60
0
    haystack: &[u8],
61
0
    start: usize,
62
0
    end: usize,
63
0
) -> Result<Option<HalfMatch>, MatchError> {
64
0
    assert!(start <= end);
65
0
    assert!(start <= haystack.len());
66
0
    assert!(end <= haystack.len());
67
68
    // Why do this? This lets 'bytes[at]' work without bounds checks below.
69
    // It seems the assert on 'end <= haystack.len()' above is otherwise
70
    // not enough. Why not just make 'bytes' scoped this way anyway? Well,
71
    // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
72
    // for resolving look-ahead.
73
0
    let bytes = &haystack[..end];
74
75
0
    let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
76
0
    let mut last_match = None;
77
0
    let mut at = start;
78
0
    if let Some(ref mut pre) = pre {
79
        // If a prefilter doesn't report false positives, then we don't need to
80
        // touch the DFA at all. However, since all matches include the pattern
81
        // ID, and the prefilter infrastructure doesn't report pattern IDs, we
82
        // limit this optimization to cases where there is exactly one pattern.
83
        // In that case, any match must be the 0th pattern.
84
0
        if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
85
0
            return Ok(pre.next_candidate(bytes, at).into_option().map(
86
0
                |offset| HalfMatch { pattern: PatternID::ZERO, offset },
87
0
            ));
88
0
        } else if pre.is_effective(at) {
89
0
            match pre.next_candidate(bytes, at).into_option() {
90
0
                None => return Ok(None),
91
0
                Some(i) => {
92
0
                    at = i;
93
0
                }
94
            }
95
0
        }
96
0
    }
97
0
    while at < end {
98
0
        let byte = bytes[at];
99
0
        state = dfa.next_state(state, byte);
100
0
        at += 1;
101
0
        if dfa.is_special_state(state) {
102
0
            if dfa.is_start_state(state) {
103
0
                if let Some(ref mut pre) = pre {
104
0
                    if pre.is_effective(at) {
105
0
                        match pre.next_candidate(bytes, at).into_option() {
106
0
                            None => return Ok(None),
107
0
                            Some(i) => {
108
0
                                at = i;
109
0
                            }
110
                        }
111
0
                    }
112
0
                } else if dfa.is_accel_state(state) {
113
0
                    let needles = dfa.accelerator(state);
114
0
                    at = accel::find_fwd(needles, bytes, at)
115
0
                        .unwrap_or(bytes.len());
116
0
                }
117
0
            } else if dfa.is_match_state(state) {
118
0
                last_match = Some(HalfMatch {
119
0
                    pattern: dfa.match_pattern(state, 0),
120
0
                    offset: at - MATCH_OFFSET,
121
0
                });
122
0
                if earliest {
123
0
                    return Ok(last_match);
124
0
                }
125
0
                if dfa.is_accel_state(state) {
126
0
                    let needles = dfa.accelerator(state);
127
0
                    at = accel::find_fwd(needles, bytes, at)
128
0
                        .unwrap_or(bytes.len());
129
0
                }
130
0
            } else if dfa.is_accel_state(state) {
131
0
                let needs = dfa.accelerator(state);
132
0
                at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
133
0
            } else if dfa.is_dead_state(state) {
134
0
                return Ok(last_match);
135
            } else {
136
0
                debug_assert!(dfa.is_quit_state(state));
137
0
                if last_match.is_some() {
138
0
                    return Ok(last_match);
139
0
                }
140
0
                return Err(MatchError::Quit { byte, offset: at - 1 });
141
            }
142
0
        }
143
0
        while at < end && dfa.next_state(state, bytes[at]) == state {
144
0
            at += 1;
145
0
        }
146
    }
147
0
    Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
148
0
}
149
150
#[inline(never)]
151
0
pub fn find_earliest_rev<A: Automaton + ?Sized>(
152
0
    dfa: &A,
153
0
    pattern_id: Option<PatternID>,
154
0
    bytes: &[u8],
155
0
    start: usize,
156
0
    end: usize,
157
0
) -> Result<Option<HalfMatch>, MatchError> {
158
0
    find_rev(true, dfa, pattern_id, bytes, start, end)
159
0
}
160
161
#[inline(never)]
162
0
pub fn find_leftmost_rev<A: Automaton + ?Sized>(
163
0
    dfa: &A,
164
0
    pattern_id: Option<PatternID>,
165
0
    bytes: &[u8],
166
0
    start: usize,
167
0
    end: usize,
168
0
) -> Result<Option<HalfMatch>, MatchError> {
169
0
    find_rev(false, dfa, pattern_id, bytes, start, end)
170
0
}
171
172
/// This is marked as `inline(always)` specifically because it supports
173
/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
174
/// permits eliminating a few crucial branches.
175
#[inline(always)]
176
0
fn find_rev<A: Automaton + ?Sized>(
177
0
    earliest: bool,
178
0
    dfa: &A,
179
0
    pattern_id: Option<PatternID>,
180
0
    bytes: &[u8],
181
0
    start: usize,
182
0
    end: usize,
183
0
) -> Result<Option<HalfMatch>, MatchError> {
184
0
    assert!(start <= end);
185
0
    assert!(start <= bytes.len());
186
0
    assert!(end <= bytes.len());
187
188
0
    let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
189
0
    let mut last_match = None;
190
0
    let mut at = end;
191
0
    while at > start {
192
0
        at -= 1;
193
0
        while at > start && dfa.next_state(state, bytes[at]) == state {
194
0
            at -= 1;
195
0
        }
196
197
0
        let byte = bytes[at];
198
0
        state = dfa.next_state(state, byte);
199
0
        if dfa.is_special_state(state) {
200
0
            if dfa.is_start_state(state) {
201
0
                if dfa.is_accel_state(state) {
202
0
                    let needles = dfa.accelerator(state);
203
0
                    at = accel::find_rev(needles, bytes, at)
204
0
                        .map(|i| i + 1)
205
0
                        .unwrap_or(0);
206
0
                }
207
0
            } else if dfa.is_match_state(state) {
208
0
                last_match = Some(HalfMatch {
209
0
                    pattern: dfa.match_pattern(state, 0),
210
0
                    offset: at + MATCH_OFFSET,
211
0
                });
212
0
                if earliest {
213
0
                    return Ok(last_match);
214
0
                }
215
0
                if dfa.is_accel_state(state) {
216
0
                    let needles = dfa.accelerator(state);
217
0
                    at = accel::find_rev(needles, bytes, at)
218
0
                        .map(|i| i + 1)
219
0
                        .unwrap_or(0);
220
0
                }
221
0
            } else if dfa.is_accel_state(state) {
222
0
                let needles = dfa.accelerator(state);
223
0
                at = accel::find_rev(needles, bytes, at)
224
0
                    .map(|i| i + 1)
225
0
                    .unwrap_or(0);
226
0
            } else if dfa.is_dead_state(state) {
227
0
                return Ok(last_match);
228
            } else {
229
0
                debug_assert!(dfa.is_quit_state(state));
230
0
                if last_match.is_some() {
231
0
                    return Ok(last_match);
232
0
                }
233
0
                return Err(MatchError::Quit { byte, offset: at });
234
            }
235
0
        }
236
    }
237
0
    Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
238
0
}
239
240
#[inline(never)]
241
0
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
242
0
    pre: Option<&mut prefilter::Scanner>,
243
0
    dfa: &A,
244
0
    pattern_id: Option<PatternID>,
245
0
    bytes: &[u8],
246
0
    start: usize,
247
0
    end: usize,
248
0
    caller_state: &mut OverlappingState,
249
0
) -> Result<Option<HalfMatch>, MatchError> {
250
0
    // Searching with a pattern ID is always anchored, so we should only ever
251
0
    // use a prefilter when no pattern ID is given.
252
0
    if pre.is_some() && pattern_id.is_none() {
253
0
        find_overlapping_fwd_imp(
254
0
            pre,
255
0
            dfa,
256
0
            pattern_id,
257
0
            bytes,
258
0
            start,
259
0
            end,
260
0
            caller_state,
261
0
        )
262
    } else {
263
0
        find_overlapping_fwd_imp(
264
0
            None,
265
0
            dfa,
266
0
            pattern_id,
267
0
            bytes,
268
0
            start,
269
0
            end,
270
0
            caller_state,
271
0
        )
272
    }
273
0
}
274
275
/// This is marked as `inline(always)` specifically because it supports
276
/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
277
/// permits eliminating a few crucial branches and reduces code size when it is
278
/// not used.
279
#[inline(always)]
280
0
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
281
0
    mut pre: Option<&mut prefilter::Scanner>,
282
0
    dfa: &A,
283
0
    pattern_id: Option<PatternID>,
284
0
    bytes: &[u8],
285
0
    mut start: usize,
286
0
    end: usize,
287
0
    caller_state: &mut OverlappingState,
288
0
) -> Result<Option<HalfMatch>, MatchError> {
289
0
    assert!(start <= end);
290
0
    assert!(start <= bytes.len());
291
0
    assert!(end <= bytes.len());
292
293
0
    let mut state = match caller_state.id() {
294
0
        None => init_fwd(dfa, pattern_id, bytes, start, end)?,
295
0
        Some(id) => {
296
0
            if let Some(last) = caller_state.last_match() {
297
0
                let match_count = dfa.match_count(id);
298
0
                if last.match_index < match_count {
299
0
                    let m = HalfMatch {
300
0
                        pattern: dfa.match_pattern(id, last.match_index),
301
0
                        offset: last.offset,
302
0
                    };
303
0
                    last.match_index += 1;
304
0
                    return Ok(Some(m));
305
0
                }
306
0
            }
307
308
            // This is a subtle but critical detail. If the caller provides a
309
            // non-None state ID, then it must be the case that the state ID
310
            // corresponds to one set by this function. The state ID therefore
311
            // corresponds to a match state, a dead state or some other state.
312
            // However, "some other" state _only_ occurs when the input has
313
            // been exhausted because the only way to stop before then is to
314
            // see a match or a dead/quit state.
315
            //
316
            // If the input is exhausted or if it's a dead state, then
317
            // incrementing the starting position has no relevance on
318
            // correctness, since the loop below will either not execute
319
            // at all or will immediately stop due to being in a dead state.
320
            // (Once in a dead state it is impossible to leave it.)
321
            //
322
            // Therefore, the only case we need to consider is when
323
            // caller_state is a match state. In this case, since our machines
324
            // support the ability to delay a match by a certain number of
325
            // bytes (to support look-around), it follows that we actually
326
            // consumed that many additional bytes on our previous search. When
327
            // the caller resumes their search to find subsequent matches, they
328
            // will use the ending location from the previous match as the next
329
            // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
330
            // we scanned to on the previous search. Therefore, we need to
331
            // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
332
            //
333
            // Incidentally, since MATCH_OFFSET is non-zero, this also makes
334
            // dealing with empty matches convenient. Namely, callers needn't
335
            // special case them when implementing an iterator. Instead, this
336
            // ensures that forward progress is always made.
337
0
            start += MATCH_OFFSET;
338
0
            id
339
        }
340
    };
341
342
0
    let mut at = start;
343
0
    while at < end {
344
0
        let byte = bytes[at];
345
0
        state = dfa.next_state(state, byte);
346
0
        at += 1;
347
0
        if dfa.is_special_state(state) {
348
0
            caller_state.set_id(state);
349
0
            if dfa.is_start_state(state) {
350
0
                if let Some(ref mut pre) = pre {
351
0
                    if pre.is_effective(at) {
352
0
                        match pre.next_candidate(bytes, at).into_option() {
353
0
                            None => return Ok(None),
354
0
                            Some(i) => {
355
0
                                at = i;
356
0
                            }
357
                        }
358
0
                    }
359
0
                } else if dfa.is_accel_state(state) {
360
0
                    let needles = dfa.accelerator(state);
361
0
                    at = accel::find_fwd(needles, bytes, at)
362
0
                        .unwrap_or(bytes.len());
363
0
                }
364
0
            } else if dfa.is_match_state(state) {
365
0
                let offset = at - MATCH_OFFSET;
366
0
                caller_state
367
0
                    .set_last_match(StateMatch { match_index: 1, offset });
368
0
                return Ok(Some(HalfMatch {
369
0
                    pattern: dfa.match_pattern(state, 0),
370
0
                    offset,
371
0
                }));
372
0
            } else if dfa.is_accel_state(state) {
373
0
                let needs = dfa.accelerator(state);
374
0
                at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
375
0
            } else if dfa.is_dead_state(state) {
376
0
                return Ok(None);
377
            } else {
378
0
                debug_assert!(dfa.is_quit_state(state));
379
0
                return Err(MatchError::Quit { byte, offset: at - 1 });
380
            }
381
0
        }
382
    }
383
384
0
    let result = eoi_fwd(dfa, bytes, end, &mut state);
385
0
    caller_state.set_id(state);
386
0
    if let Ok(Some(ref last_match)) = result {
387
0
        caller_state.set_last_match(StateMatch {
388
0
            match_index: 1,
389
0
            offset: last_match.offset(),
390
0
        });
391
0
    }
392
0
    result
393
0
}
394
395
0
fn init_fwd<A: Automaton + ?Sized>(
396
0
    dfa: &A,
397
0
    pattern_id: Option<PatternID>,
398
0
    bytes: &[u8],
399
0
    start: usize,
400
0
    end: usize,
401
0
) -> Result<StateID, MatchError> {
402
0
    let state = dfa.start_state_forward(pattern_id, bytes, start, end);
403
0
    // Start states can never be match states, since all matches are delayed
404
0
    // by 1 byte.
405
0
    assert!(!dfa.is_match_state(state));
406
0
    Ok(state)
407
0
}
408
409
0
fn init_rev<A: Automaton + ?Sized>(
410
0
    dfa: &A,
411
0
    pattern_id: Option<PatternID>,
412
0
    bytes: &[u8],
413
0
    start: usize,
414
0
    end: usize,
415
0
) -> Result<StateID, MatchError> {
416
0
    let state = dfa.start_state_reverse(pattern_id, bytes, start, end);
417
0
    // Start states can never be match states, since all matches are delayed
418
0
    // by 1 byte.
419
0
    assert!(!dfa.is_match_state(state));
420
0
    Ok(state)
421
0
}
422
423
0
fn eoi_fwd<A: Automaton + ?Sized>(
424
0
    dfa: &A,
425
0
    bytes: &[u8],
426
0
    end: usize,
427
0
    state: &mut StateID,
428
0
) -> Result<Option<HalfMatch>, MatchError> {
429
0
    match bytes.get(end) {
430
0
        Some(&b) => {
431
0
            *state = dfa.next_state(*state, b);
432
0
            if dfa.is_match_state(*state) {
433
0
                Ok(Some(HalfMatch {
434
0
                    pattern: dfa.match_pattern(*state, 0),
435
0
                    offset: end,
436
0
                }))
437
            } else {
438
0
                Ok(None)
439
            }
440
        }
441
        None => {
442
0
            *state = dfa.next_eoi_state(*state);
443
0
            if dfa.is_match_state(*state) {
444
0
                Ok(Some(HalfMatch {
445
0
                    pattern: dfa.match_pattern(*state, 0),
446
0
                    offset: bytes.len(),
447
0
                }))
448
            } else {
449
0
                Ok(None)
450
            }
451
        }
452
    }
453
0
}
454
455
0
fn eoi_rev<A: Automaton + ?Sized>(
456
0
    dfa: &A,
457
0
    bytes: &[u8],
458
0
    start: usize,
459
0
    state: StateID,
460
0
) -> Result<Option<HalfMatch>, MatchError> {
461
0
    if start > 0 {
462
0
        let state = dfa.next_state(state, bytes[start - 1]);
463
0
        if dfa.is_match_state(state) {
464
0
            Ok(Some(HalfMatch {
465
0
                pattern: dfa.match_pattern(state, 0),
466
0
                offset: start,
467
0
            }))
468
        } else {
469
0
            Ok(None)
470
        }
471
    } else {
472
0
        let state = dfa.next_eoi_state(state);
473
0
        if dfa.is_match_state(state) {
474
0
            Ok(Some(HalfMatch {
475
0
                pattern: dfa.match_pattern(state, 0),
476
0
                offset: 0,
477
0
            }))
478
        } else {
479
0
            Ok(None)
480
        }
481
    }
482
0
}
483
484
// Currently unused, but is useful to keep around. This was originally used
485
// when the code above used raw pointers for its main loop.
486
// /// Returns the distance between the given pointer and the start of `bytes`.
487
// /// This assumes that the given pointer points to somewhere in the `bytes`
488
// /// slice given.
489
// fn offset(bytes: &[u8], p: *const u8) -> usize {
490
// debug_assert!(bytes.as_ptr() <= p);
491
// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
492
// ((p as isize) - (bytes.as_ptr() as isize)) as usize
493
// }