Coverage Report

Created: 2025-07-23 07:29

/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.4.1/src/memmem/twoway.rs
Line
Count
Source (jump to first uncovered line)
1
use core::cmp;
2
3
use crate::memmem::{prefilter::Pre, util};
4
5
/// Two-Way search in the forward direction.
6
#[derive(Clone, Copy, Debug)]
7
pub(crate) struct Forward(TwoWay);
8
9
/// Two-Way search in the reverse direction.
10
#[derive(Clone, Copy, Debug)]
11
pub(crate) struct Reverse(TwoWay);
12
13
/// An implementation of the TwoWay substring search algorithm, with heuristics
14
/// for accelerating search based on frequency analysis.
15
///
16
/// This searcher supports forward and reverse search, although not
17
/// simultaneously. It runs in O(n + m) time and O(1) space, where
18
/// `n ~ len(needle)` and `m ~ len(haystack)`.
19
///
20
/// The implementation here roughly matches that which was developed by
21
/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
22
/// changes in this implementation are 1) the use of zero-based indices, 2) a
23
/// heuristic skip table based on the last byte (borrowed from Rust's standard
24
/// library) and 3) the addition of heuristics for a fast skip loop. That is,
25
/// (3) this will detect bytes that are believed to be rare in the needle and
26
/// use fast vectorized instructions to find their occurrences quickly. The
27
/// Two-Way algorithm is then used to confirm whether a match at that location
28
/// occurred.
29
///
30
/// The heuristic for fast skipping is automatically shut off if it's
31
/// detected to be ineffective at search time. Generally, this only occurs in
32
/// pathological cases. But this is generally necessary in order to preserve
33
/// a `O(n + m)` time bound.
34
///
35
/// The code below is fairly complex and not obviously correct at all. It's
36
/// likely necessary to read the Two-Way paper cited above in order to fully
37
/// grok this code. The essence of it is:
38
///
39
/// 1) Do something to detect a "critical" position in the needle.
40
/// 2) For the current position in the haystack, look if needle[critical..]
41
///    matches at that position.
42
/// 3) If so, look if needle[..critical] matches.
43
/// 4) If a mismatch occurs, shift the search by some amount based on the
44
///    critical position and a pre-computed shift.
45
///
46
/// This type is wrapped in Forward and Reverse types that expose consistent
47
/// forward or reverse APIs.
48
#[derive(Clone, Copy, Debug)]
49
struct TwoWay {
50
    /// A small bitset used as a quick prefilter (in addition to the faster
51
    /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i
52
    /// for any b in the needle.
53
    ///
54
    /// When used as a prefilter, if the last byte at the current candidate
55
    /// position is NOT in this set, then we can skip that entire candidate
56
    /// position (the length of the needle). This is essentially the shift
57
    /// trick found in Boyer-Moore, but only applied to bytes that don't appear
58
    /// in the needle.
59
    ///
60
    /// N.B. This trick was inspired by something similar in std's
61
    /// implementation of Two-Way.
62
    byteset: ApproximateByteSet,
63
    /// A critical position in needle. Specifically, this position corresponds
64
    /// to beginning of either the minimal or maximal suffix in needle. (N.B.
65
    /// See SuffixType below for why "minimal" isn't quite the correct word
66
    /// here.)
67
    ///
68
    /// This is the position at which every search begins. Namely, search
69
    /// starts by scanning text to the right of this position, and only if
70
    /// there's a match does the text to the left of this position get scanned.
71
    critical_pos: usize,
72
    /// The amount we shift by in the Two-Way search algorithm. This
73
    /// corresponds to the "small period" and "large period" cases.
74
    shift: Shift,
75
}
76
77
impl Forward {
78
    /// Create a searcher that uses the Two-Way algorithm by searching forwards
79
    /// through any haystack.
80
0
    pub(crate) fn new(needle: &[u8]) -> Forward {
81
0
        if needle.is_empty() {
82
0
            return Forward(TwoWay::empty());
83
0
        }
84
0
85
0
        let byteset = ApproximateByteSet::new(needle);
86
0
        let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
87
0
        let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
88
0
        let (period_lower_bound, critical_pos) =
89
0
            if min_suffix.pos > max_suffix.pos {
90
0
                (min_suffix.period, min_suffix.pos)
91
            } else {
92
0
                (max_suffix.period, max_suffix.pos)
93
            };
94
0
        let shift = Shift::forward(needle, period_lower_bound, critical_pos);
95
0
        Forward(TwoWay { byteset, critical_pos, shift })
96
0
    }
97
98
    /// Find the position of the first occurrence of this searcher's needle in
99
    /// the given haystack. If one does not exist, then return None.
100
    ///
101
    /// This accepts prefilter state that is useful when using the same
102
    /// searcher multiple times, such as in an iterator.
103
    ///
104
    /// Callers must guarantee that the needle is non-empty and its length is
105
    /// <= the haystack's length.
106
    #[inline(always)]
107
0
    pub(crate) fn find(
108
0
        &self,
109
0
        pre: Option<&mut Pre<'_>>,
110
0
        haystack: &[u8],
111
0
        needle: &[u8],
112
0
    ) -> Option<usize> {
113
0
        debug_assert!(!needle.is_empty(), "needle should not be empty");
114
0
        debug_assert!(needle.len() <= haystack.len(), "haystack too short");
115
116
0
        match self.0.shift {
117
0
            Shift::Small { period } => {
118
0
                self.find_small_imp(pre, haystack, needle, period)
119
            }
120
0
            Shift::Large { shift } => {
121
0
                self.find_large_imp(pre, haystack, needle, shift)
122
            }
123
        }
124
0
    }
125
126
    /// Like find, but handles the degenerate substring test cases. This is
127
    /// only useful for conveniently testing this substring implementation in
128
    /// isolation.
129
    #[cfg(test)]
130
    fn find_general(
131
        &self,
132
        pre: Option<&mut Pre<'_>>,
133
        haystack: &[u8],
134
        needle: &[u8],
135
    ) -> Option<usize> {
136
        if needle.is_empty() {
137
            Some(0)
138
        } else if haystack.len() < needle.len() {
139
            None
140
        } else {
141
            self.find(pre, haystack, needle)
142
        }
143
    }
144
145
    // Each of the two search implementations below can be accelerated by a
146
    // prefilter, but it is not always enabled. To avoid its overhead when
147
    // its disabled, we explicitly inline each search implementation based on
148
    // whether a prefilter will be used or not. The decision on which to use
149
    // is made in the parent meta searcher.
150
151
    #[inline(always)]
152
0
    fn find_small_imp(
153
0
        &self,
154
0
        mut pre: Option<&mut Pre<'_>>,
155
0
        haystack: &[u8],
156
0
        needle: &[u8],
157
0
        period: usize,
158
0
    ) -> Option<usize> {
159
0
        let last_byte = needle.len() - 1;
160
0
        let mut pos = 0;
161
0
        let mut shift = 0;
162
0
        while pos + needle.len() <= haystack.len() {
163
0
            let mut i = cmp::max(self.0.critical_pos, shift);
164
0
            if let Some(pre) = pre.as_mut() {
165
0
                if pre.should_call() {
166
0
                    pos += pre.call(&haystack[pos..], needle)?;
167
0
                    shift = 0;
168
0
                    i = self.0.critical_pos;
169
0
                    if pos + needle.len() > haystack.len() {
170
0
                        return None;
171
0
                    }
172
0
                }
173
0
            }
174
0
            if !self.0.byteset.contains(haystack[pos + last_byte]) {
175
0
                pos += needle.len();
176
0
                shift = 0;
177
0
                continue;
178
0
            }
179
0
            while i < needle.len() && needle[i] == haystack[pos + i] {
180
0
                i += 1;
181
0
            }
182
0
            if i < needle.len() {
183
0
                pos += i - self.0.critical_pos + 1;
184
0
                shift = 0;
185
0
            } else {
186
0
                let mut j = self.0.critical_pos;
187
0
                while j > shift && needle[j] == haystack[pos + j] {
188
0
                    j -= 1;
189
0
                }
190
0
                if j <= shift && needle[shift] == haystack[pos + shift] {
191
0
                    return Some(pos);
192
0
                }
193
0
                pos += period;
194
0
                shift = needle.len() - period;
195
            }
196
        }
197
0
        None
198
0
    }
199
200
    #[inline(always)]
201
0
    fn find_large_imp(
202
0
        &self,
203
0
        mut pre: Option<&mut Pre<'_>>,
204
0
        haystack: &[u8],
205
0
        needle: &[u8],
206
0
        shift: usize,
207
0
    ) -> Option<usize> {
208
0
        let last_byte = needle.len() - 1;
209
0
        let mut pos = 0;
210
0
        'outer: while pos + needle.len() <= haystack.len() {
211
0
            if let Some(pre) = pre.as_mut() {
212
0
                if pre.should_call() {
213
0
                    pos += pre.call(&haystack[pos..], needle)?;
214
0
                    if pos + needle.len() > haystack.len() {
215
0
                        return None;
216
0
                    }
217
0
                }
218
0
            }
219
220
0
            if !self.0.byteset.contains(haystack[pos + last_byte]) {
221
0
                pos += needle.len();
222
0
                continue;
223
0
            }
224
0
            let mut i = self.0.critical_pos;
225
0
            while i < needle.len() && needle[i] == haystack[pos + i] {
226
0
                i += 1;
227
0
            }
228
0
            if i < needle.len() {
229
0
                pos += i - self.0.critical_pos + 1;
230
0
            } else {
231
0
                for j in (0..self.0.critical_pos).rev() {
232
0
                    if needle[j] != haystack[pos + j] {
233
0
                        pos += shift;
234
0
                        continue 'outer;
235
0
                    }
236
                }
237
0
                return Some(pos);
238
            }
239
        }
240
0
        None
241
0
    }
242
}
243
244
impl Reverse {
245
    /// Create a searcher that uses the Two-Way algorithm by searching in
246
    /// reverse through any haystack.
247
0
    pub(crate) fn new(needle: &[u8]) -> Reverse {
248
0
        if needle.is_empty() {
249
0
            return Reverse(TwoWay::empty());
250
0
        }
251
0
252
0
        let byteset = ApproximateByteSet::new(needle);
253
0
        let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
254
0
        let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
255
0
        let (period_lower_bound, critical_pos) =
256
0
            if min_suffix.pos < max_suffix.pos {
257
0
                (min_suffix.period, min_suffix.pos)
258
            } else {
259
0
                (max_suffix.period, max_suffix.pos)
260
            };
261
        // let critical_pos = needle.len() - critical_pos;
262
0
        let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
263
0
        Reverse(TwoWay { byteset, critical_pos, shift })
264
0
    }
265
266
    /// Find the position of the last occurrence of this searcher's needle
267
    /// in the given haystack. If one does not exist, then return None.
268
    ///
269
    /// This will automatically initialize prefilter state. This should only
270
    /// be used for one-off searches.
271
    ///
272
    /// Callers must guarantee that the needle is non-empty and its length is
273
    /// <= the haystack's length.
274
    #[inline(always)]
275
0
    pub(crate) fn rfind(
276
0
        &self,
277
0
        haystack: &[u8],
278
0
        needle: &[u8],
279
0
    ) -> Option<usize> {
280
0
        debug_assert!(!needle.is_empty(), "needle should not be empty");
281
0
        debug_assert!(needle.len() <= haystack.len(), "haystack too short");
282
        // For the reverse case, we don't use a prefilter. It's plausible that
283
        // perhaps we should, but it's a lot of additional code to do it, and
284
        // it's not clear that it's actually worth it. If you have a really
285
        // compelling use case for this, please file an issue.
286
0
        match self.0.shift {
287
0
            Shift::Small { period } => {
288
0
                self.rfind_small_imp(haystack, needle, period)
289
            }
290
0
            Shift::Large { shift } => {
291
0
                self.rfind_large_imp(haystack, needle, shift)
292
            }
293
        }
294
0
    }
295
296
    /// Like rfind, but handles the degenerate substring test cases. This is
297
    /// only useful for conveniently testing this substring implementation in
298
    /// isolation.
299
    #[cfg(test)]
300
    fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
301
        if needle.is_empty() {
302
            Some(haystack.len())
303
        } else if haystack.len() < needle.len() {
304
            None
305
        } else {
306
            self.rfind(haystack, needle)
307
        }
308
    }
309
310
    #[inline(always)]
311
0
    fn rfind_small_imp(
312
0
        &self,
313
0
        haystack: &[u8],
314
0
        needle: &[u8],
315
0
        period: usize,
316
0
    ) -> Option<usize> {
317
0
        let nlen = needle.len();
318
0
        let mut pos = haystack.len();
319
0
        let mut shift = nlen;
320
0
        while pos >= nlen {
321
0
            if !self.0.byteset.contains(haystack[pos - nlen]) {
322
0
                pos -= nlen;
323
0
                shift = nlen;
324
0
                continue;
325
0
            }
326
0
            let mut i = cmp::min(self.0.critical_pos, shift);
327
0
            while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
328
0
                i -= 1;
329
0
            }
330
0
            if i > 0 || needle[0] != haystack[pos - nlen] {
331
0
                pos -= self.0.critical_pos - i + 1;
332
0
                shift = nlen;
333
0
            } else {
334
0
                let mut j = self.0.critical_pos;
335
0
                while j < shift && needle[j] == haystack[pos - nlen + j] {
336
0
                    j += 1;
337
0
                }
338
0
                if j >= shift {
339
0
                    return Some(pos - nlen);
340
0
                }
341
0
                pos -= period;
342
0
                shift = period;
343
            }
344
        }
345
0
        None
346
0
    }
347
348
    #[inline(always)]
349
0
    fn rfind_large_imp(
350
0
        &self,
351
0
        haystack: &[u8],
352
0
        needle: &[u8],
353
0
        shift: usize,
354
0
    ) -> Option<usize> {
355
0
        let nlen = needle.len();
356
0
        let mut pos = haystack.len();
357
0
        while pos >= nlen {
358
0
            if !self.0.byteset.contains(haystack[pos - nlen]) {
359
0
                pos -= nlen;
360
0
                continue;
361
0
            }
362
0
            let mut i = self.0.critical_pos;
363
0
            while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
364
0
                i -= 1;
365
0
            }
366
0
            if i > 0 || needle[0] != haystack[pos - nlen] {
367
0
                pos -= self.0.critical_pos - i + 1;
368
0
            } else {
369
0
                let mut j = self.0.critical_pos;
370
0
                while j < nlen && needle[j] == haystack[pos - nlen + j] {
371
0
                    j += 1;
372
0
                }
373
0
                if j == nlen {
374
0
                    return Some(pos - nlen);
375
0
                }
376
0
                pos -= shift;
377
            }
378
        }
379
0
        None
380
0
    }
381
}
382
383
impl TwoWay {
384
0
    fn empty() -> TwoWay {
385
0
        TwoWay {
386
0
            byteset: ApproximateByteSet::new(b""),
387
0
            critical_pos: 0,
388
0
            shift: Shift::Large { shift: 0 },
389
0
        }
390
0
    }
391
}
392
393
/// A representation of the amount we're allowed to shift by during Two-Way
394
/// search.
395
///
396
/// When computing a critical factorization of the needle, we find the position
397
/// of the critical factorization by finding the needle's maximal (or minimal)
398
/// suffix, along with the period of that suffix. It turns out that the period
399
/// of that suffix is a lower bound on the period of the needle itself.
400
///
401
/// This lower bound is equivalent to the actual period of the needle in
402
/// some cases. To describe that case, we denote the needle as `x` where
403
/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
404
/// bound given here is always the period of `v`, which is `<= period(x)`. The
405
/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
406
/// where `u` is a suffix of `v[0..period(v)]`.
407
///
408
/// This case is important because the search algorithm for when the
409
/// periods are equivalent is slightly different than the search algorithm
410
/// for when the periods are not equivalent. In particular, when they aren't
411
/// equivalent, we know that the period of the needle is no less than half its
412
/// length. In this case, we shift by an amount less than or equal to the
413
/// period of the needle (determined by the maximum length of the components
414
/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
415
///
416
/// The above two cases are represented by the variants below. Each entails
417
/// a different instantiation of the Two-Way search algorithm.
418
///
419
/// N.B. If we could find a way to compute the exact period in all cases,
420
/// then we could collapse this case analysis and simplify the algorithm. The
421
/// Two-Way paper suggests this is possible, but more reading is required to
422
/// grok why the authors didn't pursue that path.
423
#[derive(Clone, Copy, Debug)]
424
enum Shift {
425
    Small { period: usize },
426
    Large { shift: usize },
427
}
428
429
impl Shift {
430
    /// Compute the shift for a given needle in the forward direction.
431
    ///
432
    /// This requires a lower bound on the period and a critical position.
433
    /// These can be computed by extracting both the minimal and maximal
434
    /// lexicographic suffixes, and choosing the right-most starting position.
435
    /// The lower bound on the period is then the period of the chosen suffix.
436
0
    fn forward(
437
0
        needle: &[u8],
438
0
        period_lower_bound: usize,
439
0
        critical_pos: usize,
440
0
    ) -> Shift {
441
0
        let large = cmp::max(critical_pos, needle.len() - critical_pos);
442
0
        if critical_pos * 2 >= needle.len() {
443
0
            return Shift::Large { shift: large };
444
0
        }
445
0
446
0
        let (u, v) = needle.split_at(critical_pos);
447
0
        if !util::is_suffix(&v[..period_lower_bound], u) {
448
0
            return Shift::Large { shift: large };
449
0
        }
450
0
        Shift::Small { period: period_lower_bound }
451
0
    }
452
453
    /// Compute the shift for a given needle in the reverse direction.
454
    ///
455
    /// This requires a lower bound on the period and a critical position.
456
    /// These can be computed by extracting both the minimal and maximal
457
    /// lexicographic suffixes, and choosing the left-most starting position.
458
    /// The lower bound on the period is then the period of the chosen suffix.
459
0
    fn reverse(
460
0
        needle: &[u8],
461
0
        period_lower_bound: usize,
462
0
        critical_pos: usize,
463
0
    ) -> Shift {
464
0
        let large = cmp::max(critical_pos, needle.len() - critical_pos);
465
0
        if (needle.len() - critical_pos) * 2 >= needle.len() {
466
0
            return Shift::Large { shift: large };
467
0
        }
468
0
469
0
        let (v, u) = needle.split_at(critical_pos);
470
0
        if !util::is_prefix(&v[v.len() - period_lower_bound..], u) {
471
0
            return Shift::Large { shift: large };
472
0
        }
473
0
        Shift::Small { period: period_lower_bound }
474
0
    }
475
}
476
477
/// A suffix extracted from a needle along with its period.
478
#[derive(Debug)]
479
struct Suffix {
480
    /// The starting position of this suffix.
481
    ///
482
    /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
483
    /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
484
    /// forward suffixes, this is an inclusive starting position, where as for
485
    /// reverse suffixes, this is an exclusive ending position.
486
    pos: usize,
487
    /// The period of this suffix.
488
    ///
489
    /// Note that this is NOT necessarily the period of the string from which
490
    /// this suffix comes from. (It is always less than or equal to the period
491
    /// of the original string.)
492
    period: usize,
493
}
494
495
impl Suffix {
496
0
    fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
497
0
        debug_assert!(!needle.is_empty());
498
499
        // suffix represents our maximal (or minimal) suffix, along with
500
        // its period.
501
0
        let mut suffix = Suffix { pos: 0, period: 1 };
502
0
        // The start of a suffix in `needle` that we are considering as a
503
0
        // more maximal (or minimal) suffix than what's in `suffix`.
504
0
        let mut candidate_start = 1;
505
0
        // The current offset of our suffixes that we're comparing.
506
0
        //
507
0
        // When the characters at this offset are the same, then we mush on
508
0
        // to the next position since no decision is possible. When the
509
0
        // candidate's character is greater (or lesser) than the corresponding
510
0
        // character than our current maximal (or minimal) suffix, then the
511
0
        // current suffix is changed over to the candidate and we restart our
512
0
        // search. Otherwise, the candidate suffix is no good and we restart
513
0
        // our search on the next candidate.
514
0
        //
515
0
        // The three cases above correspond to the three cases in the loop
516
0
        // below.
517
0
        let mut offset = 0;
518
519
0
        while candidate_start + offset < needle.len() {
520
0
            let current = needle[suffix.pos + offset];
521
0
            let candidate = needle[candidate_start + offset];
522
0
            match kind.cmp(current, candidate) {
523
0
                SuffixOrdering::Accept => {
524
0
                    suffix = Suffix { pos: candidate_start, period: 1 };
525
0
                    candidate_start += 1;
526
0
                    offset = 0;
527
0
                }
528
0
                SuffixOrdering::Skip => {
529
0
                    candidate_start += offset + 1;
530
0
                    offset = 0;
531
0
                    suffix.period = candidate_start - suffix.pos;
532
0
                }
533
                SuffixOrdering::Push => {
534
0
                    if offset + 1 == suffix.period {
535
0
                        candidate_start += suffix.period;
536
0
                        offset = 0;
537
0
                    } else {
538
0
                        offset += 1;
539
0
                    }
540
                }
541
            }
542
        }
543
0
        suffix
544
0
    }
545
546
0
    fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
547
0
        debug_assert!(!needle.is_empty());
548
549
        // See the comments in `forward` for how this works.
550
0
        let mut suffix = Suffix { pos: needle.len(), period: 1 };
551
0
        if needle.len() == 1 {
552
0
            return suffix;
553
0
        }
554
0
        let mut candidate_start = needle.len() - 1;
555
0
        let mut offset = 0;
556
557
0
        while offset < candidate_start {
558
0
            let current = needle[suffix.pos - offset - 1];
559
0
            let candidate = needle[candidate_start - offset - 1];
560
0
            match kind.cmp(current, candidate) {
561
0
                SuffixOrdering::Accept => {
562
0
                    suffix = Suffix { pos: candidate_start, period: 1 };
563
0
                    candidate_start -= 1;
564
0
                    offset = 0;
565
0
                }
566
0
                SuffixOrdering::Skip => {
567
0
                    candidate_start -= offset + 1;
568
0
                    offset = 0;
569
0
                    suffix.period = suffix.pos - candidate_start;
570
0
                }
571
                SuffixOrdering::Push => {
572
0
                    if offset + 1 == suffix.period {
573
0
                        candidate_start -= suffix.period;
574
0
                        offset = 0;
575
0
                    } else {
576
0
                        offset += 1;
577
0
                    }
578
                }
579
            }
580
        }
581
0
        suffix
582
0
    }
583
}
584
585
/// The kind of suffix to extract.
586
#[derive(Clone, Copy, Debug)]
587
enum SuffixKind {
588
    /// Extract the smallest lexicographic suffix from a string.
589
    ///
590
    /// Technically, this doesn't actually pick the smallest lexicographic
591
    /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
592
    /// the latter over the former, even though `a < aa`. The reasoning for
593
    /// this isn't clear from the paper, but it still smells like a minimal
594
    /// suffix.
595
    Minimal,
596
    /// Extract the largest lexicographic suffix from a string.
597
    ///
598
    /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
599
    /// the choice between `z` and `zz`, this will choose the latter over the
600
    /// former.
601
    Maximal,
602
}
603
604
/// The result of comparing corresponding bytes between two suffixes.
605
#[derive(Clone, Copy, Debug)]
606
enum SuffixOrdering {
607
    /// This occurs when the given candidate byte indicates that the candidate
608
    /// suffix is better than the current maximal (or minimal) suffix. That is,
609
    /// the current candidate suffix should supplant the current maximal (or
610
    /// minimal) suffix.
611
    Accept,
612
    /// This occurs when the given candidate byte excludes the candidate suffix
613
    /// from being better than the current maximal (or minimal) suffix. That
614
    /// is, the current candidate suffix should be dropped and the next one
615
    /// should be considered.
616
    Skip,
617
    /// This occurs when no decision to accept or skip the candidate suffix
618
    /// can be made, e.g., when corresponding bytes are equivalent. In this
619
    /// case, the next corresponding bytes should be compared.
620
    Push,
621
}
622
623
impl SuffixKind {
624
    /// Returns true if and only if the given candidate byte indicates that
625
    /// it should replace the current suffix as the maximal (or minimal)
626
    /// suffix.
627
0
    fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
628
        use self::SuffixOrdering::*;
629
630
0
        match self {
631
0
            SuffixKind::Minimal if candidate < current => Accept,
632
0
            SuffixKind::Minimal if candidate > current => Skip,
633
0
            SuffixKind::Minimal => Push,
634
0
            SuffixKind::Maximal if candidate > current => Accept,
635
0
            SuffixKind::Maximal if candidate < current => Skip,
636
0
            SuffixKind::Maximal => Push,
637
        }
638
0
    }
639
}
640
641
/// A bitset used to track whether a particular byte exists in a needle or not.
642
///
643
/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
644
/// needle. If a particular byte in the haystack is NOT in this set, then one
645
/// can conclude that it is also not in the needle, and thus, one can advance
646
/// in the haystack by needle.len() bytes.
647
#[derive(Clone, Copy, Debug)]
648
struct ApproximateByteSet(u64);
649
650
impl ApproximateByteSet {
651
    /// Create a new set from the given needle.
652
0
    fn new(needle: &[u8]) -> ApproximateByteSet {
653
0
        let mut bits = 0;
654
0
        for &b in needle {
655
0
            bits |= 1 << (b % 64);
656
0
        }
657
0
        ApproximateByteSet(bits)
658
0
    }
659
660
    /// Return true if and only if the given byte might be in this set. This
661
    /// may return a false positive, but will never return a false negative.
662
    #[inline(always)]
663
0
    fn contains(&self, byte: u8) -> bool {
664
0
        self.0 & (1 << (byte % 64)) != 0
665
0
    }
666
}
667
668
#[cfg(all(test, feature = "std", not(miri)))]
669
mod tests {
670
    use quickcheck::quickcheck;
671
672
    use super::*;
673
674
    define_memmem_quickcheck_tests!(
675
        super::simpletests::twoway_find,
676
        super::simpletests::twoway_rfind
677
    );
678
679
    /// Convenience wrapper for computing the suffix as a byte string.
680
    fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
681
        let s = Suffix::forward(needle, kind);
682
        (&needle[s.pos..], s.period)
683
    }
684
685
    /// Convenience wrapper for computing the reverse suffix as a byte string.
686
    fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
687
        let s = Suffix::reverse(needle, kind);
688
        (&needle[..s.pos], s.period)
689
    }
690
691
    /// Return all of the non-empty suffixes in the given byte string.
692
    fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
693
        (0..bytes.len()).map(|i| &bytes[i..]).collect()
694
    }
695
696
    /// Return the lexicographically maximal suffix of the given byte string.
697
    fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
698
        let mut sufs = suffixes(needle);
699
        sufs.sort();
700
        sufs.pop().unwrap()
701
    }
702
703
    /// Return the lexicographically maximal suffix of the reverse of the given
704
    /// byte string.
705
    fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
706
        let mut reversed = needle.to_vec();
707
        reversed.reverse();
708
        let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
709
        got.reverse();
710
        got
711
    }
712
713
    #[test]
714
    fn suffix_forward() {
715
        macro_rules! assert_suffix_min {
716
            ($given:expr, $expected:expr, $period:expr) => {
717
                let (got_suffix, got_period) =
718
                    get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
719
                let got_suffix = std::str::from_utf8(got_suffix).unwrap();
720
                assert_eq!(($expected, $period), (got_suffix, got_period));
721
            };
722
        }
723
724
        macro_rules! assert_suffix_max {
725
            ($given:expr, $expected:expr, $period:expr) => {
726
                let (got_suffix, got_period) =
727
                    get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
728
                let got_suffix = std::str::from_utf8(got_suffix).unwrap();
729
                assert_eq!(($expected, $period), (got_suffix, got_period));
730
            };
731
        }
732
733
        assert_suffix_min!("a", "a", 1);
734
        assert_suffix_max!("a", "a", 1);
735
736
        assert_suffix_min!("ab", "ab", 2);
737
        assert_suffix_max!("ab", "b", 1);
738
739
        assert_suffix_min!("ba", "a", 1);
740
        assert_suffix_max!("ba", "ba", 2);
741
742
        assert_suffix_min!("abc", "abc", 3);
743
        assert_suffix_max!("abc", "c", 1);
744
745
        assert_suffix_min!("acb", "acb", 3);
746
        assert_suffix_max!("acb", "cb", 2);
747
748
        assert_suffix_min!("cba", "a", 1);
749
        assert_suffix_max!("cba", "cba", 3);
750
751
        assert_suffix_min!("abcabc", "abcabc", 3);
752
        assert_suffix_max!("abcabc", "cabc", 3);
753
754
        assert_suffix_min!("abcabcabc", "abcabcabc", 3);
755
        assert_suffix_max!("abcabcabc", "cabcabc", 3);
756
757
        assert_suffix_min!("abczz", "abczz", 5);
758
        assert_suffix_max!("abczz", "zz", 1);
759
760
        assert_suffix_min!("zzabc", "abc", 3);
761
        assert_suffix_max!("zzabc", "zzabc", 5);
762
763
        assert_suffix_min!("aaa", "aaa", 1);
764
        assert_suffix_max!("aaa", "aaa", 1);
765
766
        assert_suffix_min!("foobar", "ar", 2);
767
        assert_suffix_max!("foobar", "r", 1);
768
    }
769
770
    #[test]
771
    fn suffix_reverse() {
772
        macro_rules! assert_suffix_min {
773
            ($given:expr, $expected:expr, $period:expr) => {
774
                let (got_suffix, got_period) =
775
                    get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
776
                let got_suffix = std::str::from_utf8(got_suffix).unwrap();
777
                assert_eq!(($expected, $period), (got_suffix, got_period));
778
            };
779
        }
780
781
        macro_rules! assert_suffix_max {
782
            ($given:expr, $expected:expr, $period:expr) => {
783
                let (got_suffix, got_period) =
784
                    get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
785
                let got_suffix = std::str::from_utf8(got_suffix).unwrap();
786
                assert_eq!(($expected, $period), (got_suffix, got_period));
787
            };
788
        }
789
790
        assert_suffix_min!("a", "a", 1);
791
        assert_suffix_max!("a", "a", 1);
792
793
        assert_suffix_min!("ab", "a", 1);
794
        assert_suffix_max!("ab", "ab", 2);
795
796
        assert_suffix_min!("ba", "ba", 2);
797
        assert_suffix_max!("ba", "b", 1);
798
799
        assert_suffix_min!("abc", "a", 1);
800
        assert_suffix_max!("abc", "abc", 3);
801
802
        assert_suffix_min!("acb", "a", 1);
803
        assert_suffix_max!("acb", "ac", 2);
804
805
        assert_suffix_min!("cba", "cba", 3);
806
        assert_suffix_max!("cba", "c", 1);
807
808
        assert_suffix_min!("abcabc", "abca", 3);
809
        assert_suffix_max!("abcabc", "abcabc", 3);
810
811
        assert_suffix_min!("abcabcabc", "abcabca", 3);
812
        assert_suffix_max!("abcabcabc", "abcabcabc", 3);
813
814
        assert_suffix_min!("abczz", "a", 1);
815
        assert_suffix_max!("abczz", "abczz", 5);
816
817
        assert_suffix_min!("zzabc", "zza", 3);
818
        assert_suffix_max!("zzabc", "zz", 1);
819
820
        assert_suffix_min!("aaa", "aaa", 1);
821
        assert_suffix_max!("aaa", "aaa", 1);
822
    }
823
824
    quickcheck! {
825
        fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
826
            if bytes.is_empty() {
827
                return true;
828
            }
829
830
            let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
831
            let expected = naive_maximal_suffix_forward(&bytes);
832
            got == expected
833
        }
834
835
        fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
836
            if bytes.is_empty() {
837
                return true;
838
            }
839
840
            let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
841
            let expected = naive_maximal_suffix_reverse(&bytes);
842
            expected == got
843
        }
844
    }
845
}
846
847
#[cfg(test)]
848
mod simpletests {
849
    use super::*;
850
851
    pub(crate) fn twoway_find(
852
        haystack: &[u8],
853
        needle: &[u8],
854
    ) -> Option<usize> {
855
        Forward::new(needle).find_general(None, haystack, needle)
856
    }
857
858
    pub(crate) fn twoway_rfind(
859
        haystack: &[u8],
860
        needle: &[u8],
861
    ) -> Option<usize> {
862
        Reverse::new(needle).rfind_general(haystack, needle)
863
    }
864
865
    define_memmem_simple_tests!(twoway_find, twoway_rfind);
866
867
    // This is a regression test caught by quickcheck that exercised a bug in
868
    // the reverse small period handling. The bug was that we were using 'if j
869
    // == shift' to determine if a match occurred, but the correct guard is 'if
870
    // j >= shift', which matches the corresponding guard in the forward impl.
871
    #[test]
872
    fn regression_rev_small_period() {
873
        let rfind = super::simpletests::twoway_rfind;
874
        let haystack = "ababaz";
875
        let needle = "abab";
876
        assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
877
    }
878
}