/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.4.1/src/memmem/twoway.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use core::cmp; |
2 | | |
3 | | use crate::memmem::{prefilter::Pre, util}; |
4 | | |
5 | | /// Two-Way search in the forward direction. |
6 | | #[derive(Clone, Copy, Debug)] |
7 | | pub(crate) struct Forward(TwoWay); |
8 | | |
9 | | /// Two-Way search in the reverse direction. |
10 | | #[derive(Clone, Copy, Debug)] |
11 | | pub(crate) struct Reverse(TwoWay); |
12 | | |
13 | | /// An implementation of the TwoWay substring search algorithm, with heuristics |
14 | | /// for accelerating search based on frequency analysis. |
15 | | /// |
16 | | /// This searcher supports forward and reverse search, although not |
17 | | /// simultaneously. It runs in O(n + m) time and O(1) space, where |
18 | | /// `n ~ len(needle)` and `m ~ len(haystack)`. |
19 | | /// |
20 | | /// The implementation here roughly matches that which was developed by |
21 | | /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The |
22 | | /// changes in this implementation are 1) the use of zero-based indices, 2) a |
23 | | /// heuristic skip table based on the last byte (borrowed from Rust's standard |
24 | | /// library) and 3) the addition of heuristics for a fast skip loop. That is, |
25 | | /// (3) this will detect bytes that are believed to be rare in the needle and |
26 | | /// use fast vectorized instructions to find their occurrences quickly. The |
27 | | /// Two-Way algorithm is then used to confirm whether a match at that location |
28 | | /// occurred. |
29 | | /// |
30 | | /// The heuristic for fast skipping is automatically shut off if it's |
31 | | /// detected to be ineffective at search time. Generally, this only occurs in |
32 | | /// pathological cases. But this is generally necessary in order to preserve |
33 | | /// a `O(n + m)` time bound. |
34 | | /// |
35 | | /// The code below is fairly complex and not obviously correct at all. It's |
36 | | /// likely necessary to read the Two-Way paper cited above in order to fully |
37 | | /// grok this code. The essence of it is: |
38 | | /// |
39 | | /// 1) Do something to detect a "critical" position in the needle. |
40 | | /// 2) For the current position in the haystack, look if needle[critical..] |
41 | | /// matches at that position. |
42 | | /// 3) If so, look if needle[..critical] matches. |
43 | | /// 4) If a mismatch occurs, shift the search by some amount based on the |
44 | | /// critical position and a pre-computed shift. |
45 | | /// |
46 | | /// This type is wrapped in Forward and Reverse types that expose consistent |
47 | | /// forward or reverse APIs. |
48 | | #[derive(Clone, Copy, Debug)] |
49 | | struct TwoWay { |
50 | | /// A small bitset used as a quick prefilter (in addition to the faster |
51 | | /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i |
52 | | /// for any b in the needle. |
53 | | /// |
54 | | /// When used as a prefilter, if the last byte at the current candidate |
55 | | /// position is NOT in this set, then we can skip that entire candidate |
56 | | /// position (the length of the needle). This is essentially the shift |
57 | | /// trick found in Boyer-Moore, but only applied to bytes that don't appear |
58 | | /// in the needle. |
59 | | /// |
60 | | /// N.B. This trick was inspired by something similar in std's |
61 | | /// implementation of Two-Way. |
62 | | byteset: ApproximateByteSet, |
63 | | /// A critical position in needle. Specifically, this position corresponds |
64 | | /// to beginning of either the minimal or maximal suffix in needle. (N.B. |
65 | | /// See SuffixType below for why "minimal" isn't quite the correct word |
66 | | /// here.) |
67 | | /// |
68 | | /// This is the position at which every search begins. Namely, search |
69 | | /// starts by scanning text to the right of this position, and only if |
70 | | /// there's a match does the text to the left of this position get scanned. |
71 | | critical_pos: usize, |
72 | | /// The amount we shift by in the Two-Way search algorithm. This |
73 | | /// corresponds to the "small period" and "large period" cases. |
74 | | shift: Shift, |
75 | | } |
76 | | |
77 | | impl Forward { |
78 | | /// Create a searcher that uses the Two-Way algorithm by searching forwards |
79 | | /// through any haystack. |
80 | 0 | pub(crate) fn new(needle: &[u8]) -> Forward { |
81 | 0 | if needle.is_empty() { |
82 | 0 | return Forward(TwoWay::empty()); |
83 | 0 | } |
84 | 0 |
|
85 | 0 | let byteset = ApproximateByteSet::new(needle); |
86 | 0 | let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); |
87 | 0 | let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); |
88 | 0 | let (period_lower_bound, critical_pos) = |
89 | 0 | if min_suffix.pos > max_suffix.pos { |
90 | 0 | (min_suffix.period, min_suffix.pos) |
91 | | } else { |
92 | 0 | (max_suffix.period, max_suffix.pos) |
93 | | }; |
94 | 0 | let shift = Shift::forward(needle, period_lower_bound, critical_pos); |
95 | 0 | Forward(TwoWay { byteset, critical_pos, shift }) |
96 | 0 | } |
97 | | |
98 | | /// Find the position of the first occurrence of this searcher's needle in |
99 | | /// the given haystack. If one does not exist, then return None. |
100 | | /// |
101 | | /// This accepts prefilter state that is useful when using the same |
102 | | /// searcher multiple times, such as in an iterator. |
103 | | /// |
104 | | /// Callers must guarantee that the needle is non-empty and its length is |
105 | | /// <= the haystack's length. |
106 | | #[inline(always)] |
107 | 0 | pub(crate) fn find( |
108 | 0 | &self, |
109 | 0 | pre: Option<&mut Pre<'_>>, |
110 | 0 | haystack: &[u8], |
111 | 0 | needle: &[u8], |
112 | 0 | ) -> Option<usize> { |
113 | 0 | debug_assert!(!needle.is_empty(), "needle should not be empty"); |
114 | 0 | debug_assert!(needle.len() <= haystack.len(), "haystack too short"); |
115 | | |
116 | 0 | match self.0.shift { |
117 | 0 | Shift::Small { period } => { |
118 | 0 | self.find_small_imp(pre, haystack, needle, period) |
119 | | } |
120 | 0 | Shift::Large { shift } => { |
121 | 0 | self.find_large_imp(pre, haystack, needle, shift) |
122 | | } |
123 | | } |
124 | 0 | } |
125 | | |
126 | | /// Like find, but handles the degenerate substring test cases. This is |
127 | | /// only useful for conveniently testing this substring implementation in |
128 | | /// isolation. |
129 | | #[cfg(test)] |
130 | | fn find_general( |
131 | | &self, |
132 | | pre: Option<&mut Pre<'_>>, |
133 | | haystack: &[u8], |
134 | | needle: &[u8], |
135 | | ) -> Option<usize> { |
136 | | if needle.is_empty() { |
137 | | Some(0) |
138 | | } else if haystack.len() < needle.len() { |
139 | | None |
140 | | } else { |
141 | | self.find(pre, haystack, needle) |
142 | | } |
143 | | } |
144 | | |
145 | | // Each of the two search implementations below can be accelerated by a |
146 | | // prefilter, but it is not always enabled. To avoid its overhead when |
147 | | // its disabled, we explicitly inline each search implementation based on |
148 | | // whether a prefilter will be used or not. The decision on which to use |
149 | | // is made in the parent meta searcher. |
150 | | |
151 | | #[inline(always)] |
152 | 0 | fn find_small_imp( |
153 | 0 | &self, |
154 | 0 | mut pre: Option<&mut Pre<'_>>, |
155 | 0 | haystack: &[u8], |
156 | 0 | needle: &[u8], |
157 | 0 | period: usize, |
158 | 0 | ) -> Option<usize> { |
159 | 0 | let last_byte = needle.len() - 1; |
160 | 0 | let mut pos = 0; |
161 | 0 | let mut shift = 0; |
162 | 0 | while pos + needle.len() <= haystack.len() { |
163 | 0 | let mut i = cmp::max(self.0.critical_pos, shift); |
164 | 0 | if let Some(pre) = pre.as_mut() { |
165 | 0 | if pre.should_call() { |
166 | 0 | pos += pre.call(&haystack[pos..], needle)?; |
167 | 0 | shift = 0; |
168 | 0 | i = self.0.critical_pos; |
169 | 0 | if pos + needle.len() > haystack.len() { |
170 | 0 | return None; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | } |
174 | 0 | if !self.0.byteset.contains(haystack[pos + last_byte]) { |
175 | 0 | pos += needle.len(); |
176 | 0 | shift = 0; |
177 | 0 | continue; |
178 | 0 | } |
179 | 0 | while i < needle.len() && needle[i] == haystack[pos + i] { |
180 | 0 | i += 1; |
181 | 0 | } |
182 | 0 | if i < needle.len() { |
183 | 0 | pos += i - self.0.critical_pos + 1; |
184 | 0 | shift = 0; |
185 | 0 | } else { |
186 | 0 | let mut j = self.0.critical_pos; |
187 | 0 | while j > shift && needle[j] == haystack[pos + j] { |
188 | 0 | j -= 1; |
189 | 0 | } |
190 | 0 | if j <= shift && needle[shift] == haystack[pos + shift] { |
191 | 0 | return Some(pos); |
192 | 0 | } |
193 | 0 | pos += period; |
194 | 0 | shift = needle.len() - period; |
195 | | } |
196 | | } |
197 | 0 | None |
198 | 0 | } |
199 | | |
200 | | #[inline(always)] |
201 | 0 | fn find_large_imp( |
202 | 0 | &self, |
203 | 0 | mut pre: Option<&mut Pre<'_>>, |
204 | 0 | haystack: &[u8], |
205 | 0 | needle: &[u8], |
206 | 0 | shift: usize, |
207 | 0 | ) -> Option<usize> { |
208 | 0 | let last_byte = needle.len() - 1; |
209 | 0 | let mut pos = 0; |
210 | 0 | 'outer: while pos + needle.len() <= haystack.len() { |
211 | 0 | if let Some(pre) = pre.as_mut() { |
212 | 0 | if pre.should_call() { |
213 | 0 | pos += pre.call(&haystack[pos..], needle)?; |
214 | 0 | if pos + needle.len() > haystack.len() { |
215 | 0 | return None; |
216 | 0 | } |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | 0 | if !self.0.byteset.contains(haystack[pos + last_byte]) { |
221 | 0 | pos += needle.len(); |
222 | 0 | continue; |
223 | 0 | } |
224 | 0 | let mut i = self.0.critical_pos; |
225 | 0 | while i < needle.len() && needle[i] == haystack[pos + i] { |
226 | 0 | i += 1; |
227 | 0 | } |
228 | 0 | if i < needle.len() { |
229 | 0 | pos += i - self.0.critical_pos + 1; |
230 | 0 | } else { |
231 | 0 | for j in (0..self.0.critical_pos).rev() { |
232 | 0 | if needle[j] != haystack[pos + j] { |
233 | 0 | pos += shift; |
234 | 0 | continue 'outer; |
235 | 0 | } |
236 | | } |
237 | 0 | return Some(pos); |
238 | | } |
239 | | } |
240 | 0 | None |
241 | 0 | } |
242 | | } |
243 | | |
244 | | impl Reverse { |
245 | | /// Create a searcher that uses the Two-Way algorithm by searching in |
246 | | /// reverse through any haystack. |
247 | 0 | pub(crate) fn new(needle: &[u8]) -> Reverse { |
248 | 0 | if needle.is_empty() { |
249 | 0 | return Reverse(TwoWay::empty()); |
250 | 0 | } |
251 | 0 |
|
252 | 0 | let byteset = ApproximateByteSet::new(needle); |
253 | 0 | let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); |
254 | 0 | let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); |
255 | 0 | let (period_lower_bound, critical_pos) = |
256 | 0 | if min_suffix.pos < max_suffix.pos { |
257 | 0 | (min_suffix.period, min_suffix.pos) |
258 | | } else { |
259 | 0 | (max_suffix.period, max_suffix.pos) |
260 | | }; |
261 | | // let critical_pos = needle.len() - critical_pos; |
262 | 0 | let shift = Shift::reverse(needle, period_lower_bound, critical_pos); |
263 | 0 | Reverse(TwoWay { byteset, critical_pos, shift }) |
264 | 0 | } |
265 | | |
266 | | /// Find the position of the last occurrence of this searcher's needle |
267 | | /// in the given haystack. If one does not exist, then return None. |
268 | | /// |
269 | | /// This will automatically initialize prefilter state. This should only |
270 | | /// be used for one-off searches. |
271 | | /// |
272 | | /// Callers must guarantee that the needle is non-empty and its length is |
273 | | /// <= the haystack's length. |
274 | | #[inline(always)] |
275 | 0 | pub(crate) fn rfind( |
276 | 0 | &self, |
277 | 0 | haystack: &[u8], |
278 | 0 | needle: &[u8], |
279 | 0 | ) -> Option<usize> { |
280 | 0 | debug_assert!(!needle.is_empty(), "needle should not be empty"); |
281 | 0 | debug_assert!(needle.len() <= haystack.len(), "haystack too short"); |
282 | | // For the reverse case, we don't use a prefilter. It's plausible that |
283 | | // perhaps we should, but it's a lot of additional code to do it, and |
284 | | // it's not clear that it's actually worth it. If you have a really |
285 | | // compelling use case for this, please file an issue. |
286 | 0 | match self.0.shift { |
287 | 0 | Shift::Small { period } => { |
288 | 0 | self.rfind_small_imp(haystack, needle, period) |
289 | | } |
290 | 0 | Shift::Large { shift } => { |
291 | 0 | self.rfind_large_imp(haystack, needle, shift) |
292 | | } |
293 | | } |
294 | 0 | } |
295 | | |
296 | | /// Like rfind, but handles the degenerate substring test cases. This is |
297 | | /// only useful for conveniently testing this substring implementation in |
298 | | /// isolation. |
299 | | #[cfg(test)] |
300 | | fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |
301 | | if needle.is_empty() { |
302 | | Some(haystack.len()) |
303 | | } else if haystack.len() < needle.len() { |
304 | | None |
305 | | } else { |
306 | | self.rfind(haystack, needle) |
307 | | } |
308 | | } |
309 | | |
310 | | #[inline(always)] |
311 | 0 | fn rfind_small_imp( |
312 | 0 | &self, |
313 | 0 | haystack: &[u8], |
314 | 0 | needle: &[u8], |
315 | 0 | period: usize, |
316 | 0 | ) -> Option<usize> { |
317 | 0 | let nlen = needle.len(); |
318 | 0 | let mut pos = haystack.len(); |
319 | 0 | let mut shift = nlen; |
320 | 0 | while pos >= nlen { |
321 | 0 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
322 | 0 | pos -= nlen; |
323 | 0 | shift = nlen; |
324 | 0 | continue; |
325 | 0 | } |
326 | 0 | let mut i = cmp::min(self.0.critical_pos, shift); |
327 | 0 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
328 | 0 | i -= 1; |
329 | 0 | } |
330 | 0 | if i > 0 || needle[0] != haystack[pos - nlen] { |
331 | 0 | pos -= self.0.critical_pos - i + 1; |
332 | 0 | shift = nlen; |
333 | 0 | } else { |
334 | 0 | let mut j = self.0.critical_pos; |
335 | 0 | while j < shift && needle[j] == haystack[pos - nlen + j] { |
336 | 0 | j += 1; |
337 | 0 | } |
338 | 0 | if j >= shift { |
339 | 0 | return Some(pos - nlen); |
340 | 0 | } |
341 | 0 | pos -= period; |
342 | 0 | shift = period; |
343 | | } |
344 | | } |
345 | 0 | None |
346 | 0 | } |
347 | | |
348 | | #[inline(always)] |
349 | 0 | fn rfind_large_imp( |
350 | 0 | &self, |
351 | 0 | haystack: &[u8], |
352 | 0 | needle: &[u8], |
353 | 0 | shift: usize, |
354 | 0 | ) -> Option<usize> { |
355 | 0 | let nlen = needle.len(); |
356 | 0 | let mut pos = haystack.len(); |
357 | 0 | while pos >= nlen { |
358 | 0 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
359 | 0 | pos -= nlen; |
360 | 0 | continue; |
361 | 0 | } |
362 | 0 | let mut i = self.0.critical_pos; |
363 | 0 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
364 | 0 | i -= 1; |
365 | 0 | } |
366 | 0 | if i > 0 || needle[0] != haystack[pos - nlen] { |
367 | 0 | pos -= self.0.critical_pos - i + 1; |
368 | 0 | } else { |
369 | 0 | let mut j = self.0.critical_pos; |
370 | 0 | while j < nlen && needle[j] == haystack[pos - nlen + j] { |
371 | 0 | j += 1; |
372 | 0 | } |
373 | 0 | if j == nlen { |
374 | 0 | return Some(pos - nlen); |
375 | 0 | } |
376 | 0 | pos -= shift; |
377 | | } |
378 | | } |
379 | 0 | None |
380 | 0 | } |
381 | | } |
382 | | |
383 | | impl TwoWay { |
384 | 0 | fn empty() -> TwoWay { |
385 | 0 | TwoWay { |
386 | 0 | byteset: ApproximateByteSet::new(b""), |
387 | 0 | critical_pos: 0, |
388 | 0 | shift: Shift::Large { shift: 0 }, |
389 | 0 | } |
390 | 0 | } |
391 | | } |
392 | | |
393 | | /// A representation of the amount we're allowed to shift by during Two-Way |
394 | | /// search. |
395 | | /// |
396 | | /// When computing a critical factorization of the needle, we find the position |
397 | | /// of the critical factorization by finding the needle's maximal (or minimal) |
398 | | /// suffix, along with the period of that suffix. It turns out that the period |
399 | | /// of that suffix is a lower bound on the period of the needle itself. |
400 | | /// |
401 | | /// This lower bound is equivalent to the actual period of the needle in |
402 | | /// some cases. To describe that case, we denote the needle as `x` where |
403 | | /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower |
404 | | /// bound given here is always the period of `v`, which is `<= period(x)`. The |
405 | | /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and |
406 | | /// where `u` is a suffix of `v[0..period(v)]`. |
407 | | /// |
408 | | /// This case is important because the search algorithm for when the |
409 | | /// periods are equivalent is slightly different than the search algorithm |
410 | | /// for when the periods are not equivalent. In particular, when they aren't |
411 | | /// equivalent, we know that the period of the needle is no less than half its |
412 | | /// length. In this case, we shift by an amount less than or equal to the |
413 | | /// period of the needle (determined by the maximum length of the components |
414 | | /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. |
415 | | /// |
416 | | /// The above two cases are represented by the variants below. Each entails |
417 | | /// a different instantiation of the Two-Way search algorithm. |
418 | | /// |
419 | | /// N.B. If we could find a way to compute the exact period in all cases, |
420 | | /// then we could collapse this case analysis and simplify the algorithm. The |
421 | | /// Two-Way paper suggests this is possible, but more reading is required to |
422 | | /// grok why the authors didn't pursue that path. |
423 | | #[derive(Clone, Copy, Debug)] |
424 | | enum Shift { |
425 | | Small { period: usize }, |
426 | | Large { shift: usize }, |
427 | | } |
428 | | |
429 | | impl Shift { |
430 | | /// Compute the shift for a given needle in the forward direction. |
431 | | /// |
432 | | /// This requires a lower bound on the period and a critical position. |
433 | | /// These can be computed by extracting both the minimal and maximal |
434 | | /// lexicographic suffixes, and choosing the right-most starting position. |
435 | | /// The lower bound on the period is then the period of the chosen suffix. |
436 | 0 | fn forward( |
437 | 0 | needle: &[u8], |
438 | 0 | period_lower_bound: usize, |
439 | 0 | critical_pos: usize, |
440 | 0 | ) -> Shift { |
441 | 0 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
442 | 0 | if critical_pos * 2 >= needle.len() { |
443 | 0 | return Shift::Large { shift: large }; |
444 | 0 | } |
445 | 0 |
|
446 | 0 | let (u, v) = needle.split_at(critical_pos); |
447 | 0 | if !util::is_suffix(&v[..period_lower_bound], u) { |
448 | 0 | return Shift::Large { shift: large }; |
449 | 0 | } |
450 | 0 | Shift::Small { period: period_lower_bound } |
451 | 0 | } |
452 | | |
453 | | /// Compute the shift for a given needle in the reverse direction. |
454 | | /// |
455 | | /// This requires a lower bound on the period and a critical position. |
456 | | /// These can be computed by extracting both the minimal and maximal |
457 | | /// lexicographic suffixes, and choosing the left-most starting position. |
458 | | /// The lower bound on the period is then the period of the chosen suffix. |
459 | 0 | fn reverse( |
460 | 0 | needle: &[u8], |
461 | 0 | period_lower_bound: usize, |
462 | 0 | critical_pos: usize, |
463 | 0 | ) -> Shift { |
464 | 0 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
465 | 0 | if (needle.len() - critical_pos) * 2 >= needle.len() { |
466 | 0 | return Shift::Large { shift: large }; |
467 | 0 | } |
468 | 0 |
|
469 | 0 | let (v, u) = needle.split_at(critical_pos); |
470 | 0 | if !util::is_prefix(&v[v.len() - period_lower_bound..], u) { |
471 | 0 | return Shift::Large { shift: large }; |
472 | 0 | } |
473 | 0 | Shift::Small { period: period_lower_bound } |
474 | 0 | } |
475 | | } |
476 | | |
477 | | /// A suffix extracted from a needle along with its period. |
478 | | #[derive(Debug)] |
479 | | struct Suffix { |
480 | | /// The starting position of this suffix. |
481 | | /// |
482 | | /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this |
483 | | /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for |
484 | | /// forward suffixes, this is an inclusive starting position, where as for |
485 | | /// reverse suffixes, this is an exclusive ending position. |
486 | | pos: usize, |
487 | | /// The period of this suffix. |
488 | | /// |
489 | | /// Note that this is NOT necessarily the period of the string from which |
490 | | /// this suffix comes from. (It is always less than or equal to the period |
491 | | /// of the original string.) |
492 | | period: usize, |
493 | | } |
494 | | |
495 | | impl Suffix { |
496 | 0 | fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { |
497 | 0 | debug_assert!(!needle.is_empty()); |
498 | | |
499 | | // suffix represents our maximal (or minimal) suffix, along with |
500 | | // its period. |
501 | 0 | let mut suffix = Suffix { pos: 0, period: 1 }; |
502 | 0 | // The start of a suffix in `needle` that we are considering as a |
503 | 0 | // more maximal (or minimal) suffix than what's in `suffix`. |
504 | 0 | let mut candidate_start = 1; |
505 | 0 | // The current offset of our suffixes that we're comparing. |
506 | 0 | // |
507 | 0 | // When the characters at this offset are the same, then we mush on |
508 | 0 | // to the next position since no decision is possible. When the |
509 | 0 | // candidate's character is greater (or lesser) than the corresponding |
510 | 0 | // character than our current maximal (or minimal) suffix, then the |
511 | 0 | // current suffix is changed over to the candidate and we restart our |
512 | 0 | // search. Otherwise, the candidate suffix is no good and we restart |
513 | 0 | // our search on the next candidate. |
514 | 0 | // |
515 | 0 | // The three cases above correspond to the three cases in the loop |
516 | 0 | // below. |
517 | 0 | let mut offset = 0; |
518 | | |
519 | 0 | while candidate_start + offset < needle.len() { |
520 | 0 | let current = needle[suffix.pos + offset]; |
521 | 0 | let candidate = needle[candidate_start + offset]; |
522 | 0 | match kind.cmp(current, candidate) { |
523 | 0 | SuffixOrdering::Accept => { |
524 | 0 | suffix = Suffix { pos: candidate_start, period: 1 }; |
525 | 0 | candidate_start += 1; |
526 | 0 | offset = 0; |
527 | 0 | } |
528 | 0 | SuffixOrdering::Skip => { |
529 | 0 | candidate_start += offset + 1; |
530 | 0 | offset = 0; |
531 | 0 | suffix.period = candidate_start - suffix.pos; |
532 | 0 | } |
533 | | SuffixOrdering::Push => { |
534 | 0 | if offset + 1 == suffix.period { |
535 | 0 | candidate_start += suffix.period; |
536 | 0 | offset = 0; |
537 | 0 | } else { |
538 | 0 | offset += 1; |
539 | 0 | } |
540 | | } |
541 | | } |
542 | | } |
543 | 0 | suffix |
544 | 0 | } |
545 | | |
546 | 0 | fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { |
547 | 0 | debug_assert!(!needle.is_empty()); |
548 | | |
549 | | // See the comments in `forward` for how this works. |
550 | 0 | let mut suffix = Suffix { pos: needle.len(), period: 1 }; |
551 | 0 | if needle.len() == 1 { |
552 | 0 | return suffix; |
553 | 0 | } |
554 | 0 | let mut candidate_start = needle.len() - 1; |
555 | 0 | let mut offset = 0; |
556 | | |
557 | 0 | while offset < candidate_start { |
558 | 0 | let current = needle[suffix.pos - offset - 1]; |
559 | 0 | let candidate = needle[candidate_start - offset - 1]; |
560 | 0 | match kind.cmp(current, candidate) { |
561 | 0 | SuffixOrdering::Accept => { |
562 | 0 | suffix = Suffix { pos: candidate_start, period: 1 }; |
563 | 0 | candidate_start -= 1; |
564 | 0 | offset = 0; |
565 | 0 | } |
566 | 0 | SuffixOrdering::Skip => { |
567 | 0 | candidate_start -= offset + 1; |
568 | 0 | offset = 0; |
569 | 0 | suffix.period = suffix.pos - candidate_start; |
570 | 0 | } |
571 | | SuffixOrdering::Push => { |
572 | 0 | if offset + 1 == suffix.period { |
573 | 0 | candidate_start -= suffix.period; |
574 | 0 | offset = 0; |
575 | 0 | } else { |
576 | 0 | offset += 1; |
577 | 0 | } |
578 | | } |
579 | | } |
580 | | } |
581 | 0 | suffix |
582 | 0 | } |
583 | | } |
584 | | |
585 | | /// The kind of suffix to extract. |
586 | | #[derive(Clone, Copy, Debug)] |
587 | | enum SuffixKind { |
588 | | /// Extract the smallest lexicographic suffix from a string. |
589 | | /// |
590 | | /// Technically, this doesn't actually pick the smallest lexicographic |
591 | | /// suffix. e.g., Given the choice between `a` and `aa`, this will choose |
592 | | /// the latter over the former, even though `a < aa`. The reasoning for |
593 | | /// this isn't clear from the paper, but it still smells like a minimal |
594 | | /// suffix. |
595 | | Minimal, |
596 | | /// Extract the largest lexicographic suffix from a string. |
597 | | /// |
598 | | /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given |
599 | | /// the choice between `z` and `zz`, this will choose the latter over the |
600 | | /// former. |
601 | | Maximal, |
602 | | } |
603 | | |
604 | | /// The result of comparing corresponding bytes between two suffixes. |
605 | | #[derive(Clone, Copy, Debug)] |
606 | | enum SuffixOrdering { |
607 | | /// This occurs when the given candidate byte indicates that the candidate |
608 | | /// suffix is better than the current maximal (or minimal) suffix. That is, |
609 | | /// the current candidate suffix should supplant the current maximal (or |
610 | | /// minimal) suffix. |
611 | | Accept, |
612 | | /// This occurs when the given candidate byte excludes the candidate suffix |
613 | | /// from being better than the current maximal (or minimal) suffix. That |
614 | | /// is, the current candidate suffix should be dropped and the next one |
615 | | /// should be considered. |
616 | | Skip, |
617 | | /// This occurs when no decision to accept or skip the candidate suffix |
618 | | /// can be made, e.g., when corresponding bytes are equivalent. In this |
619 | | /// case, the next corresponding bytes should be compared. |
620 | | Push, |
621 | | } |
622 | | |
623 | | impl SuffixKind { |
624 | | /// Returns true if and only if the given candidate byte indicates that |
625 | | /// it should replace the current suffix as the maximal (or minimal) |
626 | | /// suffix. |
627 | 0 | fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { |
628 | | use self::SuffixOrdering::*; |
629 | | |
630 | 0 | match self { |
631 | 0 | SuffixKind::Minimal if candidate < current => Accept, |
632 | 0 | SuffixKind::Minimal if candidate > current => Skip, |
633 | 0 | SuffixKind::Minimal => Push, |
634 | 0 | SuffixKind::Maximal if candidate > current => Accept, |
635 | 0 | SuffixKind::Maximal if candidate < current => Skip, |
636 | 0 | SuffixKind::Maximal => Push, |
637 | | } |
638 | 0 | } |
639 | | } |
640 | | |
641 | | /// A bitset used to track whether a particular byte exists in a needle or not. |
642 | | /// |
643 | | /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the |
644 | | /// needle. If a particular byte in the haystack is NOT in this set, then one |
645 | | /// can conclude that it is also not in the needle, and thus, one can advance |
646 | | /// in the haystack by needle.len() bytes. |
647 | | #[derive(Clone, Copy, Debug)] |
648 | | struct ApproximateByteSet(u64); |
649 | | |
650 | | impl ApproximateByteSet { |
651 | | /// Create a new set from the given needle. |
652 | 0 | fn new(needle: &[u8]) -> ApproximateByteSet { |
653 | 0 | let mut bits = 0; |
654 | 0 | for &b in needle { |
655 | 0 | bits |= 1 << (b % 64); |
656 | 0 | } |
657 | 0 | ApproximateByteSet(bits) |
658 | 0 | } |
659 | | |
660 | | /// Return true if and only if the given byte might be in this set. This |
661 | | /// may return a false positive, but will never return a false negative. |
662 | | #[inline(always)] |
663 | 0 | fn contains(&self, byte: u8) -> bool { |
664 | 0 | self.0 & (1 << (byte % 64)) != 0 |
665 | 0 | } |
666 | | } |
667 | | |
668 | | #[cfg(all(test, feature = "std", not(miri)))] |
669 | | mod tests { |
670 | | use quickcheck::quickcheck; |
671 | | |
672 | | use super::*; |
673 | | |
674 | | define_memmem_quickcheck_tests!( |
675 | | super::simpletests::twoway_find, |
676 | | super::simpletests::twoway_rfind |
677 | | ); |
678 | | |
679 | | /// Convenience wrapper for computing the suffix as a byte string. |
680 | | fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
681 | | let s = Suffix::forward(needle, kind); |
682 | | (&needle[s.pos..], s.period) |
683 | | } |
684 | | |
685 | | /// Convenience wrapper for computing the reverse suffix as a byte string. |
686 | | fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
687 | | let s = Suffix::reverse(needle, kind); |
688 | | (&needle[..s.pos], s.period) |
689 | | } |
690 | | |
691 | | /// Return all of the non-empty suffixes in the given byte string. |
692 | | fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { |
693 | | (0..bytes.len()).map(|i| &bytes[i..]).collect() |
694 | | } |
695 | | |
696 | | /// Return the lexicographically maximal suffix of the given byte string. |
697 | | fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { |
698 | | let mut sufs = suffixes(needle); |
699 | | sufs.sort(); |
700 | | sufs.pop().unwrap() |
701 | | } |
702 | | |
703 | | /// Return the lexicographically maximal suffix of the reverse of the given |
704 | | /// byte string. |
705 | | fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { |
706 | | let mut reversed = needle.to_vec(); |
707 | | reversed.reverse(); |
708 | | let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); |
709 | | got.reverse(); |
710 | | got |
711 | | } |
712 | | |
713 | | #[test] |
714 | | fn suffix_forward() { |
715 | | macro_rules! assert_suffix_min { |
716 | | ($given:expr, $expected:expr, $period:expr) => { |
717 | | let (got_suffix, got_period) = |
718 | | get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); |
719 | | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
720 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
721 | | }; |
722 | | } |
723 | | |
724 | | macro_rules! assert_suffix_max { |
725 | | ($given:expr, $expected:expr, $period:expr) => { |
726 | | let (got_suffix, got_period) = |
727 | | get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); |
728 | | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
729 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
730 | | }; |
731 | | } |
732 | | |
733 | | assert_suffix_min!("a", "a", 1); |
734 | | assert_suffix_max!("a", "a", 1); |
735 | | |
736 | | assert_suffix_min!("ab", "ab", 2); |
737 | | assert_suffix_max!("ab", "b", 1); |
738 | | |
739 | | assert_suffix_min!("ba", "a", 1); |
740 | | assert_suffix_max!("ba", "ba", 2); |
741 | | |
742 | | assert_suffix_min!("abc", "abc", 3); |
743 | | assert_suffix_max!("abc", "c", 1); |
744 | | |
745 | | assert_suffix_min!("acb", "acb", 3); |
746 | | assert_suffix_max!("acb", "cb", 2); |
747 | | |
748 | | assert_suffix_min!("cba", "a", 1); |
749 | | assert_suffix_max!("cba", "cba", 3); |
750 | | |
751 | | assert_suffix_min!("abcabc", "abcabc", 3); |
752 | | assert_suffix_max!("abcabc", "cabc", 3); |
753 | | |
754 | | assert_suffix_min!("abcabcabc", "abcabcabc", 3); |
755 | | assert_suffix_max!("abcabcabc", "cabcabc", 3); |
756 | | |
757 | | assert_suffix_min!("abczz", "abczz", 5); |
758 | | assert_suffix_max!("abczz", "zz", 1); |
759 | | |
760 | | assert_suffix_min!("zzabc", "abc", 3); |
761 | | assert_suffix_max!("zzabc", "zzabc", 5); |
762 | | |
763 | | assert_suffix_min!("aaa", "aaa", 1); |
764 | | assert_suffix_max!("aaa", "aaa", 1); |
765 | | |
766 | | assert_suffix_min!("foobar", "ar", 2); |
767 | | assert_suffix_max!("foobar", "r", 1); |
768 | | } |
769 | | |
770 | | #[test] |
771 | | fn suffix_reverse() { |
772 | | macro_rules! assert_suffix_min { |
773 | | ($given:expr, $expected:expr, $period:expr) => { |
774 | | let (got_suffix, got_period) = |
775 | | get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); |
776 | | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
777 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
778 | | }; |
779 | | } |
780 | | |
781 | | macro_rules! assert_suffix_max { |
782 | | ($given:expr, $expected:expr, $period:expr) => { |
783 | | let (got_suffix, got_period) = |
784 | | get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); |
785 | | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
786 | | assert_eq!(($expected, $period), (got_suffix, got_period)); |
787 | | }; |
788 | | } |
789 | | |
790 | | assert_suffix_min!("a", "a", 1); |
791 | | assert_suffix_max!("a", "a", 1); |
792 | | |
793 | | assert_suffix_min!("ab", "a", 1); |
794 | | assert_suffix_max!("ab", "ab", 2); |
795 | | |
796 | | assert_suffix_min!("ba", "ba", 2); |
797 | | assert_suffix_max!("ba", "b", 1); |
798 | | |
799 | | assert_suffix_min!("abc", "a", 1); |
800 | | assert_suffix_max!("abc", "abc", 3); |
801 | | |
802 | | assert_suffix_min!("acb", "a", 1); |
803 | | assert_suffix_max!("acb", "ac", 2); |
804 | | |
805 | | assert_suffix_min!("cba", "cba", 3); |
806 | | assert_suffix_max!("cba", "c", 1); |
807 | | |
808 | | assert_suffix_min!("abcabc", "abca", 3); |
809 | | assert_suffix_max!("abcabc", "abcabc", 3); |
810 | | |
811 | | assert_suffix_min!("abcabcabc", "abcabca", 3); |
812 | | assert_suffix_max!("abcabcabc", "abcabcabc", 3); |
813 | | |
814 | | assert_suffix_min!("abczz", "a", 1); |
815 | | assert_suffix_max!("abczz", "abczz", 5); |
816 | | |
817 | | assert_suffix_min!("zzabc", "zza", 3); |
818 | | assert_suffix_max!("zzabc", "zz", 1); |
819 | | |
820 | | assert_suffix_min!("aaa", "aaa", 1); |
821 | | assert_suffix_max!("aaa", "aaa", 1); |
822 | | } |
823 | | |
824 | | quickcheck! { |
825 | | fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { |
826 | | if bytes.is_empty() { |
827 | | return true; |
828 | | } |
829 | | |
830 | | let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); |
831 | | let expected = naive_maximal_suffix_forward(&bytes); |
832 | | got == expected |
833 | | } |
834 | | |
835 | | fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { |
836 | | if bytes.is_empty() { |
837 | | return true; |
838 | | } |
839 | | |
840 | | let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); |
841 | | let expected = naive_maximal_suffix_reverse(&bytes); |
842 | | expected == got |
843 | | } |
844 | | } |
845 | | } |
846 | | |
847 | | #[cfg(test)] |
848 | | mod simpletests { |
849 | | use super::*; |
850 | | |
851 | | pub(crate) fn twoway_find( |
852 | | haystack: &[u8], |
853 | | needle: &[u8], |
854 | | ) -> Option<usize> { |
855 | | Forward::new(needle).find_general(None, haystack, needle) |
856 | | } |
857 | | |
858 | | pub(crate) fn twoway_rfind( |
859 | | haystack: &[u8], |
860 | | needle: &[u8], |
861 | | ) -> Option<usize> { |
862 | | Reverse::new(needle).rfind_general(haystack, needle) |
863 | | } |
864 | | |
865 | | define_memmem_simple_tests!(twoway_find, twoway_rfind); |
866 | | |
867 | | // This is a regression test caught by quickcheck that exercised a bug in |
868 | | // the reverse small period handling. The bug was that we were using 'if j |
869 | | // == shift' to determine if a match occurred, but the correct guard is 'if |
870 | | // j >= shift', which matches the corresponding guard in the forward impl. |
871 | | #[test] |
872 | | fn regression_rev_small_period() { |
873 | | let rfind = super::simpletests::twoway_rfind; |
874 | | let haystack = "ababaz"; |
875 | | let needle = "abab"; |
876 | | assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); |
877 | | } |
878 | | } |