/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.5.0/src/memmem/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | This module provides forward and reverse substring search routines. |
3 | | |
4 | | Unlike the standard library's substring search routines, these work on |
5 | | arbitrary bytes. For all non-empty needles, these routines will report exactly |
6 | | the same values as the corresponding routines in the standard library. For |
7 | | the empty needle, the standard library reports matches only at valid UTF-8 |
8 | | boundaries, where as these routines will report matches at every position. |
9 | | |
10 | | Other than being able to work on arbitrary bytes, the primary reason to prefer |
11 | | these routines over the standard library routines is that these will generally |
12 | | be faster. In some cases, significantly so. |
13 | | |
14 | | # Example: iterating over substring matches |
15 | | |
16 | | This example shows how to use [`find_iter`] to find occurrences of a substring |
17 | | in a haystack. |
18 | | |
19 | | ``` |
20 | | use memchr::memmem; |
21 | | |
22 | | let haystack = b"foo bar foo baz foo"; |
23 | | |
24 | | let mut it = memmem::find_iter(haystack, "foo"); |
25 | | assert_eq!(Some(0), it.next()); |
26 | | assert_eq!(Some(8), it.next()); |
27 | | assert_eq!(Some(16), it.next()); |
28 | | assert_eq!(None, it.next()); |
29 | | ``` |
30 | | |
31 | | # Example: iterating over substring matches in reverse |
32 | | |
33 | | This example shows how to use [`rfind_iter`] to find occurrences of a substring |
34 | | in a haystack starting from the end of the haystack. |
35 | | |
36 | | **NOTE:** This module does not implement double ended iterators, so reverse |
37 | | searches aren't done by calling `rev` on a forward iterator. |
38 | | |
39 | | ``` |
40 | | use memchr::memmem; |
41 | | |
42 | | let haystack = b"foo bar foo baz foo"; |
43 | | |
44 | | let mut it = memmem::rfind_iter(haystack, "foo"); |
45 | | assert_eq!(Some(16), it.next()); |
46 | | assert_eq!(Some(8), it.next()); |
47 | | assert_eq!(Some(0), it.next()); |
48 | | assert_eq!(None, it.next()); |
49 | | ``` |
50 | | |
51 | | # Example: repeating a search for the same needle |
52 | | |
53 | | It may be possible for the overhead of constructing a substring searcher to be |
54 | | measurable in some workloads. In cases where the same needle is used to search |
55 | | many haystacks, it is possible to do construction once and thus to avoid it for |
56 | | subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for |
57 | | reverse searches). |
58 | | |
59 | | ``` |
60 | | use memchr::memmem; |
61 | | |
62 | | let finder = memmem::Finder::new("foo"); |
63 | | |
64 | | assert_eq!(Some(4), finder.find(b"baz foo quux")); |
65 | | assert_eq!(None, finder.find(b"quux baz bar")); |
66 | | ``` |
67 | | */ |
68 | | |
69 | | pub use self::prefilter::Prefilter; |
70 | | |
71 | | use crate::{ |
72 | | cow::CowBytes, |
73 | | memmem::{ |
74 | | prefilter::{Pre, PrefilterFn, PrefilterState}, |
75 | | rabinkarp::NeedleHash, |
76 | | rarebytes::RareNeedleBytes, |
77 | | }, |
78 | | }; |
79 | | |
80 | | /// Defines a suite of quickcheck properties for forward and reverse |
81 | | /// substring searching. |
82 | | /// |
83 | | /// This is defined in this specific spot so that it can be used freely among |
84 | | /// the different substring search implementations. I couldn't be bothered to |
85 | | /// fight with the macro-visibility rules enough to figure out how to stuff it |
86 | | /// somewhere more convenient. |
87 | | #[cfg(all(test, feature = "std"))] |
88 | | macro_rules! define_memmem_quickcheck_tests { |
89 | | ($fwd:expr, $rev:expr) => { |
90 | | use crate::memmem::proptests; |
91 | | |
92 | | quickcheck::quickcheck! { |
93 | | fn qc_fwd_prefix_is_substring(bs: Vec<u8>) -> bool { |
94 | | proptests::prefix_is_substring(false, &bs, $fwd) |
95 | | } |
96 | | |
97 | | fn qc_fwd_suffix_is_substring(bs: Vec<u8>) -> bool { |
98 | | proptests::suffix_is_substring(false, &bs, $fwd) |
99 | | } |
100 | | |
101 | | fn qc_fwd_matches_naive( |
102 | | haystack: Vec<u8>, |
103 | | needle: Vec<u8> |
104 | | ) -> bool { |
105 | | proptests::matches_naive(false, &haystack, &needle, $fwd) |
106 | | } |
107 | | |
108 | | fn qc_rev_prefix_is_substring(bs: Vec<u8>) -> bool { |
109 | | proptests::prefix_is_substring(true, &bs, $rev) |
110 | | } |
111 | | |
112 | | fn qc_rev_suffix_is_substring(bs: Vec<u8>) -> bool { |
113 | | proptests::suffix_is_substring(true, &bs, $rev) |
114 | | } |
115 | | |
116 | | fn qc_rev_matches_naive( |
117 | | haystack: Vec<u8>, |
118 | | needle: Vec<u8> |
119 | | ) -> bool { |
120 | | proptests::matches_naive(true, &haystack, &needle, $rev) |
121 | | } |
122 | | } |
123 | | }; |
124 | | } |
125 | | |
126 | | /// Defines a suite of "simple" hand-written tests for a substring |
127 | | /// implementation. |
128 | | /// |
129 | | /// This is defined here for the same reason that |
130 | | /// define_memmem_quickcheck_tests is defined here. |
131 | | #[cfg(test)] |
132 | | macro_rules! define_memmem_simple_tests { |
133 | | ($fwd:expr, $rev:expr) => { |
134 | | use crate::memmem::testsimples; |
135 | | |
136 | | #[test] |
137 | | fn simple_forward() { |
138 | | testsimples::run_search_tests_fwd($fwd); |
139 | | } |
140 | | |
141 | | #[test] |
142 | | fn simple_reverse() { |
143 | | testsimples::run_search_tests_rev($rev); |
144 | | } |
145 | | }; |
146 | | } |
147 | | |
148 | | mod byte_frequencies; |
149 | | #[cfg(memchr_runtime_simd)] |
150 | | mod genericsimd; |
151 | | mod prefilter; |
152 | | mod rabinkarp; |
153 | | mod rarebytes; |
154 | | mod twoway; |
155 | | mod util; |
156 | | #[cfg(memchr_runtime_simd)] |
157 | | mod vector; |
158 | | #[cfg(all(memchr_runtime_wasm128))] |
159 | | mod wasm; |
160 | | #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
161 | | mod x86; |
162 | | |
163 | | /// Returns an iterator over all non-overlapping occurrences of a substring in |
164 | | /// a haystack. |
165 | | /// |
166 | | /// # Complexity |
167 | | /// |
168 | | /// This routine is guaranteed to have worst case linear time complexity |
169 | | /// with respect to both the needle and the haystack. That is, this runs |
170 | | /// in `O(needle.len() + haystack.len())` time. |
171 | | /// |
172 | | /// This routine is also guaranteed to have worst case constant space |
173 | | /// complexity. |
174 | | /// |
175 | | /// # Examples |
176 | | /// |
177 | | /// Basic usage: |
178 | | /// |
179 | | /// ``` |
180 | | /// use memchr::memmem; |
181 | | /// |
182 | | /// let haystack = b"foo bar foo baz foo"; |
183 | | /// let mut it = memmem::find_iter(haystack, b"foo"); |
184 | | /// assert_eq!(Some(0), it.next()); |
185 | | /// assert_eq!(Some(8), it.next()); |
186 | | /// assert_eq!(Some(16), it.next()); |
187 | | /// assert_eq!(None, it.next()); |
188 | | /// ``` |
189 | | #[inline] |
190 | 0 | pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( |
191 | 0 | haystack: &'h [u8], |
192 | 0 | needle: &'n N, |
193 | 0 | ) -> FindIter<'h, 'n> { |
194 | 0 | FindIter::new(haystack, Finder::new(needle)) |
195 | 0 | } |
196 | | |
197 | | /// Returns a reverse iterator over all non-overlapping occurrences of a |
198 | | /// substring in a haystack. |
199 | | /// |
200 | | /// # Complexity |
201 | | /// |
202 | | /// This routine is guaranteed to have worst case linear time complexity |
203 | | /// with respect to both the needle and the haystack. That is, this runs |
204 | | /// in `O(needle.len() + haystack.len())` time. |
205 | | /// |
206 | | /// This routine is also guaranteed to have worst case constant space |
207 | | /// complexity. |
208 | | /// |
209 | | /// # Examples |
210 | | /// |
211 | | /// Basic usage: |
212 | | /// |
213 | | /// ``` |
214 | | /// use memchr::memmem; |
215 | | /// |
216 | | /// let haystack = b"foo bar foo baz foo"; |
217 | | /// let mut it = memmem::rfind_iter(haystack, b"foo"); |
218 | | /// assert_eq!(Some(16), it.next()); |
219 | | /// assert_eq!(Some(8), it.next()); |
220 | | /// assert_eq!(Some(0), it.next()); |
221 | | /// assert_eq!(None, it.next()); |
222 | | /// ``` |
223 | | #[inline] |
224 | 0 | pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( |
225 | 0 | haystack: &'h [u8], |
226 | 0 | needle: &'n N, |
227 | 0 | ) -> FindRevIter<'h, 'n> { |
228 | 0 | FindRevIter::new(haystack, FinderRev::new(needle)) |
229 | 0 | } |
230 | | |
231 | | /// Returns the index of the first occurrence of the given needle. |
232 | | /// |
233 | | /// Note that if you're are searching for the same needle in many different |
234 | | /// small haystacks, it may be faster to initialize a [`Finder`] once, |
235 | | /// and reuse it for each search. |
236 | | /// |
237 | | /// # Complexity |
238 | | /// |
239 | | /// This routine is guaranteed to have worst case linear time complexity |
240 | | /// with respect to both the needle and the haystack. That is, this runs |
241 | | /// in `O(needle.len() + haystack.len())` time. |
242 | | /// |
243 | | /// This routine is also guaranteed to have worst case constant space |
244 | | /// complexity. |
245 | | /// |
246 | | /// # Examples |
247 | | /// |
248 | | /// Basic usage: |
249 | | /// |
250 | | /// ``` |
251 | | /// use memchr::memmem; |
252 | | /// |
253 | | /// let haystack = b"foo bar baz"; |
254 | | /// assert_eq!(Some(0), memmem::find(haystack, b"foo")); |
255 | | /// assert_eq!(Some(4), memmem::find(haystack, b"bar")); |
256 | | /// assert_eq!(None, memmem::find(haystack, b"quux")); |
257 | | /// ``` |
258 | | #[inline] |
259 | 0 | pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
260 | 0 | if haystack.len() < 64 { |
261 | 0 | rabinkarp::find(haystack, needle) |
262 | | } else { |
263 | 0 | Finder::new(needle).find(haystack) |
264 | | } |
265 | 0 | } |
266 | | |
267 | | /// Returns the index of the last occurrence of the given needle. |
268 | | /// |
269 | | /// Note that if you're are searching for the same needle in many different |
270 | | /// small haystacks, it may be faster to initialize a [`FinderRev`] once, |
271 | | /// and reuse it for each search. |
272 | | /// |
273 | | /// # Complexity |
274 | | /// |
275 | | /// This routine is guaranteed to have worst case linear time complexity |
276 | | /// with respect to both the needle and the haystack. That is, this runs |
277 | | /// in `O(needle.len() + haystack.len())` time. |
278 | | /// |
279 | | /// This routine is also guaranteed to have worst case constant space |
280 | | /// complexity. |
281 | | /// |
282 | | /// # Examples |
283 | | /// |
284 | | /// Basic usage: |
285 | | /// |
286 | | /// ``` |
287 | | /// use memchr::memmem; |
288 | | /// |
289 | | /// let haystack = b"foo bar baz"; |
290 | | /// assert_eq!(Some(0), memmem::rfind(haystack, b"foo")); |
291 | | /// assert_eq!(Some(4), memmem::rfind(haystack, b"bar")); |
292 | | /// assert_eq!(Some(8), memmem::rfind(haystack, b"ba")); |
293 | | /// assert_eq!(None, memmem::rfind(haystack, b"quux")); |
294 | | /// ``` |
295 | | #[inline] |
296 | 0 | pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
297 | 0 | if haystack.len() < 64 { |
298 | 0 | rabinkarp::rfind(haystack, needle) |
299 | | } else { |
300 | 0 | FinderRev::new(needle).rfind(haystack) |
301 | | } |
302 | 0 | } |
303 | | |
304 | | /// An iterator over non-overlapping substring matches. |
305 | | /// |
306 | | /// Matches are reported by the byte offset at which they begin. |
307 | | /// |
308 | | /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
309 | | /// needle. |
310 | | #[derive(Debug)] |
311 | | pub struct FindIter<'h, 'n> { |
312 | | haystack: &'h [u8], |
313 | | prestate: PrefilterState, |
314 | | finder: Finder<'n>, |
315 | | pos: usize, |
316 | | } |
317 | | |
318 | | impl<'h, 'n> FindIter<'h, 'n> { |
319 | | #[inline(always)] |
320 | 0 | pub(crate) fn new( |
321 | 0 | haystack: &'h [u8], |
322 | 0 | finder: Finder<'n>, |
323 | 0 | ) -> FindIter<'h, 'n> { |
324 | 0 | let prestate = finder.searcher.prefilter_state(); |
325 | 0 | FindIter { haystack, prestate, finder, pos: 0 } |
326 | 0 | } |
327 | | |
328 | | /// Convert this iterator into its owned variant, such that it no longer |
329 | | /// borrows the finder and needle. |
330 | | /// |
331 | | /// If this is already an owned iterator, then this is a no-op. Otherwise, |
332 | | /// this copies the needle. |
333 | | /// |
334 | | /// This is only available when the `std` feature is enabled. |
335 | | #[cfg(feature = "std")] |
336 | | #[inline] |
337 | 0 | pub fn into_owned(self) -> FindIter<'h, 'static> { |
338 | 0 | FindIter { |
339 | 0 | haystack: self.haystack, |
340 | 0 | prestate: self.prestate, |
341 | 0 | finder: self.finder.into_owned(), |
342 | 0 | pos: self.pos, |
343 | 0 | } |
344 | 0 | } |
345 | | } |
346 | | |
347 | | impl<'h, 'n> Iterator for FindIter<'h, 'n> { |
348 | | type Item = usize; |
349 | | |
350 | 0 | fn next(&mut self) -> Option<usize> { |
351 | 0 | if self.pos > self.haystack.len() { |
352 | 0 | return None; |
353 | 0 | } |
354 | 0 | let result = self |
355 | 0 | .finder |
356 | 0 | .searcher |
357 | 0 | .find(&mut self.prestate, &self.haystack[self.pos..]); |
358 | 0 | match result { |
359 | 0 | None => None, |
360 | 0 | Some(i) => { |
361 | 0 | let pos = self.pos + i; |
362 | 0 | self.pos = pos + core::cmp::max(1, self.finder.needle().len()); |
363 | 0 | Some(pos) |
364 | | } |
365 | | } |
366 | 0 | } |
367 | | } |
368 | | |
369 | | /// An iterator over non-overlapping substring matches in reverse. |
370 | | /// |
371 | | /// Matches are reported by the byte offset at which they begin. |
372 | | /// |
373 | | /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
374 | | /// needle. |
375 | | #[derive(Debug)] |
376 | | pub struct FindRevIter<'h, 'n> { |
377 | | haystack: &'h [u8], |
378 | | finder: FinderRev<'n>, |
379 | | /// When searching with an empty needle, this gets set to `None` after |
380 | | /// we've yielded the last element at `0`. |
381 | | pos: Option<usize>, |
382 | | } |
383 | | |
384 | | impl<'h, 'n> FindRevIter<'h, 'n> { |
385 | | #[inline(always)] |
386 | 0 | pub(crate) fn new( |
387 | 0 | haystack: &'h [u8], |
388 | 0 | finder: FinderRev<'n>, |
389 | 0 | ) -> FindRevIter<'h, 'n> { |
390 | 0 | let pos = Some(haystack.len()); |
391 | 0 | FindRevIter { haystack, finder, pos } |
392 | 0 | } |
393 | | |
394 | | /// Convert this iterator into its owned variant, such that it no longer |
395 | | /// borrows the finder and needle. |
396 | | /// |
397 | | /// If this is already an owned iterator, then this is a no-op. Otherwise, |
398 | | /// this copies the needle. |
399 | | /// |
400 | | /// This is only available when the `std` feature is enabled. |
401 | | #[cfg(feature = "std")] |
402 | | #[inline] |
403 | 0 | pub fn into_owned(self) -> FindRevIter<'h, 'static> { |
404 | 0 | FindRevIter { |
405 | 0 | haystack: self.haystack, |
406 | 0 | finder: self.finder.into_owned(), |
407 | 0 | pos: self.pos, |
408 | 0 | } |
409 | 0 | } |
410 | | } |
411 | | |
412 | | impl<'h, 'n> Iterator for FindRevIter<'h, 'n> { |
413 | | type Item = usize; |
414 | | |
415 | 0 | fn next(&mut self) -> Option<usize> { |
416 | 0 | let pos = match self.pos { |
417 | 0 | None => return None, |
418 | 0 | Some(pos) => pos, |
419 | 0 | }; |
420 | 0 | let result = self.finder.rfind(&self.haystack[..pos]); |
421 | 0 | match result { |
422 | 0 | None => None, |
423 | 0 | Some(i) => { |
424 | 0 | if pos == i { |
425 | 0 | self.pos = pos.checked_sub(1); |
426 | 0 | } else { |
427 | 0 | self.pos = Some(i); |
428 | 0 | } |
429 | 0 | Some(i) |
430 | | } |
431 | | } |
432 | 0 | } |
433 | | } |
434 | | |
435 | | /// A single substring searcher fixed to a particular needle. |
436 | | /// |
437 | | /// The purpose of this type is to permit callers to construct a substring |
438 | | /// searcher that can be used to search haystacks without the overhead of |
439 | | /// constructing the searcher in the first place. This is a somewhat niche |
440 | | /// concern when it's necessary to re-use the same needle to search multiple |
441 | | /// different haystacks with as little overhead as possible. In general, using |
442 | | /// [`find`] is good enough, but `Finder` is useful when you can meaningfully |
443 | | /// observe searcher construction time in a profile. |
444 | | /// |
445 | | /// When the `std` feature is enabled, then this type has an `into_owned` |
446 | | /// version which permits building a `Finder` that is not connected to |
447 | | /// the lifetime of its needle. |
448 | | #[derive(Clone, Debug)] |
449 | | pub struct Finder<'n> { |
450 | | searcher: Searcher<'n>, |
451 | | } |
452 | | |
453 | | impl<'n> Finder<'n> { |
454 | | /// Create a new finder for the given needle. |
455 | | #[inline] |
456 | 0 | pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> { |
457 | 0 | FinderBuilder::new().build_forward(needle) |
458 | 0 | } |
459 | | |
460 | | /// Returns the index of the first occurrence of this needle in the given |
461 | | /// haystack. |
462 | | /// |
463 | | /// # Complexity |
464 | | /// |
465 | | /// This routine is guaranteed to have worst case linear time complexity |
466 | | /// with respect to both the needle and the haystack. That is, this runs |
467 | | /// in `O(needle.len() + haystack.len())` time. |
468 | | /// |
469 | | /// This routine is also guaranteed to have worst case constant space |
470 | | /// complexity. |
471 | | /// |
472 | | /// # Examples |
473 | | /// |
474 | | /// Basic usage: |
475 | | /// |
476 | | /// ``` |
477 | | /// use memchr::memmem::Finder; |
478 | | /// |
479 | | /// let haystack = b"foo bar baz"; |
480 | | /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); |
481 | | /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); |
482 | | /// assert_eq!(None, Finder::new("quux").find(haystack)); |
483 | | /// ``` |
484 | 0 | pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
485 | 0 | self.searcher.find(&mut self.searcher.prefilter_state(), haystack) |
486 | 0 | } |
487 | | |
488 | | /// Returns an iterator over all occurrences of a substring in a haystack. |
489 | | /// |
490 | | /// # Complexity |
491 | | /// |
492 | | /// This routine is guaranteed to have worst case linear time complexity |
493 | | /// with respect to both the needle and the haystack. That is, this runs |
494 | | /// in `O(needle.len() + haystack.len())` time. |
495 | | /// |
496 | | /// This routine is also guaranteed to have worst case constant space |
497 | | /// complexity. |
498 | | /// |
499 | | /// # Examples |
500 | | /// |
501 | | /// Basic usage: |
502 | | /// |
503 | | /// ``` |
504 | | /// use memchr::memmem::Finder; |
505 | | /// |
506 | | /// let haystack = b"foo bar foo baz foo"; |
507 | | /// let finder = Finder::new(b"foo"); |
508 | | /// let mut it = finder.find_iter(haystack); |
509 | | /// assert_eq!(Some(0), it.next()); |
510 | | /// assert_eq!(Some(8), it.next()); |
511 | | /// assert_eq!(Some(16), it.next()); |
512 | | /// assert_eq!(None, it.next()); |
513 | | /// ``` |
514 | | #[inline] |
515 | 0 | pub fn find_iter<'a, 'h>( |
516 | 0 | &'a self, |
517 | 0 | haystack: &'h [u8], |
518 | 0 | ) -> FindIter<'h, 'a> { |
519 | 0 | FindIter::new(haystack, self.as_ref()) |
520 | 0 | } |
521 | | |
522 | | /// Convert this finder into its owned variant, such that it no longer |
523 | | /// borrows the needle. |
524 | | /// |
525 | | /// If this is already an owned finder, then this is a no-op. Otherwise, |
526 | | /// this copies the needle. |
527 | | /// |
528 | | /// This is only available when the `std` feature is enabled. |
529 | | #[cfg(feature = "std")] |
530 | | #[inline] |
531 | 0 | pub fn into_owned(self) -> Finder<'static> { |
532 | 0 | Finder { searcher: self.searcher.into_owned() } |
533 | 0 | } |
534 | | |
535 | | /// Convert this finder into its borrowed variant. |
536 | | /// |
537 | | /// This is primarily useful if your finder is owned and you'd like to |
538 | | /// store its borrowed variant in some intermediate data structure. |
539 | | /// |
540 | | /// Note that the lifetime parameter of the returned finder is tied to the |
541 | | /// lifetime of `self`, and may be shorter than the `'n` lifetime of the |
542 | | /// needle itself. Namely, a finder's needle can be either borrowed or |
543 | | /// owned, so the lifetime of the needle returned must necessarily be the |
544 | | /// shorter of the two. |
545 | | #[inline] |
546 | 0 | pub fn as_ref(&self) -> Finder<'_> { |
547 | 0 | Finder { searcher: self.searcher.as_ref() } |
548 | 0 | } |
549 | | |
550 | | /// Returns the needle that this finder searches for. |
551 | | /// |
552 | | /// Note that the lifetime of the needle returned is tied to the lifetime |
553 | | /// of the finder, and may be shorter than the `'n` lifetime. Namely, a |
554 | | /// finder's needle can be either borrowed or owned, so the lifetime of the |
555 | | /// needle returned must necessarily be the shorter of the two. |
556 | | #[inline] |
557 | 0 | pub fn needle(&self) -> &[u8] { |
558 | 0 | self.searcher.needle() |
559 | 0 | } |
560 | | } |
561 | | |
562 | | /// A single substring reverse searcher fixed to a particular needle. |
563 | | /// |
564 | | /// The purpose of this type is to permit callers to construct a substring |
565 | | /// searcher that can be used to search haystacks without the overhead of |
566 | | /// constructing the searcher in the first place. This is a somewhat niche |
567 | | /// concern when it's necessary to re-use the same needle to search multiple |
568 | | /// different haystacks with as little overhead as possible. In general, |
569 | | /// using [`rfind`] is good enough, but `FinderRev` is useful when you can |
570 | | /// meaningfully observe searcher construction time in a profile. |
571 | | /// |
572 | | /// When the `std` feature is enabled, then this type has an `into_owned` |
573 | | /// version which permits building a `FinderRev` that is not connected to |
574 | | /// the lifetime of its needle. |
575 | | #[derive(Clone, Debug)] |
576 | | pub struct FinderRev<'n> { |
577 | | searcher: SearcherRev<'n>, |
578 | | } |
579 | | |
580 | | impl<'n> FinderRev<'n> { |
581 | | /// Create a new reverse finder for the given needle. |
582 | | #[inline] |
583 | 0 | pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> { |
584 | 0 | FinderBuilder::new().build_reverse(needle) |
585 | 0 | } |
586 | | |
587 | | /// Returns the index of the last occurrence of this needle in the given |
588 | | /// haystack. |
589 | | /// |
590 | | /// The haystack may be any type that can be cheaply converted into a |
591 | | /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
592 | | /// |
593 | | /// # Complexity |
594 | | /// |
595 | | /// This routine is guaranteed to have worst case linear time complexity |
596 | | /// with respect to both the needle and the haystack. That is, this runs |
597 | | /// in `O(needle.len() + haystack.len())` time. |
598 | | /// |
599 | | /// This routine is also guaranteed to have worst case constant space |
600 | | /// complexity. |
601 | | /// |
602 | | /// # Examples |
603 | | /// |
604 | | /// Basic usage: |
605 | | /// |
606 | | /// ``` |
607 | | /// use memchr::memmem::FinderRev; |
608 | | /// |
609 | | /// let haystack = b"foo bar baz"; |
610 | | /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack)); |
611 | | /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack)); |
612 | | /// assert_eq!(None, FinderRev::new("quux").rfind(haystack)); |
613 | | /// ``` |
614 | 0 | pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { |
615 | 0 | self.searcher.rfind(haystack.as_ref()) |
616 | 0 | } |
617 | | |
618 | | /// Returns a reverse iterator over all occurrences of a substring in a |
619 | | /// haystack. |
620 | | /// |
621 | | /// # Complexity |
622 | | /// |
623 | | /// This routine is guaranteed to have worst case linear time complexity |
624 | | /// with respect to both the needle and the haystack. That is, this runs |
625 | | /// in `O(needle.len() + haystack.len())` time. |
626 | | /// |
627 | | /// This routine is also guaranteed to have worst case constant space |
628 | | /// complexity. |
629 | | /// |
630 | | /// # Examples |
631 | | /// |
632 | | /// Basic usage: |
633 | | /// |
634 | | /// ``` |
635 | | /// use memchr::memmem::FinderRev; |
636 | | /// |
637 | | /// let haystack = b"foo bar foo baz foo"; |
638 | | /// let finder = FinderRev::new(b"foo"); |
639 | | /// let mut it = finder.rfind_iter(haystack); |
640 | | /// assert_eq!(Some(16), it.next()); |
641 | | /// assert_eq!(Some(8), it.next()); |
642 | | /// assert_eq!(Some(0), it.next()); |
643 | | /// assert_eq!(None, it.next()); |
644 | | /// ``` |
645 | | #[inline] |
646 | 0 | pub fn rfind_iter<'a, 'h>( |
647 | 0 | &'a self, |
648 | 0 | haystack: &'h [u8], |
649 | 0 | ) -> FindRevIter<'h, 'a> { |
650 | 0 | FindRevIter::new(haystack, self.as_ref()) |
651 | 0 | } |
652 | | |
653 | | /// Convert this finder into its owned variant, such that it no longer |
654 | | /// borrows the needle. |
655 | | /// |
656 | | /// If this is already an owned finder, then this is a no-op. Otherwise, |
657 | | /// this copies the needle. |
658 | | /// |
659 | | /// This is only available when the `std` feature is enabled. |
660 | | #[cfg(feature = "std")] |
661 | | #[inline] |
662 | 0 | pub fn into_owned(self) -> FinderRev<'static> { |
663 | 0 | FinderRev { searcher: self.searcher.into_owned() } |
664 | 0 | } |
665 | | |
666 | | /// Convert this finder into its borrowed variant. |
667 | | /// |
668 | | /// This is primarily useful if your finder is owned and you'd like to |
669 | | /// store its borrowed variant in some intermediate data structure. |
670 | | /// |
671 | | /// Note that the lifetime parameter of the returned finder is tied to the |
672 | | /// lifetime of `self`, and may be shorter than the `'n` lifetime of the |
673 | | /// needle itself. Namely, a finder's needle can be either borrowed or |
674 | | /// owned, so the lifetime of the needle returned must necessarily be the |
675 | | /// shorter of the two. |
676 | | #[inline] |
677 | 0 | pub fn as_ref(&self) -> FinderRev<'_> { |
678 | 0 | FinderRev { searcher: self.searcher.as_ref() } |
679 | 0 | } |
680 | | |
681 | | /// Returns the needle that this finder searches for. |
682 | | /// |
683 | | /// Note that the lifetime of the needle returned is tied to the lifetime |
684 | | /// of the finder, and may be shorter than the `'n` lifetime. Namely, a |
685 | | /// finder's needle can be either borrowed or owned, so the lifetime of the |
686 | | /// needle returned must necessarily be the shorter of the two. |
687 | | #[inline] |
688 | 0 | pub fn needle(&self) -> &[u8] { |
689 | 0 | self.searcher.needle() |
690 | 0 | } |
691 | | } |
692 | | |
693 | | /// A builder for constructing non-default forward or reverse memmem finders. |
694 | | /// |
695 | | /// A builder is primarily useful for configuring a substring searcher. |
696 | | /// Currently, the only configuration exposed is the ability to disable |
697 | | /// heuristic prefilters used to speed up certain searches. |
698 | | #[derive(Clone, Debug, Default)] |
699 | | pub struct FinderBuilder { |
700 | | config: SearcherConfig, |
701 | | } |
702 | | |
703 | | impl FinderBuilder { |
704 | | /// Create a new finder builder with default settings. |
705 | 0 | pub fn new() -> FinderBuilder { |
706 | 0 | FinderBuilder::default() |
707 | 0 | } |
708 | | |
709 | | /// Build a forward finder using the given needle from the current |
710 | | /// settings. |
711 | 0 | pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>( |
712 | 0 | &self, |
713 | 0 | needle: &'n B, |
714 | 0 | ) -> Finder<'n> { |
715 | 0 | Finder { searcher: Searcher::new(self.config, needle.as_ref()) } |
716 | 0 | } |
717 | | |
718 | | /// Build a reverse finder using the given needle from the current |
719 | | /// settings. |
720 | 0 | pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>( |
721 | 0 | &self, |
722 | 0 | needle: &'n B, |
723 | 0 | ) -> FinderRev<'n> { |
724 | 0 | FinderRev { searcher: SearcherRev::new(needle.as_ref()) } |
725 | 0 | } |
726 | | |
727 | | /// Configure the prefilter setting for the finder. |
728 | | /// |
729 | | /// See the documentation for [`Prefilter`] for more discussion on why |
730 | | /// you might want to configure this. |
731 | 0 | pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder { |
732 | 0 | self.config.prefilter = prefilter; |
733 | 0 | self |
734 | 0 | } |
735 | | } |
736 | | |
737 | | /// The internal implementation of a forward substring searcher. |
738 | | /// |
739 | | /// The reality is that this is a "meta" searcher. Namely, depending on a |
740 | | /// variety of parameters (CPU support, target, needle size, haystack size and |
741 | | /// even dynamic properties such as prefilter effectiveness), the actual |
742 | | /// algorithm employed to do substring search may change. |
743 | | #[derive(Clone, Debug)] |
744 | | struct Searcher<'n> { |
745 | | /// The actual needle we're searching for. |
746 | | /// |
747 | | /// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is |
748 | | /// specialized to a single variant (the borrowed form). |
749 | | needle: CowBytes<'n>, |
750 | | /// A collection of facts computed on the needle that are useful for more |
751 | | /// than one substring search algorithm. |
752 | | ninfo: NeedleInfo, |
753 | | /// A prefilter function, if it was deemed appropriate. |
754 | | /// |
755 | | /// Some substring search implementations (like Two-Way) benefit greatly |
756 | | /// if we can quickly find candidate starting positions for a match. |
757 | | prefn: Option<PrefilterFn>, |
758 | | /// The actual substring implementation in use. |
759 | | kind: SearcherKind, |
760 | | } |
761 | | |
762 | | /// A collection of facts computed about a search needle. |
763 | | /// |
764 | | /// We group these things together because it's useful to be able to hand them |
765 | | /// to prefilters or substring algorithms that want them. |
766 | | #[derive(Clone, Copy, Debug)] |
767 | | pub(crate) struct NeedleInfo { |
768 | | /// The offsets of "rare" bytes detected in the needle. |
769 | | /// |
770 | | /// This is meant to be a heuristic in order to maximize the effectiveness |
771 | | /// of vectorized code. Namely, vectorized code tends to focus on only |
772 | | /// one or two bytes. If we pick bytes from the needle that occur |
773 | | /// infrequently, then more time will be spent in the vectorized code and |
774 | | /// will likely make the overall search (much) faster. |
775 | | /// |
776 | | /// Of course, this is only a heuristic based on a background frequency |
777 | | /// distribution of bytes. But it tends to work very well in practice. |
778 | | pub(crate) rarebytes: RareNeedleBytes, |
779 | | /// A Rabin-Karp hash of the needle. |
780 | | /// |
781 | | /// This is store here instead of in a more specific Rabin-Karp search |
782 | | /// since Rabin-Karp may be used even if another SearchKind corresponds |
783 | | /// to some other search implementation. e.g., If measurements suggest RK |
784 | | /// is faster in some cases or if a search implementation can't handle |
785 | | /// particularly small haystack. (Moreover, we cannot use RK *generally*, |
786 | | /// since its worst case time is multiplicative. Instead, we only use it |
787 | | /// some small haystacks, where "small" is a constant.) |
788 | | pub(crate) nhash: NeedleHash, |
789 | | } |
790 | | |
791 | | /// Configuration for substring search. |
792 | | #[derive(Clone, Copy, Debug, Default)] |
793 | | struct SearcherConfig { |
794 | | /// This permits changing the behavior of the prefilter, since it can have |
795 | | /// a variable impact on performance. |
796 | | prefilter: Prefilter, |
797 | | } |
798 | | |
799 | | #[derive(Clone, Debug)] |
800 | | enum SearcherKind { |
801 | | /// A special case for empty needles. An empty needle always matches, even |
802 | | /// in an empty haystack. |
803 | | Empty, |
804 | | /// This is used whenever the needle is a single byte. In this case, we |
805 | | /// always use memchr. |
806 | | OneByte(u8), |
807 | | /// Two-Way is the generic work horse and is what provides our additive |
808 | | /// linear time guarantee. In general, it's used when the needle is bigger |
809 | | /// than 8 bytes or so. |
810 | | TwoWay(twoway::Forward), |
811 | | #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
812 | | GenericSIMD128(x86::sse::Forward), |
813 | | #[cfg(memchr_runtime_wasm128)] |
814 | | GenericSIMD128(wasm::Forward), |
815 | | #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
816 | | GenericSIMD256(x86::avx::Forward), |
817 | | } |
818 | | |
819 | | impl<'n> Searcher<'n> { |
820 | 0 | fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { |
821 | | use self::SearcherKind::*; |
822 | | |
823 | 0 | let ninfo = NeedleInfo::new(needle); |
824 | 0 | let mk = |kind: SearcherKind| { |
825 | 0 | let prefn = prefilter::forward( |
826 | 0 | &config.prefilter, |
827 | 0 | &ninfo.rarebytes, |
828 | 0 | needle, |
829 | 0 | ); |
830 | 0 | Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } |
831 | 0 | }; |
832 | 0 | if needle.len() == 0 { |
833 | 0 | return mk(Empty); |
834 | 0 | } |
835 | 0 | if needle.len() == 1 { |
836 | 0 | return mk(OneByte(needle[0])); |
837 | 0 | } |
838 | | #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] |
839 | | { |
840 | 0 | if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { |
841 | 0 | return mk(GenericSIMD256(fwd)); |
842 | 0 | } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { |
843 | 0 | return mk(GenericSIMD128(fwd)); |
844 | 0 | } |
845 | 0 | } |
846 | 0 | #[cfg(all(target_arch = "wasm32", memchr_runtime_simd))] |
847 | 0 | { |
848 | 0 | if let Some(fwd) = wasm::Forward::new(&ninfo, needle) { |
849 | 0 | return mk(GenericSIMD128(fwd)); |
850 | 0 | } |
851 | 0 | } |
852 | 0 |
|
853 | 0 | mk(TwoWay(twoway::Forward::new(needle))) |
854 | 0 | } |
855 | | |
856 | | /// Return a fresh prefilter state that can be used with this searcher. |
857 | | /// A prefilter state is used to track the effectiveness of a searcher's |
858 | | /// prefilter for speeding up searches. Therefore, the prefilter state |
859 | | /// should generally be reused on subsequent searches (such as in an |
860 | | /// iterator). For searches on a different haystack, then a new prefilter |
861 | | /// state should be used. |
862 | | /// |
863 | | /// This always initializes a valid (but possibly inert) prefilter state |
864 | | /// even if this searcher does not have a prefilter enabled. |
865 | 0 | fn prefilter_state(&self) -> PrefilterState { |
866 | 0 | if self.prefn.is_none() { |
867 | 0 | PrefilterState::inert() |
868 | | } else { |
869 | 0 | PrefilterState::new() |
870 | | } |
871 | 0 | } |
872 | | |
873 | 0 | fn needle(&self) -> &[u8] { |
874 | 0 | self.needle.as_slice() |
875 | 0 | } |
876 | | |
877 | 0 | fn as_ref(&self) -> Searcher<'_> { |
878 | | use self::SearcherKind::*; |
879 | | |
880 | 0 | let kind = match self.kind { |
881 | 0 | Empty => Empty, |
882 | 0 | OneByte(b) => OneByte(b), |
883 | 0 | TwoWay(tw) => TwoWay(tw), |
884 | | #[cfg(all(not(miri), memchr_runtime_simd))] |
885 | 0 | GenericSIMD128(gs) => GenericSIMD128(gs), |
886 | | #[cfg(all( |
887 | | not(miri), |
888 | | target_arch = "x86_64", |
889 | | memchr_runtime_simd |
890 | | ))] |
891 | 0 | GenericSIMD256(gs) => GenericSIMD256(gs), |
892 | | }; |
893 | 0 | Searcher { |
894 | 0 | needle: CowBytes::new(self.needle()), |
895 | 0 | ninfo: self.ninfo, |
896 | 0 | prefn: self.prefn, |
897 | 0 | kind, |
898 | 0 | } |
899 | 0 | } |
900 | | |
901 | | #[cfg(feature = "std")] |
902 | 0 | fn into_owned(self) -> Searcher<'static> { |
903 | | use self::SearcherKind::*; |
904 | | |
905 | 0 | let kind = match self.kind { |
906 | 0 | Empty => Empty, |
907 | 0 | OneByte(b) => OneByte(b), |
908 | 0 | TwoWay(tw) => TwoWay(tw), |
909 | | #[cfg(all(not(miri), memchr_runtime_simd))] |
910 | 0 | GenericSIMD128(gs) => GenericSIMD128(gs), |
911 | | #[cfg(all( |
912 | | not(miri), |
913 | | target_arch = "x86_64", |
914 | | memchr_runtime_simd |
915 | | ))] |
916 | 0 | GenericSIMD256(gs) => GenericSIMD256(gs), |
917 | | }; |
918 | 0 | Searcher { |
919 | 0 | needle: self.needle.into_owned(), |
920 | 0 | ninfo: self.ninfo, |
921 | 0 | prefn: self.prefn, |
922 | 0 | kind, |
923 | 0 | } |
924 | 0 | } |
925 | | |
926 | | /// Implements forward substring search by selecting the implementation |
927 | | /// chosen at construction and executing it on the given haystack with the |
928 | | /// prefilter's current state of effectiveness. |
929 | | #[inline(always)] |
930 | 0 | fn find( |
931 | 0 | &self, |
932 | 0 | state: &mut PrefilterState, |
933 | 0 | haystack: &[u8], |
934 | 0 | ) -> Option<usize> { |
935 | | use self::SearcherKind::*; |
936 | | |
937 | 0 | let needle = self.needle(); |
938 | 0 | if haystack.len() < needle.len() { |
939 | 0 | return None; |
940 | 0 | } |
941 | 0 | match self.kind { |
942 | 0 | Empty => Some(0), |
943 | 0 | OneByte(b) => crate::memchr(b, haystack), |
944 | 0 | TwoWay(ref tw) => { |
945 | 0 | // For very short haystacks (e.g., where the prefilter probably |
946 | 0 | // can't run), it's faster to just run RK. |
947 | 0 | if rabinkarp::is_fast(haystack, needle) { |
948 | 0 | rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
949 | | } else { |
950 | 0 | self.find_tw(tw, state, haystack, needle) |
951 | | } |
952 | | } |
953 | | #[cfg(all(not(miri), memchr_runtime_simd))] |
954 | 0 | GenericSIMD128(ref gs) => { |
955 | 0 | // The SIMD matcher can't handle particularly short haystacks, |
956 | 0 | // so we fall back to RK in these cases. |
957 | 0 | if haystack.len() < gs.min_haystack_len() { |
958 | 0 | rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
959 | | } else { |
960 | 0 | gs.find(haystack, needle) |
961 | | } |
962 | | } |
963 | | #[cfg(all( |
964 | | not(miri), |
965 | | target_arch = "x86_64", |
966 | | memchr_runtime_simd |
967 | | ))] |
968 | 0 | GenericSIMD256(ref gs) => { |
969 | 0 | // The SIMD matcher can't handle particularly short haystacks, |
970 | 0 | // so we fall back to RK in these cases. |
971 | 0 | if haystack.len() < gs.min_haystack_len() { |
972 | 0 | rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) |
973 | | } else { |
974 | 0 | gs.find(haystack, needle) |
975 | | } |
976 | | } |
977 | | } |
978 | 0 | } |
979 | | |
980 | | /// Calls Two-Way on the given haystack/needle. |
981 | | /// |
982 | | /// This is marked as unlineable since it seems to have a better overall |
983 | | /// effect on benchmarks. However, this is one of those cases where |
984 | | /// inlining it results an improvement in other benchmarks too, so I |
985 | | /// suspect we just don't have enough data yet to make the right call here. |
986 | | /// |
987 | | /// I suspect the main problem is that this function contains two different |
988 | | /// inlined copies of Two-Way: one with and one without prefilters enabled. |
989 | | #[inline(never)] |
990 | 0 | fn find_tw( |
991 | 0 | &self, |
992 | 0 | tw: &twoway::Forward, |
993 | 0 | state: &mut PrefilterState, |
994 | 0 | haystack: &[u8], |
995 | 0 | needle: &[u8], |
996 | 0 | ) -> Option<usize> { |
997 | 0 | if let Some(prefn) = self.prefn { |
998 | | // We used to look at the length of a haystack here. That is, if |
999 | | // it was too small, then don't bother with the prefilter. But two |
1000 | | // things changed: the prefilter falls back to memchr for small |
1001 | | // haystacks, and, above, Rabin-Karp is employed for tiny haystacks |
1002 | | // anyway. |
1003 | 0 | if state.is_effective() { |
1004 | 0 | let mut pre = Pre { state, prefn, ninfo: &self.ninfo }; |
1005 | 0 | return tw.find(Some(&mut pre), haystack, needle); |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | tw.find(None, haystack, needle) |
1009 | 0 | } |
1010 | | } |
1011 | | |
1012 | | impl NeedleInfo { |
1013 | 0 | pub(crate) fn new(needle: &[u8]) -> NeedleInfo { |
1014 | 0 | NeedleInfo { |
1015 | 0 | rarebytes: RareNeedleBytes::forward(needle), |
1016 | 0 | nhash: NeedleHash::forward(needle), |
1017 | 0 | } |
1018 | 0 | } |
1019 | | } |
1020 | | |
1021 | | /// The internal implementation of a reverse substring searcher. |
1022 | | /// |
1023 | | /// See the forward searcher docs for more details. Currently, the reverse |
1024 | | /// searcher is considerably simpler since it lacks prefilter support. This |
1025 | | /// was done because it adds a lot of code, and more surface area to test. And |
1026 | | /// in particular, it's not clear whether a prefilter on reverse searching is |
1027 | | /// worth it. (If you have a compelling use case, please file an issue!) |
1028 | | #[derive(Clone, Debug)] |
1029 | | struct SearcherRev<'n> { |
1030 | | /// The actual needle we're searching for. |
1031 | | needle: CowBytes<'n>, |
1032 | | /// A Rabin-Karp hash of the needle. |
1033 | | nhash: NeedleHash, |
1034 | | /// The actual substring implementation in use. |
1035 | | kind: SearcherRevKind, |
1036 | | } |
1037 | | |
1038 | | #[derive(Clone, Debug)] |
1039 | | enum SearcherRevKind { |
1040 | | /// A special case for empty needles. An empty needle always matches, even |
1041 | | /// in an empty haystack. |
1042 | | Empty, |
1043 | | /// This is used whenever the needle is a single byte. In this case, we |
1044 | | /// always use memchr. |
1045 | | OneByte(u8), |
1046 | | /// Two-Way is the generic work horse and is what provides our additive |
1047 | | /// linear time guarantee. In general, it's used when the needle is bigger |
1048 | | /// than 8 bytes or so. |
1049 | | TwoWay(twoway::Reverse), |
1050 | | } |
1051 | | |
1052 | | impl<'n> SearcherRev<'n> { |
1053 | 0 | fn new(needle: &'n [u8]) -> SearcherRev<'n> { |
1054 | | use self::SearcherRevKind::*; |
1055 | | |
1056 | 0 | let kind = if needle.len() == 0 { |
1057 | 0 | Empty |
1058 | 0 | } else if needle.len() == 1 { |
1059 | 0 | OneByte(needle[0]) |
1060 | | } else { |
1061 | 0 | TwoWay(twoway::Reverse::new(needle)) |
1062 | | }; |
1063 | 0 | SearcherRev { |
1064 | 0 | needle: CowBytes::new(needle), |
1065 | 0 | nhash: NeedleHash::reverse(needle), |
1066 | 0 | kind, |
1067 | 0 | } |
1068 | 0 | } |
1069 | | |
1070 | 0 | fn needle(&self) -> &[u8] { |
1071 | 0 | self.needle.as_slice() |
1072 | 0 | } |
1073 | | |
1074 | 0 | fn as_ref(&self) -> SearcherRev<'_> { |
1075 | | use self::SearcherRevKind::*; |
1076 | | |
1077 | 0 | let kind = match self.kind { |
1078 | 0 | Empty => Empty, |
1079 | 0 | OneByte(b) => OneByte(b), |
1080 | 0 | TwoWay(tw) => TwoWay(tw), |
1081 | | }; |
1082 | 0 | SearcherRev { |
1083 | 0 | needle: CowBytes::new(self.needle()), |
1084 | 0 | nhash: self.nhash, |
1085 | 0 | kind, |
1086 | 0 | } |
1087 | 0 | } |
1088 | | |
1089 | | #[cfg(feature = "std")] |
1090 | 0 | fn into_owned(self) -> SearcherRev<'static> { |
1091 | | use self::SearcherRevKind::*; |
1092 | | |
1093 | 0 | let kind = match self.kind { |
1094 | 0 | Empty => Empty, |
1095 | 0 | OneByte(b) => OneByte(b), |
1096 | 0 | TwoWay(tw) => TwoWay(tw), |
1097 | | }; |
1098 | 0 | SearcherRev { |
1099 | 0 | needle: self.needle.into_owned(), |
1100 | 0 | nhash: self.nhash, |
1101 | 0 | kind, |
1102 | 0 | } |
1103 | 0 | } |
1104 | | |
1105 | | /// Implements reverse substring search by selecting the implementation |
1106 | | /// chosen at construction and executing it on the given haystack with the |
1107 | | /// prefilter's current state of effectiveness. |
1108 | | #[inline(always)] |
1109 | 0 | fn rfind(&self, haystack: &[u8]) -> Option<usize> { |
1110 | | use self::SearcherRevKind::*; |
1111 | | |
1112 | 0 | let needle = self.needle(); |
1113 | 0 | if haystack.len() < needle.len() { |
1114 | 0 | return None; |
1115 | 0 | } |
1116 | 0 | match self.kind { |
1117 | 0 | Empty => Some(haystack.len()), |
1118 | 0 | OneByte(b) => crate::memrchr(b, haystack), |
1119 | 0 | TwoWay(ref tw) => { |
1120 | 0 | // For very short haystacks (e.g., where the prefilter probably |
1121 | 0 | // can't run), it's faster to just run RK. |
1122 | 0 | if rabinkarp::is_fast(haystack, needle) { |
1123 | 0 | rabinkarp::rfind_with(&self.nhash, haystack, needle) |
1124 | | } else { |
1125 | 0 | tw.rfind(haystack, needle) |
1126 | | } |
1127 | | } |
1128 | | } |
1129 | 0 | } |
1130 | | } |
1131 | | |
1132 | | /// This module defines some generic quickcheck properties useful for testing |
1133 | | /// any substring search algorithm. It also runs those properties for the |
1134 | | /// top-level public API memmem routines. (The properties are also used to |
1135 | | /// test various substring search implementations more granularly elsewhere as |
1136 | | /// well.) |
1137 | | #[cfg(all(test, feature = "std", not(miri)))] |
1138 | | mod proptests { |
1139 | | // N.B. This defines the quickcheck tests using the properties defined |
1140 | | // below. Because of macro-visibility weirdness, the actual macro is |
1141 | | // defined at the top of this file. |
1142 | | define_memmem_quickcheck_tests!(super::find, super::rfind); |
1143 | | |
1144 | | /// Check that every prefix of the given byte string is a substring. |
1145 | | pub(crate) fn prefix_is_substring( |
1146 | | reverse: bool, |
1147 | | bs: &[u8], |
1148 | | mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
1149 | | ) -> bool { |
1150 | | if bs.is_empty() { |
1151 | | return true; |
1152 | | } |
1153 | | for i in 0..(bs.len() - 1) { |
1154 | | let prefix = &bs[..i]; |
1155 | | if reverse { |
1156 | | assert_eq!(naive_rfind(bs, prefix), search(bs, prefix)); |
1157 | | } else { |
1158 | | assert_eq!(naive_find(bs, prefix), search(bs, prefix)); |
1159 | | } |
1160 | | } |
1161 | | true |
1162 | | } |
1163 | | |
1164 | | /// Check that every suffix of the given byte string is a substring. |
1165 | | pub(crate) fn suffix_is_substring( |
1166 | | reverse: bool, |
1167 | | bs: &[u8], |
1168 | | mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
1169 | | ) -> bool { |
1170 | | if bs.is_empty() { |
1171 | | return true; |
1172 | | } |
1173 | | for i in 0..(bs.len() - 1) { |
1174 | | let suffix = &bs[i..]; |
1175 | | if reverse { |
1176 | | assert_eq!(naive_rfind(bs, suffix), search(bs, suffix)); |
1177 | | } else { |
1178 | | assert_eq!(naive_find(bs, suffix), search(bs, suffix)); |
1179 | | } |
1180 | | } |
1181 | | true |
1182 | | } |
1183 | | |
1184 | | /// Check that naive substring search matches the result of the given search |
1185 | | /// algorithm. |
1186 | | pub(crate) fn matches_naive( |
1187 | | reverse: bool, |
1188 | | haystack: &[u8], |
1189 | | needle: &[u8], |
1190 | | mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
1191 | | ) -> bool { |
1192 | | if reverse { |
1193 | | naive_rfind(haystack, needle) == search(haystack, needle) |
1194 | | } else { |
1195 | | naive_find(haystack, needle) == search(haystack, needle) |
1196 | | } |
1197 | | } |
1198 | | |
1199 | | /// Naively search forwards for the given needle in the given haystack. |
1200 | | fn naive_find(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
1201 | | if needle.is_empty() { |
1202 | | return Some(0); |
1203 | | } else if haystack.len() < needle.len() { |
1204 | | return None; |
1205 | | } |
1206 | | for i in 0..(haystack.len() - needle.len() + 1) { |
1207 | | if needle == &haystack[i..i + needle.len()] { |
1208 | | return Some(i); |
1209 | | } |
1210 | | } |
1211 | | None |
1212 | | } |
1213 | | |
1214 | | /// Naively search in reverse for the given needle in the given haystack. |
1215 | | fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
1216 | | if needle.is_empty() { |
1217 | | return Some(haystack.len()); |
1218 | | } else if haystack.len() < needle.len() { |
1219 | | return None; |
1220 | | } |
1221 | | for i in (0..(haystack.len() - needle.len() + 1)).rev() { |
1222 | | if needle == &haystack[i..i + needle.len()] { |
1223 | | return Some(i); |
1224 | | } |
1225 | | } |
1226 | | None |
1227 | | } |
1228 | | } |
1229 | | |
1230 | | /// This module defines some hand-written "simple" substring tests. It |
1231 | | /// also provides routines for easily running them on any substring search |
1232 | | /// implementation. |
1233 | | #[cfg(test)] |
1234 | | mod testsimples { |
1235 | | define_memmem_simple_tests!(super::find, super::rfind); |
1236 | | |
1237 | | /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. |
1238 | | type SearchTest = |
1239 | | (&'static str, &'static str, Option<usize>, Option<usize>); |
1240 | | |
1241 | | const SEARCH_TESTS: &'static [SearchTest] = &[ |
1242 | | ("", "", Some(0), Some(0)), |
1243 | | ("", "a", Some(0), Some(1)), |
1244 | | ("", "ab", Some(0), Some(2)), |
1245 | | ("", "abc", Some(0), Some(3)), |
1246 | | ("a", "", None, None), |
1247 | | ("a", "a", Some(0), Some(0)), |
1248 | | ("a", "aa", Some(0), Some(1)), |
1249 | | ("a", "ba", Some(1), Some(1)), |
1250 | | ("a", "bba", Some(2), Some(2)), |
1251 | | ("a", "bbba", Some(3), Some(3)), |
1252 | | ("a", "bbbab", Some(3), Some(3)), |
1253 | | ("a", "bbbabb", Some(3), Some(3)), |
1254 | | ("a", "bbbabbb", Some(3), Some(3)), |
1255 | | ("a", "bbbbbb", None, None), |
1256 | | ("ab", "", None, None), |
1257 | | ("ab", "a", None, None), |
1258 | | ("ab", "b", None, None), |
1259 | | ("ab", "ab", Some(0), Some(0)), |
1260 | | ("ab", "aab", Some(1), Some(1)), |
1261 | | ("ab", "aaab", Some(2), Some(2)), |
1262 | | ("ab", "abaab", Some(0), Some(3)), |
1263 | | ("ab", "baaab", Some(3), Some(3)), |
1264 | | ("ab", "acb", None, None), |
1265 | | ("ab", "abba", Some(0), Some(0)), |
1266 | | ("abc", "ab", None, None), |
1267 | | ("abc", "abc", Some(0), Some(0)), |
1268 | | ("abc", "abcz", Some(0), Some(0)), |
1269 | | ("abc", "abczz", Some(0), Some(0)), |
1270 | | ("abc", "zabc", Some(1), Some(1)), |
1271 | | ("abc", "zzabc", Some(2), Some(2)), |
1272 | | ("abc", "azbc", None, None), |
1273 | | ("abc", "abzc", None, None), |
1274 | | ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)), |
1275 | | ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)), |
1276 | | ("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(32), Some(32)), |
1277 | | // Failures caught by quickcheck. |
1278 | | ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)), |
1279 | | ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None), |
1280 | | ]; |
1281 | | |
1282 | | /// Run the substring search tests. `search` should be a closure that |
1283 | | /// accepts a haystack and a needle and returns the starting position |
1284 | | /// of the first occurrence of needle in the haystack, or `None` if one |
1285 | | /// doesn't exist. |
1286 | | pub(crate) fn run_search_tests_fwd( |
1287 | | mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
1288 | | ) { |
1289 | | for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { |
1290 | | let (n, h) = (needle.as_bytes(), haystack.as_bytes()); |
1291 | | assert_eq!( |
1292 | | expected_fwd, |
1293 | | search(h, n), |
1294 | | "needle: {:?}, haystack: {:?}, expected: {:?}", |
1295 | | n, |
1296 | | h, |
1297 | | expected_fwd |
1298 | | ); |
1299 | | } |
1300 | | } |
1301 | | |
1302 | | /// Run the substring search tests. `search` should be a closure that |
1303 | | /// accepts a haystack and a needle and returns the starting position of |
1304 | | /// the last occurrence of needle in the haystack, or `None` if one doesn't |
1305 | | /// exist. |
1306 | | pub(crate) fn run_search_tests_rev( |
1307 | | mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, |
1308 | | ) { |
1309 | | for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { |
1310 | | let (n, h) = (needle.as_bytes(), haystack.as_bytes()); |
1311 | | assert_eq!( |
1312 | | expected_rev, |
1313 | | search(h, n), |
1314 | | "needle: {:?}, haystack: {:?}, expected: {:?}", |
1315 | | n, |
1316 | | h, |
1317 | | expected_rev |
1318 | | ); |
1319 | | } |
1320 | | } |
1321 | | } |