/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.5.0/src/memmem/rabinkarp.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | This module implements the classical Rabin-Karp substring search algorithm, |
3 | | with no extra frills. While its use would seem to break our time complexity |
4 | | guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only |
5 | | ever use RK on a constant subset of haystacks. The main point here is that |
6 | | RK has good latency properties for small needles/haystacks. It's very quick |
7 | | to compute a needle hash and zip through the haystack when compared to |
8 | | initializing Two-Way, for example. And this is especially useful for cases |
9 | | where the haystack is just too short for vector instructions to do much good. |
10 | | |
11 | | The hashing function used here is the same one recommended by ESMAJ. |
12 | | |
13 | | Another choice instead of Rabin-Karp would be Shift-Or. But its latency |
14 | | isn't quite as good since its preprocessing time is a bit more expensive |
15 | | (both in practice and in theory). However, perhaps Shift-Or has a place |
16 | | somewhere else for short patterns. I think the main problem is that it |
17 | | requires space proportional to the alphabet and the needle. If we, for |
18 | | example, supported needles up to length 16, then the total table size would be |
19 | | len(alphabet)*size_of::<u16>()==512 bytes. Which isn't exactly small, and it's |
20 | | probably bad to put that on the stack. So ideally, we'd throw it on the heap, |
21 | | but we'd really like to write as much code without using alloc/std as possible. |
22 | | But maybe it's worth the special casing. It's a TODO to benchmark. |
23 | | |
24 | | Wikipedia has a decent explanation, if a bit heavy on the theory: |
25 | | https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm |
26 | | |
27 | | But ESMAJ provides something a bit more concrete: |
28 | | http://www-igm.univ-mlv.fr/~lecroq/string/node5.html |
29 | | |
30 | | Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases: |
31 | | https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs |
32 | | */ |
33 | | |
34 | | /// Whether RK is believed to be very fast for the given needle/haystack. |
35 | 0 | pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool { |
36 | 0 | haystack.len() < 16 |
37 | 0 | } |
38 | | |
39 | | /// Search for the first occurrence of needle in haystack using Rabin-Karp. |
40 | 0 | pub(crate) fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
41 | 0 | find_with(&NeedleHash::forward(needle), haystack, needle) |
42 | 0 | } |
43 | | |
44 | | /// Search for the first occurrence of needle in haystack using Rabin-Karp with |
45 | | /// a pre-computed needle hash. |
46 | 0 | pub(crate) fn find_with( |
47 | 0 | nhash: &NeedleHash, |
48 | 0 | mut haystack: &[u8], |
49 | 0 | needle: &[u8], |
50 | 0 | ) -> Option<usize> { |
51 | 0 | if haystack.len() < needle.len() { |
52 | 0 | return None; |
53 | 0 | } |
54 | 0 | let start = haystack.as_ptr() as usize; |
55 | 0 | let mut hash = Hash::from_bytes_fwd(&haystack[..needle.len()]); |
56 | | // N.B. I've experimented with unrolling this loop, but couldn't realize |
57 | | // any obvious gains. |
58 | | loop { |
59 | 0 | if nhash.eq(hash) && is_prefix(haystack, needle) { |
60 | 0 | return Some(haystack.as_ptr() as usize - start); |
61 | 0 | } |
62 | 0 | if needle.len() >= haystack.len() { |
63 | 0 | return None; |
64 | 0 | } |
65 | 0 | hash.roll(&nhash, haystack[0], haystack[needle.len()]); |
66 | 0 | haystack = &haystack[1..]; |
67 | | } |
68 | 0 | } |
69 | | |
70 | | /// Search for the last occurrence of needle in haystack using Rabin-Karp. |
71 | 0 | pub(crate) fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { |
72 | 0 | rfind_with(&NeedleHash::reverse(needle), haystack, needle) |
73 | 0 | } |
74 | | |
75 | | /// Search for the last occurrence of needle in haystack using Rabin-Karp with |
76 | | /// a pre-computed needle hash. |
77 | 0 | pub(crate) fn rfind_with( |
78 | 0 | nhash: &NeedleHash, |
79 | 0 | mut haystack: &[u8], |
80 | 0 | needle: &[u8], |
81 | 0 | ) -> Option<usize> { |
82 | 0 | if haystack.len() < needle.len() { |
83 | 0 | return None; |
84 | 0 | } |
85 | 0 | let mut hash = |
86 | 0 | Hash::from_bytes_rev(&haystack[haystack.len() - needle.len()..]); |
87 | | loop { |
88 | 0 | if nhash.eq(hash) && is_suffix(haystack, needle) { |
89 | 0 | return Some(haystack.len() - needle.len()); |
90 | 0 | } |
91 | 0 | if needle.len() >= haystack.len() { |
92 | 0 | return None; |
93 | 0 | } |
94 | 0 | hash.roll( |
95 | 0 | &nhash, |
96 | 0 | haystack[haystack.len() - 1], |
97 | 0 | haystack[haystack.len() - needle.len() - 1], |
98 | 0 | ); |
99 | 0 | haystack = &haystack[..haystack.len() - 1]; |
100 | | } |
101 | 0 | } |
102 | | |
103 | | /// A hash derived from a needle. |
104 | | #[derive(Clone, Copy, Debug, Default)] |
105 | | pub(crate) struct NeedleHash { |
106 | | /// The actual hash. |
107 | | hash: Hash, |
108 | | /// The factor needed to multiply a byte by in order to subtract it from |
109 | | /// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation), |
110 | | /// where n is the length of the needle. This is how we "remove" a byte |
111 | | /// from the hash once the hash window rolls past it. |
112 | | hash_2pow: u32, |
113 | | } |
114 | | |
115 | | impl NeedleHash { |
116 | | /// Create a new Rabin-Karp hash for the given needle for use in forward |
117 | | /// searching. |
118 | 0 | pub(crate) fn forward(needle: &[u8]) -> NeedleHash { |
119 | 0 | let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; |
120 | 0 | if needle.is_empty() { |
121 | 0 | return nh; |
122 | 0 | } |
123 | 0 | nh.hash.add(needle[0]); |
124 | 0 | for &b in needle.iter().skip(1) { |
125 | 0 | nh.hash.add(b); |
126 | 0 | nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); |
127 | 0 | } |
128 | 0 | nh |
129 | 0 | } |
130 | | |
131 | | /// Create a new Rabin-Karp hash for the given needle for use in reverse |
132 | | /// searching. |
133 | 0 | pub(crate) fn reverse(needle: &[u8]) -> NeedleHash { |
134 | 0 | let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; |
135 | 0 | if needle.is_empty() { |
136 | 0 | return nh; |
137 | 0 | } |
138 | 0 | nh.hash.add(needle[needle.len() - 1]); |
139 | 0 | for &b in needle.iter().rev().skip(1) { |
140 | 0 | nh.hash.add(b); |
141 | 0 | nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); |
142 | 0 | } |
143 | 0 | nh |
144 | 0 | } |
145 | | |
146 | | /// Return true if the hashes are equivalent. |
147 | 0 | fn eq(&self, hash: Hash) -> bool { |
148 | 0 | self.hash == hash |
149 | 0 | } |
150 | | } |
151 | | |
152 | | /// A Rabin-Karp hash. This might represent the hash of a needle, or the hash |
153 | | /// of a rolling window in the haystack. |
154 | | #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] |
155 | | pub(crate) struct Hash(u32); |
156 | | |
157 | | impl Hash { |
158 | | /// Create a new hash that represents the empty string. |
159 | 0 | pub(crate) fn new() -> Hash { |
160 | 0 | Hash(0) |
161 | 0 | } |
162 | | |
163 | | /// Create a new hash from the bytes given for use in forward searches. |
164 | 0 | pub(crate) fn from_bytes_fwd(bytes: &[u8]) -> Hash { |
165 | 0 | let mut hash = Hash::new(); |
166 | 0 | for &b in bytes { |
167 | 0 | hash.add(b); |
168 | 0 | } |
169 | 0 | hash |
170 | 0 | } |
171 | | |
172 | | /// Create a new hash from the bytes given for use in reverse searches. |
173 | 0 | fn from_bytes_rev(bytes: &[u8]) -> Hash { |
174 | 0 | let mut hash = Hash::new(); |
175 | 0 | for &b in bytes.iter().rev() { |
176 | 0 | hash.add(b); |
177 | 0 | } |
178 | 0 | hash |
179 | 0 | } |
180 | | |
181 | | /// Add 'new' and remove 'old' from this hash. The given needle hash should |
182 | | /// correspond to the hash computed for the needle being searched for. |
183 | | /// |
184 | | /// This is meant to be used when the rolling window of the haystack is |
185 | | /// advanced. |
186 | 0 | fn roll(&mut self, nhash: &NeedleHash, old: u8, new: u8) { |
187 | 0 | self.del(nhash, old); |
188 | 0 | self.add(new); |
189 | 0 | } |
190 | | |
191 | | /// Add a byte to this hash. |
192 | 0 | fn add(&mut self, byte: u8) { |
193 | 0 | self.0 = self.0.wrapping_shl(1).wrapping_add(byte as u32); |
194 | 0 | } |
195 | | |
196 | | /// Remove a byte from this hash. The given needle hash should correspond |
197 | | /// to the hash computed for the needle being searched for. |
198 | 0 | fn del(&mut self, nhash: &NeedleHash, byte: u8) { |
199 | 0 | let factor = nhash.hash_2pow; |
200 | 0 | self.0 = self.0.wrapping_sub((byte as u32).wrapping_mul(factor)); |
201 | 0 | } |
202 | | } |
203 | | |
204 | | /// Returns true if the given needle is a prefix of the given haystack. |
205 | | /// |
206 | | /// We forcefully don't inline the is_prefix call and hint at the compiler that |
207 | | /// it is unlikely to be called. This causes the inner rabinkarp loop above |
208 | | /// to be a bit tighter and leads to some performance improvement. See the |
209 | | /// memmem/krate/prebuilt/sliceslice-words/words benchmark. |
210 | | #[cold] |
211 | | #[inline(never)] |
212 | 0 | fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { |
213 | 0 | crate::memmem::util::is_prefix(haystack, needle) |
214 | 0 | } |
215 | | |
216 | | /// Returns true if the given needle is a suffix of the given haystack. |
217 | | /// |
218 | | /// See is_prefix for why this is forcefully not inlined. |
219 | | #[cold] |
220 | | #[inline(never)] |
221 | 0 | fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { |
222 | 0 | crate::memmem::util::is_suffix(haystack, needle) |
223 | 0 | } |
224 | | |
225 | | #[cfg(test)] |
226 | | mod simpletests { |
227 | | define_memmem_simple_tests!(super::find, super::rfind); |
228 | | } |
229 | | |
230 | | #[cfg(all(test, feature = "std", not(miri)))] |
231 | | mod proptests { |
232 | | define_memmem_quickcheck_tests!(super::find, super::rfind); |
233 | | } |