/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.4.1/src/memmem/genericsimd.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use core::mem::size_of; |
2 | | |
3 | | use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo}; |
4 | | |
5 | | /// The minimum length of a needle required for this algorithm. The minimum |
6 | | /// is 2 since a length of 1 should just use memchr and a length of 0 isn't |
7 | | /// a case handled by this searcher. |
8 | | pub(crate) const MIN_NEEDLE_LEN: usize = 2; |
9 | | |
10 | | /// The maximum length of a needle required for this algorithm. |
11 | | /// |
12 | | /// In reality, there is no hard max here. The code below can handle any |
13 | | /// length needle. (Perhaps that suggests there are missing optimizations.) |
14 | | /// Instead, this is a heuristic and a bound guaranteeing our linear time |
15 | | /// complexity. |
16 | | /// |
17 | | /// It is a heuristic because when a candidate match is found, memcmp is run. |
18 | | /// For very large needles with lots of false positives, memcmp can make the |
19 | | /// code run quite slow. |
20 | | /// |
21 | | /// It is a bound because the worst case behavior with memcmp is multiplicative |
22 | | /// in the size of the needle and haystack, and we want to keep that additive. |
23 | | /// This bound ensures we still meet that bound theoretically, since it's just |
24 | | /// a constant. We aren't acting in bad faith here, memcmp on tiny needles |
25 | | /// is so fast that even in pathological cases (see pathological vector |
26 | | /// benchmarks), this is still just as fast or faster in practice. |
27 | | /// |
28 | | /// This specific number was chosen by tweaking a bit and running benchmarks. |
29 | | /// The rare-medium-needle, for example, gets about 5% faster by using this |
30 | | /// algorithm instead of a prefilter-accelerated Two-Way. There's also a |
31 | | /// theoretical desire to keep this number reasonably low, to mitigate the |
32 | | /// impact of pathological cases. I did try 64, and some benchmarks got a |
33 | | /// little better, and others (particularly the pathological ones), got a lot |
34 | | /// worse. So... 32 it is? |
35 | | pub(crate) const MAX_NEEDLE_LEN: usize = 32; |
36 | | |
37 | | /// The implementation of the forward vector accelerated substring search. |
38 | | /// |
39 | | /// This is extremely similar to the prefilter vector module by the same name. |
40 | | /// The key difference is that this is not a prefilter. Instead, it handles |
41 | | /// confirming its own matches. The trade off is that this only works with |
42 | | /// smaller needles. The speed up here is that an inlined memcmp on a tiny |
43 | | /// needle is very quick, even on pathological inputs. This is much better than |
44 | | /// combining a prefilter with Two-Way, where using Two-Way to confirm the |
45 | | /// match has higher latency. |
46 | | /// |
47 | | /// So why not use this for all needles? We could, and it would probably work |
48 | | /// really well on most inputs. But its worst case is multiplicative and we |
49 | | /// want to guarantee worst case additive time. Some of the benchmarks try to |
50 | | /// justify this (see the pathological ones). |
51 | | /// |
52 | | /// The prefilter variant of this has more comments. Also note that we only |
53 | | /// implement this for forward searches for now. If you have a compelling use |
54 | | /// case for accelerated reverse search, please file an issue. |
55 | 0 | #[derive(Clone, Copy, Debug)] |
56 | | pub(crate) struct Forward { |
57 | | rare1i: u8, |
58 | | rare2i: u8, |
59 | | } |
60 | | |
61 | | impl Forward { |
62 | | /// Create a new "generic simd" forward searcher. If one could not be |
63 | | /// created from the given inputs, then None is returned. |
64 | 0 | pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> { |
65 | 0 | let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8(); |
66 | 0 | // If the needle is too short or too long, give up. Also, give up |
67 | 0 | // if the rare bytes detected are at the same position. (It likely |
68 | 0 | // suggests a degenerate case, although it should technically not be |
69 | 0 | // possible.) |
70 | 0 | if needle.len() < MIN_NEEDLE_LEN |
71 | 0 | || needle.len() > MAX_NEEDLE_LEN |
72 | 0 | || rare1i == rare2i |
73 | | { |
74 | 0 | return None; |
75 | 0 | } |
76 | 0 | Some(Forward { rare1i, rare2i }) |
77 | 0 | } |
78 | | |
79 | | /// Returns the minimum length of haystack that is needed for this searcher |
80 | | /// to work for a particular vector. Passing a haystack with a length |
81 | | /// smaller than this will cause `fwd_find` to panic. |
82 | | #[inline(always)] |
83 | 0 | pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize { |
84 | 0 | self.rare2i as usize + size_of::<V>() |
85 | 0 | } Unexecuted instantiation: <memchr::memmem::genericsimd::Forward>::min_haystack_len::<core::core_arch::x86::__m256i> Unexecuted instantiation: <memchr::memmem::genericsimd::Forward>::min_haystack_len::<core::core_arch::x86::__m128i> |
86 | | } |
87 | | |
88 | | /// Searches the given haystack for the given needle. The needle given should |
89 | | /// be the same as the needle that this searcher was initialized with. |
90 | | /// |
91 | | /// # Panics |
92 | | /// |
93 | | /// When the given haystack has a length smaller than `min_haystack_len`. |
94 | | /// |
95 | | /// # Safety |
96 | | /// |
97 | | /// Since this is meant to be used with vector functions, callers need to |
98 | | /// specialize this inside of a function with a `target_feature` attribute. |
99 | | /// Therefore, callers must ensure that whatever target feature is being used |
100 | | /// supports the vector functions that this function is specialized for. (For |
101 | | /// the specific vector functions used, see the Vector trait implementations.) |
102 | | #[inline(always)] |
103 | 0 | pub(crate) unsafe fn fwd_find<V: Vector>( |
104 | 0 | fwd: &Forward, |
105 | 0 | haystack: &[u8], |
106 | 0 | needle: &[u8], |
107 | 0 | ) -> Option<usize> { |
108 | 0 | // It would be nice if we didn't have this check here, since the meta |
109 | 0 | // searcher should handle it for us. But without this, I don't think we |
110 | 0 | // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could |
111 | 0 | // put it as part of the safety contract, but it makes it more complicated |
112 | 0 | // than necessary. |
113 | 0 | if haystack.len() < needle.len() { |
114 | 0 | return None; |
115 | 0 | } |
116 | 0 | let min_haystack_len = fwd.min_haystack_len::<V>(); |
117 | 0 | assert!(haystack.len() >= min_haystack_len, "haystack too small"); |
118 | 0 | debug_assert!(needle.len() <= haystack.len()); |
119 | | debug_assert!( |
120 | 0 | needle.len() >= MIN_NEEDLE_LEN, |
121 | 0 | "needle must be at least {} bytes", |
122 | | MIN_NEEDLE_LEN, |
123 | | ); |
124 | | debug_assert!( |
125 | 0 | needle.len() <= MAX_NEEDLE_LEN, |
126 | 0 | "needle must be at most {} bytes", |
127 | | MAX_NEEDLE_LEN, |
128 | | ); |
129 | | |
130 | 0 | let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize); |
131 | 0 | let rare1chunk = V::splat(needle[rare1i]); |
132 | 0 | let rare2chunk = V::splat(needle[rare2i]); |
133 | 0 |
|
134 | 0 | let start_ptr = haystack.as_ptr(); |
135 | 0 | let end_ptr = start_ptr.add(haystack.len()); |
136 | 0 | let max_ptr = end_ptr.sub(min_haystack_len); |
137 | 0 | let mut ptr = start_ptr; |
138 | | |
139 | | // N.B. I did experiment with unrolling the loop to deal with size(V) |
140 | | // bytes at a time and 2*size(V) bytes at a time. The double unroll was |
141 | | // marginally faster while the quadruple unroll was unambiguously slower. |
142 | | // In the end, I decided the complexity from unrolling wasn't worth it. I |
143 | | // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare. |
144 | 0 | while ptr <= max_ptr { |
145 | 0 | let m = fwd_find_in_chunk( |
146 | 0 | fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0, |
147 | 0 | ); |
148 | 0 | if let Some(chunki) = m { |
149 | 0 | return Some(matched(start_ptr, ptr, chunki)); |
150 | 0 | } |
151 | 0 | ptr = ptr.add(size_of::<V>()); |
152 | | } |
153 | 0 | if ptr < end_ptr { |
154 | 0 | let remaining = diff(end_ptr, ptr); |
155 | | debug_assert!( |
156 | 0 | remaining < min_haystack_len, |
157 | 0 | "remaining bytes should be smaller than the minimum haystack \ |
158 | 0 | length of {}, but there are {} bytes remaining", |
159 | | min_haystack_len, |
160 | | remaining, |
161 | | ); |
162 | 0 | if remaining < needle.len() { |
163 | 0 | return None; |
164 | 0 | } |
165 | | debug_assert!( |
166 | 0 | max_ptr < ptr, |
167 | | "after main loop, ptr should have exceeded max_ptr", |
168 | | ); |
169 | 0 | let overlap = diff(ptr, max_ptr); |
170 | | debug_assert!( |
171 | 0 | overlap > 0, |
172 | 0 | "overlap ({}) must always be non-zero", |
173 | | overlap, |
174 | | ); |
175 | | debug_assert!( |
176 | 0 | overlap < size_of::<V>(), |
177 | 0 | "overlap ({}) cannot possibly be >= than a vector ({})", |
178 | | overlap, |
179 | | size_of::<V>(), |
180 | | ); |
181 | | // The mask has all of its bits set except for the first N least |
182 | | // significant bits, where N=overlap. This way, any matches that |
183 | | // occur in find_in_chunk within the overlap are automatically |
184 | | // ignored. |
185 | 0 | let mask = !((1 << overlap) - 1); |
186 | 0 | ptr = max_ptr; |
187 | 0 | let m = fwd_find_in_chunk( |
188 | 0 | fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask, |
189 | 0 | ); |
190 | 0 | if let Some(chunki) = m { |
191 | 0 | return Some(matched(start_ptr, ptr, chunki)); |
192 | 0 | } |
193 | 0 | } |
194 | 0 | None |
195 | 0 | } Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find::<core::core_arch::x86::__m256i> Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find::<core::core_arch::x86::__m128i> |
196 | | |
197 | | /// Search for an occurrence of two rare bytes from the needle in the chunk |
198 | | /// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When |
199 | | /// an occurrence is found, memcmp is run to check if a match occurs at the |
200 | | /// corresponding position. |
201 | | /// |
202 | | /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2 |
203 | | /// bytes repeated in each 8-bit lane, respectively. |
204 | | /// |
205 | | /// mask should have bits set corresponding the positions in the chunk in which |
206 | | /// matches are considered. This is only used for the last vector load where |
207 | | /// the beginning of the vector might have overlapped with the last load in |
208 | | /// the main loop. The mask lets us avoid visiting positions that have already |
209 | | /// been discarded as matches. |
210 | | /// |
211 | | /// # Safety |
212 | | /// |
213 | | /// It must be safe to do an unaligned read of size(V) bytes starting at both |
214 | | /// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned |
215 | | /// loads on ptr up to (end_ptr - needle.len()). |
216 | | #[inline(always)] |
217 | 0 | unsafe fn fwd_find_in_chunk<V: Vector>( |
218 | 0 | fwd: &Forward, |
219 | 0 | needle: &[u8], |
220 | 0 | ptr: *const u8, |
221 | 0 | end_ptr: *const u8, |
222 | 0 | rare1chunk: V, |
223 | 0 | rare2chunk: V, |
224 | 0 | mask: u32, |
225 | 0 | ) -> Option<usize> { |
226 | 0 | let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize)); |
227 | 0 | let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize)); |
228 | 0 |
|
229 | 0 | let eq0 = chunk0.cmpeq(rare1chunk); |
230 | 0 | let eq1 = chunk1.cmpeq(rare2chunk); |
231 | 0 |
|
232 | 0 | let mut match_offsets = eq0.and(eq1).movemask() & mask; |
233 | 0 | while match_offsets != 0 { |
234 | 0 | let offset = match_offsets.trailing_zeros() as usize; |
235 | 0 | let ptr = ptr.add(offset); |
236 | 0 | if end_ptr.sub(needle.len()) < ptr { |
237 | 0 | return None; |
238 | 0 | } |
239 | 0 | let chunk = core::slice::from_raw_parts(ptr, needle.len()); |
240 | 0 | if memcmp(needle, chunk) { |
241 | 0 | return Some(offset); |
242 | 0 | } |
243 | 0 | match_offsets &= match_offsets - 1; |
244 | | } |
245 | 0 | None |
246 | 0 | } Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find_in_chunk::<core::core_arch::x86::__m256i> Unexecuted instantiation: memchr::memmem::genericsimd::fwd_find_in_chunk::<core::core_arch::x86::__m128i> |
247 | | |
248 | | /// Accepts a chunk-relative offset and returns a haystack relative offset |
249 | | /// after updating the prefilter state. |
250 | | /// |
251 | | /// See the same function with the same name in the prefilter variant of this |
252 | | /// algorithm to learned why it's tagged with inline(never). Even here, where |
253 | | /// the function is simpler, inlining it leads to poorer codegen. (Although |
254 | | /// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.) |
255 | | #[cold] |
256 | | #[inline(never)] |
257 | 0 | fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize { |
258 | 0 | diff(ptr, start_ptr) + chunki |
259 | 0 | } |
260 | | |
261 | | /// Subtract `b` from `a` and return the difference. `a` must be greater than |
262 | | /// or equal to `b`. |
263 | 0 | fn diff(a: *const u8, b: *const u8) -> usize { |
264 | 0 | debug_assert!(a >= b); |
265 | 0 | (a as usize) - (b as usize) |
266 | 0 | } |