/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.7.4/src/arch/generic/memchr.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Generic crate-internal routines for the `memchr` family of functions. |
3 | | */ |
4 | | |
5 | | // What follows is a vector algorithm generic over the specific vector |
6 | | // type to detect the position of one, two or three needles in a haystack. |
7 | | // From what I know, this is a "classic" algorithm, although I don't |
8 | | // believe it has been published in any peer reviewed journal. I believe |
9 | | // it can be found in places like glibc and Go's standard library. It |
10 | | // appears to be well known and is elaborated on in more detail here: |
11 | | // https://gms.tf/stdfind-and-memchr-optimizations.html |
12 | | // |
13 | | // While the routine below is fairly long and perhaps intimidating, the basic |
14 | | // idea is actually very simple and can be expressed straight-forwardly in |
15 | | // pseudo code. The psuedo code below is written for 128 bit vectors, but the |
16 | | // actual code below works for anything that implements the Vector trait. |
17 | | // |
18 | | // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1 |
19 | | // // Note: shift amount is in bytes |
20 | | // |
21 | | // while i <= haystack.len() - 16: |
22 | | // // A 16 byte vector. Each byte in chunk corresponds to a byte in |
23 | | // // the haystack. |
24 | | // chunk = haystack[i:i+16] |
25 | | // // Compare bytes in needle with bytes in chunk. The result is a 16 |
26 | | // // byte chunk where each byte is 0xFF if the corresponding bytes |
27 | | // // in needle and chunk were equal, or 0x00 otherwise. |
28 | | // eqs = cmpeq(needle, chunk) |
29 | | // // Return a 32 bit integer where the most significant 16 bits |
30 | | // // are always 0 and the lower 16 bits correspond to whether the |
31 | | // // most significant bit in the correspond byte in `eqs` is set. |
32 | | // // In other words, `mask as u16` has bit i set if and only if |
33 | | // // needle[i] == chunk[i]. |
34 | | // mask = movemask(eqs) |
35 | | // |
36 | | // // Mask is 0 if there is no match, and non-zero otherwise. |
37 | | // if mask != 0: |
38 | | // // trailing_zeros tells us the position of the least significant |
39 | | // // bit that is set. |
40 | | // return i + trailing_zeros(mask) |
41 | | // |
42 | | // // haystack length may not be a multiple of 16, so search the rest. |
43 | | // while i < haystack.len(): |
44 | | // if haystack[i] == n1: |
45 | | // return i |
46 | | // |
47 | | // // No match found. |
48 | | // return NULL |
49 | | // |
50 | | // In fact, we could loosely translate the above code to Rust line-for-line |
51 | | // and it would be a pretty fast algorithm. But, we pull out all the stops |
52 | | // to go as fast as possible: |
53 | | // |
54 | | // 1. We use aligned loads. That is, we do some finagling to make sure our |
55 | | // primary loop not only proceeds in increments of 16 bytes, but that |
56 | | // the address of haystack's pointer that we dereference is aligned to |
57 | | // 16 bytes. 16 is a magic number here because it is the size of SSE2 |
58 | | // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.) |
59 | | // Therefore, to get aligned loads, our pointer's address must be evenly |
60 | | // divisible by 16. |
61 | | // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's |
62 | | // kind of like loop unrolling, but we combine the equality comparisons |
63 | | // using a vector OR such that we only need to extract a single mask to |
64 | | // determine whether a match exists or not. If so, then we do some |
65 | | // book-keeping to determine the precise location but otherwise mush on. |
66 | | // 3. We use our "chunk" comparison routine in as many places as possible, |
67 | | // even if it means using unaligned loads. In particular, if haystack |
68 | | // starts with an unaligned address, then we do an unaligned load to |
69 | | // search the first 16 bytes. We then start our primary loop at the |
70 | | // smallest subsequent aligned address, which will actually overlap with |
71 | | // previously searched bytes. But we're OK with that. We do a similar |
72 | | // dance at the end of our primary loop. Finally, to avoid a |
73 | | // byte-at-a-time loop at the end, we do a final 16 byte unaligned load |
74 | | // that may overlap with a previous load. This is OK because it converts |
75 | | // a loop into a small number of very fast vector instructions. The overlap |
76 | | // is OK because we know the place where the overlap occurs does not |
77 | | // contain a match. |
78 | | // |
79 | | // And that's pretty all there is to it. Note that since the below is |
80 | | // generic and since it's meant to be inlined into routines with a |
81 | | // `#[target_feature(enable = "...")]` annotation, we must mark all routines as |
82 | | // both unsafe and `#[inline(always)]`. |
83 | | // |
84 | | // The fact that the code below is generic does somewhat inhibit us. For |
85 | | // example, I've noticed that introducing an unlineable `#[cold]` function to |
86 | | // handle the match case in the loop generates tighter assembly, but there is |
87 | | // no way to do this in the generic code below because the generic code doesn't |
88 | | // know what `target_feature` annotation to apply to the unlineable function. |
89 | | // We could make such functions part of the `Vector` trait, but we instead live |
90 | | // with the slightly sub-optimal codegen for now since it doesn't seem to have |
91 | | // a noticeable perf difference. |
92 | | |
93 | | use crate::{ |
94 | | ext::Pointer, |
95 | | vector::{MoveMask, Vector}, |
96 | | }; |
97 | | |
98 | | /// Finds all occurrences of a single byte in a haystack. |
99 | | #[derive(Clone, Copy, Debug)] |
100 | | pub(crate) struct One<V> { |
101 | | s1: u8, |
102 | | v1: V, |
103 | | } |
104 | | |
105 | | impl<V: Vector> One<V> { |
106 | | /// The number of bytes we examine per each iteration of our search loop. |
107 | | const LOOP_SIZE: usize = 4 * V::BYTES; |
108 | | |
109 | | /// Create a new searcher that finds occurrences of the byte given. |
110 | | #[inline(always)] |
111 | 1.79M | pub(crate) unsafe fn new(needle: u8) -> One<V> { |
112 | 1.79M | One { s1: needle, v1: V::splat(needle) } |
113 | 1.79M | } <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::new Line | Count | Source | 111 | 895k | pub(crate) unsafe fn new(needle: u8) -> One<V> { | 112 | 895k | One { s1: needle, v1: V::splat(needle) } | 113 | 895k | } |
<memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::new Line | Count | Source | 111 | 895k | pub(crate) unsafe fn new(needle: u8) -> One<V> { | 112 | 895k | One { s1: needle, v1: V::splat(needle) } | 113 | 895k | } |
|
114 | | |
115 | | /// Returns the needle given to `One::new`. |
116 | | #[inline(always)] |
117 | 27.2k | pub(crate) fn needle1(&self) -> u8 { |
118 | 27.2k | self.s1 |
119 | 27.2k | } <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::needle1 Line | Count | Source | 117 | 27.2k | pub(crate) fn needle1(&self) -> u8 { | 118 | 27.2k | self.s1 | 119 | 27.2k | } |
Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::needle1 |
120 | | |
121 | | /// Return a pointer to the first occurrence of the needle in the given |
122 | | /// haystack. If no such occurrence exists, then `None` is returned. |
123 | | /// |
124 | | /// When a match is found, the pointer returned is guaranteed to be |
125 | | /// `>= start` and `< end`. |
126 | | /// |
127 | | /// # Safety |
128 | | /// |
129 | | /// * It must be the case that `start < end` and that the distance between |
130 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
131 | | /// to do at least an unaligned load of `V` at `start`. |
132 | | /// * Both `start` and `end` must be valid for reads. |
133 | | /// * Both `start` and `end` must point to an initialized value. |
134 | | /// * Both `start` and `end` must point to the same allocated object and |
135 | | /// must either be in bounds or at most one byte past the end of the |
136 | | /// allocated object. |
137 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
138 | | /// object. |
139 | | /// * The distance between `start` and `end` must not overflow `isize`. |
140 | | /// * The distance being in bounds must not rely on "wrapping around" the |
141 | | /// address space. |
142 | | #[inline(always)] |
143 | 891k | pub(crate) unsafe fn find_raw( |
144 | 891k | &self, |
145 | 891k | start: *const u8, |
146 | 891k | end: *const u8, |
147 | 891k | ) -> Option<*const u8> { |
148 | 891k | // If we want to support vectors bigger than 256 bits, we probably |
149 | 891k | // need to move up to using a u64 for the masks used below. Currently |
150 | 891k | // they are 32 bits, which means we're SOL for vectors that need masks |
151 | 891k | // bigger than 32 bits. Overall unclear until there's a use case. |
152 | 891k | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
153 | | |
154 | 891k | let topos = V::Mask::first_offset; |
155 | 891k | let len = end.distance(start); |
156 | 891k | debug_assert!( |
157 | 0 | len >= V::BYTES, |
158 | 0 | "haystack has length {}, but must be at least {}", |
159 | | len, |
160 | | V::BYTES |
161 | | ); |
162 | | |
163 | | // Search a possibly unaligned chunk at `start`. This covers any part |
164 | | // of the haystack prior to where aligned loads can start. |
165 | 891k | if let Some(cur) = self.search_chunk(start, topos) { |
166 | 891k | return Some(cur); |
167 | 0 | } |
168 | 0 | // Set `cur` to the first V-aligned pointer greater than `start`. |
169 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); |
170 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); |
171 | 0 | if len >= Self::LOOP_SIZE { |
172 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { |
173 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
174 | | |
175 | 0 | let a = V::load_aligned(cur); |
176 | 0 | let b = V::load_aligned(cur.add(1 * V::BYTES)); |
177 | 0 | let c = V::load_aligned(cur.add(2 * V::BYTES)); |
178 | 0 | let d = V::load_aligned(cur.add(3 * V::BYTES)); |
179 | 0 | let eqa = self.v1.cmpeq(a); |
180 | 0 | let eqb = self.v1.cmpeq(b); |
181 | 0 | let eqc = self.v1.cmpeq(c); |
182 | 0 | let eqd = self.v1.cmpeq(d); |
183 | 0 | let or1 = eqa.or(eqb); |
184 | 0 | let or2 = eqc.or(eqd); |
185 | 0 | let or3 = or1.or(or2); |
186 | 0 | if or3.movemask_will_have_non_zero() { |
187 | 0 | let mask = eqa.movemask(); |
188 | 0 | if mask.has_non_zero() { |
189 | 0 | return Some(cur.add(topos(mask))); |
190 | 0 | } |
191 | 0 |
|
192 | 0 | let mask = eqb.movemask(); |
193 | 0 | if mask.has_non_zero() { |
194 | 0 | return Some(cur.add(1 * V::BYTES).add(topos(mask))); |
195 | 0 | } |
196 | 0 |
|
197 | 0 | let mask = eqc.movemask(); |
198 | 0 | if mask.has_non_zero() { |
199 | 0 | return Some(cur.add(2 * V::BYTES).add(topos(mask))); |
200 | 0 | } |
201 | 0 |
|
202 | 0 | let mask = eqd.movemask(); |
203 | 0 | debug_assert!(mask.has_non_zero()); |
204 | 0 | return Some(cur.add(3 * V::BYTES).add(topos(mask))); |
205 | 0 | } |
206 | 0 | cur = cur.add(Self::LOOP_SIZE); |
207 | | } |
208 | 0 | } |
209 | | // Handle any leftovers after the aligned loop above. We use unaligned |
210 | | // loads here, but I believe we are guaranteed that they are aligned |
211 | | // since `cur` is aligned. |
212 | 0 | while cur <= end.sub(V::BYTES) { |
213 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); |
214 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
215 | 0 | return Some(cur); |
216 | 0 | } |
217 | 0 | cur = cur.add(V::BYTES); |
218 | | } |
219 | | // Finally handle any remaining bytes less than the size of V. In this |
220 | | // case, our pointer may indeed be unaligned and the load may overlap |
221 | | // with the previous one. But that's okay since we know the previous |
222 | | // load didn't lead to a match (otherwise we wouldn't be here). |
223 | 0 | if cur < end { |
224 | 0 | debug_assert!(end.distance(cur) < V::BYTES); |
225 | 0 | cur = cur.sub(V::BYTES - end.distance(cur)); |
226 | 0 | debug_assert_eq!(end.distance(cur), V::BYTES); |
227 | 0 | return self.search_chunk(cur, topos); |
228 | 0 | } |
229 | 0 | None |
230 | 891k | } <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::find_raw Line | Count | Source | 143 | 57.8k | pub(crate) unsafe fn find_raw( | 144 | 57.8k | &self, | 145 | 57.8k | start: *const u8, | 146 | 57.8k | end: *const u8, | 147 | 57.8k | ) -> Option<*const u8> { | 148 | 57.8k | // If we want to support vectors bigger than 256 bits, we probably | 149 | 57.8k | // need to move up to using a u64 for the masks used below. Currently | 150 | 57.8k | // they are 32 bits, which means we're SOL for vectors that need masks | 151 | 57.8k | // bigger than 32 bits. Overall unclear until there's a use case. | 152 | 57.8k | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); | 153 | | | 154 | 57.8k | let topos = V::Mask::first_offset; | 155 | 57.8k | let len = end.distance(start); | 156 | 57.8k | debug_assert!( | 157 | 0 | len >= V::BYTES, | 158 | 0 | "haystack has length {}, but must be at least {}", | 159 | | len, | 160 | | V::BYTES | 161 | | ); | 162 | | | 163 | | // Search a possibly unaligned chunk at `start`. This covers any part | 164 | | // of the haystack prior to where aligned loads can start. | 165 | 57.8k | if let Some(cur) = self.search_chunk(start, topos) { | 166 | 57.8k | return Some(cur); | 167 | 0 | } | 168 | 0 | // Set `cur` to the first V-aligned pointer greater than `start`. | 169 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); | 170 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); | 171 | 0 | if len >= Self::LOOP_SIZE { | 172 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { | 173 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); | 174 | | | 175 | 0 | let a = V::load_aligned(cur); | 176 | 0 | let b = V::load_aligned(cur.add(1 * V::BYTES)); | 177 | 0 | let c = V::load_aligned(cur.add(2 * V::BYTES)); | 178 | 0 | let d = V::load_aligned(cur.add(3 * V::BYTES)); | 179 | 0 | let eqa = self.v1.cmpeq(a); | 180 | 0 | let eqb = self.v1.cmpeq(b); | 181 | 0 | let eqc = self.v1.cmpeq(c); | 182 | 0 | let eqd = self.v1.cmpeq(d); | 183 | 0 | let or1 = eqa.or(eqb); | 184 | 0 | let or2 = eqc.or(eqd); | 185 | 0 | let or3 = or1.or(or2); | 186 | 0 | if or3.movemask_will_have_non_zero() { | 187 | 0 | let mask = eqa.movemask(); | 188 | 0 | if mask.has_non_zero() { | 189 | 0 | return Some(cur.add(topos(mask))); | 190 | 0 | } | 191 | 0 |
| 192 | 0 | let mask = eqb.movemask(); | 193 | 0 | if mask.has_non_zero() { | 194 | 0 | return Some(cur.add(1 * V::BYTES).add(topos(mask))); | 195 | 0 | } | 196 | 0 |
| 197 | 0 | let mask = eqc.movemask(); | 198 | 0 | if mask.has_non_zero() { | 199 | 0 | return Some(cur.add(2 * V::BYTES).add(topos(mask))); | 200 | 0 | } | 201 | 0 |
| 202 | 0 | let mask = eqd.movemask(); | 203 | 0 | debug_assert!(mask.has_non_zero()); | 204 | 0 | return Some(cur.add(3 * V::BYTES).add(topos(mask))); | 205 | 0 | } | 206 | 0 | cur = cur.add(Self::LOOP_SIZE); | 207 | | } | 208 | 0 | } | 209 | | // Handle any leftovers after the aligned loop above. We use unaligned | 210 | | // loads here, but I believe we are guaranteed that they are aligned | 211 | | // since `cur` is aligned. | 212 | 0 | while cur <= end.sub(V::BYTES) { | 213 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); | 214 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { | 215 | 0 | return Some(cur); | 216 | 0 | } | 217 | 0 | cur = cur.add(V::BYTES); | 218 | | } | 219 | | // Finally handle any remaining bytes less than the size of V. In this | 220 | | // case, our pointer may indeed be unaligned and the load may overlap | 221 | | // with the previous one. But that's okay since we know the previous | 222 | | // load didn't lead to a match (otherwise we wouldn't be here). | 223 | 0 | if cur < end { | 224 | 0 | debug_assert!(end.distance(cur) < V::BYTES); | 225 | 0 | cur = cur.sub(V::BYTES - end.distance(cur)); | 226 | 0 | debug_assert_eq!(end.distance(cur), V::BYTES); | 227 | 0 | return self.search_chunk(cur, topos); | 228 | 0 | } | 229 | 0 | None | 230 | 57.8k | } |
<memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::find_raw Line | Count | Source | 143 | 833k | pub(crate) unsafe fn find_raw( | 144 | 833k | &self, | 145 | 833k | start: *const u8, | 146 | 833k | end: *const u8, | 147 | 833k | ) -> Option<*const u8> { | 148 | 833k | // If we want to support vectors bigger than 256 bits, we probably | 149 | 833k | // need to move up to using a u64 for the masks used below. Currently | 150 | 833k | // they are 32 bits, which means we're SOL for vectors that need masks | 151 | 833k | // bigger than 32 bits. Overall unclear until there's a use case. | 152 | 833k | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); | 153 | | | 154 | 833k | let topos = V::Mask::first_offset; | 155 | 833k | let len = end.distance(start); | 156 | 833k | debug_assert!( | 157 | 0 | len >= V::BYTES, | 158 | 0 | "haystack has length {}, but must be at least {}", | 159 | | len, | 160 | | V::BYTES | 161 | | ); | 162 | | | 163 | | // Search a possibly unaligned chunk at `start`. This covers any part | 164 | | // of the haystack prior to where aligned loads can start. | 165 | 833k | if let Some(cur) = self.search_chunk(start, topos) { | 166 | 833k | return Some(cur); | 167 | 0 | } | 168 | 0 | // Set `cur` to the first V-aligned pointer greater than `start`. | 169 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); | 170 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); | 171 | 0 | if len >= Self::LOOP_SIZE { | 172 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { | 173 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); | 174 | | | 175 | 0 | let a = V::load_aligned(cur); | 176 | 0 | let b = V::load_aligned(cur.add(1 * V::BYTES)); | 177 | 0 | let c = V::load_aligned(cur.add(2 * V::BYTES)); | 178 | 0 | let d = V::load_aligned(cur.add(3 * V::BYTES)); | 179 | 0 | let eqa = self.v1.cmpeq(a); | 180 | 0 | let eqb = self.v1.cmpeq(b); | 181 | 0 | let eqc = self.v1.cmpeq(c); | 182 | 0 | let eqd = self.v1.cmpeq(d); | 183 | 0 | let or1 = eqa.or(eqb); | 184 | 0 | let or2 = eqc.or(eqd); | 185 | 0 | let or3 = or1.or(or2); | 186 | 0 | if or3.movemask_will_have_non_zero() { | 187 | 0 | let mask = eqa.movemask(); | 188 | 0 | if mask.has_non_zero() { | 189 | 0 | return Some(cur.add(topos(mask))); | 190 | 0 | } | 191 | 0 |
| 192 | 0 | let mask = eqb.movemask(); | 193 | 0 | if mask.has_non_zero() { | 194 | 0 | return Some(cur.add(1 * V::BYTES).add(topos(mask))); | 195 | 0 | } | 196 | 0 |
| 197 | 0 | let mask = eqc.movemask(); | 198 | 0 | if mask.has_non_zero() { | 199 | 0 | return Some(cur.add(2 * V::BYTES).add(topos(mask))); | 200 | 0 | } | 201 | 0 |
| 202 | 0 | let mask = eqd.movemask(); | 203 | 0 | debug_assert!(mask.has_non_zero()); | 204 | 0 | return Some(cur.add(3 * V::BYTES).add(topos(mask))); | 205 | 0 | } | 206 | 0 | cur = cur.add(Self::LOOP_SIZE); | 207 | | } | 208 | 0 | } | 209 | | // Handle any leftovers after the aligned loop above. We use unaligned | 210 | | // loads here, but I believe we are guaranteed that they are aligned | 211 | | // since `cur` is aligned. | 212 | 0 | while cur <= end.sub(V::BYTES) { | 213 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); | 214 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { | 215 | 0 | return Some(cur); | 216 | 0 | } | 217 | 0 | cur = cur.add(V::BYTES); | 218 | | } | 219 | | // Finally handle any remaining bytes less than the size of V. In this | 220 | | // case, our pointer may indeed be unaligned and the load may overlap | 221 | | // with the previous one. But that's okay since we know the previous | 222 | | // load didn't lead to a match (otherwise we wouldn't be here). | 223 | 0 | if cur < end { | 224 | 0 | debug_assert!(end.distance(cur) < V::BYTES); | 225 | 0 | cur = cur.sub(V::BYTES - end.distance(cur)); | 226 | 0 | debug_assert_eq!(end.distance(cur), V::BYTES); | 227 | 0 | return self.search_chunk(cur, topos); | 228 | 0 | } | 229 | 0 | None | 230 | 833k | } |
|
231 | | |
232 | | /// Return a pointer to the last occurrence of the needle in the given |
233 | | /// haystack. If no such occurrence exists, then `None` is returned. |
234 | | /// |
235 | | /// When a match is found, the pointer returned is guaranteed to be |
236 | | /// `>= start` and `< end`. |
237 | | /// |
238 | | /// # Safety |
239 | | /// |
240 | | /// * It must be the case that `start < end` and that the distance between |
241 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
242 | | /// to do at least an unaligned load of `V` at `start`. |
243 | | /// * Both `start` and `end` must be valid for reads. |
244 | | /// * Both `start` and `end` must point to an initialized value. |
245 | | /// * Both `start` and `end` must point to the same allocated object and |
246 | | /// must either be in bounds or at most one byte past the end of the |
247 | | /// allocated object. |
248 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
249 | | /// object. |
250 | | /// * The distance between `start` and `end` must not overflow `isize`. |
251 | | /// * The distance being in bounds must not rely on "wrapping around" the |
252 | | /// address space. |
253 | | #[inline(always)] |
254 | 0 | pub(crate) unsafe fn rfind_raw( |
255 | 0 | &self, |
256 | 0 | start: *const u8, |
257 | 0 | end: *const u8, |
258 | 0 | ) -> Option<*const u8> { |
259 | 0 | // If we want to support vectors bigger than 256 bits, we probably |
260 | 0 | // need to move up to using a u64 for the masks used below. Currently |
261 | 0 | // they are 32 bits, which means we're SOL for vectors that need masks |
262 | 0 | // bigger than 32 bits. Overall unclear until there's a use case. |
263 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
264 | | |
265 | 0 | let topos = V::Mask::last_offset; |
266 | 0 | let len = end.distance(start); |
267 | 0 | debug_assert!( |
268 | 0 | len >= V::BYTES, |
269 | 0 | "haystack has length {}, but must be at least {}", |
270 | | len, |
271 | | V::BYTES |
272 | | ); |
273 | | |
274 | 0 | if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) { |
275 | 0 | return Some(cur); |
276 | 0 | } |
277 | 0 | let mut cur = end.sub(end.as_usize() & V::ALIGN); |
278 | 0 | debug_assert!(start <= cur && cur <= end); |
279 | 0 | if len >= Self::LOOP_SIZE { |
280 | 0 | while cur >= start.add(Self::LOOP_SIZE) { |
281 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
282 | | |
283 | 0 | cur = cur.sub(Self::LOOP_SIZE); |
284 | 0 | let a = V::load_aligned(cur); |
285 | 0 | let b = V::load_aligned(cur.add(1 * V::BYTES)); |
286 | 0 | let c = V::load_aligned(cur.add(2 * V::BYTES)); |
287 | 0 | let d = V::load_aligned(cur.add(3 * V::BYTES)); |
288 | 0 | let eqa = self.v1.cmpeq(a); |
289 | 0 | let eqb = self.v1.cmpeq(b); |
290 | 0 | let eqc = self.v1.cmpeq(c); |
291 | 0 | let eqd = self.v1.cmpeq(d); |
292 | 0 | let or1 = eqa.or(eqb); |
293 | 0 | let or2 = eqc.or(eqd); |
294 | 0 | let or3 = or1.or(or2); |
295 | 0 | if or3.movemask_will_have_non_zero() { |
296 | 0 | let mask = eqd.movemask(); |
297 | 0 | if mask.has_non_zero() { |
298 | 0 | return Some(cur.add(3 * V::BYTES).add(topos(mask))); |
299 | 0 | } |
300 | 0 |
|
301 | 0 | let mask = eqc.movemask(); |
302 | 0 | if mask.has_non_zero() { |
303 | 0 | return Some(cur.add(2 * V::BYTES).add(topos(mask))); |
304 | 0 | } |
305 | 0 |
|
306 | 0 | let mask = eqb.movemask(); |
307 | 0 | if mask.has_non_zero() { |
308 | 0 | return Some(cur.add(1 * V::BYTES).add(topos(mask))); |
309 | 0 | } |
310 | 0 |
|
311 | 0 | let mask = eqa.movemask(); |
312 | 0 | debug_assert!(mask.has_non_zero()); |
313 | 0 | return Some(cur.add(topos(mask))); |
314 | 0 | } |
315 | | } |
316 | 0 | } |
317 | 0 | while cur >= start.add(V::BYTES) { |
318 | 0 | debug_assert!(cur.distance(start) >= V::BYTES); |
319 | 0 | cur = cur.sub(V::BYTES); |
320 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
321 | 0 | return Some(cur); |
322 | 0 | } |
323 | | } |
324 | 0 | if cur > start { |
325 | 0 | debug_assert!(cur.distance(start) < V::BYTES); |
326 | 0 | return self.search_chunk(start, topos); |
327 | 0 | } |
328 | 0 | None |
329 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::rfind_raw Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::rfind_raw |
330 | | |
331 | | /// Return a count of all matching bytes in the given haystack. |
332 | | /// |
333 | | /// # Safety |
334 | | /// |
335 | | /// * It must be the case that `start < end` and that the distance between |
336 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
337 | | /// to do at least an unaligned load of `V` at `start`. |
338 | | /// * Both `start` and `end` must be valid for reads. |
339 | | /// * Both `start` and `end` must point to an initialized value. |
340 | | /// * Both `start` and `end` must point to the same allocated object and |
341 | | /// must either be in bounds or at most one byte past the end of the |
342 | | /// allocated object. |
343 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
344 | | /// object. |
345 | | /// * The distance between `start` and `end` must not overflow `isize`. |
346 | | /// * The distance being in bounds must not rely on "wrapping around" the |
347 | | /// address space. |
348 | | #[inline(always)] |
349 | 0 | pub(crate) unsafe fn count_raw( |
350 | 0 | &self, |
351 | 0 | start: *const u8, |
352 | 0 | end: *const u8, |
353 | 0 | ) -> usize { |
354 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
355 | | |
356 | 0 | let confirm = |b| b == self.needle1(); Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::count_raw::{closure#0}Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::count_raw::{closure#0} |
357 | 0 | let len = end.distance(start); |
358 | 0 | debug_assert!( |
359 | 0 | len >= V::BYTES, |
360 | 0 | "haystack has length {}, but must be at least {}", |
361 | | len, |
362 | | V::BYTES |
363 | | ); |
364 | | |
365 | | // Set `cur` to the first V-aligned pointer greater than `start`. |
366 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); |
367 | 0 | // Count any matching bytes before we start our aligned loop. |
368 | 0 | let mut count = count_byte_by_byte(start, cur, confirm); |
369 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); |
370 | 0 | if len >= Self::LOOP_SIZE { |
371 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { |
372 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
373 | | |
374 | 0 | let a = V::load_aligned(cur); |
375 | 0 | let b = V::load_aligned(cur.add(1 * V::BYTES)); |
376 | 0 | let c = V::load_aligned(cur.add(2 * V::BYTES)); |
377 | 0 | let d = V::load_aligned(cur.add(3 * V::BYTES)); |
378 | 0 | let eqa = self.v1.cmpeq(a); |
379 | 0 | let eqb = self.v1.cmpeq(b); |
380 | 0 | let eqc = self.v1.cmpeq(c); |
381 | 0 | let eqd = self.v1.cmpeq(d); |
382 | 0 | count += eqa.movemask().count_ones(); |
383 | 0 | count += eqb.movemask().count_ones(); |
384 | 0 | count += eqc.movemask().count_ones(); |
385 | 0 | count += eqd.movemask().count_ones(); |
386 | 0 | cur = cur.add(Self::LOOP_SIZE); |
387 | | } |
388 | 0 | } |
389 | | // Handle any leftovers after the aligned loop above. We use unaligned |
390 | | // loads here, but I believe we are guaranteed that they are aligned |
391 | | // since `cur` is aligned. |
392 | 0 | while cur <= end.sub(V::BYTES) { |
393 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); |
394 | 0 | let chunk = V::load_unaligned(cur); |
395 | 0 | count += self.v1.cmpeq(chunk).movemask().count_ones(); |
396 | 0 | cur = cur.add(V::BYTES); |
397 | | } |
398 | | // And finally count any leftovers that weren't caught above. |
399 | 0 | count += count_byte_by_byte(cur, end, confirm); |
400 | 0 | count |
401 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::count_raw Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::count_raw |
402 | | |
403 | | /// Search `V::BYTES` starting at `cur` via an unaligned load. |
404 | | /// |
405 | | /// `mask_to_offset` should be a function that converts a `movemask` to |
406 | | /// an offset such that `cur.add(offset)` corresponds to a pointer to the |
407 | | /// match location if one is found. Generally it is expected to use either |
408 | | /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether |
409 | | /// one is implementing a forward or reverse search, respectively. |
410 | | /// |
411 | | /// # Safety |
412 | | /// |
413 | | /// `cur` must be a valid pointer and it must be valid to do an unaligned |
414 | | /// load of size `V::BYTES` at `cur`. |
415 | | #[inline(always)] |
416 | 891k | unsafe fn search_chunk( |
417 | 891k | &self, |
418 | 891k | cur: *const u8, |
419 | 891k | mask_to_offset: impl Fn(V::Mask) -> usize, |
420 | 891k | ) -> Option<*const u8> { |
421 | 891k | let chunk = V::load_unaligned(cur); |
422 | 891k | let mask = self.v1.cmpeq(chunk).movemask(); |
423 | 891k | if mask.has_non_zero() { |
424 | 891k | Some(cur.add(mask_to_offset(mask))) |
425 | | } else { |
426 | 0 | None |
427 | | } |
428 | 891k | } Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> <memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> Line | Count | Source | 416 | 57.8k | unsafe fn search_chunk( | 417 | 57.8k | &self, | 418 | 57.8k | cur: *const u8, | 419 | 57.8k | mask_to_offset: impl Fn(V::Mask) -> usize, | 420 | 57.8k | ) -> Option<*const u8> { | 421 | 57.8k | let chunk = V::load_unaligned(cur); | 422 | 57.8k | let mask = self.v1.cmpeq(chunk).movemask(); | 423 | 57.8k | if mask.has_non_zero() { | 424 | 57.8k | Some(cur.add(mask_to_offset(mask))) | 425 | | } else { | 426 | 0 | None | 427 | | } | 428 | 57.8k | } |
Unexecuted instantiation: <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> <memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> Line | Count | Source | 416 | 833k | unsafe fn search_chunk( | 417 | 833k | &self, | 418 | 833k | cur: *const u8, | 419 | 833k | mask_to_offset: impl Fn(V::Mask) -> usize, | 420 | 833k | ) -> Option<*const u8> { | 421 | 833k | let chunk = V::load_unaligned(cur); | 422 | 833k | let mask = self.v1.cmpeq(chunk).movemask(); | 423 | 833k | if mask.has_non_zero() { | 424 | 833k | Some(cur.add(mask_to_offset(mask))) | 425 | | } else { | 426 | 0 | None | 427 | | } | 428 | 833k | } |
|
429 | | } |
430 | | |
431 | | /// Finds all occurrences of two bytes in a haystack. |
432 | | /// |
433 | | /// That is, this reports matches of one of two possible bytes. For example, |
434 | | /// searching for `a` or `b` in `afoobar` would report matches at offsets `0`, |
435 | | /// `4` and `5`. |
436 | | #[derive(Clone, Copy, Debug)] |
437 | | pub(crate) struct Two<V> { |
438 | | s1: u8, |
439 | | s2: u8, |
440 | | v1: V, |
441 | | v2: V, |
442 | | } |
443 | | |
444 | | impl<V: Vector> Two<V> { |
445 | | /// The number of bytes we examine per each iteration of our search loop. |
446 | | const LOOP_SIZE: usize = 2 * V::BYTES; |
447 | | |
448 | | /// Create a new searcher that finds occurrences of the byte given. |
449 | | #[inline(always)] |
450 | 0 | pub(crate) unsafe fn new(needle1: u8, needle2: u8) -> Two<V> { |
451 | 0 | Two { |
452 | 0 | s1: needle1, |
453 | 0 | s2: needle2, |
454 | 0 | v1: V::splat(needle1), |
455 | 0 | v2: V::splat(needle2), |
456 | 0 | } |
457 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m128i>>::new Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m256i>>::new |
458 | | |
459 | | /// Returns the first needle given to `Two::new`. |
460 | | #[inline(always)] |
461 | 0 | pub(crate) fn needle1(&self) -> u8 { |
462 | 0 | self.s1 |
463 | 0 | } |
464 | | |
465 | | /// Returns the second needle given to `Two::new`. |
466 | | #[inline(always)] |
467 | 0 | pub(crate) fn needle2(&self) -> u8 { |
468 | 0 | self.s2 |
469 | 0 | } |
470 | | |
471 | | /// Return a pointer to the first occurrence of one of the needles in the |
472 | | /// given haystack. If no such occurrence exists, then `None` is returned. |
473 | | /// |
474 | | /// When a match is found, the pointer returned is guaranteed to be |
475 | | /// `>= start` and `< end`. |
476 | | /// |
477 | | /// # Safety |
478 | | /// |
479 | | /// * It must be the case that `start < end` and that the distance between |
480 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
481 | | /// to do at least an unaligned load of `V` at `start`. |
482 | | /// * Both `start` and `end` must be valid for reads. |
483 | | /// * Both `start` and `end` must point to an initialized value. |
484 | | /// * Both `start` and `end` must point to the same allocated object and |
485 | | /// must either be in bounds or at most one byte past the end of the |
486 | | /// allocated object. |
487 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
488 | | /// object. |
489 | | /// * The distance between `start` and `end` must not overflow `isize`. |
490 | | /// * The distance being in bounds must not rely on "wrapping around" the |
491 | | /// address space. |
492 | | #[inline(always)] |
493 | 0 | pub(crate) unsafe fn find_raw( |
494 | 0 | &self, |
495 | 0 | start: *const u8, |
496 | 0 | end: *const u8, |
497 | 0 | ) -> Option<*const u8> { |
498 | 0 | // If we want to support vectors bigger than 256 bits, we probably |
499 | 0 | // need to move up to using a u64 for the masks used below. Currently |
500 | 0 | // they are 32 bits, which means we're SOL for vectors that need masks |
501 | 0 | // bigger than 32 bits. Overall unclear until there's a use case. |
502 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
503 | | |
504 | 0 | let topos = V::Mask::first_offset; |
505 | 0 | let len = end.distance(start); |
506 | 0 | debug_assert!( |
507 | 0 | len >= V::BYTES, |
508 | 0 | "haystack has length {}, but must be at least {}", |
509 | | len, |
510 | | V::BYTES |
511 | | ); |
512 | | |
513 | | // Search a possibly unaligned chunk at `start`. This covers any part |
514 | | // of the haystack prior to where aligned loads can start. |
515 | 0 | if let Some(cur) = self.search_chunk(start, topos) { |
516 | 0 | return Some(cur); |
517 | 0 | } |
518 | 0 | // Set `cur` to the first V-aligned pointer greater than `start`. |
519 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); |
520 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); |
521 | 0 | if len >= Self::LOOP_SIZE { |
522 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { |
523 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
524 | | |
525 | 0 | let a = V::load_aligned(cur); |
526 | 0 | let b = V::load_aligned(cur.add(V::BYTES)); |
527 | 0 | let eqa1 = self.v1.cmpeq(a); |
528 | 0 | let eqb1 = self.v1.cmpeq(b); |
529 | 0 | let eqa2 = self.v2.cmpeq(a); |
530 | 0 | let eqb2 = self.v2.cmpeq(b); |
531 | 0 | let or1 = eqa1.or(eqb1); |
532 | 0 | let or2 = eqa2.or(eqb2); |
533 | 0 | let or3 = or1.or(or2); |
534 | 0 | if or3.movemask_will_have_non_zero() { |
535 | 0 | let mask = eqa1.movemask().or(eqa2.movemask()); |
536 | 0 | if mask.has_non_zero() { |
537 | 0 | return Some(cur.add(topos(mask))); |
538 | 0 | } |
539 | 0 |
|
540 | 0 | let mask = eqb1.movemask().or(eqb2.movemask()); |
541 | 0 | debug_assert!(mask.has_non_zero()); |
542 | 0 | return Some(cur.add(V::BYTES).add(topos(mask))); |
543 | 0 | } |
544 | 0 | cur = cur.add(Self::LOOP_SIZE); |
545 | | } |
546 | 0 | } |
547 | | // Handle any leftovers after the aligned loop above. We use unaligned |
548 | | // loads here, but I believe we are guaranteed that they are aligned |
549 | | // since `cur` is aligned. |
550 | 0 | while cur <= end.sub(V::BYTES) { |
551 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); |
552 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
553 | 0 | return Some(cur); |
554 | 0 | } |
555 | 0 | cur = cur.add(V::BYTES); |
556 | | } |
557 | | // Finally handle any remaining bytes less than the size of V. In this |
558 | | // case, our pointer may indeed be unaligned and the load may overlap |
559 | | // with the previous one. But that's okay since we know the previous |
560 | | // load didn't lead to a match (otherwise we wouldn't be here). |
561 | 0 | if cur < end { |
562 | 0 | debug_assert!(end.distance(cur) < V::BYTES); |
563 | 0 | cur = cur.sub(V::BYTES - end.distance(cur)); |
564 | 0 | debug_assert_eq!(end.distance(cur), V::BYTES); |
565 | 0 | return self.search_chunk(cur, topos); |
566 | 0 | } |
567 | 0 | None |
568 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m128i>>::find_raw Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m256i>>::find_raw |
569 | | |
570 | | /// Return a pointer to the last occurrence of the needle in the given |
571 | | /// haystack. If no such occurrence exists, then `None` is returned. |
572 | | /// |
573 | | /// When a match is found, the pointer returned is guaranteed to be |
574 | | /// `>= start` and `< end`. |
575 | | /// |
576 | | /// # Safety |
577 | | /// |
578 | | /// * It must be the case that `start < end` and that the distance between |
579 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
580 | | /// to do at least an unaligned load of `V` at `start`. |
581 | | /// * Both `start` and `end` must be valid for reads. |
582 | | /// * Both `start` and `end` must point to an initialized value. |
583 | | /// * Both `start` and `end` must point to the same allocated object and |
584 | | /// must either be in bounds or at most one byte past the end of the |
585 | | /// allocated object. |
586 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
587 | | /// object. |
588 | | /// * The distance between `start` and `end` must not overflow `isize`. |
589 | | /// * The distance being in bounds must not rely on "wrapping around" the |
590 | | /// address space. |
591 | | #[inline(always)] |
592 | 0 | pub(crate) unsafe fn rfind_raw( |
593 | 0 | &self, |
594 | 0 | start: *const u8, |
595 | 0 | end: *const u8, |
596 | 0 | ) -> Option<*const u8> { |
597 | 0 | // If we want to support vectors bigger than 256 bits, we probably |
598 | 0 | // need to move up to using a u64 for the masks used below. Currently |
599 | 0 | // they are 32 bits, which means we're SOL for vectors that need masks |
600 | 0 | // bigger than 32 bits. Overall unclear until there's a use case. |
601 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
602 | | |
603 | 0 | let topos = V::Mask::last_offset; |
604 | 0 | let len = end.distance(start); |
605 | 0 | debug_assert!( |
606 | 0 | len >= V::BYTES, |
607 | 0 | "haystack has length {}, but must be at least {}", |
608 | | len, |
609 | | V::BYTES |
610 | | ); |
611 | | |
612 | 0 | if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) { |
613 | 0 | return Some(cur); |
614 | 0 | } |
615 | 0 | let mut cur = end.sub(end.as_usize() & V::ALIGN); |
616 | 0 | debug_assert!(start <= cur && cur <= end); |
617 | 0 | if len >= Self::LOOP_SIZE { |
618 | 0 | while cur >= start.add(Self::LOOP_SIZE) { |
619 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
620 | | |
621 | 0 | cur = cur.sub(Self::LOOP_SIZE); |
622 | 0 | let a = V::load_aligned(cur); |
623 | 0 | let b = V::load_aligned(cur.add(V::BYTES)); |
624 | 0 | let eqa1 = self.v1.cmpeq(a); |
625 | 0 | let eqb1 = self.v1.cmpeq(b); |
626 | 0 | let eqa2 = self.v2.cmpeq(a); |
627 | 0 | let eqb2 = self.v2.cmpeq(b); |
628 | 0 | let or1 = eqa1.or(eqb1); |
629 | 0 | let or2 = eqa2.or(eqb2); |
630 | 0 | let or3 = or1.or(or2); |
631 | 0 | if or3.movemask_will_have_non_zero() { |
632 | 0 | let mask = eqb1.movemask().or(eqb2.movemask()); |
633 | 0 | if mask.has_non_zero() { |
634 | 0 | return Some(cur.add(V::BYTES).add(topos(mask))); |
635 | 0 | } |
636 | 0 |
|
637 | 0 | let mask = eqa1.movemask().or(eqa2.movemask()); |
638 | 0 | debug_assert!(mask.has_non_zero()); |
639 | 0 | return Some(cur.add(topos(mask))); |
640 | 0 | } |
641 | | } |
642 | 0 | } |
643 | 0 | while cur >= start.add(V::BYTES) { |
644 | 0 | debug_assert!(cur.distance(start) >= V::BYTES); |
645 | 0 | cur = cur.sub(V::BYTES); |
646 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
647 | 0 | return Some(cur); |
648 | 0 | } |
649 | | } |
650 | 0 | if cur > start { |
651 | 0 | debug_assert!(cur.distance(start) < V::BYTES); |
652 | 0 | return self.search_chunk(start, topos); |
653 | 0 | } |
654 | 0 | None |
655 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m128i>>::rfind_raw Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m256i>>::rfind_raw |
656 | | |
657 | | /// Search `V::BYTES` starting at `cur` via an unaligned load. |
658 | | /// |
659 | | /// `mask_to_offset` should be a function that converts a `movemask` to |
660 | | /// an offset such that `cur.add(offset)` corresponds to a pointer to the |
661 | | /// match location if one is found. Generally it is expected to use either |
662 | | /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether |
663 | | /// one is implementing a forward or reverse search, respectively. |
664 | | /// |
665 | | /// # Safety |
666 | | /// |
667 | | /// `cur` must be a valid pointer and it must be valid to do an unaligned |
668 | | /// load of size `V::BYTES` at `cur`. |
669 | | #[inline(always)] |
670 | 0 | unsafe fn search_chunk( |
671 | 0 | &self, |
672 | 0 | cur: *const u8, |
673 | 0 | mask_to_offset: impl Fn(V::Mask) -> usize, |
674 | 0 | ) -> Option<*const u8> { |
675 | 0 | let chunk = V::load_unaligned(cur); |
676 | 0 | let eq1 = self.v1.cmpeq(chunk); |
677 | 0 | let eq2 = self.v2.cmpeq(chunk); |
678 | 0 | let mask = eq1.or(eq2).movemask(); |
679 | 0 | if mask.has_non_zero() { |
680 | 0 | let mask1 = eq1.movemask(); |
681 | 0 | let mask2 = eq2.movemask(); |
682 | 0 | Some(cur.add(mask_to_offset(mask1.or(mask2)))) |
683 | | } else { |
684 | 0 | None |
685 | | } |
686 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Two<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> |
687 | | } |
688 | | |
689 | | /// Finds all occurrences of two bytes in a haystack. |
690 | | /// |
691 | | /// That is, this reports matches of one of two possible bytes. For example, |
692 | | /// searching for `a` or `b` in `afoobar` would report matches at offsets `0`, |
693 | | /// `4` and `5`. |
694 | | #[derive(Clone, Copy, Debug)] |
695 | | pub(crate) struct Three<V> { |
696 | | s1: u8, |
697 | | s2: u8, |
698 | | s3: u8, |
699 | | v1: V, |
700 | | v2: V, |
701 | | v3: V, |
702 | | } |
703 | | |
704 | | impl<V: Vector> Three<V> { |
705 | | /// The number of bytes we examine per each iteration of our search loop. |
706 | | const LOOP_SIZE: usize = 2 * V::BYTES; |
707 | | |
708 | | /// Create a new searcher that finds occurrences of the byte given. |
709 | | #[inline(always)] |
710 | 0 | pub(crate) unsafe fn new( |
711 | 0 | needle1: u8, |
712 | 0 | needle2: u8, |
713 | 0 | needle3: u8, |
714 | 0 | ) -> Three<V> { |
715 | 0 | Three { |
716 | 0 | s1: needle1, |
717 | 0 | s2: needle2, |
718 | 0 | s3: needle3, |
719 | 0 | v1: V::splat(needle1), |
720 | 0 | v2: V::splat(needle2), |
721 | 0 | v3: V::splat(needle3), |
722 | 0 | } |
723 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m128i>>::new Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m256i>>::new |
724 | | |
725 | | /// Returns the first needle given to `Three::new`. |
726 | | #[inline(always)] |
727 | 0 | pub(crate) fn needle1(&self) -> u8 { |
728 | 0 | self.s1 |
729 | 0 | } |
730 | | |
731 | | /// Returns the second needle given to `Three::new`. |
732 | | #[inline(always)] |
733 | 0 | pub(crate) fn needle2(&self) -> u8 { |
734 | 0 | self.s2 |
735 | 0 | } |
736 | | |
737 | | /// Returns the third needle given to `Three::new`. |
738 | | #[inline(always)] |
739 | 0 | pub(crate) fn needle3(&self) -> u8 { |
740 | 0 | self.s3 |
741 | 0 | } |
742 | | |
743 | | /// Return a pointer to the first occurrence of one of the needles in the |
744 | | /// given haystack. If no such occurrence exists, then `None` is returned. |
745 | | /// |
746 | | /// When a match is found, the pointer returned is guaranteed to be |
747 | | /// `>= start` and `< end`. |
748 | | /// |
749 | | /// # Safety |
750 | | /// |
751 | | /// * It must be the case that `start < end` and that the distance between |
752 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
753 | | /// to do at least an unaligned load of `V` at `start`. |
754 | | /// * Both `start` and `end` must be valid for reads. |
755 | | /// * Both `start` and `end` must point to an initialized value. |
756 | | /// * Both `start` and `end` must point to the same allocated object and |
757 | | /// must either be in bounds or at most one byte past the end of the |
758 | | /// allocated object. |
759 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
760 | | /// object. |
761 | | /// * The distance between `start` and `end` must not overflow `isize`. |
762 | | /// * The distance being in bounds must not rely on "wrapping around" the |
763 | | /// address space. |
764 | | #[inline(always)] |
765 | 0 | pub(crate) unsafe fn find_raw( |
766 | 0 | &self, |
767 | 0 | start: *const u8, |
768 | 0 | end: *const u8, |
769 | 0 | ) -> Option<*const u8> { |
770 | 0 | // If we want to support vectors bigger than 256 bits, we probably |
771 | 0 | // need to move up to using a u64 for the masks used below. Currently |
772 | 0 | // they are 32 bits, which means we're SOL for vectors that need masks |
773 | 0 | // bigger than 32 bits. Overall unclear until there's a use case. |
774 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
775 | | |
776 | 0 | let topos = V::Mask::first_offset; |
777 | 0 | let len = end.distance(start); |
778 | 0 | debug_assert!( |
779 | 0 | len >= V::BYTES, |
780 | 0 | "haystack has length {}, but must be at least {}", |
781 | | len, |
782 | | V::BYTES |
783 | | ); |
784 | | |
785 | | // Search a possibly unaligned chunk at `start`. This covers any part |
786 | | // of the haystack prior to where aligned loads can start. |
787 | 0 | if let Some(cur) = self.search_chunk(start, topos) { |
788 | 0 | return Some(cur); |
789 | 0 | } |
790 | 0 | // Set `cur` to the first V-aligned pointer greater than `start`. |
791 | 0 | let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN)); |
792 | 0 | debug_assert!(cur > start && end.sub(V::BYTES) >= start); |
793 | 0 | if len >= Self::LOOP_SIZE { |
794 | 0 | while cur <= end.sub(Self::LOOP_SIZE) { |
795 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
796 | | |
797 | 0 | let a = V::load_aligned(cur); |
798 | 0 | let b = V::load_aligned(cur.add(V::BYTES)); |
799 | 0 | let eqa1 = self.v1.cmpeq(a); |
800 | 0 | let eqb1 = self.v1.cmpeq(b); |
801 | 0 | let eqa2 = self.v2.cmpeq(a); |
802 | 0 | let eqb2 = self.v2.cmpeq(b); |
803 | 0 | let eqa3 = self.v3.cmpeq(a); |
804 | 0 | let eqb3 = self.v3.cmpeq(b); |
805 | 0 | let or1 = eqa1.or(eqb1); |
806 | 0 | let or2 = eqa2.or(eqb2); |
807 | 0 | let or3 = eqa3.or(eqb3); |
808 | 0 | let or4 = or1.or(or2); |
809 | 0 | let or5 = or3.or(or4); |
810 | 0 | if or5.movemask_will_have_non_zero() { |
811 | 0 | let mask = eqa1 |
812 | 0 | .movemask() |
813 | 0 | .or(eqa2.movemask()) |
814 | 0 | .or(eqa3.movemask()); |
815 | 0 | if mask.has_non_zero() { |
816 | 0 | return Some(cur.add(topos(mask))); |
817 | 0 | } |
818 | 0 |
|
819 | 0 | let mask = eqb1 |
820 | 0 | .movemask() |
821 | 0 | .or(eqb2.movemask()) |
822 | 0 | .or(eqb3.movemask()); |
823 | 0 | debug_assert!(mask.has_non_zero()); |
824 | 0 | return Some(cur.add(V::BYTES).add(topos(mask))); |
825 | 0 | } |
826 | 0 | cur = cur.add(Self::LOOP_SIZE); |
827 | | } |
828 | 0 | } |
829 | | // Handle any leftovers after the aligned loop above. We use unaligned |
830 | | // loads here, but I believe we are guaranteed that they are aligned |
831 | | // since `cur` is aligned. |
832 | 0 | while cur <= end.sub(V::BYTES) { |
833 | 0 | debug_assert!(end.distance(cur) >= V::BYTES); |
834 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
835 | 0 | return Some(cur); |
836 | 0 | } |
837 | 0 | cur = cur.add(V::BYTES); |
838 | | } |
839 | | // Finally handle any remaining bytes less than the size of V. In this |
840 | | // case, our pointer may indeed be unaligned and the load may overlap |
841 | | // with the previous one. But that's okay since we know the previous |
842 | | // load didn't lead to a match (otherwise we wouldn't be here). |
843 | 0 | if cur < end { |
844 | 0 | debug_assert!(end.distance(cur) < V::BYTES); |
845 | 0 | cur = cur.sub(V::BYTES - end.distance(cur)); |
846 | 0 | debug_assert_eq!(end.distance(cur), V::BYTES); |
847 | 0 | return self.search_chunk(cur, topos); |
848 | 0 | } |
849 | 0 | None |
850 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m128i>>::find_raw Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m256i>>::find_raw |
851 | | |
852 | | /// Return a pointer to the last occurrence of the needle in the given |
853 | | /// haystack. If no such occurrence exists, then `None` is returned. |
854 | | /// |
855 | | /// When a match is found, the pointer returned is guaranteed to be |
856 | | /// `>= start` and `< end`. |
857 | | /// |
858 | | /// # Safety |
859 | | /// |
860 | | /// * It must be the case that `start < end` and that the distance between |
861 | | /// them is at least equal to `V::BYTES`. That is, it must always be valid |
862 | | /// to do at least an unaligned load of `V` at `start`. |
863 | | /// * Both `start` and `end` must be valid for reads. |
864 | | /// * Both `start` and `end` must point to an initialized value. |
865 | | /// * Both `start` and `end` must point to the same allocated object and |
866 | | /// must either be in bounds or at most one byte past the end of the |
867 | | /// allocated object. |
868 | | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
869 | | /// object. |
870 | | /// * The distance between `start` and `end` must not overflow `isize`. |
871 | | /// * The distance being in bounds must not rely on "wrapping around" the |
872 | | /// address space. |
873 | | #[inline(always)] |
874 | 0 | pub(crate) unsafe fn rfind_raw( |
875 | 0 | &self, |
876 | 0 | start: *const u8, |
877 | 0 | end: *const u8, |
878 | 0 | ) -> Option<*const u8> { |
879 | 0 | // If we want to support vectors bigger than 256 bits, we probably |
880 | 0 | // need to move up to using a u64 for the masks used below. Currently |
881 | 0 | // they are 32 bits, which means we're SOL for vectors that need masks |
882 | 0 | // bigger than 32 bits. Overall unclear until there's a use case. |
883 | 0 | debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes"); |
884 | | |
885 | 0 | let topos = V::Mask::last_offset; |
886 | 0 | let len = end.distance(start); |
887 | 0 | debug_assert!( |
888 | 0 | len >= V::BYTES, |
889 | 0 | "haystack has length {}, but must be at least {}", |
890 | | len, |
891 | | V::BYTES |
892 | | ); |
893 | | |
894 | 0 | if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) { |
895 | 0 | return Some(cur); |
896 | 0 | } |
897 | 0 | let mut cur = end.sub(end.as_usize() & V::ALIGN); |
898 | 0 | debug_assert!(start <= cur && cur <= end); |
899 | 0 | if len >= Self::LOOP_SIZE { |
900 | 0 | while cur >= start.add(Self::LOOP_SIZE) { |
901 | 0 | debug_assert_eq!(0, cur.as_usize() % V::BYTES); |
902 | | |
903 | 0 | cur = cur.sub(Self::LOOP_SIZE); |
904 | 0 | let a = V::load_aligned(cur); |
905 | 0 | let b = V::load_aligned(cur.add(V::BYTES)); |
906 | 0 | let eqa1 = self.v1.cmpeq(a); |
907 | 0 | let eqb1 = self.v1.cmpeq(b); |
908 | 0 | let eqa2 = self.v2.cmpeq(a); |
909 | 0 | let eqb2 = self.v2.cmpeq(b); |
910 | 0 | let eqa3 = self.v3.cmpeq(a); |
911 | 0 | let eqb3 = self.v3.cmpeq(b); |
912 | 0 | let or1 = eqa1.or(eqb1); |
913 | 0 | let or2 = eqa2.or(eqb2); |
914 | 0 | let or3 = eqa3.or(eqb3); |
915 | 0 | let or4 = or1.or(or2); |
916 | 0 | let or5 = or3.or(or4); |
917 | 0 | if or5.movemask_will_have_non_zero() { |
918 | 0 | let mask = eqb1 |
919 | 0 | .movemask() |
920 | 0 | .or(eqb2.movemask()) |
921 | 0 | .or(eqb3.movemask()); |
922 | 0 | if mask.has_non_zero() { |
923 | 0 | return Some(cur.add(V::BYTES).add(topos(mask))); |
924 | 0 | } |
925 | 0 |
|
926 | 0 | let mask = eqa1 |
927 | 0 | .movemask() |
928 | 0 | .or(eqa2.movemask()) |
929 | 0 | .or(eqa3.movemask()); |
930 | 0 | debug_assert!(mask.has_non_zero()); |
931 | 0 | return Some(cur.add(topos(mask))); |
932 | 0 | } |
933 | | } |
934 | 0 | } |
935 | 0 | while cur >= start.add(V::BYTES) { |
936 | 0 | debug_assert!(cur.distance(start) >= V::BYTES); |
937 | 0 | cur = cur.sub(V::BYTES); |
938 | 0 | if let Some(cur) = self.search_chunk(cur, topos) { |
939 | 0 | return Some(cur); |
940 | 0 | } |
941 | | } |
942 | 0 | if cur > start { |
943 | 0 | debug_assert!(cur.distance(start) < V::BYTES); |
944 | 0 | return self.search_chunk(start, topos); |
945 | 0 | } |
946 | 0 | None |
947 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m128i>>::rfind_raw Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m256i>>::rfind_raw |
948 | | |
949 | | /// Search `V::BYTES` starting at `cur` via an unaligned load. |
950 | | /// |
951 | | /// `mask_to_offset` should be a function that converts a `movemask` to |
952 | | /// an offset such that `cur.add(offset)` corresponds to a pointer to the |
953 | | /// match location if one is found. Generally it is expected to use either |
954 | | /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether |
955 | | /// one is implementing a forward or reverse search, respectively. |
956 | | /// |
957 | | /// # Safety |
958 | | /// |
959 | | /// `cur` must be a valid pointer and it must be valid to do an unaligned |
960 | | /// load of size `V::BYTES` at `cur`. |
961 | | #[inline(always)] |
962 | 0 | unsafe fn search_chunk( |
963 | 0 | &self, |
964 | 0 | cur: *const u8, |
965 | 0 | mask_to_offset: impl Fn(V::Mask) -> usize, |
966 | 0 | ) -> Option<*const u8> { |
967 | 0 | let chunk = V::load_unaligned(cur); |
968 | 0 | let eq1 = self.v1.cmpeq(chunk); |
969 | 0 | let eq2 = self.v2.cmpeq(chunk); |
970 | 0 | let eq3 = self.v3.cmpeq(chunk); |
971 | 0 | let mask = eq1.or(eq2).or(eq3).movemask(); |
972 | 0 | if mask.has_non_zero() { |
973 | 0 | let mask1 = eq1.movemask(); |
974 | 0 | let mask2 = eq2.movemask(); |
975 | 0 | let mask3 = eq3.movemask(); |
976 | 0 | Some(cur.add(mask_to_offset(mask1.or(mask2).or(mask3)))) |
977 | | } else { |
978 | 0 | None |
979 | | } |
980 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m128i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::last_offset> Unexecuted instantiation: <memchr::arch::generic::memchr::Three<core::core_arch::x86::__m256i>>::search_chunk::<<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::first_offset> |
981 | | } |
982 | | |
983 | | /// An iterator over all occurrences of a set of bytes in a haystack. |
984 | | /// |
985 | | /// This iterator implements the routines necessary to provide a |
986 | | /// `DoubleEndedIterator` impl, which means it can also be used to find |
987 | | /// occurrences in reverse order. |
988 | | /// |
989 | | /// The lifetime parameters are as follows: |
990 | | /// |
991 | | /// * `'h` refers to the lifetime of the haystack being searched. |
992 | | /// |
993 | | /// This type is intended to be used to implement all iterators for the |
994 | | /// `memchr` family of functions. It handles a tiny bit of marginally tricky |
995 | | /// raw pointer math, but otherwise expects the caller to provide `find_raw` |
996 | | /// and `rfind_raw` routines for each call of `next` and `next_back`, |
997 | | /// respectively. |
998 | | #[derive(Clone, Debug)] |
999 | | pub(crate) struct Iter<'h> { |
1000 | | /// The original starting point into the haystack. We use this to convert |
1001 | | /// pointers to offsets. |
1002 | | original_start: *const u8, |
1003 | | /// The current starting point into the haystack. That is, where the next |
1004 | | /// search will begin. |
1005 | | start: *const u8, |
1006 | | /// The current ending point into the haystack. That is, where the next |
1007 | | /// reverse search will begin. |
1008 | | end: *const u8, |
1009 | | /// A marker for tracking the lifetime of the start/cur_start/cur_end |
1010 | | /// pointers above, which all point into the haystack. |
1011 | | haystack: core::marker::PhantomData<&'h [u8]>, |
1012 | | } |
1013 | | |
1014 | | // SAFETY: Iter contains no shared references to anything that performs any |
1015 | | // interior mutations. Also, the lifetime guarantees that Iter will not outlive |
1016 | | // the haystack. |
1017 | | unsafe impl<'h> Send for Iter<'h> {} |
1018 | | |
1019 | | // SAFETY: Iter perform no interior mutations, therefore no explicit |
1020 | | // synchronization is necessary. Also, the lifetime guarantees that Iter will |
1021 | | // not outlive the haystack. |
1022 | | unsafe impl<'h> Sync for Iter<'h> {} |
1023 | | |
1024 | | impl<'h> Iter<'h> { |
1025 | | /// Create a new generic memchr iterator. |
1026 | | #[inline(always)] |
1027 | 0 | pub(crate) fn new(haystack: &'h [u8]) -> Iter<'h> { |
1028 | 0 | Iter { |
1029 | 0 | original_start: haystack.as_ptr(), |
1030 | 0 | start: haystack.as_ptr(), |
1031 | 0 | end: haystack.as_ptr().wrapping_add(haystack.len()), |
1032 | 0 | haystack: core::marker::PhantomData, |
1033 | 0 | } |
1034 | 0 | } |
1035 | | |
1036 | | /// Returns the next occurrence in the forward direction. |
1037 | | /// |
1038 | | /// # Safety |
1039 | | /// |
1040 | | /// Callers must ensure that if a pointer is returned from the closure |
1041 | | /// provided, then it must be greater than or equal to the start pointer |
1042 | | /// and less than the end pointer. |
1043 | | #[inline(always)] |
1044 | 0 | pub(crate) unsafe fn next( |
1045 | 0 | &mut self, |
1046 | 0 | mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>, |
1047 | 0 | ) -> Option<usize> { |
1048 | | // SAFETY: Pointers are derived directly from the same &[u8] haystack. |
1049 | | // We only ever modify start/end corresponding to a matching offset |
1050 | | // found between start and end. Thus all changes to start/end maintain |
1051 | | // our safety requirements. |
1052 | | // |
1053 | | // The only other assumption we rely on is that the pointer returned |
1054 | | // by `find_raw` satisfies `self.start <= found < self.end`, and that |
1055 | | // safety contract is forwarded to the caller. |
1056 | 0 | let found = find_raw(self.start, self.end)?; |
1057 | 0 | let result = found.distance(self.original_start); |
1058 | 0 | self.start = found.add(1); |
1059 | 0 | Some(result) |
1060 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::memchr::Memchr as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::all::memchr::TwoIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::memchr::Memchr2 as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::avx2::memchr::TwoIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::sse2::memchr::TwoIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::all::memchr::ThreeIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::memchr::Memchr3 as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::avx2::memchr::ThreeIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::sse2::memchr::ThreeIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::memchr::Memchr as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::all::memchr::OneIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::avx2::memchr::OneIter as core::iter::traits::iterator::Iterator>::next::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next::<<memchr::arch::x86_64::sse2::memchr::OneIter as core::iter::traits::iterator::Iterator>::next::{closure#0}> |
1061 | | |
1062 | | /// Returns the number of remaining elements in this iterator. |
1063 | | #[inline(always)] |
1064 | 0 | pub(crate) fn count( |
1065 | 0 | self, |
1066 | 0 | mut count_raw: impl FnMut(*const u8, *const u8) -> usize, |
1067 | 0 | ) -> usize { |
1068 | 0 | // SAFETY: Pointers are derived directly from the same &[u8] haystack. |
1069 | 0 | // We only ever modify start/end corresponding to a matching offset |
1070 | 0 | // found between start and end. Thus all changes to start/end maintain |
1071 | 0 | // our safety requirements. |
1072 | 0 | count_raw(self.start, self.end) |
1073 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::count::<<memchr::memchr::Memchr as core::iter::traits::iterator::Iterator>::count::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::count::<<memchr::arch::all::memchr::OneIter as core::iter::traits::iterator::Iterator>::count::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::count::<<memchr::arch::x86_64::avx2::memchr::OneIter as core::iter::traits::iterator::Iterator>::count::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::count::<<memchr::arch::x86_64::sse2::memchr::OneIter as core::iter::traits::iterator::Iterator>::count::{closure#0}> |
1074 | | |
1075 | | /// Returns the next occurrence in reverse. |
1076 | | /// |
1077 | | /// # Safety |
1078 | | /// |
1079 | | /// Callers must ensure that if a pointer is returned from the closure |
1080 | | /// provided, then it must be greater than or equal to the start pointer |
1081 | | /// and less than the end pointer. |
1082 | | #[inline(always)] |
1083 | 0 | pub(crate) unsafe fn next_back( |
1084 | 0 | &mut self, |
1085 | 0 | mut rfind_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>, |
1086 | 0 | ) -> Option<usize> { |
1087 | | // SAFETY: Pointers are derived directly from the same &[u8] haystack. |
1088 | | // We only ever modify start/end corresponding to a matching offset |
1089 | | // found between start and end. Thus all changes to start/end maintain |
1090 | | // our safety requirements. |
1091 | | // |
1092 | | // The only other assumption we rely on is that the pointer returned |
1093 | | // by `rfind_raw` satisfies `self.start <= found < self.end`, and that |
1094 | | // safety contract is forwarded to the caller. |
1095 | 0 | let found = rfind_raw(self.start, self.end)?; |
1096 | 0 | let result = found.distance(self.original_start); |
1097 | 0 | self.end = found; |
1098 | 0 | Some(result) |
1099 | 0 | } Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::memchr::Memchr as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::all::memchr::OneIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::avx2::memchr::OneIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::sse2::memchr::OneIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::all::memchr::TwoIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::memchr::Memchr2 as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::avx2::memchr::TwoIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::sse2::memchr::TwoIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::all::memchr::ThreeIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::memchr::Memchr3 as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::avx2::memchr::ThreeIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}>Unexecuted instantiation: <memchr::arch::generic::memchr::Iter>::next_back::<<memchr::arch::x86_64::sse2::memchr::ThreeIter as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::{closure#0}> |
1100 | | |
1101 | | /// Provides an implementation of `Iterator::size_hint`. |
1102 | | #[inline(always)] |
1103 | 0 | pub(crate) fn size_hint(&self) -> (usize, Option<usize>) { |
1104 | 0 | (0, Some(self.end.as_usize().saturating_sub(self.start.as_usize()))) |
1105 | 0 | } |
1106 | | } |
1107 | | |
1108 | | /// Search a slice using a function that operates on raw pointers. |
1109 | | /// |
1110 | | /// Given a function to search a contiguous sequence of memory for the location |
1111 | | /// of a non-empty set of bytes, this will execute that search on a slice of |
1112 | | /// bytes. The pointer returned by the given function will be converted to an |
1113 | | /// offset relative to the starting point of the given slice. That is, if a |
1114 | | /// match is found, the offset returned by this routine is guaranteed to be a |
1115 | | /// valid index into `haystack`. |
1116 | | /// |
1117 | | /// Callers may use this for a forward or reverse search. |
1118 | | /// |
1119 | | /// # Safety |
1120 | | /// |
1121 | | /// Callers must ensure that if a pointer is returned by `find_raw`, then the |
1122 | | /// pointer must be greater than or equal to the starting pointer and less than |
1123 | | /// the end pointer. |
1124 | | #[inline(always)] |
1125 | 895k | pub(crate) unsafe fn search_slice_with_raw( |
1126 | 895k | haystack: &[u8], |
1127 | 895k | mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>, |
1128 | 895k | ) -> Option<usize> { |
1129 | 895k | // SAFETY: We rely on `find_raw` to return a correct and valid pointer, but |
1130 | 895k | // otherwise, `start` and `end` are valid due to the guarantees provided by |
1131 | 895k | // a &[u8]. |
1132 | 895k | let start = haystack.as_ptr(); |
1133 | 895k | let end = start.add(haystack.len()); |
1134 | 895k | let found = find_raw(start, end)?; |
1135 | 895k | Some(found.distance(start)) |
1136 | 895k | } Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr::{closure#0}>memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr::{closure#0}>Line | Count | Source | 1125 | 895k | pub(crate) unsafe fn search_slice_with_raw( | 1126 | 895k | haystack: &[u8], | 1127 | 895k | mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>, | 1128 | 895k | ) -> Option<usize> { | 1129 | 895k | // SAFETY: We rely on `find_raw` to return a correct and valid pointer, but | 1130 | 895k | // otherwise, `start` and `end` are valid due to the guarantees provided by | 1131 | 895k | // a &[u8]. | 1132 | 895k | let start = haystack.as_ptr(); | 1133 | 895k | let end = start.add(haystack.len()); | 1134 | 895k | let found = find_raw(start, end)?; | 1135 | 895k | Some(found.distance(start)) | 1136 | 895k | } |
Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr2::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr2::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::One>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::One>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::One>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::One>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::One>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::One>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::Two>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::Two>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::Two>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::Two>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::Two>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::Two>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::Three>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::all::memchr::Three>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::Three>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::avx2::memchr::Three>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::Three>::find::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<<memchr::arch::x86_64::sse2::memchr::Three>::rfind::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr2::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memchr3::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memrchr::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memrchr2::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::search_slice_with_raw::<memchr::memchr::memrchr3::{closure#0}> |
1137 | | |
1138 | | /// Performs a forward byte-at-a-time loop until either `ptr >= end_ptr` or |
1139 | | /// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is |
1140 | | /// returned. If the latter occurs, then the pointer at which `confirm` returns |
1141 | | /// `true` is returned. |
1142 | | /// |
1143 | | /// # Safety |
1144 | | /// |
1145 | | /// Callers must provide valid pointers and they must satisfy `start_ptr <= |
1146 | | /// ptr` and `ptr <= end_ptr`. |
1147 | | #[inline(always)] |
1148 | 4.42k | pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>( |
1149 | 4.42k | start: *const u8, |
1150 | 4.42k | end: *const u8, |
1151 | 4.42k | confirm: F, |
1152 | 4.42k | ) -> Option<*const u8> { |
1153 | 4.42k | debug_assert!(start <= end); |
1154 | 4.42k | let mut ptr = start; |
1155 | 27.2k | while ptr < end { |
1156 | 27.2k | if confirm(*ptr) { |
1157 | 4.42k | return Some(ptr); |
1158 | 22.8k | } |
1159 | 22.8k | ptr = ptr.offset(1); |
1160 | | } |
1161 | 0 | None |
1162 | 4.42k | } Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::all::memchr::One>::find_raw::{closure#0}>memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::One>::find_raw::{closure#0}>Line | Count | Source | 1148 | 4.42k | pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>( | 1149 | 4.42k | start: *const u8, | 1150 | 4.42k | end: *const u8, | 1151 | 4.42k | confirm: F, | 1152 | 4.42k | ) -> Option<*const u8> { | 1153 | 4.42k | debug_assert!(start <= end); | 1154 | 4.42k | let mut ptr = start; | 1155 | 27.2k | while ptr < end { | 1156 | 27.2k | if confirm(*ptr) { | 1157 | 4.42k | return Some(ptr); | 1158 | 22.8k | } | 1159 | 22.8k | ptr = ptr.offset(1); | 1160 | | } | 1161 | 0 | None | 1162 | 4.42k | } |
Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::One>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::all::memchr::Two>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::Two>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::Two>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::all::memchr::Three>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::Three>::find_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::fwd_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::Three>::find_raw::{closure#0}> |
1163 | | |
1164 | | /// Performs a reverse byte-at-a-time loop until either `ptr < start_ptr` or |
1165 | | /// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is |
1166 | | /// returned. If the latter occurs, then the pointer at which `confirm` returns |
1167 | | /// `true` is returned. |
1168 | | /// |
1169 | | /// # Safety |
1170 | | /// |
1171 | | /// Callers must provide valid pointers and they must satisfy `start_ptr <= |
1172 | | /// ptr` and `ptr <= end_ptr`. |
1173 | | #[inline(always)] |
1174 | 0 | pub(crate) unsafe fn rev_byte_by_byte<F: Fn(u8) -> bool>( |
1175 | 0 | start: *const u8, |
1176 | 0 | end: *const u8, |
1177 | 0 | confirm: F, |
1178 | 0 | ) -> Option<*const u8> { |
1179 | 0 | debug_assert!(start <= end); |
1180 | | |
1181 | 0 | let mut ptr = end; |
1182 | 0 | while ptr > start { |
1183 | 0 | ptr = ptr.offset(-1); |
1184 | 0 | if confirm(*ptr) { |
1185 | 0 | return Some(ptr); |
1186 | 0 | } |
1187 | | } |
1188 | 0 | None |
1189 | 0 | } Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::all::memchr::One>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::One>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::One>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::all::memchr::Two>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::Two>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::Two>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::all::memchr::Three>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::Three>::rfind_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::rev_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::Three>::rfind_raw::{closure#0}> |
1190 | | |
1191 | | /// Performs a forward byte-at-a-time loop until `ptr >= end_ptr` and returns |
1192 | | /// the number of times `confirm(*ptr)` returns `true`. |
1193 | | /// |
1194 | | /// # Safety |
1195 | | /// |
1196 | | /// Callers must provide valid pointers and they must satisfy `start_ptr <= |
1197 | | /// ptr` and `ptr <= end_ptr`. |
1198 | | #[inline(always)] |
1199 | 0 | pub(crate) unsafe fn count_byte_by_byte<F: Fn(u8) -> bool>( |
1200 | 0 | start: *const u8, |
1201 | 0 | end: *const u8, |
1202 | 0 | confirm: F, |
1203 | 0 | ) -> usize { |
1204 | 0 | debug_assert!(start <= end); |
1205 | 0 | let mut ptr = start; |
1206 | 0 | let mut count = 0; |
1207 | 0 | while ptr < end { |
1208 | 0 | if confirm(*ptr) { |
1209 | 0 | count += 1; |
1210 | 0 | } |
1211 | 0 | ptr = ptr.offset(1); |
1212 | | } |
1213 | 0 | count |
1214 | 0 | } Unexecuted instantiation: memchr::arch::generic::memchr::count_byte_by_byte::<<memchr::arch::generic::memchr::One<core::core_arch::x86::__m128i>>::count_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::count_byte_by_byte::<<memchr::arch::generic::memchr::One<core::core_arch::x86::__m256i>>::count_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::count_byte_by_byte::<<memchr::arch::x86_64::avx2::memchr::One>::count_raw::{closure#0}>Unexecuted instantiation: memchr::arch::generic::memchr::count_byte_by_byte::<<memchr::arch::x86_64::sse2::memchr::One>::count_raw::{closure#0}> |