Coverage Report

Created: 2024-10-13 21:57

/rust/registry/src/index.crates.io-6f17d22bba15001f/memchr-2.7.4/src/vector.rs
Line
Count
Source (jump to first uncovered line)
1
/// A trait for describing vector operations used by vectorized searchers.
2
///
3
/// The trait is highly constrained to low level vector operations needed.
4
/// In general, it was invented mostly to be generic over x86's __m128i and
5
/// __m256i types. At time of writing, it also supports wasm and aarch64
6
/// 128-bit vector types as well.
7
///
8
/// # Safety
9
///
10
/// All methods are not safe since they are intended to be implemented using
11
/// vendor intrinsics, which are also not safe. Callers must ensure that the
12
/// appropriate target features are enabled in the calling function, and that
13
/// the current CPU supports them. All implementations should avoid marking the
14
/// routines with #[target_feature] and instead mark them as #[inline(always)]
15
/// to ensure they get appropriately inlined. (inline(always) cannot be used
16
/// with target_feature.)
17
pub(crate) trait Vector: Copy + core::fmt::Debug {
18
    /// The number of bytes in the vector. That is, this is the size of the
19
    /// vector in memory.
20
    const BYTES: usize;
21
    /// The bits that must be zero in order for a `*const u8` pointer to be
22
    /// correctly aligned to read vector values.
23
    const ALIGN: usize;
24
25
    /// The type of the value returned by `Vector::movemask`.
26
    ///
27
    /// This supports abstracting over the specific representation used in
28
    /// order to accommodate different representations in different ISAs.
29
    type Mask: MoveMask;
30
31
    /// Create a vector with 8-bit lanes with the given byte repeated into each
32
    /// lane.
33
    unsafe fn splat(byte: u8) -> Self;
34
35
    /// Read a vector-size number of bytes from the given pointer. The pointer
36
    /// must be aligned to the size of the vector.
37
    ///
38
    /// # Safety
39
    ///
40
    /// Callers must guarantee that at least `BYTES` bytes are readable from
41
    /// `data` and that `data` is aligned to a `BYTES` boundary.
42
    unsafe fn load_aligned(data: *const u8) -> Self;
43
44
    /// Read a vector-size number of bytes from the given pointer. The pointer
45
    /// does not need to be aligned.
46
    ///
47
    /// # Safety
48
    ///
49
    /// Callers must guarantee that at least `BYTES` bytes are readable from
50
    /// `data`.
51
    unsafe fn load_unaligned(data: *const u8) -> Self;
52
53
    /// _mm_movemask_epi8 or _mm256_movemask_epi8
54
    unsafe fn movemask(self) -> Self::Mask;
55
    /// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8
56
    unsafe fn cmpeq(self, vector2: Self) -> Self;
57
    /// _mm_and_si128 or _mm256_and_si256
58
    unsafe fn and(self, vector2: Self) -> Self;
59
    /// _mm_or or _mm256_or_si256
60
    unsafe fn or(self, vector2: Self) -> Self;
61
    /// Returns true if and only if `Self::movemask` would return a mask that
62
    /// contains at least one non-zero bit.
63
1.48M
    unsafe fn movemask_will_have_non_zero(self) -> bool {
64
1.48M
        self.movemask().has_non_zero()
65
1.48M
    }
Unexecuted instantiation: <core::core_arch::x86::__m128i as memchr::vector::Vector>::movemask_will_have_non_zero
<core::core_arch::x86::__m256i as memchr::vector::Vector>::movemask_will_have_non_zero
Line
Count
Source
63
1.48M
    unsafe fn movemask_will_have_non_zero(self) -> bool {
64
1.48M
        self.movemask().has_non_zero()
65
1.48M
    }
66
}
67
68
/// A trait that abstracts over a vector-to-scalar operation called
69
/// "move mask."
70
///
71
/// On x86-64, this is `_mm_movemask_epi8` for SSE2 and `_mm256_movemask_epi8`
72
/// for AVX2. It takes a vector of `u8` lanes and returns a scalar where the
73
/// `i`th bit is set if and only if the most significant bit in the `i`th lane
74
/// of the vector is set. The simd128 ISA for wasm32 also supports this
75
/// exact same operation natively.
76
///
77
/// ... But aarch64 doesn't. So we have to fake it with more instructions and
78
/// a slightly different representation. We could do extra work to unify the
79
/// representations, but then would require additional costs in the hot path
80
/// for `memchr` and `packedpair`. So instead, we abstraction over the specific
81
/// representation with this trait an ddefine the operations we actually need.
82
pub(crate) trait MoveMask: Copy + core::fmt::Debug {
83
    /// Return a mask that is all zeros except for the least significant `n`
84
    /// lanes in a corresponding vector.
85
    fn all_zeros_except_least_significant(n: usize) -> Self;
86
87
    /// Returns true if and only if this mask has a a non-zero bit anywhere.
88
    fn has_non_zero(self) -> bool;
89
90
    /// Returns the number of bits set to 1 in this mask.
91
    fn count_ones(self) -> usize;
92
93
    /// Does a bitwise `and` operation between `self` and `other`.
94
    fn and(self, other: Self) -> Self;
95
96
    /// Does a bitwise `or` operation between `self` and `other`.
97
    fn or(self, other: Self) -> Self;
98
99
    /// Returns a mask that is equivalent to `self` but with the least
100
    /// significant 1-bit set to 0.
101
    fn clear_least_significant_bit(self) -> Self;
102
103
    /// Returns the offset of the first non-zero lane this mask represents.
104
    fn first_offset(self) -> usize;
105
106
    /// Returns the offset of the last non-zero lane this mask represents.
107
    fn last_offset(self) -> usize;
108
}
109
110
/// This is a "sensible" movemask implementation where each bit represents
111
/// whether the most significant bit is set in each corresponding lane of a
112
/// vector. This is used on x86-64 and wasm, but such a mask is more expensive
113
/// to get on aarch64 so we use something a little different.
114
///
115
/// We call this "sensible" because this is what we get using native sse/avx
116
/// movemask instructions. But neon has no such native equivalent.
117
#[derive(Clone, Copy, Debug)]
118
pub(crate) struct SensibleMoveMask(u32);
119
120
impl SensibleMoveMask {
121
    /// Get the mask in a form suitable for computing offsets.
122
    ///
123
    /// Basically, this normalizes to little endian. On big endian, this swaps
124
    /// the bytes.
125
    #[inline(always)]
126
5.35M
    fn get_for_offset(self) -> u32 {
127
5.35M
        #[cfg(target_endian = "big")]
128
5.35M
        {
129
5.35M
            self.0.swap_bytes()
130
5.35M
        }
131
5.35M
        #[cfg(target_endian = "little")]
132
5.35M
        {
133
5.35M
            self.0
134
5.35M
        }
135
5.35M
    }
136
}
137
138
impl MoveMask for SensibleMoveMask {
139
    #[inline(always)]
140
1.43k
    fn all_zeros_except_least_significant(n: usize) -> SensibleMoveMask {
141
1.43k
        debug_assert!(n < 32);
142
1.43k
        SensibleMoveMask(!((1 << n) - 1))
143
1.43k
    }
144
145
    #[inline(always)]
146
8.63M
    fn has_non_zero(self) -> bool {
147
8.63M
        self.0 != 0
148
8.63M
    }
149
150
    #[inline(always)]
151
0
    fn count_ones(self) -> usize {
152
0
        self.0.count_ones() as usize
153
0
    }
154
155
    #[inline(always)]
156
1.77M
    fn and(self, other: SensibleMoveMask) -> SensibleMoveMask {
157
1.77M
        SensibleMoveMask(self.0 & other.0)
158
1.77M
    }
159
160
    #[inline(always)]
161
5.35M
    fn or(self, other: SensibleMoveMask) -> SensibleMoveMask {
162
5.35M
        SensibleMoveMask(self.0 | other.0)
163
5.35M
    }
164
165
    #[inline(always)]
166
5.38k
    fn clear_least_significant_bit(self) -> SensibleMoveMask {
167
5.38k
        SensibleMoveMask(self.0 & (self.0 - 1))
168
5.38k
    }
169
170
    #[inline(always)]
171
5.35M
    fn first_offset(self) -> usize {
172
5.35M
        // We are dealing with little endian here (and if we aren't, we swap
173
5.35M
        // the bytes so we are in practice), where the most significant byte
174
5.35M
        // is at a higher address. That means the least significant bit that
175
5.35M
        // is set corresponds to the position of our first matching byte.
176
5.35M
        // That position corresponds to the number of zeros after the least
177
5.35M
        // significant bit.
178
5.35M
        self.get_for_offset().trailing_zeros() as usize
179
5.35M
    }
180
181
    #[inline(always)]
182
0
    fn last_offset(self) -> usize {
183
0
        // We are dealing with little endian here (and if we aren't, we swap
184
0
        // the bytes so we are in practice), where the most significant byte is
185
0
        // at a higher address. That means the most significant bit that is set
186
0
        // corresponds to the position of our last matching byte. The position
187
0
        // from the end of the mask is therefore the number of leading zeros
188
0
        // in a 32 bit integer, and the position from the start of the mask is
189
0
        // therefore 32 - (leading zeros) - 1.
190
0
        32 - self.get_for_offset().leading_zeros() as usize - 1
191
0
    }
192
}
193
194
#[cfg(target_arch = "x86_64")]
195
mod x86sse2 {
196
    use core::arch::x86_64::*;
197
198
    use super::{SensibleMoveMask, Vector};
199
200
    impl Vector for __m128i {
201
        const BYTES: usize = 16;
202
        const ALIGN: usize = Self::BYTES - 1;
203
204
        type Mask = SensibleMoveMask;
205
206
        #[inline(always)]
207
10.7M
        unsafe fn splat(byte: u8) -> __m128i {
208
10.7M
            _mm_set1_epi8(byte as i8)
209
10.7M
        }
210
211
        #[inline(always)]
212
0
        unsafe fn load_aligned(data: *const u8) -> __m128i {
213
0
            _mm_load_si128(data as *const __m128i)
214
0
        }
215
216
        #[inline(always)]
217
5.77k
        unsafe fn load_unaligned(data: *const u8) -> __m128i {
218
5.77k
            _mm_loadu_si128(data as *const __m128i)
219
5.77k
        }
220
221
        #[inline(always)]
222
16.9k
        unsafe fn movemask(self) -> SensibleMoveMask {
223
16.9k
            SensibleMoveMask(_mm_movemask_epi8(self) as u32)
224
16.9k
        }
225
226
        #[inline(always)]
227
11.5k
        unsafe fn cmpeq(self, vector2: Self) -> __m128i {
228
11.5k
            _mm_cmpeq_epi8(self, vector2)
229
11.5k
        }
230
231
        #[inline(always)]
232
0
        unsafe fn and(self, vector2: Self) -> __m128i {
233
0
            _mm_and_si128(self, vector2)
234
0
        }
235
236
        #[inline(always)]
237
5.77k
        unsafe fn or(self, vector2: Self) -> __m128i {
238
5.77k
            _mm_or_si128(self, vector2)
239
5.77k
        }
240
    }
241
}
242
243
#[cfg(target_arch = "x86_64")]
244
mod x86avx2 {
245
    use core::arch::x86_64::*;
246
247
    use super::{SensibleMoveMask, Vector};
248
249
    impl Vector for __m256i {
250
        const BYTES: usize = 32;
251
        const ALIGN: usize = Self::BYTES - 1;
252
253
        type Mask = SensibleMoveMask;
254
255
        #[inline(always)]
256
10.7M
        unsafe fn splat(byte: u8) -> __m256i {
257
10.7M
            _mm256_set1_epi8(byte as i8)
258
10.7M
        }
259
260
        #[inline(always)]
261
3.26M
        unsafe fn load_aligned(data: *const u8) -> __m256i {
262
3.26M
            _mm256_load_si256(data as *const __m256i)
263
3.26M
        }
264
265
        #[inline(always)]
266
8.89M
        unsafe fn load_unaligned(data: *const u8) -> __m256i {
267
8.89M
            _mm256_loadu_si256(data as *const __m256i)
268
8.89M
        }
269
270
        #[inline(always)]
271
19.3M
        unsafe fn movemask(self) -> SensibleMoveMask {
272
19.3M
            SensibleMoveMask(_mm256_movemask_epi8(self) as u32)
273
19.3M
        }
274
275
        #[inline(always)]
276
20.1M
        unsafe fn cmpeq(self, vector2: Self) -> __m256i {
277
20.1M
            _mm256_cmpeq_epi8(self, vector2)
278
20.1M
        }
279
280
        #[inline(always)]
281
1.77M
        unsafe fn and(self, vector2: Self) -> __m256i {
282
1.77M
            _mm256_and_si256(self, vector2)
283
1.77M
        }
284
285
        #[inline(always)]
286
9.78M
        unsafe fn or(self, vector2: Self) -> __m256i {
287
9.78M
            _mm256_or_si256(self, vector2)
288
9.78M
        }
289
    }
290
}
291
292
#[cfg(target_arch = "aarch64")]
293
mod aarch64neon {
294
    use core::arch::aarch64::*;
295
296
    use super::{MoveMask, Vector};
297
298
    impl Vector for uint8x16_t {
299
        const BYTES: usize = 16;
300
        const ALIGN: usize = Self::BYTES - 1;
301
302
        type Mask = NeonMoveMask;
303
304
        #[inline(always)]
305
        unsafe fn splat(byte: u8) -> uint8x16_t {
306
            vdupq_n_u8(byte)
307
        }
308
309
        #[inline(always)]
310
        unsafe fn load_aligned(data: *const u8) -> uint8x16_t {
311
            // I've tried `data.cast::<uint8x16_t>().read()` instead, but
312
            // couldn't observe any benchmark differences.
313
            Self::load_unaligned(data)
314
        }
315
316
        #[inline(always)]
317
        unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
318
            vld1q_u8(data)
319
        }
320
321
        #[inline(always)]
322
        unsafe fn movemask(self) -> NeonMoveMask {
323
            let asu16s = vreinterpretq_u16_u8(self);
324
            let mask = vshrn_n_u16(asu16s, 4);
325
            let asu64 = vreinterpret_u64_u8(mask);
326
            let scalar64 = vget_lane_u64(asu64, 0);
327
            NeonMoveMask(scalar64 & 0x8888888888888888)
328
        }
329
330
        #[inline(always)]
331
        unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
332
            vceqq_u8(self, vector2)
333
        }
334
335
        #[inline(always)]
336
        unsafe fn and(self, vector2: Self) -> uint8x16_t {
337
            vandq_u8(self, vector2)
338
        }
339
340
        #[inline(always)]
341
        unsafe fn or(self, vector2: Self) -> uint8x16_t {
342
            vorrq_u8(self, vector2)
343
        }
344
345
        /// This is the only interesting implementation of this routine.
346
        /// Basically, instead of doing the "shift right narrow" dance, we use
347
        /// adajacent folding max to determine whether there are any non-zero
348
        /// bytes in our mask. If there are, *then* we'll do the "shift right
349
        /// narrow" dance. In benchmarks, this does lead to slightly better
350
        /// throughput, but the win doesn't appear huge.
351
        #[inline(always)]
352
        unsafe fn movemask_will_have_non_zero(self) -> bool {
353
            let low = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
354
            vgetq_lane_u64(low, 0) != 0
355
        }
356
    }
357
358
    /// Neon doesn't have a `movemask` that works like the one in x86-64, so we
359
    /// wind up using a different method[1]. The different method also produces
360
    /// a mask, but 4 bits are set in the neon case instead of a single bit set
361
    /// in the x86-64 case. We do an extra step to zero out 3 of the 4 bits,
362
    /// but we still wind up with at least 3 zeroes between each set bit. This
363
    /// generally means that we need to do some division by 4 before extracting
364
    /// offsets.
365
    ///
366
    /// In fact, the existence of this type is the entire reason that we have
367
    /// the `MoveMask` trait in the first place. This basically lets us keep
368
    /// the different representations of masks without being forced to unify
369
    /// them into a single representation, which could result in extra and
370
    /// unnecessary work.
371
    ///
372
    /// [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
373
    #[derive(Clone, Copy, Debug)]
374
    pub(crate) struct NeonMoveMask(u64);
375
376
    impl NeonMoveMask {
377
        /// Get the mask in a form suitable for computing offsets.
378
        ///
379
        /// Basically, this normalizes to little endian. On big endian, this
380
        /// swaps the bytes.
381
        #[inline(always)]
382
        fn get_for_offset(self) -> u64 {
383
            #[cfg(target_endian = "big")]
384
            {
385
                self.0.swap_bytes()
386
            }
387
            #[cfg(target_endian = "little")]
388
            {
389
                self.0
390
            }
391
        }
392
    }
393
394
    impl MoveMask for NeonMoveMask {
395
        #[inline(always)]
396
        fn all_zeros_except_least_significant(n: usize) -> NeonMoveMask {
397
            debug_assert!(n < 16);
398
            NeonMoveMask(!(((1 << n) << 2) - 1))
399
        }
400
401
        #[inline(always)]
402
        fn has_non_zero(self) -> bool {
403
            self.0 != 0
404
        }
405
406
        #[inline(always)]
407
        fn count_ones(self) -> usize {
408
            self.0.count_ones() as usize
409
        }
410
411
        #[inline(always)]
412
        fn and(self, other: NeonMoveMask) -> NeonMoveMask {
413
            NeonMoveMask(self.0 & other.0)
414
        }
415
416
        #[inline(always)]
417
        fn or(self, other: NeonMoveMask) -> NeonMoveMask {
418
            NeonMoveMask(self.0 | other.0)
419
        }
420
421
        #[inline(always)]
422
        fn clear_least_significant_bit(self) -> NeonMoveMask {
423
            NeonMoveMask(self.0 & (self.0 - 1))
424
        }
425
426
        #[inline(always)]
427
        fn first_offset(self) -> usize {
428
            // We are dealing with little endian here (and if we aren't,
429
            // we swap the bytes so we are in practice), where the most
430
            // significant byte is at a higher address. That means the least
431
            // significant bit that is set corresponds to the position of our
432
            // first matching byte. That position corresponds to the number of
433
            // zeros after the least significant bit.
434
            //
435
            // Note that unlike `SensibleMoveMask`, this mask has its bits
436
            // spread out over 64 bits instead of 16 bits (for a 128 bit
437
            // vector). Namely, where as x86-64 will turn
438
            //
439
            //   0x00 0xFF 0x00 0x00 0xFF
440
            //
441
            // into 10010, our neon approach will turn it into
442
            //
443
            //   10000000000010000000
444
            //
445
            // And this happens because neon doesn't have a native `movemask`
446
            // instruction, so we kind of fake it[1]. Thus, we divide the
447
            // number of trailing zeros by 4 to get the "real" offset.
448
            //
449
            // [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
450
            (self.get_for_offset().trailing_zeros() >> 2) as usize
451
        }
452
453
        #[inline(always)]
454
        fn last_offset(self) -> usize {
455
            // See comment in `first_offset` above. This is basically the same,
456
            // but coming from the other direction.
457
            16 - (self.get_for_offset().leading_zeros() >> 2) as usize - 1
458
        }
459
    }
460
}
461
462
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
463
mod wasm_simd128 {
464
    use core::arch::wasm32::*;
465
466
    use super::{SensibleMoveMask, Vector};
467
468
    impl Vector for v128 {
469
        const BYTES: usize = 16;
470
        const ALIGN: usize = Self::BYTES - 1;
471
472
        type Mask = SensibleMoveMask;
473
474
        #[inline(always)]
475
        unsafe fn splat(byte: u8) -> v128 {
476
            u8x16_splat(byte)
477
        }
478
479
        #[inline(always)]
480
        unsafe fn load_aligned(data: *const u8) -> v128 {
481
            *data.cast()
482
        }
483
484
        #[inline(always)]
485
        unsafe fn load_unaligned(data: *const u8) -> v128 {
486
            v128_load(data.cast())
487
        }
488
489
        #[inline(always)]
490
        unsafe fn movemask(self) -> SensibleMoveMask {
491
            SensibleMoveMask(u8x16_bitmask(self).into())
492
        }
493
494
        #[inline(always)]
495
        unsafe fn cmpeq(self, vector2: Self) -> v128 {
496
            u8x16_eq(self, vector2)
497
        }
498
499
        #[inline(always)]
500
        unsafe fn and(self, vector2: Self) -> v128 {
501
            v128_and(self, vector2)
502
        }
503
504
        #[inline(always)]
505
        unsafe fn or(self, vector2: Self) -> v128 {
506
            v128_or(self, vector2)
507
        }
508
    }
509
}