/src/highwayhash/highwayhash/vector256.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2016 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #ifndef HIGHWAYHASH_VECTOR256_H_ |
16 | | #define HIGHWAYHASH_VECTOR256_H_ |
17 | | |
18 | | // Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators: |
19 | | // const V4x64U masked_sum = (a + b) & m; |
20 | | // This is shorter and more readable than compiler intrinsics: |
21 | | // const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m); |
22 | | // There is typically no runtime cost for these abstractions. |
23 | | // |
24 | | // The naming convention is VNxBBT where N is the number of lanes, BB the |
25 | | // number of bits per lane and T is the lane type: unsigned integer (U), |
26 | | // signed integer (I), or floating-point (F). |
27 | | |
28 | | // WARNING: this is a "restricted" header because it is included from |
29 | | // translation units compiled with different flags. This header and its |
30 | | // dependencies must not define any function unless it is static inline and/or |
31 | | // within namespace HH_TARGET_NAME. See arch_specific.h for details. |
32 | | |
33 | | #include <stddef.h> |
34 | | #include <stdint.h> |
35 | | |
36 | | #include "highwayhash/arch_specific.h" |
37 | | #include "highwayhash/compiler_specific.h" |
38 | | |
39 | | // For auto-dependency generation, we need to include all headers but not their |
40 | | // contents (otherwise compilation fails because -mavx2 is not specified). |
41 | | #ifndef HH_DISABLE_TARGET_SPECIFIC |
42 | | |
43 | | // (This include cannot be moved within a namespace due to conflicts with |
44 | | // other system headers; see the comment in hh_sse41.h.) |
45 | | #include <immintrin.h> |
46 | | |
47 | | namespace highwayhash { |
48 | | // To prevent ODR violations when including this from multiple translation |
49 | | // units (TU) that are compiled with different flags, the contents must reside |
50 | | // in a namespace whose name is unique to the TU. NOTE: this behavior is |
51 | | // incompatible with precompiled modules and requires textual inclusion instead. |
52 | | namespace HH_TARGET_NAME { |
53 | | |
54 | | // Primary template for 256-bit AVX2 vectors; only specializations are used. |
55 | | template <typename T> |
56 | | class V256 {}; |
57 | | |
58 | | template <> |
59 | | class V256<uint8_t> { |
60 | | public: |
61 | | using Intrinsic = __m256i; |
62 | | using T = uint8_t; |
63 | | static constexpr size_t N = 32; |
64 | | |
65 | | // Leaves v_ uninitialized - typically used for output parameters. |
66 | 0 | HH_INLINE V256() {} |
67 | | |
68 | | // Broadcasts i to all lanes. |
69 | | HH_INLINE explicit V256(T i) |
70 | 0 | : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {} |
71 | | |
72 | | // Copy from other vector. |
73 | 0 | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
74 | | template <typename U> |
75 | | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
76 | 0 | HH_INLINE V256& operator=(const V256& other) { |
77 | 0 | v_ = other.v_; |
78 | 0 | return *this; |
79 | 0 | } |
80 | | |
81 | | // Convert from/to intrinsics. |
82 | 0 | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
83 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
84 | 0 | v_ = v; |
85 | 0 | return *this; |
86 | 0 | } |
87 | 0 | HH_INLINE operator Intrinsic() const { return v_; } |
88 | | |
89 | | // There are no greater-than comparison instructions for unsigned T. |
90 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
91 | 0 | return V256(_mm256_cmpeq_epi8(v_, other.v_)); |
92 | 0 | } |
93 | | |
94 | 0 | HH_INLINE V256& operator+=(const V256& other) { |
95 | 0 | v_ = _mm256_add_epi8(v_, other.v_); |
96 | 0 | return *this; |
97 | 0 | } |
98 | 0 | HH_INLINE V256& operator-=(const V256& other) { |
99 | 0 | v_ = _mm256_sub_epi8(v_, other.v_); |
100 | 0 | return *this; |
101 | 0 | } |
102 | | |
103 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
104 | 0 | v_ = _mm256_and_si256(v_, other.v_); |
105 | 0 | return *this; |
106 | 0 | } |
107 | 0 | HH_INLINE V256& operator|=(const V256& other) { |
108 | 0 | v_ = _mm256_or_si256(v_, other.v_); |
109 | 0 | return *this; |
110 | 0 | } |
111 | 0 | HH_INLINE V256& operator^=(const V256& other) { |
112 | 0 | v_ = _mm256_xor_si256(v_, other.v_); |
113 | 0 | return *this; |
114 | 0 | } |
115 | | |
116 | | private: |
117 | | Intrinsic v_; |
118 | | }; |
119 | | |
120 | | template <> |
121 | | class V256<uint16_t> { |
122 | | public: |
123 | | using Intrinsic = __m256i; |
124 | | using T = uint16_t; |
125 | | static constexpr size_t N = 16; |
126 | | |
127 | | // Leaves v_ uninitialized - typically used for output parameters. |
128 | 0 | HH_INLINE V256() {} |
129 | | |
130 | | // Lane 0 (p_0) is the lowest. |
131 | | HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7, |
132 | | T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) |
133 | | : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6, |
134 | 0 | p_5, p_4, p_3, p_2, p_1, p_0)) {} |
135 | | |
136 | | // Broadcasts i to all lanes. |
137 | | HH_INLINE explicit V256(T i) |
138 | 0 | : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {} |
139 | | |
140 | | // Copy from other vector. |
141 | 0 | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
142 | | template <typename U> |
143 | | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
144 | 0 | HH_INLINE V256& operator=(const V256& other) { |
145 | 0 | v_ = other.v_; |
146 | 0 | return *this; |
147 | 0 | } |
148 | | |
149 | | // Convert from/to intrinsics. |
150 | 0 | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
151 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
152 | 0 | v_ = v; |
153 | 0 | return *this; |
154 | 0 | } |
155 | 0 | HH_INLINE operator Intrinsic() const { return v_; } |
156 | | |
157 | | // There are no greater-than comparison instructions for unsigned T. |
158 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
159 | 0 | return V256(_mm256_cmpeq_epi16(v_, other.v_)); |
160 | 0 | } |
161 | | |
162 | 0 | HH_INLINE V256& operator+=(const V256& other) { |
163 | 0 | v_ = _mm256_add_epi16(v_, other.v_); |
164 | 0 | return *this; |
165 | 0 | } |
166 | 0 | HH_INLINE V256& operator-=(const V256& other) { |
167 | 0 | v_ = _mm256_sub_epi16(v_, other.v_); |
168 | 0 | return *this; |
169 | 0 | } |
170 | | |
171 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
172 | 0 | v_ = _mm256_and_si256(v_, other.v_); |
173 | 0 | return *this; |
174 | 0 | } |
175 | 0 | HH_INLINE V256& operator|=(const V256& other) { |
176 | 0 | v_ = _mm256_or_si256(v_, other.v_); |
177 | 0 | return *this; |
178 | 0 | } |
179 | 0 | HH_INLINE V256& operator^=(const V256& other) { |
180 | 0 | v_ = _mm256_xor_si256(v_, other.v_); |
181 | 0 | return *this; |
182 | 0 | } |
183 | | |
184 | 0 | HH_INLINE V256& operator<<=(const int count) { |
185 | 0 | v_ = _mm256_slli_epi16(v_, count); |
186 | 0 | return *this; |
187 | 0 | } |
188 | | |
189 | 0 | HH_INLINE V256& operator>>=(const int count) { |
190 | 0 | v_ = _mm256_srli_epi16(v_, count); |
191 | 0 | return *this; |
192 | 0 | } |
193 | | |
194 | | private: |
195 | | Intrinsic v_; |
196 | | }; |
197 | | |
198 | | template <> |
199 | | class V256<uint32_t> { |
200 | | public: |
201 | | using Intrinsic = __m256i; |
202 | | using T = uint32_t; |
203 | | static constexpr size_t N = 8; |
204 | | |
205 | | // Leaves v_ uninitialized - typically used for output parameters. |
206 | 0 | HH_INLINE V256() {} |
207 | | |
208 | | // Lane 0 (p_0) is the lowest. |
209 | | HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) |
210 | 0 | : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} |
211 | | |
212 | | // Broadcasts i to all lanes. |
213 | | HH_INLINE explicit V256(T i) |
214 | 21 | : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {} |
215 | | |
216 | | // Copy from other vector. |
217 | 21 | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
218 | | template <typename U> |
219 | | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
220 | 0 | HH_INLINE V256& operator=(const V256& other) { |
221 | 0 | v_ = other.v_; |
222 | 0 | return *this; |
223 | 0 | } |
224 | | |
225 | | // Convert from/to intrinsics. |
226 | 42 | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
227 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
228 | 0 | v_ = v; |
229 | 0 | return *this; |
230 | 0 | } |
231 | 105 | HH_INLINE operator Intrinsic() const { return v_; } |
232 | | |
233 | | // There are no greater-than comparison instructions for unsigned T. |
234 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
235 | 0 | return V256(_mm256_cmpeq_epi32(v_, other.v_)); |
236 | 0 | } |
237 | | |
238 | 0 | HH_INLINE V256& operator+=(const V256& other) { |
239 | 0 | v_ = _mm256_add_epi32(v_, other.v_); |
240 | 0 | return *this; |
241 | 0 | } |
242 | 21 | HH_INLINE V256& operator-=(const V256& other) { |
243 | 21 | v_ = _mm256_sub_epi32(v_, other.v_); |
244 | 21 | return *this; |
245 | 21 | } |
246 | | |
247 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
248 | 0 | v_ = _mm256_and_si256(v_, other.v_); |
249 | 0 | return *this; |
250 | 0 | } |
251 | 0 | HH_INLINE V256& operator|=(const V256& other) { |
252 | 0 | v_ = _mm256_or_si256(v_, other.v_); |
253 | 0 | return *this; |
254 | 0 | } |
255 | 0 | HH_INLINE V256& operator^=(const V256& other) { |
256 | 0 | v_ = _mm256_xor_si256(v_, other.v_); |
257 | 0 | return *this; |
258 | 0 | } |
259 | | |
260 | 0 | HH_INLINE V256& operator<<=(const int count) { |
261 | 0 | v_ = _mm256_slli_epi32(v_, count); |
262 | 0 | return *this; |
263 | 0 | } |
264 | | |
265 | 0 | HH_INLINE V256& operator>>=(const int count) { |
266 | 0 | v_ = _mm256_srli_epi32(v_, count); |
267 | 0 | return *this; |
268 | 0 | } |
269 | | |
270 | | private: |
271 | | Intrinsic v_; |
272 | | }; |
273 | | |
274 | | template <> |
275 | | class V256<uint64_t> { |
276 | | public: |
277 | | using Intrinsic = __m256i; |
278 | | using T = uint64_t; |
279 | | static constexpr size_t N = 4; |
280 | | |
281 | | // Leaves v_ uninitialized - typically used for output parameters. |
282 | 196 | HH_INLINE V256() {} |
283 | | |
284 | | // Lane 0 (p_0) is the lowest. |
285 | | HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) |
286 | 525k | : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {} |
287 | | |
288 | | // Broadcasts i to all lanes. |
289 | | HH_INLINE explicit V256(T i) |
290 | 0 | : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {} |
291 | | |
292 | | // Copy from other vector. |
293 | 525k | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
294 | | template <typename U> |
295 | 21 | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
296 | 217 | HH_INLINE V256& operator=(const V256& other) { |
297 | 217 | v_ = other.v_; |
298 | 217 | return *this; |
299 | 217 | } |
300 | | |
301 | | // Convert from/to intrinsics. |
302 | 1.83M | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
303 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
304 | 0 | v_ = v; |
305 | 0 | return *this; |
306 | 0 | } |
307 | 2.62M | HH_INLINE operator Intrinsic() const { return v_; } |
308 | | |
309 | | // There are no greater-than comparison instructions for unsigned T. |
310 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
311 | 0 | return V256(_mm256_cmpeq_epi64(v_, other.v_)); |
312 | 0 | } |
313 | | |
314 | 1.31M | HH_INLINE V256& operator+=(const V256& other) { |
315 | 1.31M | v_ = _mm256_add_epi64(v_, other.v_); |
316 | 1.31M | return *this; |
317 | 1.31M | } |
318 | 0 | HH_INLINE V256& operator-=(const V256& other) { |
319 | 0 | v_ = _mm256_sub_epi64(v_, other.v_); |
320 | 0 | return *this; |
321 | 0 | } |
322 | | |
323 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
324 | 0 | v_ = _mm256_and_si256(v_, other.v_); |
325 | 0 | return *this; |
326 | 0 | } |
327 | 21 | HH_INLINE V256& operator|=(const V256& other) { |
328 | 21 | v_ = _mm256_or_si256(v_, other.v_); |
329 | 21 | return *this; |
330 | 21 | } |
331 | 525k | HH_INLINE V256& operator^=(const V256& other) { |
332 | 525k | v_ = _mm256_xor_si256(v_, other.v_); |
333 | 525k | return *this; |
334 | 525k | } |
335 | | |
336 | 0 | HH_INLINE V256& operator<<=(const int count) { |
337 | 0 | v_ = _mm256_slli_epi64(v_, count); |
338 | 0 | return *this; |
339 | 0 | } |
340 | | |
341 | 524k | HH_INLINE V256& operator>>=(const int count) { |
342 | 524k | v_ = _mm256_srli_epi64(v_, count); |
343 | 524k | return *this; |
344 | 524k | } |
345 | | |
346 | | private: |
347 | | Intrinsic v_; |
348 | | }; |
349 | | |
350 | | template <> |
351 | | class V256<float> { |
352 | | public: |
353 | | using Intrinsic = __m256; |
354 | | using T = float; |
355 | | static constexpr size_t N = 8; |
356 | | |
357 | | // Leaves v_ uninitialized - typically used for output parameters. |
358 | 0 | HH_INLINE V256() {} |
359 | | |
360 | | // Lane 0 (p_0) is the lowest. |
361 | | HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) |
362 | 0 | : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} |
363 | | |
364 | | // Broadcasts to all lanes. |
365 | 0 | HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {} |
366 | | |
367 | | // Copy from other vector. |
368 | 0 | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
369 | | template <typename U> |
370 | | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
371 | 0 | HH_INLINE V256& operator=(const V256& other) { |
372 | 0 | v_ = other.v_; |
373 | 0 | return *this; |
374 | 0 | } |
375 | | |
376 | | // Convert from/to intrinsics. |
377 | 0 | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
378 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
379 | 0 | v_ = v; |
380 | 0 | return *this; |
381 | 0 | } |
382 | 0 | HH_INLINE operator Intrinsic() const { return v_; } |
383 | | |
384 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
385 | 0 | return V256(_mm256_cmp_ps(v_, other.v_, 0)); |
386 | 0 | } |
387 | 0 | HH_INLINE V256 operator<(const V256& other) const { |
388 | 0 | return V256(_mm256_cmp_ps(v_, other.v_, 1)); |
389 | 0 | } |
390 | 0 | HH_INLINE V256 operator>(const V256& other) const { |
391 | 0 | return V256(_mm256_cmp_ps(other.v_, v_, 1)); |
392 | 0 | } |
393 | | |
394 | 0 | HH_INLINE V256& operator*=(const V256& other) { |
395 | 0 | v_ = _mm256_mul_ps(v_, other.v_); |
396 | 0 | return *this; |
397 | 0 | } |
398 | 0 | HH_INLINE V256& operator/=(const V256& other) { |
399 | 0 | v_ = _mm256_div_ps(v_, other.v_); |
400 | 0 | return *this; |
401 | 0 | } |
402 | 0 | HH_INLINE V256& operator+=(const V256& other) { |
403 | 0 | v_ = _mm256_add_ps(v_, other.v_); |
404 | 0 | return *this; |
405 | 0 | } |
406 | 0 | HH_INLINE V256& operator-=(const V256& other) { |
407 | 0 | v_ = _mm256_sub_ps(v_, other.v_); |
408 | 0 | return *this; |
409 | 0 | } |
410 | | |
411 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
412 | 0 | v_ = _mm256_and_ps(v_, other.v_); |
413 | 0 | return *this; |
414 | 0 | } |
415 | 0 | HH_INLINE V256& operator|=(const V256& other) { |
416 | 0 | v_ = _mm256_or_ps(v_, other.v_); |
417 | 0 | return *this; |
418 | 0 | } |
419 | 0 | HH_INLINE V256& operator^=(const V256& other) { |
420 | 0 | v_ = _mm256_xor_ps(v_, other.v_); |
421 | 0 | return *this; |
422 | 0 | } |
423 | | |
424 | | private: |
425 | | Intrinsic v_; |
426 | | }; |
427 | | |
428 | | template <> |
429 | | class V256<double> { |
430 | | public: |
431 | | using Intrinsic = __m256d; |
432 | | using T = double; |
433 | | static constexpr size_t N = 4; |
434 | | |
435 | | // Leaves v_ uninitialized - typically used for output parameters. |
436 | 0 | HH_INLINE V256() {} |
437 | | |
438 | | // Lane 0 (p_0) is the lowest. |
439 | | HH_INLINE V256(T p_3, T p_2, T p_1, T p_0) |
440 | 0 | : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {} |
441 | | |
442 | | // Broadcasts to all lanes. |
443 | 0 | HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {} |
444 | | |
445 | | // Copy from other vector. |
446 | 0 | HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} |
447 | | template <typename U> |
448 | | HH_INLINE explicit V256(const V256<U>& other) : v_(other) {} |
449 | 0 | HH_INLINE V256& operator=(const V256& other) { |
450 | 0 | v_ = other.v_; |
451 | 0 | return *this; |
452 | 0 | } |
453 | | |
454 | | // Convert from/to intrinsics. |
455 | 0 | HH_INLINE V256(const Intrinsic& v) : v_(v) {} |
456 | 0 | HH_INLINE V256& operator=(const Intrinsic& v) { |
457 | 0 | v_ = v; |
458 | 0 | return *this; |
459 | 0 | } |
460 | 0 | HH_INLINE operator Intrinsic() const { return v_; } |
461 | | |
462 | 0 | HH_INLINE V256 operator==(const V256& other) const { |
463 | 0 | return V256(_mm256_cmp_pd(v_, other.v_, 0)); |
464 | 0 | } |
465 | 0 | HH_INLINE V256 operator<(const V256& other) const { |
466 | 0 | return V256(_mm256_cmp_pd(v_, other.v_, 1)); |
467 | 0 | } |
468 | 0 | HH_INLINE V256 operator>(const V256& other) const { |
469 | 0 | return V256(_mm256_cmp_pd(other.v_, v_, 1)); |
470 | 0 | } |
471 | | |
472 | 0 | HH_INLINE V256& operator*=(const V256& other) { |
473 | 0 | v_ = _mm256_mul_pd(v_, other.v_); |
474 | 0 | return *this; |
475 | 0 | } |
476 | 0 | HH_INLINE V256& operator/=(const V256& other) { |
477 | 0 | v_ = _mm256_div_pd(v_, other.v_); |
478 | 0 | return *this; |
479 | 0 | } |
480 | 0 | HH_INLINE V256& operator+=(const V256& other) { |
481 | 0 | v_ = _mm256_add_pd(v_, other.v_); |
482 | 0 | return *this; |
483 | 0 | } |
484 | 0 | HH_INLINE V256& operator-=(const V256& other) { |
485 | 0 | v_ = _mm256_sub_pd(v_, other.v_); |
486 | 0 | return *this; |
487 | 0 | } |
488 | | |
489 | 0 | HH_INLINE V256& operator&=(const V256& other) { |
490 | 0 | v_ = _mm256_and_pd(v_, other.v_); |
491 | 0 | return *this; |
492 | 0 | } |
493 | 0 | HH_INLINE V256& operator|=(const V256& other) { |
494 | 0 | v_ = _mm256_or_pd(v_, other.v_); |
495 | 0 | return *this; |
496 | 0 | } |
497 | 0 | HH_INLINE V256& operator^=(const V256& other) { |
498 | 0 | v_ = _mm256_xor_pd(v_, other.v_); |
499 | 0 | return *this; |
500 | 0 | } |
501 | | |
502 | | private: |
503 | | Intrinsic v_; |
504 | | }; |
505 | | |
506 | | // Nonmember functions for any V256 via member functions. |
507 | | |
508 | | template <typename T> |
509 | | HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) { |
510 | | V256<T> t(left); |
511 | | return t *= right; |
512 | | } |
513 | | |
514 | | template <typename T> |
515 | | HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) { |
516 | | V256<T> t(left); |
517 | | return t /= right; |
518 | | } |
519 | | |
520 | | template <typename T> |
521 | 98 | HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) { |
522 | 98 | V256<T> t(left); |
523 | 98 | return t += right; |
524 | 98 | } |
525 | | |
526 | | template <typename T> |
527 | 21 | HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) { |
528 | 21 | V256<T> t(left); |
529 | 21 | return t -= right; |
530 | 21 | } |
531 | | |
532 | | template <typename T> |
533 | | HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) { |
534 | | V256<T> t(left); |
535 | | return t &= right; |
536 | | } |
537 | | |
538 | | template <typename T> |
539 | 21 | HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) { |
540 | 21 | V256<T> t(left); |
541 | 21 | return t |= right; |
542 | 21 | } |
543 | | |
544 | | template <typename T> |
545 | 98 | HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) { |
546 | 98 | V256<T> t(left); |
547 | 98 | return t ^= right; |
548 | 98 | } |
549 | | |
550 | | template <typename T> |
551 | 0 | HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) { |
552 | 0 | V256<T> t(v); |
553 | 0 | return t <<= count; |
554 | 0 | } |
555 | | |
556 | | template <typename T> |
557 | 524k | HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) { |
558 | 524k | V256<T> t(v); |
559 | 524k | return t >>= count; |
560 | 524k | } |
561 | | |
562 | | // We do not provide operator<<(V, __m128i) because it has 4 cycle latency |
563 | | // (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead. |
564 | | |
565 | | using V32x8U = V256<uint8_t>; |
566 | | using V16x16U = V256<uint16_t>; |
567 | | using V8x32U = V256<uint32_t>; |
568 | | using V4x64U = V256<uint64_t>; |
569 | | using V8x32F = V256<float>; |
570 | | using V4x64F = V256<double>; |
571 | | |
572 | | // Load/Store for any V256. |
573 | | |
574 | | // We differentiate between targets' vector types via template specialization. |
575 | | // Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may |
576 | | // generate better code in unoptimized builds. Only declare the primary |
577 | | // templates to avoid needing mutual exclusion with vector128. |
578 | | |
579 | | template <class V> |
580 | | HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); |
581 | | |
582 | | template <class V> |
583 | | HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); |
584 | | |
585 | | template <> |
586 | 0 | HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) { |
587 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
588 | 0 | return V32x8U(_mm256_load_si256(p)); |
589 | 0 | } |
590 | | template <> |
591 | 0 | HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) { |
592 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
593 | 0 | return V16x16U(_mm256_load_si256(p)); |
594 | 0 | } |
595 | | template <> |
596 | 0 | HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) { |
597 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
598 | 0 | return V8x32U(_mm256_load_si256(p)); |
599 | 0 | } |
600 | | template <> |
601 | 0 | HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) { |
602 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
603 | 0 | return V4x64U(_mm256_load_si256(p)); |
604 | 0 | } |
605 | | template <> |
606 | 0 | HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) { |
607 | 0 | return V8x32F(_mm256_load_ps(from)); |
608 | 0 | } |
609 | | template <> |
610 | 0 | HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) { |
611 | 0 | return V4x64F(_mm256_load_pd(from)); |
612 | 0 | } |
613 | | |
614 | | template <> |
615 | 0 | HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) { |
616 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
617 | 0 | return V32x8U(_mm256_loadu_si256(p)); |
618 | 0 | } |
619 | | template <> |
620 | 0 | HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) { |
621 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
622 | 0 | return V16x16U(_mm256_loadu_si256(p)); |
623 | 0 | } |
624 | | template <> |
625 | 0 | HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) { |
626 | 0 | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
627 | 0 | return V8x32U(_mm256_loadu_si256(p)); |
628 | 0 | } |
629 | | template <> |
630 | 262k | HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) { |
631 | 262k | const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from); |
632 | 262k | return V4x64U(_mm256_loadu_si256(p)); |
633 | 262k | } |
634 | | template <> |
635 | 0 | HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) { |
636 | 0 | return V8x32F(_mm256_loadu_ps(from)); |
637 | 0 | } |
638 | | template <> |
639 | 0 | HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) { |
640 | 0 | return V4x64F(_mm256_loadu_pd(from)); |
641 | 0 | } |
642 | | |
643 | | // "to" must be vector-aligned. |
644 | | template <typename T> |
645 | | HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) { |
646 | | _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); |
647 | | } |
648 | 0 | HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) { |
649 | 0 | _mm256_store_ps(to, v); |
650 | 0 | } |
651 | 0 | HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) { |
652 | 0 | _mm256_store_pd(to, v); |
653 | 0 | } |
654 | | |
655 | | template <typename T> |
656 | 0 | HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) { |
657 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); |
658 | 0 | } |
659 | | HH_INLINE void StoreUnaligned(const V256<float>& v, |
660 | 0 | float* const HH_RESTRICT to) { |
661 | 0 | _mm256_storeu_ps(to, v); |
662 | 0 | } |
663 | | HH_INLINE void StoreUnaligned(const V256<double>& v, |
664 | 0 | double* const HH_RESTRICT to) { |
665 | 0 | _mm256_storeu_pd(to, v); |
666 | 0 | } |
667 | | |
668 | | // Writes directly to (aligned) memory, bypassing the cache. This is useful for |
669 | | // data that will not be read again in the near future. |
670 | | template <typename T> |
671 | | HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) { |
672 | | _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v); |
673 | | } |
674 | 0 | HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) { |
675 | 0 | _mm256_stream_ps(to, v); |
676 | 0 | } |
677 | 0 | HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) { |
678 | 0 | _mm256_stream_pd(to, v); |
679 | 0 | } |
680 | | |
681 | | // Miscellaneous functions. |
682 | | |
683 | | template <typename T> |
684 | | HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) { |
685 | | constexpr size_t num_bits = sizeof(T) * 8; |
686 | | return (v << count) | (v >> (num_bits - count)); |
687 | | } |
688 | | |
689 | | template <typename T> |
690 | 0 | HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) { |
691 | 0 | return V256<T>(_mm256_andnot_si256(neg_mask, values)); |
692 | 0 | } |
693 | | template <> |
694 | | HH_INLINE V256<float> AndNot(const V256<float>& neg_mask, |
695 | 0 | const V256<float>& values) { |
696 | 0 | return V256<float>(_mm256_andnot_ps(neg_mask, values)); |
697 | 0 | } |
698 | | template <> |
699 | | HH_INLINE V256<double> AndNot(const V256<double>& neg_mask, |
700 | 0 | const V256<double>& values) { |
701 | 0 | return V256<double>(_mm256_andnot_pd(neg_mask, values)); |
702 | 0 | } |
703 | | |
704 | 0 | HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) { |
705 | 0 | return V8x32F(_mm256_blendv_ps(a, b, mask)); |
706 | 0 | } |
707 | | |
708 | 0 | HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) { |
709 | 0 | return V4x64F(_mm256_blendv_pd(a, b, mask)); |
710 | 0 | } |
711 | | |
712 | | // Min/Max |
713 | | |
714 | 0 | HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) { |
715 | 0 | return V32x8U(_mm256_min_epu8(v0, v1)); |
716 | 0 | } |
717 | | |
718 | 0 | HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) { |
719 | 0 | return V32x8U(_mm256_max_epu8(v0, v1)); |
720 | 0 | } |
721 | | |
722 | 0 | HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) { |
723 | 0 | return V16x16U(_mm256_min_epu16(v0, v1)); |
724 | 0 | } |
725 | | |
726 | 0 | HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) { |
727 | 0 | return V16x16U(_mm256_max_epu16(v0, v1)); |
728 | 0 | } |
729 | | |
730 | 0 | HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) { |
731 | 0 | return V8x32U(_mm256_min_epu32(v0, v1)); |
732 | 0 | } |
733 | | |
734 | 0 | HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) { |
735 | 0 | return V8x32U(_mm256_max_epu32(v0, v1)); |
736 | 0 | } |
737 | | |
738 | 0 | HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) { |
739 | 0 | return V8x32F(_mm256_min_ps(v0, v1)); |
740 | 0 | } |
741 | | |
742 | 0 | HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) { |
743 | 0 | return V8x32F(_mm256_max_ps(v0, v1)); |
744 | 0 | } |
745 | | |
746 | 0 | HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) { |
747 | 0 | return V4x64F(_mm256_min_pd(v0, v1)); |
748 | 0 | } |
749 | | |
750 | 0 | HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) { |
751 | 0 | return V4x64F(_mm256_max_pd(v0, v1)); |
752 | 0 | } |
753 | | |
754 | | } // namespace HH_TARGET_NAME |
755 | | } // namespace highwayhash |
756 | | |
757 | | #endif // HH_DISABLE_TARGET_SPECIFIC |
758 | | #endif // HIGHWAYHASH_VECTOR256_H_ |