Coverage Report

Created: 2025-08-29 06:43

/src/highwayhash/highwayhash/vector256.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2016 Google Inc. All Rights Reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#ifndef HIGHWAYHASH_VECTOR256_H_
16
#define HIGHWAYHASH_VECTOR256_H_
17
18
// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators:
19
// const V4x64U masked_sum = (a + b) & m;
20
// This is shorter and more readable than compiler intrinsics:
21
// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m);
22
// There is typically no runtime cost for these abstractions.
23
//
24
// The naming convention is VNxBBT where N is the number of lanes, BB the
25
// number of bits per lane and T is the lane type: unsigned integer (U),
26
// signed integer (I), or floating-point (F).
27
28
// WARNING: this is a "restricted" header because it is included from
29
// translation units compiled with different flags. This header and its
30
// dependencies must not define any function unless it is static inline and/or
31
// within namespace HH_TARGET_NAME. See arch_specific.h for details.
32
33
#include <stddef.h>
34
#include <stdint.h>
35
36
#include "highwayhash/arch_specific.h"
37
#include "highwayhash/compiler_specific.h"
38
39
// For auto-dependency generation, we need to include all headers but not their
40
// contents (otherwise compilation fails because -mavx2 is not specified).
41
#ifndef HH_DISABLE_TARGET_SPECIFIC
42
43
// (This include cannot be moved within a namespace due to conflicts with
44
// other system headers; see the comment in hh_sse41.h.)
45
#include <immintrin.h>
46
47
namespace highwayhash {
48
// To prevent ODR violations when including this from multiple translation
49
// units (TU) that are compiled with different flags, the contents must reside
50
// in a namespace whose name is unique to the TU. NOTE: this behavior is
51
// incompatible with precompiled modules and requires textual inclusion instead.
52
namespace HH_TARGET_NAME {
53
54
// Primary template for 256-bit AVX2 vectors; only specializations are used.
55
template <typename T>
56
class V256 {};
57
58
template <>
59
class V256<uint8_t> {
60
 public:
61
  using Intrinsic = __m256i;
62
  using T = uint8_t;
63
  static constexpr size_t N = 32;
64
65
  // Leaves v_ uninitialized - typically used for output parameters.
66
0
  HH_INLINE V256() {}
67
68
  // Broadcasts i to all lanes.
69
  HH_INLINE explicit V256(T i)
70
0
      : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {}
71
72
  // Copy from other vector.
73
0
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
74
  template <typename U>
75
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
76
0
  HH_INLINE V256& operator=(const V256& other) {
77
0
    v_ = other.v_;
78
0
    return *this;
79
0
  }
80
81
  // Convert from/to intrinsics.
82
0
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
83
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
84
0
    v_ = v;
85
0
    return *this;
86
0
  }
87
0
  HH_INLINE operator Intrinsic() const { return v_; }
88
89
  // There are no greater-than comparison instructions for unsigned T.
90
0
  HH_INLINE V256 operator==(const V256& other) const {
91
0
    return V256(_mm256_cmpeq_epi8(v_, other.v_));
92
0
  }
93
94
0
  HH_INLINE V256& operator+=(const V256& other) {
95
0
    v_ = _mm256_add_epi8(v_, other.v_);
96
0
    return *this;
97
0
  }
98
0
  HH_INLINE V256& operator-=(const V256& other) {
99
0
    v_ = _mm256_sub_epi8(v_, other.v_);
100
0
    return *this;
101
0
  }
102
103
0
  HH_INLINE V256& operator&=(const V256& other) {
104
0
    v_ = _mm256_and_si256(v_, other.v_);
105
0
    return *this;
106
0
  }
107
0
  HH_INLINE V256& operator|=(const V256& other) {
108
0
    v_ = _mm256_or_si256(v_, other.v_);
109
0
    return *this;
110
0
  }
111
0
  HH_INLINE V256& operator^=(const V256& other) {
112
0
    v_ = _mm256_xor_si256(v_, other.v_);
113
0
    return *this;
114
0
  }
115
116
 private:
117
  Intrinsic v_;
118
};
119
120
template <>
121
class V256<uint16_t> {
122
 public:
123
  using Intrinsic = __m256i;
124
  using T = uint16_t;
125
  static constexpr size_t N = 16;
126
127
  // Leaves v_ uninitialized - typically used for output parameters.
128
0
  HH_INLINE V256() {}
129
130
  // Lane 0 (p_0) is the lowest.
131
  HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7,
132
                 T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
133
      : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6,
134
0
                            p_5, p_4, p_3, p_2, p_1, p_0)) {}
135
136
  // Broadcasts i to all lanes.
137
  HH_INLINE explicit V256(T i)
138
0
      : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {}
139
140
  // Copy from other vector.
141
0
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
142
  template <typename U>
143
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
144
0
  HH_INLINE V256& operator=(const V256& other) {
145
0
    v_ = other.v_;
146
0
    return *this;
147
0
  }
148
149
  // Convert from/to intrinsics.
150
0
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
151
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
152
0
    v_ = v;
153
0
    return *this;
154
0
  }
155
0
  HH_INLINE operator Intrinsic() const { return v_; }
156
157
  // There are no greater-than comparison instructions for unsigned T.
158
0
  HH_INLINE V256 operator==(const V256& other) const {
159
0
    return V256(_mm256_cmpeq_epi16(v_, other.v_));
160
0
  }
161
162
0
  HH_INLINE V256& operator+=(const V256& other) {
163
0
    v_ = _mm256_add_epi16(v_, other.v_);
164
0
    return *this;
165
0
  }
166
0
  HH_INLINE V256& operator-=(const V256& other) {
167
0
    v_ = _mm256_sub_epi16(v_, other.v_);
168
0
    return *this;
169
0
  }
170
171
0
  HH_INLINE V256& operator&=(const V256& other) {
172
0
    v_ = _mm256_and_si256(v_, other.v_);
173
0
    return *this;
174
0
  }
175
0
  HH_INLINE V256& operator|=(const V256& other) {
176
0
    v_ = _mm256_or_si256(v_, other.v_);
177
0
    return *this;
178
0
  }
179
0
  HH_INLINE V256& operator^=(const V256& other) {
180
0
    v_ = _mm256_xor_si256(v_, other.v_);
181
0
    return *this;
182
0
  }
183
184
0
  HH_INLINE V256& operator<<=(const int count) {
185
0
    v_ = _mm256_slli_epi16(v_, count);
186
0
    return *this;
187
0
  }
188
189
0
  HH_INLINE V256& operator>>=(const int count) {
190
0
    v_ = _mm256_srli_epi16(v_, count);
191
0
    return *this;
192
0
  }
193
194
 private:
195
  Intrinsic v_;
196
};
197
198
template <>
199
class V256<uint32_t> {
200
 public:
201
  using Intrinsic = __m256i;
202
  using T = uint32_t;
203
  static constexpr size_t N = 8;
204
205
  // Leaves v_ uninitialized - typically used for output parameters.
206
0
  HH_INLINE V256() {}
207
208
  // Lane 0 (p_0) is the lowest.
209
  HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
210
0
      : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
211
212
  // Broadcasts i to all lanes.
213
  HH_INLINE explicit V256(T i)
214
21
      : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {}
215
216
  // Copy from other vector.
217
21
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
218
  template <typename U>
219
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
220
0
  HH_INLINE V256& operator=(const V256& other) {
221
0
    v_ = other.v_;
222
0
    return *this;
223
0
  }
224
225
  // Convert from/to intrinsics.
226
42
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
227
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
228
0
    v_ = v;
229
0
    return *this;
230
0
  }
231
105
  HH_INLINE operator Intrinsic() const { return v_; }
232
233
  // There are no greater-than comparison instructions for unsigned T.
234
0
  HH_INLINE V256 operator==(const V256& other) const {
235
0
    return V256(_mm256_cmpeq_epi32(v_, other.v_));
236
0
  }
237
238
0
  HH_INLINE V256& operator+=(const V256& other) {
239
0
    v_ = _mm256_add_epi32(v_, other.v_);
240
0
    return *this;
241
0
  }
242
21
  HH_INLINE V256& operator-=(const V256& other) {
243
21
    v_ = _mm256_sub_epi32(v_, other.v_);
244
21
    return *this;
245
21
  }
246
247
0
  HH_INLINE V256& operator&=(const V256& other) {
248
0
    v_ = _mm256_and_si256(v_, other.v_);
249
0
    return *this;
250
0
  }
251
0
  HH_INLINE V256& operator|=(const V256& other) {
252
0
    v_ = _mm256_or_si256(v_, other.v_);
253
0
    return *this;
254
0
  }
255
0
  HH_INLINE V256& operator^=(const V256& other) {
256
0
    v_ = _mm256_xor_si256(v_, other.v_);
257
0
    return *this;
258
0
  }
259
260
0
  HH_INLINE V256& operator<<=(const int count) {
261
0
    v_ = _mm256_slli_epi32(v_, count);
262
0
    return *this;
263
0
  }
264
265
0
  HH_INLINE V256& operator>>=(const int count) {
266
0
    v_ = _mm256_srli_epi32(v_, count);
267
0
    return *this;
268
0
  }
269
270
 private:
271
  Intrinsic v_;
272
};
273
274
template <>
275
class V256<uint64_t> {
276
 public:
277
  using Intrinsic = __m256i;
278
  using T = uint64_t;
279
  static constexpr size_t N = 4;
280
281
  // Leaves v_ uninitialized - typically used for output parameters.
282
196
  HH_INLINE V256() {}
283
284
  // Lane 0 (p_0) is the lowest.
285
  HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
286
525k
      : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {}
287
288
  // Broadcasts i to all lanes.
289
  HH_INLINE explicit V256(T i)
290
0
      : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {}
291
292
  // Copy from other vector.
293
525k
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
294
  template <typename U>
295
21
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
296
217
  HH_INLINE V256& operator=(const V256& other) {
297
217
    v_ = other.v_;
298
217
    return *this;
299
217
  }
300
301
  // Convert from/to intrinsics.
302
1.83M
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
303
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
304
0
    v_ = v;
305
0
    return *this;
306
0
  }
307
2.62M
  HH_INLINE operator Intrinsic() const { return v_; }
308
309
  // There are no greater-than comparison instructions for unsigned T.
310
0
  HH_INLINE V256 operator==(const V256& other) const {
311
0
    return V256(_mm256_cmpeq_epi64(v_, other.v_));
312
0
  }
313
314
1.31M
  HH_INLINE V256& operator+=(const V256& other) {
315
1.31M
    v_ = _mm256_add_epi64(v_, other.v_);
316
1.31M
    return *this;
317
1.31M
  }
318
0
  HH_INLINE V256& operator-=(const V256& other) {
319
0
    v_ = _mm256_sub_epi64(v_, other.v_);
320
0
    return *this;
321
0
  }
322
323
0
  HH_INLINE V256& operator&=(const V256& other) {
324
0
    v_ = _mm256_and_si256(v_, other.v_);
325
0
    return *this;
326
0
  }
327
21
  HH_INLINE V256& operator|=(const V256& other) {
328
21
    v_ = _mm256_or_si256(v_, other.v_);
329
21
    return *this;
330
21
  }
331
525k
  HH_INLINE V256& operator^=(const V256& other) {
332
525k
    v_ = _mm256_xor_si256(v_, other.v_);
333
525k
    return *this;
334
525k
  }
335
336
0
  HH_INLINE V256& operator<<=(const int count) {
337
0
    v_ = _mm256_slli_epi64(v_, count);
338
0
    return *this;
339
0
  }
340
341
524k
  HH_INLINE V256& operator>>=(const int count) {
342
524k
    v_ = _mm256_srli_epi64(v_, count);
343
524k
    return *this;
344
524k
  }
345
346
 private:
347
  Intrinsic v_;
348
};
349
350
template <>
351
class V256<float> {
352
 public:
353
  using Intrinsic = __m256;
354
  using T = float;
355
  static constexpr size_t N = 8;
356
357
  // Leaves v_ uninitialized - typically used for output parameters.
358
0
  HH_INLINE V256() {}
359
360
  // Lane 0 (p_0) is the lowest.
361
  HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
362
0
      : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
363
364
  // Broadcasts to all lanes.
365
0
  HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {}
366
367
  // Copy from other vector.
368
0
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
369
  template <typename U>
370
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
371
0
  HH_INLINE V256& operator=(const V256& other) {
372
0
    v_ = other.v_;
373
0
    return *this;
374
0
  }
375
376
  // Convert from/to intrinsics.
377
0
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
378
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
379
0
    v_ = v;
380
0
    return *this;
381
0
  }
382
0
  HH_INLINE operator Intrinsic() const { return v_; }
383
384
0
  HH_INLINE V256 operator==(const V256& other) const {
385
0
    return V256(_mm256_cmp_ps(v_, other.v_, 0));
386
0
  }
387
0
  HH_INLINE V256 operator<(const V256& other) const {
388
0
    return V256(_mm256_cmp_ps(v_, other.v_, 1));
389
0
  }
390
0
  HH_INLINE V256 operator>(const V256& other) const {
391
0
    return V256(_mm256_cmp_ps(other.v_, v_, 1));
392
0
  }
393
394
0
  HH_INLINE V256& operator*=(const V256& other) {
395
0
    v_ = _mm256_mul_ps(v_, other.v_);
396
0
    return *this;
397
0
  }
398
0
  HH_INLINE V256& operator/=(const V256& other) {
399
0
    v_ = _mm256_div_ps(v_, other.v_);
400
0
    return *this;
401
0
  }
402
0
  HH_INLINE V256& operator+=(const V256& other) {
403
0
    v_ = _mm256_add_ps(v_, other.v_);
404
0
    return *this;
405
0
  }
406
0
  HH_INLINE V256& operator-=(const V256& other) {
407
0
    v_ = _mm256_sub_ps(v_, other.v_);
408
0
    return *this;
409
0
  }
410
411
0
  HH_INLINE V256& operator&=(const V256& other) {
412
0
    v_ = _mm256_and_ps(v_, other.v_);
413
0
    return *this;
414
0
  }
415
0
  HH_INLINE V256& operator|=(const V256& other) {
416
0
    v_ = _mm256_or_ps(v_, other.v_);
417
0
    return *this;
418
0
  }
419
0
  HH_INLINE V256& operator^=(const V256& other) {
420
0
    v_ = _mm256_xor_ps(v_, other.v_);
421
0
    return *this;
422
0
  }
423
424
 private:
425
  Intrinsic v_;
426
};
427
428
template <>
429
class V256<double> {
430
 public:
431
  using Intrinsic = __m256d;
432
  using T = double;
433
  static constexpr size_t N = 4;
434
435
  // Leaves v_ uninitialized - typically used for output parameters.
436
0
  HH_INLINE V256() {}
437
438
  // Lane 0 (p_0) is the lowest.
439
  HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
440
0
      : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {}
441
442
  // Broadcasts to all lanes.
443
0
  HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {}
444
445
  // Copy from other vector.
446
0
  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
447
  template <typename U>
448
  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
449
0
  HH_INLINE V256& operator=(const V256& other) {
450
0
    v_ = other.v_;
451
0
    return *this;
452
0
  }
453
454
  // Convert from/to intrinsics.
455
0
  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
456
0
  HH_INLINE V256& operator=(const Intrinsic& v) {
457
0
    v_ = v;
458
0
    return *this;
459
0
  }
460
0
  HH_INLINE operator Intrinsic() const { return v_; }
461
462
0
  HH_INLINE V256 operator==(const V256& other) const {
463
0
    return V256(_mm256_cmp_pd(v_, other.v_, 0));
464
0
  }
465
0
  HH_INLINE V256 operator<(const V256& other) const {
466
0
    return V256(_mm256_cmp_pd(v_, other.v_, 1));
467
0
  }
468
0
  HH_INLINE V256 operator>(const V256& other) const {
469
0
    return V256(_mm256_cmp_pd(other.v_, v_, 1));
470
0
  }
471
472
0
  HH_INLINE V256& operator*=(const V256& other) {
473
0
    v_ = _mm256_mul_pd(v_, other.v_);
474
0
    return *this;
475
0
  }
476
0
  HH_INLINE V256& operator/=(const V256& other) {
477
0
    v_ = _mm256_div_pd(v_, other.v_);
478
0
    return *this;
479
0
  }
480
0
  HH_INLINE V256& operator+=(const V256& other) {
481
0
    v_ = _mm256_add_pd(v_, other.v_);
482
0
    return *this;
483
0
  }
484
0
  HH_INLINE V256& operator-=(const V256& other) {
485
0
    v_ = _mm256_sub_pd(v_, other.v_);
486
0
    return *this;
487
0
  }
488
489
0
  HH_INLINE V256& operator&=(const V256& other) {
490
0
    v_ = _mm256_and_pd(v_, other.v_);
491
0
    return *this;
492
0
  }
493
0
  HH_INLINE V256& operator|=(const V256& other) {
494
0
    v_ = _mm256_or_pd(v_, other.v_);
495
0
    return *this;
496
0
  }
497
0
  HH_INLINE V256& operator^=(const V256& other) {
498
0
    v_ = _mm256_xor_pd(v_, other.v_);
499
0
    return *this;
500
0
  }
501
502
 private:
503
  Intrinsic v_;
504
};
505
506
// Nonmember functions for any V256 via member functions.
507
508
template <typename T>
509
HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) {
510
  V256<T> t(left);
511
  return t *= right;
512
}
513
514
template <typename T>
515
HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) {
516
  V256<T> t(left);
517
  return t /= right;
518
}
519
520
template <typename T>
521
98
HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) {
522
98
  V256<T> t(left);
523
98
  return t += right;
524
98
}
525
526
template <typename T>
527
21
HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) {
528
21
  V256<T> t(left);
529
21
  return t -= right;
530
21
}
531
532
template <typename T>
533
HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) {
534
  V256<T> t(left);
535
  return t &= right;
536
}
537
538
template <typename T>
539
21
HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) {
540
21
  V256<T> t(left);
541
21
  return t |= right;
542
21
}
543
544
template <typename T>
545
98
HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) {
546
98
  V256<T> t(left);
547
98
  return t ^= right;
548
98
}
549
550
template <typename T>
551
0
HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) {
552
0
  V256<T> t(v);
553
0
  return t <<= count;
554
0
}
555
556
template <typename T>
557
524k
HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) {
558
524k
  V256<T> t(v);
559
524k
  return t >>= count;
560
524k
}
561
562
// We do not provide operator<<(V, __m128i) because it has 4 cycle latency
563
// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead.
564
565
using V32x8U = V256<uint8_t>;
566
using V16x16U = V256<uint16_t>;
567
using V8x32U = V256<uint32_t>;
568
using V4x64U = V256<uint64_t>;
569
using V8x32F = V256<float>;
570
using V4x64F = V256<double>;
571
572
// Load/Store for any V256.
573
574
// We differentiate between targets' vector types via template specialization.
575
// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
576
// generate better code in unoptimized builds. Only declare the primary
577
// templates to avoid needing mutual exclusion with vector128.
578
579
template <class V>
580
HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
581
582
template <class V>
583
HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
584
585
template <>
586
0
HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) {
587
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
588
0
  return V32x8U(_mm256_load_si256(p));
589
0
}
590
template <>
591
0
HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) {
592
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
593
0
  return V16x16U(_mm256_load_si256(p));
594
0
}
595
template <>
596
0
HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) {
597
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
598
0
  return V8x32U(_mm256_load_si256(p));
599
0
}
600
template <>
601
0
HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) {
602
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
603
0
  return V4x64U(_mm256_load_si256(p));
604
0
}
605
template <>
606
0
HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) {
607
0
  return V8x32F(_mm256_load_ps(from));
608
0
}
609
template <>
610
0
HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) {
611
0
  return V4x64F(_mm256_load_pd(from));
612
0
}
613
614
template <>
615
0
HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) {
616
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
617
0
  return V32x8U(_mm256_loadu_si256(p));
618
0
}
619
template <>
620
0
HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) {
621
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
622
0
  return V16x16U(_mm256_loadu_si256(p));
623
0
}
624
template <>
625
0
HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) {
626
0
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
627
0
  return V8x32U(_mm256_loadu_si256(p));
628
0
}
629
template <>
630
262k
HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) {
631
262k
  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
632
262k
  return V4x64U(_mm256_loadu_si256(p));
633
262k
}
634
template <>
635
0
HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) {
636
0
  return V8x32F(_mm256_loadu_ps(from));
637
0
}
638
template <>
639
0
HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) {
640
0
  return V4x64F(_mm256_loadu_pd(from));
641
0
}
642
643
// "to" must be vector-aligned.
644
template <typename T>
645
HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) {
646
  _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
647
}
648
0
HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) {
649
0
  _mm256_store_ps(to, v);
650
0
}
651
0
HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) {
652
0
  _mm256_store_pd(to, v);
653
0
}
654
655
template <typename T>
656
0
HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) {
657
0
  _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
658
0
}
659
HH_INLINE void StoreUnaligned(const V256<float>& v,
660
0
                              float* const HH_RESTRICT to) {
661
0
  _mm256_storeu_ps(to, v);
662
0
}
663
HH_INLINE void StoreUnaligned(const V256<double>& v,
664
0
                              double* const HH_RESTRICT to) {
665
0
  _mm256_storeu_pd(to, v);
666
0
}
667
668
// Writes directly to (aligned) memory, bypassing the cache. This is useful for
669
// data that will not be read again in the near future.
670
template <typename T>
671
HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) {
672
  _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
673
}
674
0
HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) {
675
0
  _mm256_stream_ps(to, v);
676
0
}
677
0
HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) {
678
0
  _mm256_stream_pd(to, v);
679
0
}
680
681
// Miscellaneous functions.
682
683
template <typename T>
684
HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) {
685
  constexpr size_t num_bits = sizeof(T) * 8;
686
  return (v << count) | (v >> (num_bits - count));
687
}
688
689
template <typename T>
690
0
HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) {
691
0
  return V256<T>(_mm256_andnot_si256(neg_mask, values));
692
0
}
693
template <>
694
HH_INLINE V256<float> AndNot(const V256<float>& neg_mask,
695
0
                             const V256<float>& values) {
696
0
  return V256<float>(_mm256_andnot_ps(neg_mask, values));
697
0
}
698
template <>
699
HH_INLINE V256<double> AndNot(const V256<double>& neg_mask,
700
0
                              const V256<double>& values) {
701
0
  return V256<double>(_mm256_andnot_pd(neg_mask, values));
702
0
}
703
704
0
HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) {
705
0
  return V8x32F(_mm256_blendv_ps(a, b, mask));
706
0
}
707
708
0
HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) {
709
0
  return V4x64F(_mm256_blendv_pd(a, b, mask));
710
0
}
711
712
// Min/Max
713
714
0
HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) {
715
0
  return V32x8U(_mm256_min_epu8(v0, v1));
716
0
}
717
718
0
HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) {
719
0
  return V32x8U(_mm256_max_epu8(v0, v1));
720
0
}
721
722
0
HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) {
723
0
  return V16x16U(_mm256_min_epu16(v0, v1));
724
0
}
725
726
0
HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) {
727
0
  return V16x16U(_mm256_max_epu16(v0, v1));
728
0
}
729
730
0
HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) {
731
0
  return V8x32U(_mm256_min_epu32(v0, v1));
732
0
}
733
734
0
HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) {
735
0
  return V8x32U(_mm256_max_epu32(v0, v1));
736
0
}
737
738
0
HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) {
739
0
  return V8x32F(_mm256_min_ps(v0, v1));
740
0
}
741
742
0
HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) {
743
0
  return V8x32F(_mm256_max_ps(v0, v1));
744
0
}
745
746
0
HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) {
747
0
  return V4x64F(_mm256_min_pd(v0, v1));
748
0
}
749
750
0
HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) {
751
0
  return V4x64F(_mm256_max_pd(v0, v1));
752
0
}
753
754
}  // namespace HH_TARGET_NAME
755
}  // namespace highwayhash
756
757
#endif  // HH_DISABLE_TARGET_SPECIFIC
758
#endif  // HIGHWAYHASH_VECTOR256_H_