/src/botan/build/include/botan/internal/simd_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #ifndef BOTAN_SIMD_AVX2_H_ |
8 | | #define BOTAN_SIMD_AVX2_H_ |
9 | | |
10 | | #include <botan/types.h> |
11 | | #include <immintrin.h> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | class SIMD_8x32 final |
16 | | { |
17 | | public: |
18 | | |
19 | | SIMD_8x32& operator=(const SIMD_8x32& other) = default; |
20 | | SIMD_8x32(const SIMD_8x32& other) = default; |
21 | | |
22 | | SIMD_8x32& operator=(SIMD_8x32&& other) = default; |
23 | | SIMD_8x32(SIMD_8x32&& other) = default; |
24 | | |
25 | | BOTAN_FUNC_ISA("avx2") |
26 | | SIMD_8x32() |
27 | 0 | { |
28 | 0 | m_avx2 = _mm256_setzero_si256(); |
29 | 0 | } |
30 | | |
31 | | BOTAN_FUNC_ISA("avx2") |
32 | | explicit SIMD_8x32(const uint32_t B[8]) |
33 | 0 | { |
34 | 0 | m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); |
35 | 0 | } |
36 | | |
37 | | BOTAN_FUNC_ISA("avx2") |
38 | | explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, |
39 | | uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) |
40 | 97.7k | { |
41 | 97.7k | m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0); |
42 | 97.7k | } |
43 | | |
44 | | BOTAN_FUNC_ISA("avx2") |
45 | | static SIMD_8x32 splat(uint32_t B) |
46 | 1.56M | { |
47 | 1.56M | return SIMD_8x32(_mm256_set1_epi32(B)); |
48 | 1.56M | } |
49 | | |
50 | | BOTAN_FUNC_ISA("avx2") |
51 | | static SIMD_8x32 load_le(const uint8_t* in) |
52 | 0 | { |
53 | 0 | return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); |
54 | 0 | } |
55 | | |
56 | | BOTAN_FUNC_ISA("avx2") |
57 | | static SIMD_8x32 load_be(const uint8_t* in) |
58 | 0 | { |
59 | 0 | return load_le(in).bswap(); |
60 | 0 | } |
61 | | |
62 | | BOTAN_FUNC_ISA("avx2") |
63 | | void store_le(uint8_t out[]) const |
64 | 781k | { |
65 | 781k | _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); |
66 | 781k | } |
67 | | |
68 | | BOTAN_FUNC_ISA("avx2") |
69 | | void store_be(uint8_t out[]) const |
70 | 0 | { |
71 | 0 | bswap().store_le(out); |
72 | 0 | } |
73 | | |
74 | | template<size_t ROT> |
75 | | BOTAN_FUNC_ISA("avx2") |
76 | | SIMD_8x32 rotl() const |
77 | 15.6M | { |
78 | 15.6M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); |
79 | 15.6M | |
80 | 15.6M | BOTAN_IF_CONSTEXPR(ROT == 8) |
81 | 3.90M | { |
82 | 3.90M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, |
83 | 3.90M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); |
84 | 3.90M | |
85 | 3.90M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); |
86 | 3.90M | } |
87 | 15.6M | else BOTAN_IF_CONSTEXPR(ROT == 16) |
88 | 3.90M | { |
89 | 3.90M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, |
90 | 3.90M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); |
91 | 3.90M | |
92 | 3.90M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); |
93 | 3.90M | } |
94 | 7.81M | else |
95 | 7.81M | { |
96 | 7.81M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), |
97 | 7.81M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); |
98 | 7.81M | } |
99 | 15.6M | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<1ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<7ul>() const Line | Count | Source | 77 | 3.90M | { | 78 | 3.90M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 3.90M | | 80 | 3.90M | BOTAN_IF_CONSTEXPR(ROT == 8) | 81 | 0 | { | 82 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 83 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 84 | 0 |
| 85 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 86 | 0 | } | 87 | 3.90M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 88 | 0 | { | 89 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 90 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 91 | 0 |
| 92 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 93 | 0 | } | 94 | 3.90M | else | 95 | 3.90M | { | 96 | 3.90M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 97 | 3.90M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 98 | 3.90M | } | 99 | 3.90M | } |
Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<10ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<27ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<31ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<29ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<19ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<26ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<21ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<30ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<16ul>() const Line | Count | Source | 77 | 3.90M | { | 78 | 3.90M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 3.90M | | 80 | 3.90M | BOTAN_IF_CONSTEXPR(ROT == 8) | 81 | 0 | { | 82 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 83 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 84 | 0 |
| 85 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 86 | 0 | } | 87 | 3.90M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 88 | 3.90M | { | 89 | 3.90M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 90 | 3.90M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 91 | 3.90M | | 92 | 3.90M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 93 | 3.90M | } | 94 | 0 | else | 95 | 0 | { | 96 | 0 | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 97 | 0 | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 98 | 0 | } | 99 | 3.90M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<12ul>() const Line | Count | Source | 77 | 3.90M | { | 78 | 3.90M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 3.90M | | 80 | 3.90M | BOTAN_IF_CONSTEXPR(ROT == 8) | 81 | 0 | { | 82 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 83 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 84 | 0 |
| 85 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 86 | 0 | } | 87 | 3.90M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 88 | 0 | { | 89 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 90 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 91 | 0 |
| 92 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 93 | 0 | } | 94 | 3.90M | else | 95 | 3.90M | { | 96 | 3.90M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 97 | 3.90M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 98 | 3.90M | } | 99 | 3.90M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<8ul>() const Line | Count | Source | 77 | 3.90M | { | 78 | 3.90M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 3.90M | | 80 | 3.90M | BOTAN_IF_CONSTEXPR(ROT == 8) | 81 | 3.90M | { | 82 | 3.90M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 83 | 3.90M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 84 | 3.90M | | 85 | 3.90M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 86 | 3.90M | } | 87 | 3.90M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 88 | 0 | { | 89 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 90 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 91 | 0 |
| 92 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 93 | 0 | } | 94 | 0 | else | 95 | 0 | { | 96 | 0 | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 97 | 0 | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 98 | 0 | } | 99 | 3.90M | } |
|
100 | | |
101 | | template<size_t ROT> |
102 | | BOTAN_FUNC_ISA("avx2") |
103 | | SIMD_8x32 rotr() const |
104 | 0 | { |
105 | 0 | return this->rotl<32-ROT>(); |
106 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<7ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<1ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<6ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<11ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<2ul>() const |
107 | | |
108 | | template<size_t ROT1, size_t ROT2, size_t ROT3> |
109 | | SIMD_8x32 rho() const |
110 | 0 | { |
111 | 0 | SIMD_8x32 res; |
112 | 0 |
|
113 | 0 | const SIMD_8x32 rot1 = this->rotr<ROT1>(); |
114 | 0 | const SIMD_8x32 rot2 = this->rotr<ROT2>(); |
115 | 0 | const SIMD_8x32 rot3 = this->rotr<ROT3>(); |
116 | 0 |
|
117 | 0 | return rot1 ^ rot2 ^ rot3; |
118 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rho<6ul, 11ul, 25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rho<2ul, 13ul, 22ul>() const |
119 | | |
120 | | SIMD_8x32 operator+(const SIMD_8x32& other) const |
121 | 195k | { |
122 | 195k | SIMD_8x32 retval(*this); |
123 | 195k | retval += other; |
124 | 195k | return retval; |
125 | 195k | } |
126 | | |
127 | | SIMD_8x32 operator-(const SIMD_8x32& other) const |
128 | 0 | { |
129 | 0 | SIMD_8x32 retval(*this); |
130 | 0 | retval -= other; |
131 | 0 | return retval; |
132 | 0 | } |
133 | | |
134 | | SIMD_8x32 operator^(const SIMD_8x32& other) const |
135 | 0 | { |
136 | 0 | SIMD_8x32 retval(*this); |
137 | 0 | retval ^= other; |
138 | 0 | return retval; |
139 | 0 | } |
140 | | |
141 | | SIMD_8x32 operator|(const SIMD_8x32& other) const |
142 | 0 | { |
143 | 0 | SIMD_8x32 retval(*this); |
144 | 0 | retval |= other; |
145 | 0 | return retval; |
146 | 0 | } |
147 | | |
148 | | SIMD_8x32 operator&(const SIMD_8x32& other) const |
149 | 0 | { |
150 | 0 | SIMD_8x32 retval(*this); |
151 | 0 | retval &= other; |
152 | 0 | return retval; |
153 | 0 | } |
154 | | |
155 | | BOTAN_FUNC_ISA("avx2") |
156 | | void operator+=(const SIMD_8x32& other) |
157 | 16.6M | { |
158 | 16.6M | m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); |
159 | 16.6M | } |
160 | | |
161 | | BOTAN_FUNC_ISA("avx2") |
162 | | void operator-=(const SIMD_8x32& other) |
163 | 0 | { |
164 | 0 | m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); |
165 | 0 | } |
166 | | |
167 | | BOTAN_FUNC_ISA("avx2") |
168 | | void operator^=(const SIMD_8x32& other) |
169 | 15.6M | { |
170 | 15.6M | m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); |
171 | 15.6M | } |
172 | | |
173 | | BOTAN_FUNC_ISA("avx2") |
174 | | void operator|=(const SIMD_8x32& other) |
175 | 0 | { |
176 | 0 | m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); |
177 | 0 | } |
178 | | |
179 | | BOTAN_FUNC_ISA("avx2") |
180 | | void operator&=(const SIMD_8x32& other) |
181 | 0 | { |
182 | 0 | m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); |
183 | 0 | } |
184 | | |
185 | | template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shl() const |
186 | 0 | { |
187 | 0 | return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); |
188 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<3>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<7>() const |
189 | | |
190 | | template<int SHIFT> BOTAN_FUNC_ISA("avx2")SIMD_8x32 shr() const |
191 | | { |
192 | | return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); |
193 | | } |
194 | | |
195 | | BOTAN_FUNC_ISA("avx2") |
196 | | SIMD_8x32 operator~() const |
197 | 0 | { |
198 | 0 | return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); |
199 | 0 | } |
200 | | |
201 | | // (~reg) & other |
202 | | BOTAN_FUNC_ISA("avx2") |
203 | | SIMD_8x32 andc(const SIMD_8x32& other) const |
204 | 0 | { |
205 | 0 | return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); |
206 | 0 | } |
207 | | |
208 | | BOTAN_FUNC_ISA("avx2") |
209 | | SIMD_8x32 bswap() const |
210 | 0 | { |
211 | 0 | const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0, |
212 | 0 | 7, 6, 5, 4, |
213 | 0 | 11, 10, 9, 8, |
214 | 0 | 15, 14, 13, 12, |
215 | 0 | 19, 18, 17, 16, |
216 | 0 | 23, 22, 21, 20, |
217 | 0 | 27, 26, 25, 24, |
218 | 0 | 31, 30, 29, 28 }; |
219 | 0 |
|
220 | 0 | const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); |
221 | 0 |
|
222 | 0 | const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); |
223 | 0 |
|
224 | 0 | return SIMD_8x32(output); |
225 | 0 | } |
226 | | |
227 | | BOTAN_FUNC_ISA("avx2") |
228 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
229 | | SIMD_8x32& B2, SIMD_8x32& B3) |
230 | 195k | { |
231 | 195k | const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); |
232 | 195k | const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); |
233 | 195k | const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); |
234 | 195k | const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); |
235 | 195k | |
236 | 195k | B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); |
237 | 195k | B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); |
238 | 195k | B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); |
239 | 195k | B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); |
240 | 195k | } |
241 | | |
242 | | BOTAN_FUNC_ISA("avx2") |
243 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
244 | | SIMD_8x32& B2, SIMD_8x32& B3, |
245 | | SIMD_8x32& B4, SIMD_8x32& B5, |
246 | | SIMD_8x32& B6, SIMD_8x32& B7) |
247 | 97.7k | { |
248 | 97.7k | transpose(B0, B1, B2, B3); |
249 | 97.7k | transpose(B4, B5, B6, B7); |
250 | 97.7k | |
251 | 97.7k | swap_tops(B0, B4); |
252 | 97.7k | swap_tops(B1, B5); |
253 | 97.7k | swap_tops(B2, B6); |
254 | 97.7k | swap_tops(B3, B7); |
255 | 97.7k | } |
256 | | |
257 | | BOTAN_FUNC_ISA("avx2") |
258 | | static void reset_registers() |
259 | 48.8k | { |
260 | 48.8k | _mm256_zeroupper(); |
261 | 48.8k | } |
262 | | |
263 | | BOTAN_FUNC_ISA("avx2") |
264 | | static void zero_registers() |
265 | 48.8k | { |
266 | 48.8k | _mm256_zeroall(); |
267 | 48.8k | } |
268 | | |
269 | 1.56M | __m256i BOTAN_FUNC_ISA("avx2") handle() const { return m_avx2; } |
270 | | |
271 | | BOTAN_FUNC_ISA("avx2") |
272 | 17.9M | SIMD_8x32(__m256i x) : m_avx2(x) {} |
273 | | |
274 | | private: |
275 | | |
276 | | BOTAN_FUNC_ISA("avx2") |
277 | | static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) |
278 | 390k | { |
279 | 390k | SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4)); |
280 | 390k | SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4)); |
281 | 390k | A = T0; |
282 | 390k | B = T1; |
283 | 390k | } |
284 | | |
285 | | __m256i m_avx2; |
286 | | }; |
287 | | |
288 | | } |
289 | | |
290 | | #endif |