/src/botan/build/include/botan/internal/simd_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #ifndef BOTAN_SIMD_AVX2_H_ |
8 | | #define BOTAN_SIMD_AVX2_H_ |
9 | | |
10 | | #include <botan/types.h> |
11 | | #include <immintrin.h> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | class SIMD_8x32 final |
16 | | { |
17 | | public: |
18 | | |
19 | | SIMD_8x32& operator=(const SIMD_8x32& other) = default; |
20 | | SIMD_8x32(const SIMD_8x32& other) = default; |
21 | | |
22 | | SIMD_8x32& operator=(SIMD_8x32&& other) = default; |
23 | | SIMD_8x32(SIMD_8x32&& other) = default; |
24 | | |
25 | | BOTAN_FUNC_ISA("avx2") |
26 | | BOTAN_FORCE_INLINE SIMD_8x32() |
27 | 0 | { |
28 | 0 | m_avx2 = _mm256_setzero_si256(); |
29 | 0 | } |
30 | | |
31 | | BOTAN_FUNC_ISA("avx2") |
32 | | explicit SIMD_8x32(const uint32_t B[8]) |
33 | 0 | { |
34 | 0 | m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); |
35 | 0 | } |
36 | | |
37 | | BOTAN_FUNC_ISA("avx2") |
38 | | explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, |
39 | | uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) |
40 | 104k | { |
41 | 104k | m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0); |
42 | 104k | } |
43 | | |
44 | | BOTAN_FUNC_ISA("avx2") |
45 | | static SIMD_8x32 splat(uint32_t B) |
46 | 1.67M | { |
47 | 1.67M | return SIMD_8x32(_mm256_set1_epi32(B)); |
48 | 1.67M | } |
49 | | |
50 | | BOTAN_FUNC_ISA("avx2") |
51 | | static SIMD_8x32 load_le(const uint8_t* in) |
52 | 0 | { |
53 | 0 | return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); |
54 | 0 | } |
55 | | |
56 | | BOTAN_FUNC_ISA("avx2") |
57 | | static SIMD_8x32 load_be(const uint8_t* in) |
58 | 0 | { |
59 | 0 | return load_le(in).bswap(); |
60 | 0 | } |
61 | | |
62 | | BOTAN_FUNC_ISA("avx2") |
63 | | void store_le(uint8_t out[]) const |
64 | 838k | { |
65 | 838k | _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); |
66 | 838k | } |
67 | | |
68 | | BOTAN_FUNC_ISA("avx2") |
69 | | void store_be(uint8_t out[]) const |
70 | 0 | { |
71 | 0 | bswap().store_le(out); |
72 | 0 | } |
73 | | |
74 | | template<size_t ROT> |
75 | | BOTAN_FUNC_ISA("avx2") |
76 | | SIMD_8x32 rotl() const |
77 | 16.7M | { |
78 | 16.7M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); |
79 | 16.7M | |
80 | | #if defined(__AVX512VL__) |
81 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); |
82 | | #else |
83 | 16.7M | BOTAN_IF_CONSTEXPR(ROT == 8) |
84 | 4.19M | { |
85 | 4.19M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, |
86 | 4.19M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); |
87 | 4.19M | |
88 | 4.19M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); |
89 | 4.19M | } |
90 | 16.7M | else BOTAN_IF_CONSTEXPR(ROT == 16) |
91 | 4.19M | { |
92 | 4.19M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, |
93 | 4.19M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); |
94 | 4.19M | |
95 | 4.19M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); |
96 | 4.19M | } |
97 | 8.38M | else |
98 | 8.38M | { |
99 | 8.38M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), |
100 | 8.38M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); |
101 | 8.38M | } |
102 | 16.7M | #endif |
103 | 16.7M | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<1ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<7ul>() const Line | Count | Source | 77 | 4.19M | { | 78 | 4.19M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 4.19M | | 80 | | #if defined(__AVX512VL__) | 81 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 82 | | #else | 83 | 4.19M | BOTAN_IF_CONSTEXPR(ROT == 8) | 84 | 0 | { | 85 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 86 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 87 | 0 |
| 88 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 89 | 0 | } | 90 | 4.19M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 91 | 0 | { | 92 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 93 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 94 | 0 |
| 95 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | 0 | } | 97 | 4.19M | else | 98 | 4.19M | { | 99 | 4.19M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 100 | 4.19M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 101 | 4.19M | } | 102 | 4.19M | #endif | 103 | 4.19M | } |
Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<10ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<27ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<31ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<29ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<19ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<26ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<21ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<30ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<16ul>() const Line | Count | Source | 77 | 4.19M | { | 78 | 4.19M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 4.19M | | 80 | | #if defined(__AVX512VL__) | 81 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 82 | | #else | 83 | 4.19M | BOTAN_IF_CONSTEXPR(ROT == 8) | 84 | 0 | { | 85 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 86 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 87 | 0 |
| 88 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 89 | 0 | } | 90 | 4.19M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 91 | 4.19M | { | 92 | 4.19M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 93 | 4.19M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 94 | 4.19M | | 95 | 4.19M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | 4.19M | } | 97 | 0 | else | 98 | 0 | { | 99 | 0 | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 100 | 0 | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 101 | 0 | } | 102 | 4.19M | #endif | 103 | 4.19M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<12ul>() const Line | Count | Source | 77 | 4.19M | { | 78 | 4.19M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 4.19M | | 80 | | #if defined(__AVX512VL__) | 81 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 82 | | #else | 83 | 4.19M | BOTAN_IF_CONSTEXPR(ROT == 8) | 84 | 0 | { | 85 | 0 | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 86 | 0 | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 87 | 0 |
| 88 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 89 | 0 | } | 90 | 4.19M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 91 | 0 | { | 92 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 93 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 94 | 0 |
| 95 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | 0 | } | 97 | 4.19M | else | 98 | 4.19M | { | 99 | 4.19M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 100 | 4.19M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 101 | 4.19M | } | 102 | 4.19M | #endif | 103 | 4.19M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<8ul>() const Line | Count | Source | 77 | 4.19M | { | 78 | 4.19M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 79 | 4.19M | | 80 | | #if defined(__AVX512VL__) | 81 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 82 | | #else | 83 | 4.19M | BOTAN_IF_CONSTEXPR(ROT == 8) | 84 | 4.19M | { | 85 | 4.19M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 86 | 4.19M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 87 | 4.19M | | 88 | 4.19M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 89 | 4.19M | } | 90 | 4.19M | else BOTAN_IF_CONSTEXPR(ROT == 16) | 91 | 0 | { | 92 | 0 | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 93 | 0 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 94 | 0 |
| 95 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | 0 | } | 97 | 0 | else | 98 | 0 | { | 99 | 0 | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 100 | 0 | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 101 | 0 | } | 102 | 4.19M | #endif | 103 | 4.19M | } |
|
104 | | |
105 | | template<size_t ROT> |
106 | | BOTAN_FUNC_ISA("avx2") |
107 | | SIMD_8x32 rotr() const |
108 | 0 | { |
109 | 0 | return this->rotl<32-ROT>(); |
110 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<7ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<1ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<6ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<11ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<2ul>() const |
111 | | |
112 | | template<size_t ROT1, size_t ROT2, size_t ROT3> |
113 | | SIMD_8x32 BOTAN_FUNC_ISA("avx2") rho() const |
114 | 0 | { |
115 | 0 | SIMD_8x32 res; |
116 | 0 |
|
117 | 0 | const SIMD_8x32 rot1 = this->rotr<ROT1>(); |
118 | 0 | const SIMD_8x32 rot2 = this->rotr<ROT2>(); |
119 | 0 | const SIMD_8x32 rot3 = this->rotr<ROT3>(); |
120 | 0 |
|
121 | 0 | return rot1 ^ rot2 ^ rot3; |
122 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rho<6ul, 11ul, 25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rho<2ul, 13ul, 22ul>() const |
123 | | |
124 | | BOTAN_FUNC_ISA("avx2") |
125 | | SIMD_8x32 operator+(const SIMD_8x32& other) const |
126 | 209k | { |
127 | 209k | SIMD_8x32 retval(*this); |
128 | 209k | retval += other; |
129 | 209k | return retval; |
130 | 209k | } |
131 | | |
132 | | BOTAN_FUNC_ISA("avx2") |
133 | | SIMD_8x32 operator-(const SIMD_8x32& other) const |
134 | 0 | { |
135 | 0 | SIMD_8x32 retval(*this); |
136 | 0 | retval -= other; |
137 | 0 | return retval; |
138 | 0 | } |
139 | | |
140 | | BOTAN_FUNC_ISA("avx2") |
141 | | SIMD_8x32 operator^(const SIMD_8x32& other) const |
142 | 0 | { |
143 | 0 | SIMD_8x32 retval(*this); |
144 | 0 | retval ^= other; |
145 | 0 | return retval; |
146 | 0 | } |
147 | | |
148 | | BOTAN_FUNC_ISA("avx2") |
149 | | SIMD_8x32 operator|(const SIMD_8x32& other) const |
150 | 0 | { |
151 | 0 | SIMD_8x32 retval(*this); |
152 | 0 | retval |= other; |
153 | 0 | return retval; |
154 | 0 | } |
155 | | |
156 | | BOTAN_FUNC_ISA("avx2") |
157 | | SIMD_8x32 operator&(const SIMD_8x32& other) const |
158 | 0 | { |
159 | 0 | SIMD_8x32 retval(*this); |
160 | 0 | retval &= other; |
161 | 0 | return retval; |
162 | 0 | } |
163 | | |
164 | | BOTAN_FUNC_ISA("avx2") |
165 | | void operator+=(const SIMD_8x32& other) |
166 | 17.8M | { |
167 | 17.8M | m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); |
168 | 17.8M | } |
169 | | |
170 | | BOTAN_FUNC_ISA("avx2") |
171 | | void operator-=(const SIMD_8x32& other) |
172 | 0 | { |
173 | 0 | m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); |
174 | 0 | } |
175 | | |
176 | | BOTAN_FUNC_ISA("avx2") |
177 | | void operator^=(const SIMD_8x32& other) |
178 | 16.7M | { |
179 | 16.7M | m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); |
180 | 16.7M | } |
181 | | |
182 | | BOTAN_FUNC_ISA("avx2") |
183 | | void operator|=(const SIMD_8x32& other) |
184 | 0 | { |
185 | 0 | m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); |
186 | 0 | } |
187 | | |
188 | | BOTAN_FUNC_ISA("avx2") |
189 | | void operator&=(const SIMD_8x32& other) |
190 | 0 | { |
191 | 0 | m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); |
192 | 0 | } |
193 | | |
194 | | template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shl() const |
195 | 0 | { |
196 | 0 | return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); |
197 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<3>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<7>() const |
198 | | |
199 | | template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shr() const |
200 | | { |
201 | | return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); |
202 | | } |
203 | | |
204 | | BOTAN_FUNC_ISA("avx2") |
205 | | SIMD_8x32 operator~() const |
206 | 0 | { |
207 | 0 | return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); |
208 | 0 | } |
209 | | |
210 | | // (~reg) & other |
211 | | BOTAN_FUNC_ISA("avx2") |
212 | | SIMD_8x32 andc(const SIMD_8x32& other) const |
213 | 0 | { |
214 | 0 | return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); |
215 | 0 | } |
216 | | |
217 | | BOTAN_FUNC_ISA("avx2") |
218 | | SIMD_8x32 bswap() const |
219 | 0 | { |
220 | 0 | const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0, |
221 | 0 | 7, 6, 5, 4, |
222 | 0 | 11, 10, 9, 8, |
223 | 0 | 15, 14, 13, 12, |
224 | 0 | 19, 18, 17, 16, |
225 | 0 | 23, 22, 21, 20, |
226 | 0 | 27, 26, 25, 24, |
227 | 0 | 31, 30, 29, 28 }; |
228 | 0 |
|
229 | 0 | const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); |
230 | 0 |
|
231 | 0 | const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); |
232 | 0 |
|
233 | 0 | return SIMD_8x32(output); |
234 | 0 | } |
235 | | |
236 | | BOTAN_FUNC_ISA("avx2") |
237 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
238 | | SIMD_8x32& B2, SIMD_8x32& B3) |
239 | 209k | { |
240 | 209k | const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); |
241 | 209k | const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); |
242 | 209k | const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); |
243 | 209k | const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); |
244 | 209k | |
245 | 209k | B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); |
246 | 209k | B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); |
247 | 209k | B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); |
248 | 209k | B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); |
249 | 209k | } |
250 | | |
251 | | BOTAN_FUNC_ISA("avx2") |
252 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
253 | | SIMD_8x32& B2, SIMD_8x32& B3, |
254 | | SIMD_8x32& B4, SIMD_8x32& B5, |
255 | | SIMD_8x32& B6, SIMD_8x32& B7) |
256 | 104k | { |
257 | 104k | transpose(B0, B1, B2, B3); |
258 | 104k | transpose(B4, B5, B6, B7); |
259 | 104k | |
260 | 104k | swap_tops(B0, B4); |
261 | 104k | swap_tops(B1, B5); |
262 | 104k | swap_tops(B2, B6); |
263 | 104k | swap_tops(B3, B7); |
264 | 104k | } |
265 | | |
266 | | BOTAN_FUNC_ISA("avx2") |
267 | | static void reset_registers() |
268 | 52.4k | { |
269 | 52.4k | _mm256_zeroupper(); |
270 | 52.4k | } |
271 | | |
272 | | BOTAN_FUNC_ISA("avx2") |
273 | | static void zero_registers() |
274 | 52.4k | { |
275 | 52.4k | _mm256_zeroall(); |
276 | 52.4k | } |
277 | | |
278 | 1.67M | __m256i BOTAN_FUNC_ISA("avx2") handle() const { return m_avx2; } |
279 | | |
280 | | BOTAN_FUNC_ISA("avx2") |
281 | 19.2M | SIMD_8x32(__m256i x) : m_avx2(x) {} |
282 | | |
283 | | private: |
284 | | |
285 | | BOTAN_FUNC_ISA("avx2") |
286 | | static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) |
287 | 419k | { |
288 | 419k | SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4)); |
289 | 419k | SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4)); |
290 | 419k | A = T0; |
291 | 419k | B = T1; |
292 | 419k | } |
293 | | |
294 | | __m256i m_avx2; |
295 | | }; |
296 | | |
297 | | } |
298 | | |
299 | | #endif |