/src/botan/build/include/botan/internal/simd_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #ifndef BOTAN_SIMD_AVX2_H_ |
8 | | #define BOTAN_SIMD_AVX2_H_ |
9 | | |
10 | | #include <botan/types.h> |
11 | | #include <immintrin.h> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | #define BOTAN_AVX2_FN BOTAN_FUNC_ISA("avx2") |
16 | | |
17 | | class SIMD_8x32 final |
18 | | { |
19 | | public: |
20 | | |
21 | | SIMD_8x32& operator=(const SIMD_8x32& other) = default; |
22 | | SIMD_8x32(const SIMD_8x32& other) = default; |
23 | | |
24 | | SIMD_8x32& operator=(SIMD_8x32&& other) = default; |
25 | | SIMD_8x32(SIMD_8x32&& other) = default; |
26 | | |
27 | | BOTAN_AVX2_FN |
28 | | BOTAN_FORCE_INLINE SIMD_8x32() |
29 | 0 | { |
30 | 0 | m_avx2 = _mm256_setzero_si256(); |
31 | 0 | } |
32 | | |
33 | | BOTAN_AVX2_FN |
34 | | explicit SIMD_8x32(const uint32_t B[8]) |
35 | 0 | { |
36 | 0 | m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); |
37 | 0 | } |
38 | | |
39 | | BOTAN_AVX2_FN |
40 | | explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, |
41 | | uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) |
42 | 117k | { |
43 | 117k | m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0); |
44 | 117k | } |
45 | | |
46 | | BOTAN_AVX2_FN |
47 | | static SIMD_8x32 splat(uint32_t B) |
48 | 1.87M | { |
49 | 1.87M | return SIMD_8x32(_mm256_set1_epi32(B)); |
50 | 1.87M | } |
51 | | |
52 | | BOTAN_AVX2_FN |
53 | | static SIMD_8x32 load_le(const uint8_t* in) |
54 | 0 | { |
55 | 0 | return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); |
56 | 0 | } |
57 | | |
58 | | BOTAN_AVX2_FN |
59 | | static SIMD_8x32 load_be(const uint8_t* in) |
60 | 0 | { |
61 | 0 | return load_le(in).bswap(); |
62 | 0 | } |
63 | | |
64 | | BOTAN_AVX2_FN |
65 | | void store_le(uint8_t out[]) const |
66 | 936k | { |
67 | 936k | _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); |
68 | 936k | } |
69 | | |
70 | | BOTAN_AVX2_FN |
71 | | void store_be(uint8_t out[]) const |
72 | 0 | { |
73 | 0 | bswap().store_le(out); |
74 | 0 | } |
75 | | |
76 | | template<size_t ROT> |
77 | | BOTAN_AVX2_FN |
78 | | SIMD_8x32 rotl() const |
79 | 18.7M | { |
80 | 18.7M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); |
81 | | |
82 | | #if defined(__AVX512VL__) |
83 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); |
84 | | #else |
85 | 18.7M | if constexpr(ROT == 8) |
86 | 4.68M | { |
87 | 14.0M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, |
88 | 14.0M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); |
89 | | |
90 | 14.0M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); |
91 | 14.0M | } |
92 | 14.0M | else if constexpr(ROT == 16) |
93 | 4.68M | { |
94 | 9.36M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, |
95 | 9.36M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); |
96 | | |
97 | 9.36M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); |
98 | 9.36M | } |
99 | 9.36M | else |
100 | 9.36M | { |
101 | 9.36M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), |
102 | 9.36M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); |
103 | 9.36M | } |
104 | 18.7M | #endif |
105 | 18.7M | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<30ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<19ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<10ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<26ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<21ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<7ul>() const Line | Count | Source | 79 | 4.68M | { | 80 | 4.68M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 81 | | | 82 | | #if defined(__AVX512VL__) | 83 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 84 | | #else | 85 | 4.68M | if constexpr(ROT == 8) | 86 | 0 | { | 87 | 4.68M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 88 | 4.68M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 89 | | | 90 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | 4.68M | } | 92 | 4.68M | else if constexpr(ROT == 16) | 93 | 0 | { | 94 | 4.68M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 95 | 4.68M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 96 | | | 97 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 98 | 4.68M | } | 99 | 4.68M | else | 100 | 4.68M | { | 101 | 4.68M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 102 | 4.68M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 103 | 4.68M | } | 104 | 4.68M | #endif | 105 | 4.68M | } |
Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<1ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<27ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<31ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<29ul>() const Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<16ul>() const Line | Count | Source | 79 | 4.68M | { | 80 | 4.68M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 81 | | | 82 | | #if defined(__AVX512VL__) | 83 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 84 | | #else | 85 | 4.68M | if constexpr(ROT == 8) | 86 | 0 | { | 87 | 4.68M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 88 | 4.68M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 89 | | | 90 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | 4.68M | } | 92 | 4.68M | else if constexpr(ROT == 16) | 93 | 4.68M | { | 94 | 4.68M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 95 | 4.68M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 96 | | | 97 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 98 | 4.68M | } | 99 | 4.68M | else | 100 | 4.68M | { | 101 | 4.68M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 102 | 4.68M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 103 | 4.68M | } | 104 | 4.68M | #endif | 105 | 4.68M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<12ul>() const Line | Count | Source | 79 | 4.68M | { | 80 | 4.68M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 81 | | | 82 | | #if defined(__AVX512VL__) | 83 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 84 | | #else | 85 | 4.68M | if constexpr(ROT == 8) | 86 | 0 | { | 87 | 4.68M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 88 | 4.68M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 89 | | | 90 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | 4.68M | } | 92 | 4.68M | else if constexpr(ROT == 16) | 93 | 0 | { | 94 | 4.68M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 95 | 4.68M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 96 | | | 97 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 98 | 4.68M | } | 99 | 4.68M | else | 100 | 4.68M | { | 101 | 4.68M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 102 | 4.68M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 103 | 4.68M | } | 104 | 4.68M | #endif | 105 | 4.68M | } |
Botan::SIMD_8x32 Botan::SIMD_8x32::rotl<8ul>() const Line | Count | Source | 79 | 4.68M | { | 80 | 4.68M | static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); | 81 | | | 82 | | #if defined(__AVX512VL__) | 83 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 84 | | #else | 85 | 4.68M | if constexpr(ROT == 8) | 86 | 4.68M | { | 87 | 4.68M | const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, | 88 | 4.68M | 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); | 89 | | | 90 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | 4.68M | } | 92 | 4.68M | else if constexpr(ROT == 16) | 93 | 4.68M | { | 94 | 4.68M | const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, | 95 | 4.68M | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); | 96 | | | 97 | 4.68M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 98 | 4.68M | } | 99 | 4.68M | else | 100 | 4.68M | { | 101 | 4.68M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 102 | 4.68M | _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); | 103 | 4.68M | } | 104 | 4.68M | #endif | 105 | 4.68M | } |
|
106 | | |
107 | | template<size_t ROT> |
108 | | BOTAN_AVX2_FN |
109 | | SIMD_8x32 rotr() const |
110 | 0 | { |
111 | 0 | return this->rotl<32-ROT>(); |
112 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<2ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<6ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<11ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<25ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<7ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<1ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<3ul>() const |
113 | | |
114 | | SIMD_8x32 BOTAN_AVX2_FN sigma0() const |
115 | 0 | { |
116 | 0 | const SIMD_8x32 rot1 = this->rotr<2>(); |
117 | 0 | const SIMD_8x32 rot2 = this->rotr<13>(); |
118 | 0 | const SIMD_8x32 rot3 = this->rotr<22>(); |
119 | 0 | return rot1 ^ rot2 ^ rot3; |
120 | 0 | } |
121 | | |
122 | | SIMD_8x32 BOTAN_AVX2_FN sigma1() const |
123 | 0 | { |
124 | 0 | const SIMD_8x32 rot1 = this->rotr<6>(); |
125 | 0 | const SIMD_8x32 rot2 = this->rotr<11>(); |
126 | 0 | const SIMD_8x32 rot3 = this->rotr<25>(); |
127 | 0 | return rot1 ^ rot2 ^ rot3; |
128 | 0 | } |
129 | | |
130 | | BOTAN_AVX2_FN |
131 | | SIMD_8x32 operator+(const SIMD_8x32& other) const |
132 | 234k | { |
133 | 234k | SIMD_8x32 retval(*this); |
134 | 234k | retval += other; |
135 | 234k | return retval; |
136 | 234k | } |
137 | | |
138 | | BOTAN_AVX2_FN |
139 | | SIMD_8x32 operator-(const SIMD_8x32& other) const |
140 | 0 | { |
141 | 0 | SIMD_8x32 retval(*this); |
142 | 0 | retval -= other; |
143 | 0 | return retval; |
144 | 0 | } |
145 | | |
146 | | BOTAN_AVX2_FN |
147 | | SIMD_8x32 operator^(const SIMD_8x32& other) const |
148 | 0 | { |
149 | 0 | SIMD_8x32 retval(*this); |
150 | 0 | retval ^= other; |
151 | 0 | return retval; |
152 | 0 | } |
153 | | |
154 | | BOTAN_AVX2_FN |
155 | | SIMD_8x32 operator|(const SIMD_8x32& other) const |
156 | 0 | { |
157 | 0 | SIMD_8x32 retval(*this); |
158 | 0 | retval |= other; |
159 | 0 | return retval; |
160 | 0 | } |
161 | | |
162 | | BOTAN_AVX2_FN |
163 | | SIMD_8x32 operator&(const SIMD_8x32& other) const |
164 | 0 | { |
165 | 0 | SIMD_8x32 retval(*this); |
166 | 0 | retval &= other; |
167 | 0 | return retval; |
168 | 0 | } |
169 | | |
170 | | BOTAN_AVX2_FN |
171 | | void operator+=(const SIMD_8x32& other) |
172 | 19.9M | { |
173 | 19.9M | m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); |
174 | 19.9M | } |
175 | | |
176 | | BOTAN_AVX2_FN |
177 | | void operator-=(const SIMD_8x32& other) |
178 | 0 | { |
179 | 0 | m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); |
180 | 0 | } |
181 | | |
182 | | BOTAN_AVX2_FN |
183 | | void operator^=(const SIMD_8x32& other) |
184 | 18.7M | { |
185 | 18.7M | m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); |
186 | 18.7M | } |
187 | | |
188 | | BOTAN_AVX2_FN |
189 | | void operator^=(uint32_t other) |
190 | 0 | { |
191 | 0 | *this ^= SIMD_8x32::splat(other); |
192 | 0 | } |
193 | | |
194 | | BOTAN_AVX2_FN |
195 | | void operator|=(const SIMD_8x32& other) |
196 | 0 | { |
197 | 0 | m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); |
198 | 0 | } |
199 | | |
200 | | BOTAN_AVX2_FN |
201 | | void operator&=(const SIMD_8x32& other) |
202 | 0 | { |
203 | 0 | m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); |
204 | 0 | } |
205 | | |
206 | | template<int SHIFT> BOTAN_AVX2_FN SIMD_8x32 shl() const |
207 | 0 | { |
208 | 0 | return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); |
209 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<3>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<7>() const |
210 | | |
211 | | template<int SHIFT> BOTAN_AVX2_FN SIMD_8x32 shr() const |
212 | | { |
213 | | return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); |
214 | | } |
215 | | |
216 | | BOTAN_AVX2_FN |
217 | | SIMD_8x32 operator~() const |
218 | 0 | { |
219 | 0 | return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); |
220 | 0 | } |
221 | | |
222 | | // (~reg) & other |
223 | | BOTAN_AVX2_FN |
224 | | SIMD_8x32 andc(const SIMD_8x32& other) const |
225 | 0 | { |
226 | 0 | return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); |
227 | 0 | } |
228 | | |
229 | | BOTAN_AVX2_FN |
230 | | SIMD_8x32 bswap() const |
231 | 0 | { |
232 | 0 | const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0, |
233 | 0 | 7, 6, 5, 4, |
234 | 0 | 11, 10, 9, 8, |
235 | 0 | 15, 14, 13, 12, |
236 | 0 | 19, 18, 17, 16, |
237 | 0 | 23, 22, 21, 20, |
238 | 0 | 27, 26, 25, 24, |
239 | 0 | 31, 30, 29, 28 }; |
240 | |
|
241 | 0 | const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); |
242 | |
|
243 | 0 | const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); |
244 | |
|
245 | 0 | return SIMD_8x32(output); |
246 | 0 | } |
247 | | |
248 | | BOTAN_AVX2_FN |
249 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
250 | | SIMD_8x32& B2, SIMD_8x32& B3) |
251 | 234k | { |
252 | 234k | const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); |
253 | 234k | const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); |
254 | 234k | const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); |
255 | 234k | const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); |
256 | | |
257 | 234k | B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); |
258 | 234k | B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); |
259 | 234k | B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); |
260 | 234k | B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); |
261 | 234k | } |
262 | | |
263 | | BOTAN_AVX2_FN |
264 | | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, |
265 | | SIMD_8x32& B2, SIMD_8x32& B3, |
266 | | SIMD_8x32& B4, SIMD_8x32& B5, |
267 | | SIMD_8x32& B6, SIMD_8x32& B7) |
268 | 117k | { |
269 | 117k | transpose(B0, B1, B2, B3); |
270 | 117k | transpose(B4, B5, B6, B7); |
271 | | |
272 | 117k | swap_tops(B0, B4); |
273 | 117k | swap_tops(B1, B5); |
274 | 117k | swap_tops(B2, B6); |
275 | 117k | swap_tops(B3, B7); |
276 | 117k | } |
277 | | |
278 | | BOTAN_AVX2_FN |
279 | | static SIMD_8x32 choose(const SIMD_8x32& mask, const SIMD_8x32& a, const SIMD_8x32& b) |
280 | 0 | { |
281 | | #if defined(__AVX512VL__) |
282 | | return _mm256_ternarylogic_epi32(mask.handle(), a.handle(), b.handle(), 0xca); |
283 | | #else |
284 | 0 | return (mask & a) ^ mask.andc(b); |
285 | 0 | #endif |
286 | 0 | } |
287 | | |
288 | | BOTAN_AVX2_FN |
289 | | static SIMD_8x32 majority(const SIMD_8x32& x, const SIMD_8x32& y, const SIMD_8x32& z) |
290 | 0 | { |
291 | | #if defined(__AVX512VL__) |
292 | | return _mm256_ternarylogic_epi32(x.handle(), y.handle(), z.handle(), 0xe8); |
293 | | #else |
294 | 0 | return SIMD_8x32::choose(x ^ y, z, y); |
295 | 0 | #endif |
296 | 0 | } |
297 | | |
298 | | BOTAN_AVX2_FN |
299 | | static void reset_registers() |
300 | 58.5k | { |
301 | 58.5k | _mm256_zeroupper(); |
302 | 58.5k | } |
303 | | |
304 | | BOTAN_AVX2_FN |
305 | | static void zero_registers() |
306 | 58.5k | { |
307 | 58.5k | _mm256_zeroall(); |
308 | 58.5k | } |
309 | | |
310 | 1.87M | __m256i BOTAN_AVX2_FN handle() const { return m_avx2; } |
311 | | |
312 | | BOTAN_AVX2_FN |
313 | 21.5M | SIMD_8x32(__m256i x) : m_avx2(x) {} |
314 | | |
315 | | private: |
316 | | |
317 | | BOTAN_AVX2_FN |
318 | | static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) |
319 | 468k | { |
320 | 468k | SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4)); |
321 | 468k | SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4)); |
322 | 468k | A = T0; |
323 | 468k | B = T1; |
324 | 468k | } |
325 | | |
326 | | __m256i m_avx2; |
327 | | }; |
328 | | |
329 | | template<size_t R> |
330 | | inline SIMD_8x32 rotl(SIMD_8x32 input) |
331 | | { |
332 | | return input.rotl<R>(); |
333 | | } |
334 | | |
335 | | template<size_t R> |
336 | | inline SIMD_8x32 rotr(SIMD_8x32 input) |
337 | | { |
338 | | return input.rotr<R>(); |
339 | | } |
340 | | |
341 | | // For Serpent: |
342 | | template<size_t S> |
343 | | inline SIMD_8x32 shl(SIMD_8x32 input) |
344 | | { |
345 | | return input.shl<S>(); |
346 | | } |
347 | | |
348 | | } |
349 | | |
350 | | #endif |