/src/botan/build/include/internal/botan/internal/simd_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #ifndef BOTAN_SIMD_AVX2_H_ |
8 | | #define BOTAN_SIMD_AVX2_H_ |
9 | | |
10 | | #include <botan/types.h> |
11 | | #include <immintrin.h> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | #define BOTAN_AVX2_FN BOTAN_FUNC_ISA("avx2") |
16 | | |
17 | | class SIMD_8x32 final { |
18 | | public: |
19 | | SIMD_8x32& operator=(const SIMD_8x32& other) = default; |
20 | | SIMD_8x32(const SIMD_8x32& other) = default; |
21 | | |
22 | | SIMD_8x32& operator=(SIMD_8x32&& other) = default; |
23 | | SIMD_8x32(SIMD_8x32&& other) = default; |
24 | | |
25 | | ~SIMD_8x32() = default; |
26 | | |
27 | | BOTAN_AVX2_FN |
28 | 0 | BOTAN_FORCE_INLINE SIMD_8x32() noexcept { m_avx2 = _mm256_setzero_si256(); } |
29 | | |
30 | | BOTAN_AVX2_FN |
31 | 0 | explicit SIMD_8x32(const uint32_t B[8]) noexcept { |
32 | 0 | m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); |
33 | 0 | } |
34 | | |
35 | | BOTAN_AVX2_FN |
36 | | explicit SIMD_8x32(uint32_t B0, |
37 | | uint32_t B1, |
38 | | uint32_t B2, |
39 | | uint32_t B3, |
40 | | uint32_t B4, |
41 | | uint32_t B5, |
42 | | uint32_t B6, |
43 | 37.8k | uint32_t B7) noexcept { |
44 | 37.8k | m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0); |
45 | 37.8k | } |
46 | | |
47 | | BOTAN_AVX2_FN |
48 | 606k | static SIMD_8x32 splat(uint32_t B) noexcept { return SIMD_8x32(_mm256_set1_epi32(B)); } |
49 | | |
50 | | BOTAN_AVX2_FN |
51 | 0 | static SIMD_8x32 load_le(const uint8_t* in) noexcept { |
52 | 0 | return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); |
53 | 0 | } |
54 | | |
55 | | BOTAN_AVX2_FN |
56 | 0 | static SIMD_8x32 load_le128(const uint8_t* in) noexcept { |
57 | 0 | return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)))); |
58 | 0 | } |
59 | | |
60 | | BOTAN_AVX2_FN |
61 | 0 | static SIMD_8x32 load_le128(const uint32_t* in) noexcept { |
62 | 0 | return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)))); |
63 | 0 | } |
64 | | |
65 | | BOTAN_AVX2_FN |
66 | 0 | static SIMD_8x32 load_be(const uint8_t* in) noexcept { return load_le(in).bswap(); } |
67 | | |
68 | | BOTAN_AVX2_FN |
69 | 303k | void store_le(uint8_t out[]) const noexcept { _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); } |
70 | | |
71 | | BOTAN_AVX2_FN |
72 | 0 | void store_le128(uint8_t out[]) const noexcept { |
73 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(out), _mm256_extracti128_si256(raw(), 0)); |
74 | 0 | } |
75 | | |
76 | | BOTAN_AVX2_FN |
77 | 0 | void store_be(uint8_t out[]) const noexcept { bswap().store_le(out); } |
78 | | |
79 | | template <size_t ROT> |
80 | | BOTAN_AVX2_FN SIMD_8x32 rotl() const noexcept |
81 | | requires(ROT > 0 && ROT < 32) |
82 | 6.06M | { |
83 | | #if defined(__AVX512VL__) |
84 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); |
85 | | #else |
86 | 6.06M | if constexpr(ROT == 8) { |
87 | 1.51M | const __m256i shuf_rotl_8 = |
88 | 1.51M | _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003); |
89 | | |
90 | 1.51M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); |
91 | 1.51M | } else if constexpr(ROT == 16) { |
92 | 1.51M | const __m256i shuf_rotl_16 = |
93 | 1.51M | _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302); |
94 | | |
95 | 1.51M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); |
96 | 1.51M | } else if constexpr(ROT == 24) { |
97 | 0 | const __m256i shuf_rotl_24 = |
98 | 0 | _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201); |
99 | |
|
100 | 0 | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24)); |
101 | 3.03M | } else { |
102 | 3.03M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), |
103 | 3.03M | _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT)))); |
104 | 3.03M | } |
105 | 6.06M | #endif |
106 | 6.06M | } Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm13EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm3EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm1EEES0_vQaagtT_Li0EltT_Li32E _ZNK5Botan9SIMD_8x324rotlILm7EEES0_vQaagtT_Li0EltT_Li32E Line | Count | Source | 82 | 1.51M | { | 83 | | #if defined(__AVX512VL__) | 84 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 85 | | #else | 86 | | if constexpr(ROT == 8) { | 87 | | const __m256i shuf_rotl_8 = | 88 | | _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003); | 89 | | | 90 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | | } else if constexpr(ROT == 16) { | 92 | | const __m256i shuf_rotl_16 = | 93 | | _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302); | 94 | | | 95 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | | } else if constexpr(ROT == 24) { | 97 | | const __m256i shuf_rotl_24 = | 98 | | _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201); | 99 | | | 100 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24)); | 101 | 1.51M | } else { | 102 | 1.51M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 103 | 1.51M | _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT)))); | 104 | 1.51M | } | 105 | 1.51M | #endif | 106 | 1.51M | } |
Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm5EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm22EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm10EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm27EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm25EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm31EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm29EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm19EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm30EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm26EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm21EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm2EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm18EEES0_vQaagtT_Li0EltT_Li32E Unexecuted instantiation: _ZNK5Botan9SIMD_8x324rotlILm24EEES0_vQaagtT_Li0EltT_Li32E _ZNK5Botan9SIMD_8x324rotlILm16EEES0_vQaagtT_Li0EltT_Li32E Line | Count | Source | 82 | 1.51M | { | 83 | | #if defined(__AVX512VL__) | 84 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 85 | | #else | 86 | | if constexpr(ROT == 8) { | 87 | | const __m256i shuf_rotl_8 = | 88 | | _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003); | 89 | | | 90 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | 1.51M | } else if constexpr(ROT == 16) { | 92 | 1.51M | const __m256i shuf_rotl_16 = | 93 | 1.51M | _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302); | 94 | | | 95 | 1.51M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | | } else if constexpr(ROT == 24) { | 97 | | const __m256i shuf_rotl_24 = | 98 | | _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201); | 99 | | | 100 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24)); | 101 | | } else { | 102 | | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 103 | | _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT)))); | 104 | | } | 105 | 1.51M | #endif | 106 | 1.51M | } |
_ZNK5Botan9SIMD_8x324rotlILm12EEES0_vQaagtT_Li0EltT_Li32E Line | Count | Source | 82 | 1.51M | { | 83 | | #if defined(__AVX512VL__) | 84 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 85 | | #else | 86 | | if constexpr(ROT == 8) { | 87 | | const __m256i shuf_rotl_8 = | 88 | | _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003); | 89 | | | 90 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | | } else if constexpr(ROT == 16) { | 92 | | const __m256i shuf_rotl_16 = | 93 | | _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302); | 94 | | | 95 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | | } else if constexpr(ROT == 24) { | 97 | | const __m256i shuf_rotl_24 = | 98 | | _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201); | 99 | | | 100 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24)); | 101 | 1.51M | } else { | 102 | 1.51M | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 103 | 1.51M | _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT)))); | 104 | 1.51M | } | 105 | 1.51M | #endif | 106 | 1.51M | } |
_ZNK5Botan9SIMD_8x324rotlILm8EEES0_vQaagtT_Li0EltT_Li32E Line | Count | Source | 82 | 1.51M | { | 83 | | #if defined(__AVX512VL__) | 84 | | return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); | 85 | | #else | 86 | 1.51M | if constexpr(ROT == 8) { | 87 | 1.51M | const __m256i shuf_rotl_8 = | 88 | 1.51M | _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003); | 89 | | | 90 | 1.51M | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); | 91 | | } else if constexpr(ROT == 16) { | 92 | | const __m256i shuf_rotl_16 = | 93 | | _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302); | 94 | | | 95 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); | 96 | | } else if constexpr(ROT == 24) { | 97 | | const __m256i shuf_rotl_24 = | 98 | | _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201); | 99 | | | 100 | | return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24)); | 101 | | } else { | 102 | | return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), | 103 | | _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT)))); | 104 | | } | 105 | 1.51M | #endif | 106 | 1.51M | } |
|
107 | | |
108 | | template <size_t ROT> |
109 | 0 | BOTAN_AVX2_FN SIMD_8x32 rotr() const noexcept { |
110 | 0 | return this->rotl<32 - ROT>(); |
111 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<22ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<5ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<7ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<1ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<3ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<13ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<2ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<6ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<11ul>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::rotr<25ul>() const |
112 | | |
113 | 0 | SIMD_8x32 BOTAN_AVX2_FN sigma0() const noexcept { |
114 | 0 | const SIMD_8x32 rot1 = this->rotr<2>(); |
115 | 0 | const SIMD_8x32 rot2 = this->rotr<13>(); |
116 | 0 | const SIMD_8x32 rot3 = this->rotr<22>(); |
117 | 0 | return rot1 ^ rot2 ^ rot3; |
118 | 0 | } |
119 | | |
120 | 0 | SIMD_8x32 BOTAN_AVX2_FN sigma1() const noexcept { |
121 | 0 | const SIMD_8x32 rot1 = this->rotr<6>(); |
122 | 0 | const SIMD_8x32 rot2 = this->rotr<11>(); |
123 | 0 | const SIMD_8x32 rot3 = this->rotr<25>(); |
124 | 0 | return rot1 ^ rot2 ^ rot3; |
125 | 0 | } |
126 | | |
127 | | BOTAN_AVX2_FN |
128 | 75.7k | SIMD_8x32 operator+(const SIMD_8x32& other) const noexcept { |
129 | 75.7k | SIMD_8x32 retval(*this); |
130 | 75.7k | retval += other; |
131 | 75.7k | return retval; |
132 | 75.7k | } |
133 | | |
134 | | BOTAN_AVX2_FN |
135 | 0 | SIMD_8x32 operator-(const SIMD_8x32& other) const noexcept { |
136 | 0 | SIMD_8x32 retval(*this); |
137 | 0 | retval -= other; |
138 | 0 | return retval; |
139 | 0 | } |
140 | | |
141 | | BOTAN_AVX2_FN |
142 | 0 | SIMD_8x32 operator^(const SIMD_8x32& other) const noexcept { |
143 | 0 | SIMD_8x32 retval(*this); |
144 | 0 | retval ^= other; |
145 | 0 | return retval; |
146 | 0 | } |
147 | | |
148 | | BOTAN_AVX2_FN |
149 | 0 | SIMD_8x32 operator|(const SIMD_8x32& other) const noexcept { |
150 | 0 | SIMD_8x32 retval(*this); |
151 | 0 | retval |= other; |
152 | 0 | return retval; |
153 | 0 | } |
154 | | |
155 | | BOTAN_AVX2_FN |
156 | 0 | SIMD_8x32 operator&(const SIMD_8x32& other) const noexcept { |
157 | 0 | SIMD_8x32 retval(*this); |
158 | 0 | retval &= other; |
159 | 0 | return retval; |
160 | 0 | } |
161 | | |
162 | | BOTAN_AVX2_FN |
163 | 6.44M | void operator+=(const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); } |
164 | | |
165 | | BOTAN_AVX2_FN |
166 | 0 | void operator-=(const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); } |
167 | | |
168 | | BOTAN_AVX2_FN |
169 | 6.06M | void operator^=(const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); } |
170 | | |
171 | | BOTAN_AVX2_FN |
172 | 0 | void operator^=(uint32_t other) { *this ^= SIMD_8x32::splat(other); } |
173 | | |
174 | | BOTAN_AVX2_FN |
175 | 0 | void operator|=(const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); } |
176 | | |
177 | | BOTAN_AVX2_FN |
178 | 0 | void operator&=(const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); } |
179 | | |
180 | | template <int SHIFT> |
181 | 0 | BOTAN_AVX2_FN SIMD_8x32 shl() const noexcept { |
182 | 0 | return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); |
183 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<3>() const Unexecuted instantiation: Botan::SIMD_8x32 Botan::SIMD_8x32::shl<7>() const |
184 | | |
185 | | template <int SHIFT> |
186 | | BOTAN_AVX2_FN SIMD_8x32 shr() const noexcept { |
187 | | return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); |
188 | | } |
189 | | |
190 | | BOTAN_AVX2_FN |
191 | 0 | SIMD_8x32 operator~() const noexcept { |
192 | 0 | return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); |
193 | 0 | } |
194 | | |
195 | | // (~reg) & other |
196 | | BOTAN_AVX2_FN |
197 | 0 | SIMD_8x32 andc(const SIMD_8x32& other) const noexcept { |
198 | 0 | return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); |
199 | 0 | } |
200 | | |
201 | | BOTAN_AVX2_FN |
202 | 0 | SIMD_8x32 bswap() const noexcept { |
203 | 0 | const uint8_t BSWAP_MASK[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, |
204 | 0 | 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; |
205 | |
|
206 | 0 | const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); |
207 | |
|
208 | 0 | const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); |
209 | |
|
210 | 0 | return SIMD_8x32(output); |
211 | 0 | } |
212 | | |
213 | | BOTAN_AVX2_FN |
214 | 0 | SIMD_8x32 rev_words() const noexcept { return SIMD_8x32(_mm256_shuffle_epi32(raw(), 0b00011011)); } |
215 | | |
216 | | BOTAN_AVX2_FN |
217 | 75.7k | static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, SIMD_8x32& B2, SIMD_8x32& B3) noexcept { |
218 | 75.7k | const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); |
219 | 75.7k | const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); |
220 | 75.7k | const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); |
221 | 75.7k | const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); |
222 | | |
223 | 75.7k | B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); |
224 | 75.7k | B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); |
225 | 75.7k | B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); |
226 | 75.7k | B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); |
227 | 75.7k | } |
228 | | |
229 | | BOTAN_AVX2_FN |
230 | | static void transpose(SIMD_8x32& B0, |
231 | | SIMD_8x32& B1, |
232 | | SIMD_8x32& B2, |
233 | | SIMD_8x32& B3, |
234 | | SIMD_8x32& B4, |
235 | | SIMD_8x32& B5, |
236 | | SIMD_8x32& B6, |
237 | 37.8k | SIMD_8x32& B7) noexcept { |
238 | 37.8k | transpose(B0, B1, B2, B3); |
239 | 37.8k | transpose(B4, B5, B6, B7); |
240 | | |
241 | 37.8k | swap_tops(B0, B4); |
242 | 37.8k | swap_tops(B1, B5); |
243 | 37.8k | swap_tops(B2, B6); |
244 | 37.8k | swap_tops(B3, B7); |
245 | 37.8k | } |
246 | | |
247 | | BOTAN_AVX2_FN |
248 | 0 | static SIMD_8x32 choose(const SIMD_8x32& mask, const SIMD_8x32& a, const SIMD_8x32& b) noexcept { |
249 | | #if defined(__AVX512VL__) |
250 | | return _mm256_ternarylogic_epi32(mask.raw(), a.raw(), b.raw(), 0xca); |
251 | | #else |
252 | 0 | return (mask & a) ^ mask.andc(b); |
253 | 0 | #endif |
254 | 0 | } |
255 | | |
256 | | BOTAN_AVX2_FN |
257 | 0 | static SIMD_8x32 majority(const SIMD_8x32& x, const SIMD_8x32& y, const SIMD_8x32& z) noexcept { |
258 | | #if defined(__AVX512VL__) |
259 | | return _mm256_ternarylogic_epi32(x.raw(), y.raw(), z.raw(), 0xe8); |
260 | | #else |
261 | 0 | return SIMD_8x32::choose(x ^ y, z, y); |
262 | 0 | #endif |
263 | 0 | } |
264 | | |
265 | | BOTAN_AVX2_FN |
266 | 18.9k | static void reset_registers() noexcept { _mm256_zeroupper(); } |
267 | | |
268 | | BOTAN_AVX2_FN |
269 | 18.9k | static void zero_registers() noexcept { _mm256_zeroall(); } |
270 | | |
271 | 606k | __m256i BOTAN_AVX2_FN raw() const noexcept { return m_avx2; } |
272 | | |
273 | | BOTAN_AVX2_FN |
274 | 6.97M | SIMD_8x32(__m256i x) noexcept : m_avx2(x) {} |
275 | | |
276 | | private: |
277 | | BOTAN_AVX2_FN |
278 | 151k | static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) { |
279 | 151k | SIMD_8x32 T0 = _mm256_permute2x128_si256(A.raw(), B.raw(), 0 + (2 << 4)); |
280 | 151k | SIMD_8x32 T1 = _mm256_permute2x128_si256(A.raw(), B.raw(), 1 + (3 << 4)); |
281 | 151k | A = T0; |
282 | 151k | B = T1; |
283 | 151k | } |
284 | | |
285 | | __m256i m_avx2; |
286 | | }; |
287 | | |
288 | | template <size_t R> |
289 | 0 | inline SIMD_8x32 rotl(SIMD_8x32 input) { |
290 | 0 | return input.rotl<R>(); |
291 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<13ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<3ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<1ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<7ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<5ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotl<22ul>(Botan::SIMD_8x32) |
292 | | |
293 | | template <size_t R> |
294 | 0 | inline SIMD_8x32 rotr(SIMD_8x32 input) { |
295 | 0 | return input.rotr<R>(); |
296 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<22ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<5ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<7ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<1ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<3ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::rotr<13ul>(Botan::SIMD_8x32) |
297 | | |
298 | | // For Serpent: |
299 | | template <size_t S> |
300 | 0 | inline SIMD_8x32 shl(SIMD_8x32 input) { |
301 | 0 | return input.shl<S>(); |
302 | 0 | } Unexecuted instantiation: Botan::SIMD_8x32 Botan::shl<3ul>(Botan::SIMD_8x32) Unexecuted instantiation: Botan::SIMD_8x32 Botan::shl<7ul>(Botan::SIMD_8x32) |
303 | | |
304 | | } // namespace Botan |
305 | | |
306 | | #endif |