/src/botan/build/include/internal/botan/internal/simd_4x64.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2022,2025 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #ifndef BOTAN_SIMD_4X64_H_ |
8 | | #define BOTAN_SIMD_4X64_H_ |
9 | | |
10 | | #include <botan/compiler.h> |
11 | | #include <botan/types.h> |
12 | | #include <botan/internal/isa_extn.h> |
13 | | #include <botan/internal/target_info.h> |
14 | | |
15 | | #if defined(BOTAN_TARGET_CPU_SUPPORTS_AVX2) |
16 | | #include <immintrin.h> |
17 | | #endif |
18 | | |
19 | | namespace Botan { |
20 | | |
21 | | class SIMD_4x64 final { |
22 | | public: |
23 | | SIMD_4x64& operator=(const SIMD_4x64& other) = default; |
24 | | SIMD_4x64(const SIMD_4x64& other) = default; |
25 | | |
26 | | SIMD_4x64& operator=(SIMD_4x64&& other) = default; |
27 | | SIMD_4x64(SIMD_4x64&& other) = default; |
28 | | |
29 | | ~SIMD_4x64() = default; |
30 | | |
31 | | // zero initialized |
32 | 19.2k | BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64() { m_simd = _mm256_setzero_si256(); } |
33 | | |
34 | | // Load two halves at different addresses |
35 | 19.2k | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_le2(const void* inl, const void* inh) { |
36 | 19.2k | return SIMD_4x64( |
37 | 19.2k | _mm256_loadu2_m128i(reinterpret_cast<const __m128i*>(inl), reinterpret_cast<const __m128i*>(inh))); |
38 | 19.2k | } |
39 | | |
40 | 19.2k | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be2(const void* inl, const void* inh) { |
41 | 19.2k | return SIMD_4x64::load_le2(inl, inh).bswap(); |
42 | 19.2k | } |
43 | | |
44 | 96.2k | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_le(const void* in) { |
45 | 96.2k | return SIMD_4x64(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); |
46 | 96.2k | } |
47 | | |
48 | 0 | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be(const void* in) { return SIMD_4x64::load_le(in).bswap(); } |
49 | | |
50 | 19.2k | SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 bswap() const { |
51 | 19.2k | const auto idx = _mm256_set_epi8( |
52 | 19.2k | 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); |
53 | | |
54 | 19.2k | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, idx)); |
55 | 19.2k | } |
56 | | |
57 | 0 | void store_le(uint64_t out[4]) const { this->store_le(reinterpret_cast<uint8_t*>(out)); } |
58 | | |
59 | 0 | BOTAN_FN_ISA_SIMD_4X64 void store_le(uint8_t out[]) const { |
60 | 0 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_simd); |
61 | 0 | } |
62 | | |
63 | 96.2k | BOTAN_FN_ISA_SIMD_4X64 void store_le2(void* outh, void* outl) { |
64 | 96.2k | _mm256_storeu2_m128i(reinterpret_cast<__m128i*>(outh), reinterpret_cast<__m128i*>(outl), m_simd); |
65 | 96.2k | } |
66 | | |
67 | 327k | SIMD_4x64 operator+(const SIMD_4x64& other) const { |
68 | 327k | SIMD_4x64 retval(*this); |
69 | 327k | retval += other; |
70 | 327k | return retval; |
71 | 327k | } |
72 | | |
73 | 307k | SIMD_4x64 operator^(const SIMD_4x64& other) const { |
74 | 307k | SIMD_4x64 retval(*this); |
75 | 307k | retval ^= other; |
76 | 307k | return retval; |
77 | 307k | } |
78 | | |
79 | 327k | BOTAN_FN_ISA_SIMD_4X64 void operator+=(const SIMD_4x64& other) { |
80 | 327k | m_simd = _mm256_add_epi64(m_simd, other.m_simd); |
81 | 327k | } |
82 | | |
83 | 307k | BOTAN_FN_ISA_SIMD_4X64 void operator^=(const SIMD_4x64& other) { |
84 | 307k | m_simd = _mm256_xor_si256(m_simd, other.m_simd); |
85 | 307k | } |
86 | | |
87 | | template <size_t ROT> |
88 | | BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 rotr() const |
89 | | requires(ROT > 0 && ROT < 64) |
90 | 307k | { |
91 | | #if defined(__AVX512VL__) |
92 | | return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT)); |
93 | | #else |
94 | 307k | if constexpr(ROT == 8) { |
95 | 76.9k | auto shuf_rot_8 = |
96 | 76.9k | _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201); |
97 | | |
98 | 76.9k | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8)); |
99 | 76.9k | } else if constexpr(ROT == 16) { |
100 | 0 | auto shuf_rot_16 = |
101 | 0 | _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); |
102 | |
|
103 | 0 | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16)); |
104 | 0 | } else if constexpr(ROT == 24) { |
105 | 0 | auto shuf_rot_24 = |
106 | 0 | _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); |
107 | |
|
108 | 0 | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24)); |
109 | 0 | } else if constexpr(ROT == 32) { |
110 | 0 | auto shuf_rot_32 = |
111 | 0 | _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504); |
112 | |
|
113 | 0 | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32)); |
114 | 230k | } else { |
115 | 230k | return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)), |
116 | 230k | _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT)))); |
117 | 230k | } |
118 | 307k | #endif |
119 | 307k | } _ZNK5Botan9SIMD_4x644rotrILm1EEES0_vQaagtT_Li0EltT_Li64E Line | Count | Source | 90 | 76.9k | { | 91 | | #if defined(__AVX512VL__) | 92 | | return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT)); | 93 | | #else | 94 | | if constexpr(ROT == 8) { | 95 | | auto shuf_rot_8 = | 96 | | _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201); | 97 | | | 98 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8)); | 99 | | } else if constexpr(ROT == 16) { | 100 | | auto shuf_rot_16 = | 101 | | _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); | 102 | | | 103 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16)); | 104 | | } else if constexpr(ROT == 24) { | 105 | | auto shuf_rot_24 = | 106 | | _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); | 107 | | | 108 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24)); | 109 | | } else if constexpr(ROT == 32) { | 110 | | auto shuf_rot_32 = | 111 | | _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504); | 112 | | | 113 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32)); | 114 | 76.9k | } else { | 115 | 76.9k | return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)), | 116 | 76.9k | _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT)))); | 117 | 76.9k | } | 118 | 76.9k | #endif | 119 | 76.9k | } |
_ZNK5Botan9SIMD_4x644rotrILm8EEES0_vQaagtT_Li0EltT_Li64E Line | Count | Source | 90 | 76.9k | { | 91 | | #if defined(__AVX512VL__) | 92 | | return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT)); | 93 | | #else | 94 | 76.9k | if constexpr(ROT == 8) { | 95 | 76.9k | auto shuf_rot_8 = | 96 | 76.9k | _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201); | 97 | | | 98 | 76.9k | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8)); | 99 | | } else if constexpr(ROT == 16) { | 100 | | auto shuf_rot_16 = | 101 | | _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); | 102 | | | 103 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16)); | 104 | | } else if constexpr(ROT == 24) { | 105 | | auto shuf_rot_24 = | 106 | | _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); | 107 | | | 108 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24)); | 109 | | } else if constexpr(ROT == 32) { | 110 | | auto shuf_rot_32 = | 111 | | _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504); | 112 | | | 113 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32)); | 114 | | } else { | 115 | | return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)), | 116 | | _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT)))); | 117 | | } | 118 | 76.9k | #endif | 119 | 76.9k | } |
_ZNK5Botan9SIMD_4x644rotrILm19EEES0_vQaagtT_Li0EltT_Li64E Line | Count | Source | 90 | 76.9k | { | 91 | | #if defined(__AVX512VL__) | 92 | | return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT)); | 93 | | #else | 94 | | if constexpr(ROT == 8) { | 95 | | auto shuf_rot_8 = | 96 | | _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201); | 97 | | | 98 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8)); | 99 | | } else if constexpr(ROT == 16) { | 100 | | auto shuf_rot_16 = | 101 | | _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); | 102 | | | 103 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16)); | 104 | | } else if constexpr(ROT == 24) { | 105 | | auto shuf_rot_24 = | 106 | | _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); | 107 | | | 108 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24)); | 109 | | } else if constexpr(ROT == 32) { | 110 | | auto shuf_rot_32 = | 111 | | _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504); | 112 | | | 113 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32)); | 114 | 76.9k | } else { | 115 | 76.9k | return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)), | 116 | 76.9k | _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT)))); | 117 | 76.9k | } | 118 | 76.9k | #endif | 119 | 76.9k | } |
_ZNK5Botan9SIMD_4x644rotrILm61EEES0_vQaagtT_Li0EltT_Li64E Line | Count | Source | 90 | 76.9k | { | 91 | | #if defined(__AVX512VL__) | 92 | | return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT)); | 93 | | #else | 94 | | if constexpr(ROT == 8) { | 95 | | auto shuf_rot_8 = | 96 | | _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201); | 97 | | | 98 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8)); | 99 | | } else if constexpr(ROT == 16) { | 100 | | auto shuf_rot_16 = | 101 | | _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302); | 102 | | | 103 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16)); | 104 | | } else if constexpr(ROT == 24) { | 105 | | auto shuf_rot_24 = | 106 | | _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403); | 107 | | | 108 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24)); | 109 | | } else if constexpr(ROT == 32) { | 110 | | auto shuf_rot_32 = | 111 | | _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504); | 112 | | | 113 | | return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32)); | 114 | 76.9k | } else { | 115 | 76.9k | return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)), | 116 | 76.9k | _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT)))); | 117 | 76.9k | } | 118 | 76.9k | #endif | 119 | 76.9k | } |
Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm32EEES0_vQaagtT_Li0EltT_Li64E Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm24EEES0_vQaagtT_Li0EltT_Li64E Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm16EEES0_vQaagtT_Li0EltT_Li64E Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm63EEES0_vQaagtT_Li0EltT_Li64E |
120 | | |
121 | | template <size_t ROT> |
122 | | SIMD_4x64 rotl() const { |
123 | | return this->rotr<64 - ROT>(); |
124 | | } |
125 | | |
126 | | template <int SHIFT> |
127 | 153k | SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept { |
128 | 153k | return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT)); |
129 | 153k | } Botan::SIMD_4x64 Botan::SIMD_4x64::shr<7>() const Line | Count | Source | 127 | 76.9k | SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept { | 128 | 76.9k | return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT)); | 129 | 76.9k | } |
Botan::SIMD_4x64 Botan::SIMD_4x64::shr<6>() const Line | Count | Source | 127 | 76.9k | SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept { | 128 | 76.9k | return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT)); | 129 | 76.9k | } |
|
130 | | |
131 | 153k | static SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 alignr8(const SIMD_4x64& a, const SIMD_4x64& b) { |
132 | 153k | return SIMD_4x64(_mm256_alignr_epi8(a.m_simd, b.m_simd, 8)); |
133 | 153k | } |
134 | | |
135 | | // Argon2 specific operation |
136 | 0 | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) { |
137 | 0 | const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd); |
138 | 0 | return SIMD_4x64(_mm256_add_epi64(m, m)); |
139 | 0 | } |
140 | | |
141 | | template <uint8_t CTRL> |
142 | 0 | static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 permute_4x64(SIMD_4x64 x) { |
143 | 0 | return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL)); |
144 | 0 | } Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)57>(Botan::SIMD_4x64) Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)78>(Botan::SIMD_4x64) Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)147>(Botan::SIMD_4x64) |
145 | | |
146 | | // Argon2 specific |
147 | 0 | static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) { |
148 | 0 | B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B); |
149 | 0 | C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C); |
150 | 0 | D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D); |
151 | 0 | } |
152 | | |
153 | | // Argon2 specific |
154 | 0 | static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) { |
155 | 0 | B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B); |
156 | 0 | C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C); |
157 | 0 | D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D); |
158 | 0 | } |
159 | | |
160 | 750k | explicit BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64(__m256i x) : m_simd(x) {} |
161 | | |
162 | | private: |
163 | | __m256i m_simd; |
164 | | }; |
165 | | |
166 | | } // namespace Botan |
167 | | |
168 | | #endif |