Coverage Report

Created: 2025-04-11 06:34

/src/botan/build/include/internal/botan/internal/simd_4x64.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
* (C) 2022,2025 Jack Lloyd
3
*
4
* Botan is released under the Simplified BSD License (see license.txt)
5
*/
6
7
#ifndef BOTAN_SIMD_4X64_H_
8
#define BOTAN_SIMD_4X64_H_
9
10
#include <botan/compiler.h>
11
#include <botan/types.h>
12
#include <botan/internal/isa_extn.h>
13
#include <botan/internal/target_info.h>
14
15
#if defined(BOTAN_TARGET_CPU_SUPPORTS_AVX2)
16
   #include <immintrin.h>
17
#endif
18
19
namespace Botan {
20
21
class SIMD_4x64 final {
22
   public:
23
      SIMD_4x64& operator=(const SIMD_4x64& other) = default;
24
      SIMD_4x64(const SIMD_4x64& other) = default;
25
26
      SIMD_4x64& operator=(SIMD_4x64&& other) = default;
27
      SIMD_4x64(SIMD_4x64&& other) = default;
28
29
      ~SIMD_4x64() = default;
30
31
      // zero initialized
32
19.2k
      BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64() { m_simd = _mm256_setzero_si256(); }
33
34
      // Load two halves at different addresses
35
19.2k
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_le2(const void* inl, const void* inh) {
36
19.2k
         return SIMD_4x64(
37
19.2k
            _mm256_loadu2_m128i(reinterpret_cast<const __m128i*>(inl), reinterpret_cast<const __m128i*>(inh)));
38
19.2k
      }
39
40
19.2k
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be2(const void* inl, const void* inh) {
41
19.2k
         return SIMD_4x64::load_le2(inl, inh).bswap();
42
19.2k
      }
43
44
96.2k
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_le(const void* in) {
45
96.2k
         return SIMD_4x64(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
46
96.2k
      }
47
48
0
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be(const void* in) { return SIMD_4x64::load_le(in).bswap(); }
49
50
19.2k
      SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 bswap() const {
51
19.2k
         const auto idx = _mm256_set_epi8(
52
19.2k
            8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
53
54
19.2k
         return SIMD_4x64(_mm256_shuffle_epi8(m_simd, idx));
55
19.2k
      }
56
57
0
      void store_le(uint64_t out[4]) const { this->store_le(reinterpret_cast<uint8_t*>(out)); }
58
59
0
      BOTAN_FN_ISA_SIMD_4X64 void store_le(uint8_t out[]) const {
60
0
         _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_simd);
61
0
      }
62
63
96.2k
      BOTAN_FN_ISA_SIMD_4X64 void store_le2(void* outh, void* outl) {
64
96.2k
         _mm256_storeu2_m128i(reinterpret_cast<__m128i*>(outh), reinterpret_cast<__m128i*>(outl), m_simd);
65
96.2k
      }
66
67
327k
      SIMD_4x64 operator+(const SIMD_4x64& other) const {
68
327k
         SIMD_4x64 retval(*this);
69
327k
         retval += other;
70
327k
         return retval;
71
327k
      }
72
73
307k
      SIMD_4x64 operator^(const SIMD_4x64& other) const {
74
307k
         SIMD_4x64 retval(*this);
75
307k
         retval ^= other;
76
307k
         return retval;
77
307k
      }
78
79
327k
      BOTAN_FN_ISA_SIMD_4X64 void operator+=(const SIMD_4x64& other) {
80
327k
         m_simd = _mm256_add_epi64(m_simd, other.m_simd);
81
327k
      }
82
83
307k
      BOTAN_FN_ISA_SIMD_4X64 void operator^=(const SIMD_4x64& other) {
84
307k
         m_simd = _mm256_xor_si256(m_simd, other.m_simd);
85
307k
      }
86
87
      template <size_t ROT>
88
      BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 rotr() const
89
         requires(ROT > 0 && ROT < 64)
90
307k
      {
91
#if defined(__AVX512VL__)
92
         return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT));
93
#else
94
307k
         if constexpr(ROT == 8) {
95
76.9k
            auto shuf_rot_8 =
96
76.9k
               _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201);
97
98
76.9k
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8));
99
76.9k
         } else if constexpr(ROT == 16) {
100
0
            auto shuf_rot_16 =
101
0
               _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
102
103
0
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
104
0
         } else if constexpr(ROT == 24) {
105
0
            auto shuf_rot_24 =
106
0
               _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
107
108
0
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
109
0
         } else if constexpr(ROT == 32) {
110
0
            auto shuf_rot_32 =
111
0
               _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
112
113
0
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
114
230k
         } else {
115
230k
            return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
116
230k
                                             _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
117
230k
         }
118
307k
#endif
119
307k
      }
_ZNK5Botan9SIMD_4x644rotrILm1EEES0_vQaagtT_Li0EltT_Li64E
Line
Count
Source
90
76.9k
      {
91
#if defined(__AVX512VL__)
92
         return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT));
93
#else
94
         if constexpr(ROT == 8) {
95
            auto shuf_rot_8 =
96
               _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201);
97
98
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8));
99
         } else if constexpr(ROT == 16) {
100
            auto shuf_rot_16 =
101
               _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
102
103
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
104
         } else if constexpr(ROT == 24) {
105
            auto shuf_rot_24 =
106
               _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
107
108
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
109
         } else if constexpr(ROT == 32) {
110
            auto shuf_rot_32 =
111
               _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
112
113
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
114
76.9k
         } else {
115
76.9k
            return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
116
76.9k
                                             _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
117
76.9k
         }
118
76.9k
#endif
119
76.9k
      }
_ZNK5Botan9SIMD_4x644rotrILm8EEES0_vQaagtT_Li0EltT_Li64E
Line
Count
Source
90
76.9k
      {
91
#if defined(__AVX512VL__)
92
         return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT));
93
#else
94
76.9k
         if constexpr(ROT == 8) {
95
76.9k
            auto shuf_rot_8 =
96
76.9k
               _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201);
97
98
76.9k
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8));
99
         } else if constexpr(ROT == 16) {
100
            auto shuf_rot_16 =
101
               _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
102
103
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
104
         } else if constexpr(ROT == 24) {
105
            auto shuf_rot_24 =
106
               _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
107
108
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
109
         } else if constexpr(ROT == 32) {
110
            auto shuf_rot_32 =
111
               _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
112
113
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
114
         } else {
115
            return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
116
                                             _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
117
         }
118
76.9k
#endif
119
76.9k
      }
_ZNK5Botan9SIMD_4x644rotrILm19EEES0_vQaagtT_Li0EltT_Li64E
Line
Count
Source
90
76.9k
      {
91
#if defined(__AVX512VL__)
92
         return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT));
93
#else
94
         if constexpr(ROT == 8) {
95
            auto shuf_rot_8 =
96
               _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201);
97
98
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8));
99
         } else if constexpr(ROT == 16) {
100
            auto shuf_rot_16 =
101
               _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
102
103
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
104
         } else if constexpr(ROT == 24) {
105
            auto shuf_rot_24 =
106
               _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
107
108
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
109
         } else if constexpr(ROT == 32) {
110
            auto shuf_rot_32 =
111
               _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
112
113
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
114
76.9k
         } else {
115
76.9k
            return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
116
76.9k
                                             _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
117
76.9k
         }
118
76.9k
#endif
119
76.9k
      }
_ZNK5Botan9SIMD_4x644rotrILm61EEES0_vQaagtT_Li0EltT_Li64E
Line
Count
Source
90
76.9k
      {
91
#if defined(__AVX512VL__)
92
         return SIMD_4x64(_mm256_ror_epi64(m_simd, ROT));
93
#else
94
         if constexpr(ROT == 8) {
95
            auto shuf_rot_8 =
96
               _mm256_set_epi64x(0x080f0e0d0c0b0a09, 0x0007060504030201, 0x080f0e0d0c0b0a09, 0x0007060504030201);
97
98
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_8));
99
         } else if constexpr(ROT == 16) {
100
            auto shuf_rot_16 =
101
               _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
102
103
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
104
         } else if constexpr(ROT == 24) {
105
            auto shuf_rot_24 =
106
               _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
107
108
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
109
         } else if constexpr(ROT == 32) {
110
            auto shuf_rot_32 =
111
               _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
112
113
            return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
114
76.9k
         } else {
115
76.9k
            return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
116
76.9k
                                             _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
117
76.9k
         }
118
76.9k
#endif
119
76.9k
      }
Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm32EEES0_vQaagtT_Li0EltT_Li64E
Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm24EEES0_vQaagtT_Li0EltT_Li64E
Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm16EEES0_vQaagtT_Li0EltT_Li64E
Unexecuted instantiation: _ZNK5Botan9SIMD_4x644rotrILm63EEES0_vQaagtT_Li0EltT_Li64E
120
121
      template <size_t ROT>
122
      SIMD_4x64 rotl() const {
123
         return this->rotr<64 - ROT>();
124
      }
125
126
      template <int SHIFT>
127
153k
      SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept {
128
153k
         return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT));
129
153k
      }
Botan::SIMD_4x64 Botan::SIMD_4x64::shr<7>() const
Line
Count
Source
127
76.9k
      SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept {
128
76.9k
         return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT));
129
76.9k
      }
Botan::SIMD_4x64 Botan::SIMD_4x64::shr<6>() const
Line
Count
Source
127
76.9k
      SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept {
128
76.9k
         return SIMD_4x64(_mm256_srli_epi64(m_simd, SHIFT));
129
76.9k
      }
130
131
153k
      static SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 alignr8(const SIMD_4x64& a, const SIMD_4x64& b) {
132
153k
         return SIMD_4x64(_mm256_alignr_epi8(a.m_simd, b.m_simd, 8));
133
153k
      }
134
135
      // Argon2 specific operation
136
0
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) {
137
0
         const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd);
138
0
         return SIMD_4x64(_mm256_add_epi64(m, m));
139
0
      }
140
141
      template <uint8_t CTRL>
142
0
      static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 permute_4x64(SIMD_4x64 x) {
143
0
         return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
144
0
      }
Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)57>(Botan::SIMD_4x64)
Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)78>(Botan::SIMD_4x64)
Unexecuted instantiation: Botan::SIMD_4x64 Botan::SIMD_4x64::permute_4x64<(unsigned char)147>(Botan::SIMD_4x64)
145
146
      // Argon2 specific
147
0
      static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
148
0
         B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
149
0
         C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
150
0
         D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
151
0
      }
152
153
      // Argon2 specific
154
0
      static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
155
0
         B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
156
0
         C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
157
0
         D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
158
0
      }
159
160
750k
      explicit BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64(__m256i x) : m_simd(x) {}
161
162
   private:
163
      __m256i m_simd;
164
};
165
166
}  // namespace Botan
167
168
#endif