/src/cryptopp/lea_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // lea_simd.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // |
3 | | // This source file uses intrinsics and built-ins to gain access to |
4 | | // SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate |
5 | | // source file is needed because additional CXXFLAGS are required to enable |
6 | | // the appropriate instructions sets in some build configurations. |
7 | | |
8 | | #include "pch.h" |
9 | | #include "config.h" |
10 | | |
11 | | #include "lea.h" |
12 | | #include "misc.h" |
13 | | |
14 | | // Uncomment for benchmarking C++ against SSE or NEON. |
15 | | // Do so in both simon.cpp and simon_simd.cpp. |
16 | | // #undef CRYPTOPP_SSSE3_AVAILABLE |
17 | | // #undef CRYPTOPP_ARM_NEON_AVAILABLE |
18 | | |
19 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
20 | | # include "adv_simd.h" |
21 | | # include <pmmintrin.h> |
22 | | # include <tmmintrin.h> |
23 | | #endif |
24 | | |
25 | | #if defined(__XOP__) |
26 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
27 | | # include <x86intrin.h> |
28 | | # endif |
29 | | # include <ammintrin.h> |
30 | | #endif // XOP |
31 | | |
32 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
33 | | # include "adv_simd.h" |
34 | | # include <arm_neon.h> |
35 | | #endif |
36 | | |
37 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
38 | | # include <stdint.h> |
39 | | # include <arm_acle.h> |
40 | | #endif |
41 | | |
42 | | #if defined(_M_ARM64) |
43 | | # include "adv_simd.h" |
44 | | #endif |
45 | | |
46 | | // Do not port this to POWER architecture. Naively we hoped |
47 | | // for a 2x to 3x speedup. The result was a 5x slow down. |
48 | | // The table below shows MiB/s and cpb. |
49 | | // |
50 | | // C++: |
51 | | // <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64 |
52 | | // <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48 |
53 | | // <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2 |
54 | | // |
55 | | // Power8: |
56 | | // <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7 |
57 | | // <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1 |
58 | | // <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0 |
59 | | |
60 | | #undef CRYPTOPP_POWER8_AVAILABLE |
61 | | #if defined(CRYPTOPP_POWER8_AVAILABLE) |
62 | | # include "adv_simd.h" |
63 | | # include "ppc_simd.h" |
64 | | #endif |
65 | | |
66 | | // Squash MS LNK4221 and libtool warnings |
67 | | extern const char LEA_SIMD_FNAME[] = __FILE__; |
68 | | |
69 | | ANONYMOUS_NAMESPACE_BEGIN |
70 | | |
71 | | using CryptoPP::word32; |
72 | | |
73 | | // *************************** ARM NEON ***************************// |
74 | | |
75 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
76 | | |
77 | | inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b) |
78 | | { |
79 | | return veorq_u32(a, b); |
80 | | } |
81 | | |
82 | | inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b) |
83 | | { |
84 | | return vaddq_u32(a, b); |
85 | | } |
86 | | |
87 | | inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b) |
88 | | { |
89 | | return vsubq_u32(a, b); |
90 | | } |
91 | | |
92 | | template <unsigned int R> |
93 | | inline uint32x4_t RotateLeft(const uint32x4_t& val) |
94 | | { |
95 | | const uint32x4_t a(vshlq_n_u32(val, R)); |
96 | | const uint32x4_t b(vshrq_n_u32(val, 32 - R)); |
97 | | return vorrq_u32(a, b); |
98 | | } |
99 | | |
100 | | template <unsigned int R> |
101 | | inline uint32x4_t RotateRight(const uint32x4_t& val) |
102 | | { |
103 | | const uint32x4_t a(vshlq_n_u32(val, 32 - R)); |
104 | | const uint32x4_t b(vshrq_n_u32(val, R)); |
105 | | return vorrq_u32(a, b); |
106 | | } |
107 | | |
108 | | #if defined(__aarch32__) || defined(__aarch64__) |
109 | | template <> |
110 | | inline uint32x4_t RotateLeft<8>(const uint32x4_t& val) |
111 | | { |
112 | | #if (CRYPTOPP_BIG_ENDIAN) |
113 | | const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 }; |
114 | | const uint8x16_t mask = vld1q_u8(maskb); |
115 | | #else |
116 | | const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; |
117 | | const uint8x16_t mask = vld1q_u8(maskb); |
118 | | #endif |
119 | | |
120 | | return vreinterpretq_u32_u8( |
121 | | vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); |
122 | | } |
123 | | |
124 | | template <> |
125 | | inline uint32x4_t RotateRight<8>(const uint32x4_t& val) |
126 | | { |
127 | | #if (CRYPTOPP_BIG_ENDIAN) |
128 | | const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 }; |
129 | | const uint8x16_t mask = vld1q_u8(maskb); |
130 | | #else |
131 | | const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 }; |
132 | | const uint8x16_t mask = vld1q_u8(maskb); |
133 | | #endif |
134 | | |
135 | | return vreinterpretq_u32_u8( |
136 | | vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); |
137 | | } |
138 | | #endif |
139 | | |
140 | | uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b) |
141 | | { |
142 | | uint32x2_t a1 = vget_low_u32(a); |
143 | | uint32x2_t b1 = vget_low_u32(b); |
144 | | uint32x2x2_t result = vzip_u32(a1, b1); |
145 | | return vcombine_u32(result.val[0], result.val[1]); |
146 | | } |
147 | | |
148 | | uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b) |
149 | | { |
150 | | uint32x2_t a1 = vget_high_u32(a); |
151 | | uint32x2_t b1 = vget_high_u32(b); |
152 | | uint32x2x2_t result = vzip_u32(a1, b1); |
153 | | return vcombine_u32(result.val[0], result.val[1]); |
154 | | } |
155 | | |
156 | | uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b) |
157 | | { |
158 | | uint64x1_t a1 = vget_low_u64((uint64x2_t)a); |
159 | | uint64x1_t b1 = vget_low_u64((uint64x2_t)b); |
160 | | return (uint32x4_t)vcombine_u64(a1, b1); |
161 | | } |
162 | | |
163 | | uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b) |
164 | | { |
165 | | uint64x1_t a1 = vget_high_u64((uint64x2_t)a); |
166 | | uint64x1_t b1 = vget_high_u64((uint64x2_t)b); |
167 | | return (uint32x4_t)vcombine_u64(a1, b1); |
168 | | } |
169 | | |
170 | | template <unsigned int IDX> |
171 | | inline uint32x4_t LoadKey(const word32 rkey[]) |
172 | | { |
173 | | return vdupq_n_u32(rkey[IDX]); |
174 | | } |
175 | | |
176 | | template <unsigned int IDX> |
177 | | inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
178 | | { |
179 | | // Should not be instantiated |
180 | | CRYPTOPP_ASSERT(0); |
181 | | |
182 | | CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); |
183 | | CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); |
184 | | return vmovq_n_u32(0); |
185 | | } |
186 | | |
187 | | template <> |
188 | | inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
189 | | { |
190 | | const uint32x4_t r1 = UnpackLow32(a, b); |
191 | | const uint32x4_t r2 = UnpackLow32(c, d); |
192 | | return UnpackLow64(r1, r2); |
193 | | } |
194 | | |
195 | | template <> |
196 | | inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
197 | | { |
198 | | const uint32x4_t r1 = UnpackLow32(a, b); |
199 | | const uint32x4_t r2 = UnpackLow32(c, d); |
200 | | return UnpackHigh64(r1, r2); |
201 | | } |
202 | | |
203 | | template <> |
204 | | inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
205 | | { |
206 | | const uint32x4_t r1 = UnpackHigh32(a, b); |
207 | | const uint32x4_t r2 = UnpackHigh32(c, d); |
208 | | return UnpackLow64(r1, r2); |
209 | | } |
210 | | |
211 | | template <> |
212 | | inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
213 | | { |
214 | | const uint32x4_t r1 = UnpackHigh32(a, b); |
215 | | const uint32x4_t r2 = UnpackHigh32(c, d); |
216 | | return UnpackHigh64(r1, r2); |
217 | | } |
218 | | |
219 | | template <unsigned int IDX> |
220 | | inline uint32x4_t UnpackNEON(const uint32x4_t& v) |
221 | | { |
222 | | // Should not be instantiated |
223 | | CRYPTOPP_ASSERT(0); |
224 | | |
225 | | CRYPTOPP_UNUSED(v); |
226 | | return vmovq_n_u32(0); |
227 | | } |
228 | | |
229 | | template <> |
230 | | inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v) |
231 | | { |
232 | | // Splat to all lanes |
233 | | return vdupq_n_u32(vgetq_lane_u32(v, 0)); |
234 | | } |
235 | | |
236 | | template <> |
237 | | inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v) |
238 | | { |
239 | | // Splat to all lanes |
240 | | return vdupq_n_u32(vgetq_lane_u32(v, 1)); |
241 | | } |
242 | | |
243 | | template <> |
244 | | inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v) |
245 | | { |
246 | | // Splat to all lanes |
247 | | return vdupq_n_u32(vgetq_lane_u32(v, 2)); |
248 | | } |
249 | | |
250 | | template <> |
251 | | inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v) |
252 | | { |
253 | | // Splat to all lanes |
254 | | return vdupq_n_u32(vgetq_lane_u32(v, 3)); |
255 | | } |
256 | | |
257 | | template <unsigned int IDX> |
258 | | inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) |
259 | | { |
260 | | return UnpackNEON<IDX>(a, b, c, d); |
261 | | } |
262 | | |
263 | | template <unsigned int IDX> |
264 | | inline uint32x4_t RepackNEON(const uint32x4_t& v) |
265 | | { |
266 | | return UnpackNEON<IDX>(v); |
267 | | } |
268 | | |
269 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
270 | | |
271 | | // *************************** IA-32 ***************************// |
272 | | |
273 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
274 | | |
275 | | inline __m128i Xor(const __m128i& a, const __m128i& b) |
276 | 2.08k | { |
277 | 2.08k | return _mm_xor_si128(a, b); |
278 | 2.08k | } |
279 | | |
280 | | inline __m128i Add(const __m128i& a, const __m128i& b) |
281 | 960 | { |
282 | 960 | return _mm_add_epi32(a, b); |
283 | 960 | } |
284 | | |
285 | | inline __m128i Sub(const __m128i& a, const __m128i& b) |
286 | 84 | { |
287 | 84 | return _mm_sub_epi32(a, b); |
288 | 84 | } |
289 | | |
290 | | template <unsigned int R> |
291 | | inline __m128i RotateLeft(const __m128i& val) |
292 | 376 | { |
293 | | #if defined(__XOP__) |
294 | | return _mm_roti_epi32(val, R); |
295 | | #else |
296 | 376 | return _mm_or_si128( |
297 | 376 | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); |
298 | 376 | #endif |
299 | 376 | } lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft<9u>(long long __vector(2) const&) Line | Count | Source | 292 | 320 | { | 293 | | #if defined(__XOP__) | 294 | | return _mm_roti_epi32(val, R); | 295 | | #else | 296 | 320 | return _mm_or_si128( | 297 | 320 | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); | 298 | 320 | #endif | 299 | 320 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft<5u>(long long __vector(2) const&) Line | Count | Source | 292 | 28 | { | 293 | | #if defined(__XOP__) | 294 | | return _mm_roti_epi32(val, R); | 295 | | #else | 296 | 28 | return _mm_or_si128( | 297 | 28 | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); | 298 | 28 | #endif | 299 | 28 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateLeft<3u>(long long __vector(2) const&) Line | Count | Source | 292 | 28 | { | 293 | | #if defined(__XOP__) | 294 | | return _mm_roti_epi32(val, R); | 295 | | #else | 296 | 28 | return _mm_or_si128( | 297 | 28 | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); | 298 | 28 | #endif | 299 | 28 | } |
|
300 | | |
301 | | template <unsigned int R> |
302 | | inline __m128i RotateRight(const __m128i& val) |
303 | 668 | { |
304 | | #if defined(__XOP__) |
305 | | return _mm_roti_epi32(val, 32-R); |
306 | | #else |
307 | 668 | return _mm_or_si128( |
308 | 668 | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); |
309 | 668 | #endif |
310 | 668 | } lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateRight<3u>(long long __vector(2) const&) Line | Count | Source | 303 | 320 | { | 304 | | #if defined(__XOP__) | 305 | | return _mm_roti_epi32(val, 32-R); | 306 | | #else | 307 | 320 | return _mm_or_si128( | 308 | 320 | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); | 309 | 320 | #endif | 310 | 320 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateRight<5u>(long long __vector(2) const&) Line | Count | Source | 303 | 320 | { | 304 | | #if defined(__XOP__) | 305 | | return _mm_roti_epi32(val, 32-R); | 306 | | #else | 307 | 320 | return _mm_or_si128( | 308 | 320 | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); | 309 | 320 | #endif | 310 | 320 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RotateRight<9u>(long long __vector(2) const&) Line | Count | Source | 303 | 28 | { | 304 | | #if defined(__XOP__) | 305 | | return _mm_roti_epi32(val, 32-R); | 306 | | #else | 307 | 28 | return _mm_or_si128( | 308 | 28 | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); | 309 | 28 | #endif | 310 | 28 | } |
|
311 | | |
312 | | // Faster than two Shifts and an Or. |
313 | | template <> |
314 | | inline __m128i RotateLeft<8>(const __m128i& val) |
315 | 0 | { |
316 | 0 | #if defined(__XOP__) |
317 | 0 | return _mm_roti_epi32(val, 8); |
318 | 0 | #else |
319 | 0 | const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); |
320 | 0 | return _mm_shuffle_epi8(val, mask); |
321 | 0 | #endif |
322 | 0 | } |
323 | | |
324 | | // Faster than two Shifts and an Or. |
325 | | template <> |
326 | | inline __m128i RotateRight<8>(const __m128i& val) |
327 | 0 | { |
328 | 0 | #if defined(__XOP__) |
329 | 0 | return _mm_roti_epi32(val, 32-8); |
330 | 0 | #else |
331 | 0 | const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); |
332 | 0 | return _mm_shuffle_epi8(val, mask); |
333 | 0 | #endif |
334 | 0 | } |
335 | | |
336 | | template <unsigned int IDX> |
337 | | inline __m128i LoadKey(const word32 rkey[]) |
338 | 2.08k | { |
339 | 2.08k | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); |
340 | 2.08k | return _mm_castps_si128(_mm_load_ps1(&rk)); |
341 | 2.08k | } lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<4u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<5u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<2u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<3u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<0u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<1u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<10u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<11u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<8u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<9u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<6u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<7u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<16u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<17u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<14u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<15u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<12u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<13u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<22u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<23u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<20u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<21u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<18u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<19u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<28u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<29u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<26u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<27u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<24u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<25u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<34u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<35u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<32u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<33u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<30u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<31u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<40u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<41u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<38u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<39u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<36u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<37u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<46u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<47u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<44u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<45u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<42u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<43u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<52u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<53u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<50u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<51u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<48u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<49u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<58u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<59u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<56u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<57u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<54u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<55u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<64u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<65u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<62u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<63u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<60u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<61u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<70u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<71u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<68u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<69u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<66u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<67u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<76u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<77u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<74u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<75u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<72u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<73u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<82u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<83u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<80u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<81u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<78u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<79u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<88u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<89u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<86u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<87u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<84u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<85u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<94u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<95u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<92u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<93u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<90u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<91u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<100u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<101u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<98u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<99u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<96u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<97u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<106u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<107u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<104u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<105u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<102u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<103u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<112u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<113u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<110u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<111u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<108u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<109u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<118u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<119u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<116u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<117u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<114u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<115u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<124u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<125u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<122u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<123u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<120u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<121u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<130u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<131u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<128u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<129u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<126u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<127u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<136u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<137u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<134u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<135u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<132u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<133u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<142u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<143u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<140u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<141u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<138u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<139u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<148u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<149u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<146u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<147u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<144u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<145u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<154u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<155u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<152u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<153u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<150u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<151u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<160u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<161u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<158u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<159u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<156u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<157u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<166u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<167u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<164u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<165u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<162u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<163u>(unsigned int const*) Line | Count | Source | 338 | 11 | { | 339 | 11 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 11 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<172u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<173u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<170u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<171u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<168u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<169u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<178u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<179u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<176u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<177u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<174u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<175u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<184u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<185u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<182u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<183u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<180u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<181u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<190u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<191u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<188u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<189u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<186u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::LoadKey<187u>(unsigned int const*) Line | Count | Source | 338 | 10 | { | 339 | 10 | float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); | 340 | 10 | return _mm_castps_si128(_mm_load_ps1(&rk)); | 341 | 10 | } |
|
342 | | |
343 | | template <unsigned int IDX> |
344 | | inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
345 | | { |
346 | | // Should not be instantiated |
347 | | CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); |
348 | | CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); |
349 | | CRYPTOPP_ASSERT(0); |
350 | | return _mm_setzero_si128(); |
351 | | } |
352 | | |
353 | | template <> |
354 | | inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
355 | 19 | { |
356 | | // LEA is little-endian oriented, so there is no need for a separate shuffle. |
357 | 19 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
358 | 19 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
359 | 19 | return _mm_unpacklo_epi64(r1, r2); |
360 | 19 | } |
361 | | |
362 | | template <> |
363 | | inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
364 | 16 | { |
365 | | // LEA is little-endian oriented, so there is no need for a separate shuffle. |
366 | 16 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
367 | 16 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
368 | 16 | return _mm_unpackhi_epi64(r1, r2); |
369 | 16 | } |
370 | | |
371 | | template <> |
372 | | inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
373 | 16 | { |
374 | | // LEA is little-endian oriented, so there is no need for a separate shuffle. |
375 | 16 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
376 | 16 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
377 | 16 | return _mm_unpacklo_epi64(r1, r2); |
378 | 16 | } |
379 | | |
380 | | template <> |
381 | | inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
382 | 16 | { |
383 | | // LEA is little-endian oriented, so there is no need for a separate shuffle. |
384 | 16 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
385 | 16 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
386 | 16 | return _mm_unpackhi_epi64(r1, r2); |
387 | 16 | } |
388 | | |
389 | | template <unsigned int IDX> |
390 | | inline __m128i UnpackXMM(const __m128i& v) |
391 | | { |
392 | | // Should not be instantiated |
393 | | CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); |
394 | | return _mm_setzero_si128(); |
395 | | } |
396 | | |
397 | | template <> |
398 | | inline __m128i UnpackXMM<0>(const __m128i& v) |
399 | 3 | { |
400 | | // Splat to all lanes |
401 | 3 | return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
402 | 3 | } |
403 | | |
404 | | template <> |
405 | | inline __m128i UnpackXMM<1>(const __m128i& v) |
406 | 3 | { |
407 | | // Splat to all lanes |
408 | 3 | return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
409 | 3 | } |
410 | | |
411 | | template <> |
412 | | inline __m128i UnpackXMM<2>(const __m128i& v) |
413 | 3 | { |
414 | | // Splat to all lanes |
415 | 3 | return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); |
416 | 3 | } |
417 | | |
418 | | template <> |
419 | | inline __m128i UnpackXMM<3>(const __m128i& v) |
420 | 3 | { |
421 | | // Splat to all lanes |
422 | 3 | return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); |
423 | 3 | } |
424 | | |
425 | | template <unsigned int IDX> |
426 | | inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
427 | 35 | { |
428 | 35 | return UnpackXMM<IDX>(a, b, c, d); |
429 | 35 | } lea_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<0u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 427 | 11 | { | 428 | 11 | return UnpackXMM<IDX>(a, b, c, d); | 429 | 11 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<1u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 427 | 8 | { | 428 | 8 | return UnpackXMM<IDX>(a, b, c, d); | 429 | 8 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<2u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 427 | 8 | { | 428 | 8 | return UnpackXMM<IDX>(a, b, c, d); | 429 | 8 | } |
lea_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<3u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 427 | 8 | { | 428 | 8 | return UnpackXMM<IDX>(a, b, c, d); | 429 | 8 | } |
|
430 | | |
431 | | template <unsigned int IDX> |
432 | | inline __m128i RepackXMM(const __m128i& v) |
433 | | { |
434 | | return UnpackXMM<IDX>(v); |
435 | | } |
436 | | |
437 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
438 | | |
439 | | // *************************** Power8 ***************************// |
440 | | |
441 | | #if (CRYPTOPP_POWER8_AVAILABLE) |
442 | | |
443 | | using CryptoPP::uint8x16_p; |
444 | | using CryptoPP::uint32x4_p; |
445 | | using CryptoPP::uint64x2_p; |
446 | | |
447 | | inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b) |
448 | | { |
449 | | return VecXor(a, b); |
450 | | } |
451 | | |
452 | | inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b) |
453 | | { |
454 | | return VecAdd(a, b); |
455 | | } |
456 | | |
457 | | inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b) |
458 | | { |
459 | | return VecSub(a, b); |
460 | | } |
461 | | |
462 | | template <unsigned int R> |
463 | | inline uint32x4_p RotateLeft(const uint32x4_p& val) |
464 | | { |
465 | | const uint32x4_p m = {R, R, R, R}; |
466 | | return vec_rl(val, m); |
467 | | } |
468 | | |
469 | | template <unsigned int R> |
470 | | inline uint32x4_p RotateRight(const uint32x4_p& val) |
471 | | { |
472 | | const uint32x4_p m = {32-R, 32-R, 32-R, 32-R}; |
473 | | return vec_rl(val, m); |
474 | | } |
475 | | |
476 | | template <unsigned int IDX> |
477 | | inline uint32x4_p LoadKey(const word32 rkey[]) |
478 | | { |
479 | | return vec_splats(rkey[IDX]); |
480 | | } |
481 | | |
482 | | template <unsigned int IDX> |
483 | | inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
484 | | { |
485 | | // Should not be instantiated |
486 | | CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); |
487 | | CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); |
488 | | CRYPTOPP_ASSERT(0); |
489 | | return VecXor(a, a); |
490 | | } |
491 | | |
492 | | template <> |
493 | | inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
494 | | { |
495 | | const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b); |
496 | | const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d); |
497 | | return (uint32x4_p)vec_mergel(r1, r2); |
498 | | } |
499 | | |
500 | | template <> |
501 | | inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
502 | | { |
503 | | const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b); |
504 | | const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d); |
505 | | return (uint32x4_p)vec_mergeh(r1, r2); |
506 | | } |
507 | | |
508 | | template <> |
509 | | inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
510 | | { |
511 | | const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b); |
512 | | const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d); |
513 | | return (uint32x4_p)vec_mergel(r1, r2); |
514 | | } |
515 | | |
516 | | template <> |
517 | | inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
518 | | { |
519 | | const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b); |
520 | | const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d); |
521 | | return (uint32x4_p)vec_mergeh(r1, r2); |
522 | | } |
523 | | |
524 | | template <unsigned int IDX> |
525 | | inline uint32x4_p UnpackSIMD(const uint32x4_p& v) |
526 | | { |
527 | | // Should not be instantiated |
528 | | CRYPTOPP_ASSERT(0); |
529 | | return VecXor(v, v); |
530 | | } |
531 | | |
532 | | template <> |
533 | | inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v) |
534 | | { |
535 | | // Splat to all lanes |
536 | | const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0}; |
537 | | return (uint32x4_p)VecPermute(v, v, m); |
538 | | } |
539 | | |
540 | | template <> |
541 | | inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v) |
542 | | { |
543 | | // Splat to all lanes |
544 | | const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4}; |
545 | | return (uint32x4_p)VecPermute(v, v, m); |
546 | | } |
547 | | |
548 | | template <> |
549 | | inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v) |
550 | | { |
551 | | // Splat to all lanes |
552 | | const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8}; |
553 | | return (uint32x4_p)VecPermute(v, v, m); |
554 | | } |
555 | | |
556 | | template <> |
557 | | inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v) |
558 | | { |
559 | | // Splat to all lanes |
560 | | const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12}; |
561 | | return (uint32x4_p)VecPermute(v, v, m); |
562 | | } |
563 | | |
564 | | template <unsigned int IDX> |
565 | | inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) |
566 | | { |
567 | | return UnpackSIMD<IDX>(a, b, c, d); |
568 | | } |
569 | | |
570 | | template <unsigned int IDX> |
571 | | inline uint32x4_p RepackSIMD(const uint32x4_p& v) |
572 | | { |
573 | | return UnpackSIMD<IDX>(v); |
574 | | } |
575 | | |
576 | | #endif // CRYPTOPP_POWER8_AVAILABLE |
577 | | |
578 | | // *************************** LEA Encryption ***************************// |
579 | | |
580 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE) |
581 | | |
582 | | template <class W> |
583 | | inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds) |
584 | 10 | { |
585 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys)))); |
586 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys)))); |
587 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys)))); |
588 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys)))); |
589 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys)))); |
590 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys)))); |
591 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys)))); |
592 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys)))); |
593 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys)))); |
594 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys)))); |
595 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys)))); |
596 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys)))); |
597 | | |
598 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys)))); |
599 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys)))); |
600 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys)))); |
601 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys)))); |
602 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys)))); |
603 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys)))); |
604 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys)))); |
605 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys)))); |
606 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys)))); |
607 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys)))); |
608 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys)))); |
609 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys)))); |
610 | | |
611 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys)))); |
612 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys)))); |
613 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys)))); |
614 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys)))); |
615 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys)))); |
616 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys)))); |
617 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys)))); |
618 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys)))); |
619 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys)))); |
620 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys)))); |
621 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys)))); |
622 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys)))); |
623 | | |
624 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys)))); |
625 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys)))); |
626 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys)))); |
627 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys)))); |
628 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys)))); |
629 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys)))); |
630 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys)))); |
631 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys)))); |
632 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys)))); |
633 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys)))); |
634 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys)))); |
635 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys)))); |
636 | | |
637 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys)))); |
638 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys)))); |
639 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys)))); |
640 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys)))); |
641 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys)))); |
642 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys)))); |
643 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys)))); |
644 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys)))); |
645 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys)))); |
646 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys)))); |
647 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys)))); |
648 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys)))); |
649 | | |
650 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys)))); |
651 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys)))); |
652 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys)))); |
653 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys)))); |
654 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys)))); |
655 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys)))); |
656 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys)))); |
657 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys)))); |
658 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys)))); |
659 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys)))); |
660 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys)))); |
661 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys)))); |
662 | | |
663 | 10 | if(rounds > 24) |
664 | 10 | { |
665 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys)))); |
666 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys)))); |
667 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys)))); |
668 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys)))); |
669 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys)))); |
670 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys)))); |
671 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys)))); |
672 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys)))); |
673 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys)))); |
674 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys)))); |
675 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys)))); |
676 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys)))); |
677 | 10 | } |
678 | | |
679 | 10 | if(rounds > 28) |
680 | 10 | { |
681 | 10 | temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys)))); |
682 | 10 | temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys)))); |
683 | 10 | temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys)))); |
684 | 10 | temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys)))); |
685 | 10 | temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys)))); |
686 | 10 | temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys)))); |
687 | 10 | temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys)))); |
688 | 10 | temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys)))); |
689 | 10 | temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys)))); |
690 | 10 | temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys)))); |
691 | 10 | temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys)))); |
692 | 10 | temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys)))); |
693 | 10 | } |
694 | 10 | } |
695 | | |
696 | | // *************************** LEA Decryption ***************************// |
697 | | |
698 | | template <class W> |
699 | | inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds) |
700 | 1 | { |
701 | 1 | if(rounds > 28) |
702 | 0 | { |
703 | 0 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys)); |
704 | 0 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys)); |
705 | 0 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys)); |
706 | 0 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys)); |
707 | 0 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys)); |
708 | 0 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys)); |
709 | 0 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys)); |
710 | 0 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys)); |
711 | 0 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys)); |
712 | 0 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys)); |
713 | 0 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys)); |
714 | 0 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys)); |
715 | 0 | } |
716 | | |
717 | 1 | if(rounds > 24) |
718 | 1 | { |
719 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys)); |
720 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys)); |
721 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys)); |
722 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys)); |
723 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys)); |
724 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys)); |
725 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys)); |
726 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys)); |
727 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys)); |
728 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys)); |
729 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys)); |
730 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys)); |
731 | 1 | } |
732 | | |
733 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys)); |
734 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys)); |
735 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys)); |
736 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys)); |
737 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys)); |
738 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys)); |
739 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys)); |
740 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys)); |
741 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys)); |
742 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys)); |
743 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys)); |
744 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys)); |
745 | | |
746 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys)); |
747 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys)); |
748 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys)); |
749 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys)); |
750 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys)); |
751 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys)); |
752 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys)); |
753 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys)); |
754 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys)); |
755 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys)); |
756 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys)); |
757 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys)); |
758 | | |
759 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys)); |
760 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys)); |
761 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys)); |
762 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys)); |
763 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys)); |
764 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys)); |
765 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys)); |
766 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys)); |
767 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys)); |
768 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys)); |
769 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys)); |
770 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys)); |
771 | | |
772 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys)); |
773 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys)); |
774 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys)); |
775 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys)); |
776 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys)); |
777 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys)); |
778 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys)); |
779 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys)); |
780 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys)); |
781 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys)); |
782 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys)); |
783 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys)); |
784 | | |
785 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys)); |
786 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys)); |
787 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys)); |
788 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys)); |
789 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys)); |
790 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys)); |
791 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys)); |
792 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys)); |
793 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys)); |
794 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys)); |
795 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys)); |
796 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys)); |
797 | | |
798 | 1 | temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys)); |
799 | 1 | temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys)); |
800 | 1 | temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys)); |
801 | 1 | temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys)); |
802 | 1 | temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys)); |
803 | 1 | temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys)); |
804 | 1 | temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys)); |
805 | 1 | temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys)); |
806 | 1 | temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys)); |
807 | 1 | temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys)); |
808 | 1 | temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys)); |
809 | 1 | temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys)); |
810 | 1 | } |
811 | | |
812 | | #endif // LEA Encryption and Decryption |
813 | | |
814 | | // *************************** ARM NEON ***************************// |
815 | | |
816 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) |
817 | | |
818 | | inline void LEA_Enc_Block(uint32x4_t &block0, |
819 | | const word32 *subkeys, unsigned int rounds) |
820 | | { |
821 | | uint32x4_t temp[4]; |
822 | | temp[0] = UnpackNEON<0>(block0); |
823 | | temp[1] = UnpackNEON<1>(block0); |
824 | | temp[2] = UnpackNEON<2>(block0); |
825 | | temp[3] = UnpackNEON<3>(block0); |
826 | | |
827 | | LEA_Encryption(temp, subkeys, rounds); |
828 | | |
829 | | block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); |
830 | | } |
831 | | |
832 | | inline void LEA_Dec_Block(uint32x4_t &block0, |
833 | | const word32 *subkeys, unsigned int rounds) |
834 | | { |
835 | | uint32x4_t temp[4]; |
836 | | temp[0] = UnpackNEON<0>(block0); |
837 | | temp[1] = UnpackNEON<1>(block0); |
838 | | temp[2] = UnpackNEON<2>(block0); |
839 | | temp[3] = UnpackNEON<3>(block0); |
840 | | |
841 | | LEA_Decryption(temp, subkeys, rounds); |
842 | | |
843 | | block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); |
844 | | } |
845 | | |
846 | | inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, |
847 | | uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds) |
848 | | { |
849 | | uint32x4_t temp[4]; |
850 | | temp[0] = UnpackNEON<0>(block0, block1, block2, block3); |
851 | | temp[1] = UnpackNEON<1>(block0, block1, block2, block3); |
852 | | temp[2] = UnpackNEON<2>(block0, block1, block2, block3); |
853 | | temp[3] = UnpackNEON<3>(block0, block1, block2, block3); |
854 | | |
855 | | LEA_Encryption(temp, subkeys, rounds); |
856 | | |
857 | | block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); |
858 | | block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]); |
859 | | block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]); |
860 | | block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]); |
861 | | } |
862 | | |
863 | | inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, |
864 | | uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds) |
865 | | { |
866 | | uint32x4_t temp[4]; |
867 | | temp[0] = UnpackNEON<0>(block0, block1, block2, block3); |
868 | | temp[1] = UnpackNEON<1>(block0, block1, block2, block3); |
869 | | temp[2] = UnpackNEON<2>(block0, block1, block2, block3); |
870 | | temp[3] = UnpackNEON<3>(block0, block1, block2, block3); |
871 | | |
872 | | LEA_Decryption(temp, subkeys, rounds); |
873 | | |
874 | | block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); |
875 | | block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]); |
876 | | block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]); |
877 | | block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]); |
878 | | } |
879 | | |
880 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
881 | | |
882 | | // *************************** IA-32 ***************************// |
883 | | |
884 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
885 | | |
886 | | inline void LEA_Enc_Block(__m128i &block0, |
887 | | const word32 *subkeys, unsigned int rounds) |
888 | 2 | { |
889 | 2 | __m128i temp[4]; |
890 | 2 | temp[0] = UnpackXMM<0>(block0); |
891 | 2 | temp[1] = UnpackXMM<1>(block0); |
892 | 2 | temp[2] = UnpackXMM<2>(block0); |
893 | 2 | temp[3] = UnpackXMM<3>(block0); |
894 | | |
895 | 2 | LEA_Encryption(temp, subkeys, rounds); |
896 | | |
897 | 2 | block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); |
898 | 2 | } |
899 | | |
900 | | inline void LEA_Dec_Block(__m128i &block0, |
901 | | const word32 *subkeys, unsigned int rounds) |
902 | 1 | { |
903 | 1 | __m128i temp[4]; |
904 | 1 | temp[0] = UnpackXMM<0>(block0); |
905 | 1 | temp[1] = UnpackXMM<1>(block0); |
906 | 1 | temp[2] = UnpackXMM<2>(block0); |
907 | 1 | temp[3] = UnpackXMM<3>(block0); |
908 | | |
909 | 1 | LEA_Decryption(temp, subkeys, rounds); |
910 | | |
911 | 1 | block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); |
912 | 1 | } |
913 | | |
914 | | inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1, |
915 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) |
916 | 8 | { |
917 | 8 | __m128i temp[4]; |
918 | 8 | temp[0] = UnpackXMM<0>(block0, block1, block2, block3); |
919 | 8 | temp[1] = UnpackXMM<1>(block0, block1, block2, block3); |
920 | 8 | temp[2] = UnpackXMM<2>(block0, block1, block2, block3); |
921 | 8 | temp[3] = UnpackXMM<3>(block0, block1, block2, block3); |
922 | | |
923 | 8 | LEA_Encryption(temp, subkeys, rounds); |
924 | | |
925 | 8 | block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); |
926 | 8 | block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]); |
927 | 8 | block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]); |
928 | 8 | block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]); |
929 | 8 | } |
930 | | |
931 | | inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1, |
932 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) |
933 | 0 | { |
934 | 0 | __m128i temp[4]; |
935 | 0 | temp[0] = UnpackXMM<0>(block0, block1, block2, block3); |
936 | 0 | temp[1] = UnpackXMM<1>(block0, block1, block2, block3); |
937 | 0 | temp[2] = UnpackXMM<2>(block0, block1, block2, block3); |
938 | 0 | temp[3] = UnpackXMM<3>(block0, block1, block2, block3); |
939 | |
|
940 | 0 | LEA_Decryption(temp, subkeys, rounds); |
941 | |
|
942 | 0 | block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); |
943 | 0 | block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]); |
944 | 0 | block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]); |
945 | 0 | block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]); |
946 | 0 | } |
947 | | |
948 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
949 | | |
950 | | // *************************** Power8 ***************************// |
951 | | |
952 | | #if (CRYPTOPP_POWER8_AVAILABLE) |
953 | | |
954 | | inline void LEA_Enc_Block(uint32x4_p &block0, |
955 | | const word32 *subkeys, unsigned int rounds) |
956 | | { |
957 | | uint32x4_p temp[4]; |
958 | | temp[0] = UnpackSIMD<0>(block0); |
959 | | temp[1] = UnpackSIMD<1>(block0); |
960 | | temp[2] = UnpackSIMD<2>(block0); |
961 | | temp[3] = UnpackSIMD<3>(block0); |
962 | | |
963 | | LEA_Encryption(temp, subkeys, rounds); |
964 | | |
965 | | block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); |
966 | | } |
967 | | |
968 | | inline void LEA_Dec_Block(uint32x4_p &block0, |
969 | | const word32 *subkeys, unsigned int rounds) |
970 | | { |
971 | | uint32x4_p temp[4]; |
972 | | temp[0] = UnpackSIMD<0>(block0); |
973 | | temp[1] = UnpackSIMD<1>(block0); |
974 | | temp[2] = UnpackSIMD<2>(block0); |
975 | | temp[3] = UnpackSIMD<3>(block0); |
976 | | |
977 | | LEA_Decryption(temp, subkeys, rounds); |
978 | | |
979 | | block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); |
980 | | } |
981 | | |
982 | | inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
983 | | uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds) |
984 | | { |
985 | | uint32x4_p temp[4]; |
986 | | temp[0] = UnpackSIMD<0>(block0, block1, block2, block3); |
987 | | temp[1] = UnpackSIMD<1>(block0, block1, block2, block3); |
988 | | temp[2] = UnpackSIMD<2>(block0, block1, block2, block3); |
989 | | temp[3] = UnpackSIMD<3>(block0, block1, block2, block3); |
990 | | |
991 | | LEA_Encryption(temp, subkeys, rounds); |
992 | | |
993 | | block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); |
994 | | block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]); |
995 | | block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]); |
996 | | block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]); |
997 | | } |
998 | | |
999 | | inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1, |
1000 | | uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds) |
1001 | | { |
1002 | | uint32x4_p temp[4]; |
1003 | | temp[0] = UnpackSIMD<0>(block0, block1, block2, block3); |
1004 | | temp[1] = UnpackSIMD<1>(block0, block1, block2, block3); |
1005 | | temp[2] = UnpackSIMD<2>(block0, block1, block2, block3); |
1006 | | temp[3] = UnpackSIMD<3>(block0, block1, block2, block3); |
1007 | | |
1008 | | LEA_Decryption(temp, subkeys, rounds); |
1009 | | |
1010 | | block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); |
1011 | | block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]); |
1012 | | block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]); |
1013 | | block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]); |
1014 | | } |
1015 | | |
1016 | | #endif // CRYPTOPP_POWER8_AVAILABLE |
1017 | | |
1018 | | ANONYMOUS_NAMESPACE_END |
1019 | | |
1020 | | // *************************** SIMD Templates ***************************// |
1021 | | |
1022 | | NAMESPACE_BEGIN(CryptoPP) |
1023 | | |
1024 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) |
1025 | | size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, |
1026 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1027 | 4 | { |
1028 | 4 | return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks, |
1029 | 4 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1030 | 4 | } |
1031 | | |
1032 | | size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, |
1033 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1034 | 1 | { |
1035 | 1 | return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks, |
1036 | 1 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1037 | 1 | } |
1038 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
1039 | | |
1040 | | #if defined(CRYPTOPP_ARM_NEON_AVAILABLE) |
1041 | | size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, |
1042 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1043 | | { |
1044 | | return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks, |
1045 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1046 | | } |
1047 | | |
1048 | | size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, |
1049 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1050 | | { |
1051 | | return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks, |
1052 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1053 | | } |
1054 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
1055 | | |
1056 | | #if defined(CRYPTOPP_POWER8_AVAILABLE) |
1057 | | size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds, |
1058 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1059 | | { |
1060 | | return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks, |
1061 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1062 | | } |
1063 | | |
1064 | | size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds, |
1065 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1066 | | { |
1067 | | return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks, |
1068 | | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
1069 | | } |
1070 | | #endif // CRYPTOPP_POWER8_AVAILABLE |
1071 | | |
1072 | | NAMESPACE_END |