Line | Count | Source (jump to first uncovered line) |
1 | | // adv_simd.h - written and placed in the public domain by Jeffrey Walton |
2 | | |
3 | | /// \file adv_simd.h |
4 | | /// \brief Template for AdvancedProcessBlocks and SIMD processing |
5 | | |
6 | | // The SIMD based implementations for ciphers that use SSE, NEON and Power7 |
7 | | // have a common pattern. Namely, they have a specialized implementation of |
8 | | // AdvancedProcessBlocks which processes multiple block using hardware |
9 | | // acceleration. After several implementations we noticed a lot of copy and |
10 | | // paste occurring. adv_simd.h provides a template to avoid the copy and paste. |
11 | | // |
12 | | // There are 6 templates provided in this file. The number following the |
13 | | // function name, 128, is the block size in bits. The name following the |
14 | | // block size is the arrangement and acceleration. For example 4x1_SSE means |
15 | | // Intel SSE using two encrypt (or decrypt) functions: one that operates on |
16 | | // 4 SIMD words, and one that operates on 1 SIMD words. |
17 | | // |
18 | | // * AdvancedProcessBlocks128_4x1_SSE |
19 | | // * AdvancedProcessBlocks128_6x2_SSE |
20 | | // * AdvancedProcessBlocks128_4x1_NEON |
21 | | // * AdvancedProcessBlocks128_6x1_NEON |
22 | | // * AdvancedProcessBlocks128_4x1_ALTIVEC |
23 | | // * AdvancedProcessBlocks128_6x1_ALTIVEC |
24 | | // |
25 | | // If an arrangement ends in 2, like 6x2, then the template will handle the |
26 | | // single block case by padding with 0's and using the two SIMD word |
27 | | // function. This happens at most one time when processing multiple blocks. |
28 | | // The extra processing of a zero block is trivial and worth the tradeoff. |
29 | | // |
30 | | // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions |
31 | | // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually |
32 | | // results in a failed link due to the const/non-const mismatch. |
33 | | // |
34 | | // In July 2020 the library stopped using 64-bit block version of |
35 | | // AdvancedProcessBlocks. Testing showed unreliable results and failed |
36 | | // self tests on occasion. Also see Issue 945 and |
37 | | // https://github.com/weidai11/cryptopp/commit/dd7598e638bb. |
38 | | |
39 | | #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES |
40 | | #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES |
41 | | |
42 | | #include "config.h" |
43 | | #include "misc.h" |
44 | | #include "stdcpp.h" |
45 | | |
46 | | #if (CRYPTOPP_ARM_NEON_HEADER) |
47 | | # include <arm_neon.h> |
48 | | #endif |
49 | | |
50 | | #if (CRYPTOPP_ARM_ACLE_HEADER) |
51 | | # include <stdint.h> |
52 | | # include <arm_acle.h> |
53 | | #endif |
54 | | |
55 | | #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) |
56 | | # include <emmintrin.h> |
57 | | # include <xmmintrin.h> |
58 | | #endif |
59 | | |
60 | | // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too |
61 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
62 | | # include <emmintrin.h> |
63 | | # include <pmmintrin.h> |
64 | | # include <xmmintrin.h> |
65 | | #endif |
66 | | |
67 | | #if defined(__ALTIVEC__) |
68 | | # include "ppc_simd.h" |
69 | | #endif |
70 | | |
71 | | // ************************ All block ciphers *********************** // |
72 | | |
73 | | ANONYMOUS_NAMESPACE_BEGIN |
74 | | |
75 | | using CryptoPP::BlockTransformation; |
76 | | |
77 | | CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput); |
78 | | CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel); |
79 | | CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter); |
80 | | CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection); |
81 | | CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers); |
82 | | |
83 | | ANONYMOUS_NAMESPACE_END |
84 | | |
85 | | // *************************** ARM NEON ************************** // |
86 | | |
87 | | #if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \ |
88 | | defined(CRYPTOPP_DOXYGEN_PROCESSING) |
89 | | NAMESPACE_BEGIN(CryptoPP) |
90 | | |
91 | | /// \brief AdvancedProcessBlocks for 1 and 6 blocks |
92 | | /// \tparam F1 function to process 1 128-bit block |
93 | | /// \tparam F6 function to process 6 128-bit blocks |
94 | | /// \tparam W word type of the subkey table |
95 | | /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words |
96 | | /// at a time. |
97 | | /// \details The subkey type is usually word32 or word64. F1 and F6 must use the |
98 | | /// same word type. |
99 | | template <typename F1, typename F6, typename W> |
100 | | inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, |
101 | | const W *subKeys, size_t rounds, const byte *inBlocks, |
102 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
103 | | { |
104 | | CRYPTOPP_ASSERT(subKeys); |
105 | | CRYPTOPP_ASSERT(inBlocks); |
106 | | CRYPTOPP_ASSERT(outBlocks); |
107 | | CRYPTOPP_ASSERT(length >= 16); |
108 | | |
109 | | const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; |
110 | | const uint32x4_t s_one = vld1q_u32(w_one); |
111 | | |
112 | | const size_t blockSize = 16; |
113 | | // const size_t neonBlockSize = 16; |
114 | | |
115 | | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
116 | | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
117 | | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
118 | | |
119 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
120 | | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
121 | | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
122 | | |
123 | | if (flags & BT_ReverseDirection) |
124 | | { |
125 | | inBlocks = PtrAdd(inBlocks, length - blockSize); |
126 | | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
127 | | outBlocks = PtrAdd(outBlocks, length - blockSize); |
128 | | inIncrement = 0-inIncrement; |
129 | | xorIncrement = 0-xorIncrement; |
130 | | outIncrement = 0-outIncrement; |
131 | | } |
132 | | |
133 | | if (flags & BT_AllowParallel) |
134 | | { |
135 | | while (length >= 6*blockSize) |
136 | | { |
137 | | uint64x2_t block0, block1, block2, block3, block4, block5; |
138 | | if (flags & BT_InBlockIsCounter) |
139 | | { |
140 | | const uint64x2_t one = vreinterpretq_u64_u32(s_one); |
141 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
142 | | block1 = vaddq_u64(block0, one); |
143 | | block2 = vaddq_u64(block1, one); |
144 | | block3 = vaddq_u64(block2, one); |
145 | | block4 = vaddq_u64(block3, one); |
146 | | block5 = vaddq_u64(block4, one); |
147 | | vst1q_u8(const_cast<byte*>(inBlocks), |
148 | | vreinterpretq_u8_u64(vaddq_u64(block5, one))); |
149 | | } |
150 | | else |
151 | | { |
152 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
153 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
154 | | block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
155 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
156 | | block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
157 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
158 | | block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
159 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
160 | | block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
161 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
162 | | block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
163 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
164 | | } |
165 | | |
166 | | if (xorInput) |
167 | | { |
168 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
169 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
170 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
171 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
172 | | block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
173 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
174 | | block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
175 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
176 | | block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
177 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
178 | | block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
179 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
180 | | } |
181 | | |
182 | | func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds)); |
183 | | |
184 | | if (xorOutput) |
185 | | { |
186 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
187 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
188 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
189 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
190 | | block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
191 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
192 | | block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
193 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
194 | | block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
195 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
196 | | block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
197 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
198 | | } |
199 | | |
200 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0)); |
201 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
202 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1)); |
203 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
204 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2)); |
205 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
206 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3)); |
207 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
208 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4)); |
209 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
210 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5)); |
211 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
212 | | |
213 | | length -= 6*blockSize; |
214 | | } |
215 | | } |
216 | | |
217 | | while (length >= blockSize) |
218 | | { |
219 | | uint64x2_t block; |
220 | | block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
221 | | |
222 | | if (xorInput) |
223 | | block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
224 | | |
225 | | if (flags & BT_InBlockIsCounter) |
226 | | const_cast<byte *>(inBlocks)[15]++; |
227 | | |
228 | | func1(block, subKeys, static_cast<unsigned int>(rounds)); |
229 | | |
230 | | if (xorOutput) |
231 | | block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
232 | | |
233 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); |
234 | | |
235 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
236 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
237 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
238 | | length -= blockSize; |
239 | | } |
240 | | |
241 | | return length; |
242 | | } |
243 | | |
244 | | /// \brief AdvancedProcessBlocks for 1 and 4 blocks |
245 | | /// \tparam F1 function to process 1 128-bit block |
246 | | /// \tparam F4 function to process 4 128-bit blocks |
247 | | /// \tparam W word type of the subkey table |
248 | | /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words |
249 | | /// at a time. |
250 | | /// \details The subkey type is usually word32 or word64. V is the vector type and it is |
251 | | /// usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and |
252 | | /// vector type. |
253 | | template <typename F1, typename F4, typename W> |
254 | | inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, |
255 | | const W *subKeys, size_t rounds, const byte *inBlocks, |
256 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
257 | | { |
258 | | CRYPTOPP_ASSERT(subKeys); |
259 | | CRYPTOPP_ASSERT(inBlocks); |
260 | | CRYPTOPP_ASSERT(outBlocks); |
261 | | CRYPTOPP_ASSERT(length >= 16); |
262 | | |
263 | | const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; |
264 | | const uint32x4_t s_one = vld1q_u32(w_one); |
265 | | |
266 | | const size_t blockSize = 16; |
267 | | // const size_t neonBlockSize = 16; |
268 | | |
269 | | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
270 | | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
271 | | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
272 | | |
273 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
274 | | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
275 | | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
276 | | |
277 | | if (flags & BT_ReverseDirection) |
278 | | { |
279 | | inBlocks = PtrAdd(inBlocks, length - blockSize); |
280 | | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
281 | | outBlocks = PtrAdd(outBlocks, length - blockSize); |
282 | | inIncrement = 0-inIncrement; |
283 | | xorIncrement = 0-xorIncrement; |
284 | | outIncrement = 0-outIncrement; |
285 | | } |
286 | | |
287 | | if (flags & BT_AllowParallel) |
288 | | { |
289 | | while (length >= 4*blockSize) |
290 | | { |
291 | | uint32x4_t block0, block1, block2, block3; |
292 | | if (flags & BT_InBlockIsCounter) |
293 | | { |
294 | | const uint32x4_t one = s_one; |
295 | | block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
296 | | block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one))); |
297 | | block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one))); |
298 | | block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one))); |
299 | | vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64( |
300 | | vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one)))); |
301 | | } |
302 | | else |
303 | | { |
304 | | block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
305 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
306 | | block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
307 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
308 | | block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
309 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
310 | | block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
311 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
312 | | } |
313 | | |
314 | | if (xorInput) |
315 | | { |
316 | | block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
317 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
318 | | block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
319 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
320 | | block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
321 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
322 | | block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
323 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
324 | | } |
325 | | |
326 | | func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); |
327 | | |
328 | | if (xorOutput) |
329 | | { |
330 | | block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
331 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
332 | | block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
333 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
334 | | block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
335 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
336 | | block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
337 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
338 | | } |
339 | | |
340 | | vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0)); |
341 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
342 | | vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1)); |
343 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
344 | | vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2)); |
345 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
346 | | vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3)); |
347 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
348 | | |
349 | | length -= 4*blockSize; |
350 | | } |
351 | | } |
352 | | |
353 | | while (length >= blockSize) |
354 | | { |
355 | | uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); |
356 | | |
357 | | if (xorInput) |
358 | | block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
359 | | |
360 | | if (flags & BT_InBlockIsCounter) |
361 | | const_cast<byte *>(inBlocks)[15]++; |
362 | | |
363 | | func1(block, subKeys, static_cast<unsigned int>(rounds)); |
364 | | |
365 | | if (xorOutput) |
366 | | block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); |
367 | | |
368 | | vst1q_u8(outBlocks, vreinterpretq_u8_u32(block)); |
369 | | |
370 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
371 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
372 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
373 | | length -= blockSize; |
374 | | } |
375 | | |
376 | | return length; |
377 | | } |
378 | | |
379 | | /// \brief AdvancedProcessBlocks for 2 and 6 blocks |
380 | | /// \tparam F2 function to process 2 128-bit blocks |
381 | | /// \tparam F6 function to process 6 128-bit blocks |
382 | | /// \tparam W word type of the subkey table |
383 | | /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words |
384 | | /// at a time. For a single block the template uses F2 with a zero block. |
385 | | /// \details The subkey type is usually word32 or word64. F2 and F6 must use the |
386 | | /// same word type. |
387 | | template <typename F2, typename F6, typename W> |
388 | | inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, |
389 | | const W *subKeys, size_t rounds, const byte *inBlocks, |
390 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
391 | | { |
392 | | CRYPTOPP_ASSERT(subKeys); |
393 | | CRYPTOPP_ASSERT(inBlocks); |
394 | | CRYPTOPP_ASSERT(outBlocks); |
395 | | CRYPTOPP_ASSERT(length >= 16); |
396 | | |
397 | | const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; |
398 | | const uint32x4_t s_one = vld1q_u32(w_one); |
399 | | |
400 | | const size_t blockSize = 16; |
401 | | // const size_t neonBlockSize = 16; |
402 | | |
403 | | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
404 | | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
405 | | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
406 | | |
407 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
408 | | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
409 | | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
410 | | |
411 | | if (flags & BT_ReverseDirection) |
412 | | { |
413 | | inBlocks = PtrAdd(inBlocks, length - blockSize); |
414 | | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
415 | | outBlocks = PtrAdd(outBlocks, length - blockSize); |
416 | | inIncrement = 0-inIncrement; |
417 | | xorIncrement = 0-xorIncrement; |
418 | | outIncrement = 0-outIncrement; |
419 | | } |
420 | | |
421 | | if (flags & BT_AllowParallel) |
422 | | { |
423 | | while (length >= 6*blockSize) |
424 | | { |
425 | | uint64x2_t block0, block1, block2, block3, block4, block5; |
426 | | if (flags & BT_InBlockIsCounter) |
427 | | { |
428 | | const uint64x2_t one = vreinterpretq_u64_u32(s_one); |
429 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
430 | | block1 = vaddq_u64(block0, one); |
431 | | block2 = vaddq_u64(block1, one); |
432 | | block3 = vaddq_u64(block2, one); |
433 | | block4 = vaddq_u64(block3, one); |
434 | | block5 = vaddq_u64(block4, one); |
435 | | vst1q_u8(const_cast<byte*>(inBlocks), |
436 | | vreinterpretq_u8_u64(vaddq_u64(block5, one))); |
437 | | } |
438 | | else |
439 | | { |
440 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
441 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
442 | | block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
443 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
444 | | block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
445 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
446 | | block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
447 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
448 | | block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
449 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
450 | | block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
451 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
452 | | } |
453 | | |
454 | | if (xorInput) |
455 | | { |
456 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
457 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
458 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
459 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
460 | | block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
461 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
462 | | block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
463 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
464 | | block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
465 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
466 | | block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
467 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
468 | | } |
469 | | |
470 | | func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds)); |
471 | | |
472 | | if (xorOutput) |
473 | | { |
474 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
475 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
476 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
477 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
478 | | block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
479 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
480 | | block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
481 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
482 | | block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
483 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
484 | | block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
485 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
486 | | } |
487 | | |
488 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0)); |
489 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
490 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1)); |
491 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
492 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2)); |
493 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
494 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3)); |
495 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
496 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4)); |
497 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
498 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5)); |
499 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
500 | | |
501 | | length -= 6*blockSize; |
502 | | } |
503 | | |
504 | | while (length >= 2*blockSize) |
505 | | { |
506 | | uint64x2_t block0, block1; |
507 | | if (flags & BT_InBlockIsCounter) |
508 | | { |
509 | | const uint64x2_t one = vreinterpretq_u64_u32(s_one); |
510 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
511 | | block1 = vaddq_u64(block0, one); |
512 | | vst1q_u8(const_cast<byte*>(inBlocks), |
513 | | vreinterpretq_u8_u64(vaddq_u64(block1, one))); |
514 | | } |
515 | | else |
516 | | { |
517 | | block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
518 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
519 | | block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
520 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
521 | | } |
522 | | |
523 | | if (xorInput) |
524 | | { |
525 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
526 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
527 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
528 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
529 | | } |
530 | | |
531 | | func2(block0, block1, subKeys, static_cast<unsigned int>(rounds)); |
532 | | |
533 | | if (xorOutput) |
534 | | { |
535 | | block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
536 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
537 | | block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
538 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
539 | | } |
540 | | |
541 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0)); |
542 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
543 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1)); |
544 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
545 | | |
546 | | length -= 2*blockSize; |
547 | | } |
548 | | } |
549 | | |
550 | | while (length >= blockSize) |
551 | | { |
552 | | uint64x2_t block, zero = {0,0}; |
553 | | block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); |
554 | | |
555 | | if (xorInput) |
556 | | block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
557 | | |
558 | | if (flags & BT_InBlockIsCounter) |
559 | | const_cast<byte *>(inBlocks)[15]++; |
560 | | |
561 | | func2(block, zero, subKeys, static_cast<unsigned int>(rounds)); |
562 | | |
563 | | if (xorOutput) |
564 | | block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); |
565 | | |
566 | | vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); |
567 | | |
568 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
569 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
570 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
571 | | length -= blockSize; |
572 | | } |
573 | | |
574 | | return length; |
575 | | } |
576 | | |
577 | | NAMESPACE_END // CryptoPP |
578 | | |
579 | | #endif // CRYPTOPP_ARM_NEON_AVAILABLE |
580 | | |
581 | | // *************************** Intel SSE ************************** // |
582 | | |
583 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) |
584 | | |
585 | | #if defined(CRYPTOPP_DOXYGEN_PROCESSING) |
586 | | /// \brief SunCC workaround |
587 | | /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block |
588 | | /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue |
589 | | /// 224, SunCC and failed compile for rijndael.cpp</A> |
590 | | # define MAYBE_CONST const |
591 | | /// \brief SunCC workaround |
592 | | /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block |
593 | | /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue |
594 | | /// 224, SunCC and failed compile for rijndael.cpp</A> |
595 | | # define MAYBE_UNCONST_CAST(T, x) (x) |
596 | | #elif (__SUNPRO_CC >= 0x5130) |
597 | | # define MAYBE_CONST |
598 | | # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) |
599 | | #else |
600 | 1.55k | # define MAYBE_CONST const |
601 | 1.55k | # define MAYBE_UNCONST_CAST(T, x) (x) |
602 | | #endif |
603 | | |
604 | | #if defined(CRYPTOPP_DOXYGEN_PROCESSING) |
605 | | /// \brief Clang workaround |
606 | | /// \details Clang issues spurious alignment warnings |
607 | | /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue |
608 | | /// 20670, _mm_loadu_si128 parameter has wrong type</A> |
609 | | # define M128_CAST(x) ((__m128i *)(void *)(x)) |
610 | | /// \brief Clang workaround |
611 | | /// \details Clang issues spurious alignment warnings |
612 | | /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue |
613 | | /// 20670, _mm_loadu_si128 parameter has wrong type</A> |
614 | | # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) |
615 | | #else |
616 | | # ifndef M128_CAST |
617 | | # define M128_CAST(x) ((__m128i *)(void *)(x)) |
618 | | # endif |
619 | | # ifndef CONST_M128_CAST |
620 | | # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) |
621 | | # endif |
622 | | #endif |
623 | | |
624 | | NAMESPACE_BEGIN(CryptoPP) |
625 | | |
626 | | /// \brief AdvancedProcessBlocks for 2 and 6 blocks |
627 | | /// \tparam F2 function to process 2 128-bit blocks |
628 | | /// \tparam F6 function to process 6 128-bit blocks |
629 | | /// \tparam W word type of the subkey table |
630 | | /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words |
631 | | /// at a time. For a single block the template uses F2 with a zero block. |
632 | | /// \details The subkey type is usually word32 or word64. F2 and F6 must use the |
633 | | /// same word type. |
634 | | template <typename F2, typename F6, typename W> |
635 | | inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, |
636 | | MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, |
637 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
638 | 16 | { |
639 | 16 | CRYPTOPP_ASSERT(subKeys); |
640 | 16 | CRYPTOPP_ASSERT(inBlocks); |
641 | 16 | CRYPTOPP_ASSERT(outBlocks); |
642 | 16 | CRYPTOPP_ASSERT(length >= 16); |
643 | | |
644 | 16 | const size_t blockSize = 16; |
645 | | // const size_t xmmBlockSize = 16; |
646 | | |
647 | 16 | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
648 | 16 | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
649 | 16 | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
650 | | |
651 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
652 | 16 | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
653 | 16 | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
654 | | |
655 | 16 | if (flags & BT_ReverseDirection) |
656 | 0 | { |
657 | 0 | inBlocks = PtrAdd(inBlocks, length - blockSize); |
658 | 0 | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
659 | 0 | outBlocks = PtrAdd(outBlocks, length - blockSize); |
660 | 0 | inIncrement = 0-inIncrement; |
661 | 0 | xorIncrement = 0-xorIncrement; |
662 | 0 | outIncrement = 0-outIncrement; |
663 | 0 | } |
664 | | |
665 | 16 | if (flags & BT_AllowParallel) |
666 | 16 | { |
667 | 26 | while (length >= 6*blockSize) |
668 | 10 | { |
669 | 10 | __m128i block0, block1, block2, block3, block4, block5; |
670 | 10 | if (flags & BT_InBlockIsCounter) |
671 | 0 | { |
672 | | // Increment of 1 in big-endian compatible with the ctr byte array. |
673 | 0 | const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); |
674 | 0 | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
675 | 0 | block1 = _mm_add_epi32(block0, s_one); |
676 | 0 | block2 = _mm_add_epi32(block1, s_one); |
677 | 0 | block3 = _mm_add_epi32(block2, s_one); |
678 | 0 | block4 = _mm_add_epi32(block3, s_one); |
679 | 0 | block5 = _mm_add_epi32(block4, s_one); |
680 | 0 | _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one)); |
681 | 0 | } |
682 | 10 | else |
683 | 10 | { |
684 | 10 | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
685 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
686 | 10 | block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
687 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
688 | 10 | block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
689 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
690 | 10 | block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
691 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
692 | 10 | block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
693 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
694 | 10 | block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
695 | 10 | inBlocks = PtrAdd(inBlocks, inIncrement); |
696 | 10 | } |
697 | | |
698 | 10 | if (xorInput) |
699 | 0 | { |
700 | 0 | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
701 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
702 | 0 | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
703 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
704 | 0 | block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
705 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
706 | 0 | block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
707 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
708 | 0 | block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
709 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
710 | 0 | block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
711 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
712 | 0 | } |
713 | | |
714 | 10 | func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds)); |
715 | | |
716 | 10 | if (xorOutput) |
717 | 0 | { |
718 | 0 | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
719 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
720 | 0 | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
721 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
722 | 0 | block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
723 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
724 | 0 | block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
725 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
726 | 0 | block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
727 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
728 | 0 | block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
729 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
730 | 0 | } |
731 | | |
732 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block0); |
733 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
734 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block1); |
735 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
736 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block2); |
737 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
738 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block3); |
739 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
740 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block4); |
741 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
742 | 10 | _mm_storeu_si128(M128_CAST(outBlocks), block5); |
743 | 10 | outBlocks = PtrAdd(outBlocks, outIncrement); |
744 | | |
745 | 10 | length -= 6*blockSize; |
746 | 10 | } |
747 | | |
748 | 19 | while (length >= 2*blockSize) |
749 | 3 | { |
750 | 3 | __m128i block0, block1; |
751 | 3 | if (flags & BT_InBlockIsCounter) |
752 | 0 | { |
753 | | // Increment of 1 in big-endian compatible with the ctr byte array. |
754 | 0 | const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); |
755 | 0 | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
756 | 0 | block1 = _mm_add_epi32(block0, s_one); |
757 | 0 | _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one)); |
758 | 0 | } |
759 | 3 | else |
760 | 3 | { |
761 | 3 | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
762 | 3 | inBlocks = PtrAdd(inBlocks, inIncrement); |
763 | 3 | block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
764 | 3 | inBlocks = PtrAdd(inBlocks, inIncrement); |
765 | 3 | } |
766 | | |
767 | 3 | if (xorInput) |
768 | 0 | { |
769 | 0 | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
770 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
771 | 0 | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
772 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
773 | 0 | } |
774 | | |
775 | 3 | func2(block0, block1, subKeys, static_cast<unsigned int>(rounds)); |
776 | | |
777 | 3 | if (xorOutput) |
778 | 0 | { |
779 | 0 | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
780 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
781 | 0 | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
782 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
783 | 0 | } |
784 | | |
785 | 3 | _mm_storeu_si128(M128_CAST(outBlocks), block0); |
786 | 3 | outBlocks = PtrAdd(outBlocks, outIncrement); |
787 | 3 | _mm_storeu_si128(M128_CAST(outBlocks), block1); |
788 | 3 | outBlocks = PtrAdd(outBlocks, outIncrement); |
789 | | |
790 | 3 | length -= 2*blockSize; |
791 | 3 | } |
792 | 16 | } |
793 | | |
794 | 31 | while (length >= blockSize) |
795 | 15 | { |
796 | 15 | __m128i block, zero = _mm_setzero_si128(); |
797 | 15 | block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
798 | | |
799 | 15 | if (xorInput) |
800 | 0 | block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
801 | | |
802 | 15 | if (flags & BT_InBlockIsCounter) |
803 | 0 | const_cast<byte *>(inBlocks)[15]++; |
804 | | |
805 | 15 | func2(block, zero, subKeys, static_cast<unsigned int>(rounds)); |
806 | | |
807 | 15 | if (xorOutput) |
808 | 0 | block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
809 | | |
810 | 15 | _mm_storeu_si128(M128_CAST(outBlocks), block); |
811 | | |
812 | 15 | inBlocks = PtrAdd(inBlocks, inIncrement); |
813 | 15 | outBlocks = PtrAdd(outBlocks, outIncrement); |
814 | 15 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
815 | 15 | length -= blockSize; |
816 | 15 | } |
817 | | |
818 | 16 | return length; |
819 | 16 | } |
820 | | |
821 | | /// \brief AdvancedProcessBlocks for 1 and 4 blocks |
822 | | /// \tparam F1 function to process 1 128-bit block |
823 | | /// \tparam F4 function to process 4 128-bit blocks |
824 | | /// \tparam W word type of the subkey table |
825 | | /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words |
826 | | /// at a time. |
827 | | /// \details The subkey type is usually word32 or word64. F1 and F4 must use the |
828 | | /// same word type. |
829 | | template <typename F1, typename F4, typename W> |
830 | | inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, |
831 | | MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, |
832 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
833 | 530 | { |
834 | 530 | CRYPTOPP_ASSERT(subKeys); |
835 | 530 | CRYPTOPP_ASSERT(inBlocks); |
836 | 530 | CRYPTOPP_ASSERT(outBlocks); |
837 | 530 | CRYPTOPP_ASSERT(length >= 16); |
838 | | |
839 | 530 | const size_t blockSize = 16; |
840 | | // const size_t xmmBlockSize = 16; |
841 | | |
842 | 530 | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
843 | 530 | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
844 | 530 | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
845 | | |
846 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
847 | 530 | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
848 | 530 | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
849 | | |
850 | 530 | if (flags & BT_ReverseDirection) |
851 | 3 | { |
852 | 3 | inBlocks = PtrAdd(inBlocks, length - blockSize); |
853 | 3 | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
854 | 3 | outBlocks = PtrAdd(outBlocks, length - blockSize); |
855 | 3 | inIncrement = 0-inIncrement; |
856 | 3 | xorIncrement = 0-xorIncrement; |
857 | 3 | outIncrement = 0-outIncrement; |
858 | 3 | } |
859 | | |
860 | 530 | if (flags & BT_AllowParallel) |
861 | 287 | { |
862 | 2.63k | while (length >= 4*blockSize) |
863 | 2.34k | { |
864 | 2.34k | __m128i block0, block1, block2, block3; |
865 | 2.34k | if (flags & BT_InBlockIsCounter) |
866 | 2.29k | { |
867 | | // Increment of 1 in big-endian compatible with the ctr byte array. |
868 | 2.29k | const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); |
869 | 2.29k | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
870 | 2.29k | block1 = _mm_add_epi32(block0, s_one); |
871 | 2.29k | block2 = _mm_add_epi32(block1, s_one); |
872 | 2.29k | block3 = _mm_add_epi32(block2, s_one); |
873 | 2.29k | _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one)); |
874 | 2.29k | } |
875 | 51 | else |
876 | 51 | { |
877 | 51 | block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
878 | 51 | inBlocks = PtrAdd(inBlocks, inIncrement); |
879 | 51 | block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
880 | 51 | inBlocks = PtrAdd(inBlocks, inIncrement); |
881 | 51 | block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
882 | 51 | inBlocks = PtrAdd(inBlocks, inIncrement); |
883 | 51 | block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
884 | 51 | inBlocks = PtrAdd(inBlocks, inIncrement); |
885 | 51 | } |
886 | | |
887 | 2.34k | if (xorInput) |
888 | 0 | { |
889 | 0 | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
890 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
891 | 0 | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
892 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
893 | 0 | block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
894 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
895 | 0 | block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
896 | 0 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
897 | 0 | } |
898 | | |
899 | 2.34k | func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); |
900 | | |
901 | 2.34k | if (xorOutput) |
902 | 2.29k | { |
903 | 2.29k | block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
904 | 2.29k | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
905 | 2.29k | block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
906 | 2.29k | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
907 | 2.29k | block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
908 | 2.29k | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
909 | 2.29k | block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
910 | 2.29k | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
911 | 2.29k | } |
912 | | |
913 | 2.34k | _mm_storeu_si128(M128_CAST(outBlocks), block0); |
914 | 2.34k | outBlocks = PtrAdd(outBlocks, outIncrement); |
915 | 2.34k | _mm_storeu_si128(M128_CAST(outBlocks), block1); |
916 | 2.34k | outBlocks = PtrAdd(outBlocks, outIncrement); |
917 | 2.34k | _mm_storeu_si128(M128_CAST(outBlocks), block2); |
918 | 2.34k | outBlocks = PtrAdd(outBlocks, outIncrement); |
919 | 2.34k | _mm_storeu_si128(M128_CAST(outBlocks), block3); |
920 | 2.34k | outBlocks = PtrAdd(outBlocks, outIncrement); |
921 | | |
922 | 2.34k | length -= 4*blockSize; |
923 | 2.34k | } |
924 | 287 | } |
925 | | |
926 | 1.16k | while (length >= blockSize) |
927 | 637 | { |
928 | 637 | __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); |
929 | | |
930 | 637 | if (xorInput) |
931 | 22 | block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
932 | | |
933 | 637 | if (flags & BT_InBlockIsCounter) |
934 | 289 | const_cast<byte *>(inBlocks)[15]++; |
935 | | |
936 | 637 | func1(block, subKeys, static_cast<unsigned int>(rounds)); |
937 | | |
938 | 637 | if (xorOutput) |
939 | 234 | block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); |
940 | | |
941 | 637 | _mm_storeu_si128(M128_CAST(outBlocks), block); |
942 | | |
943 | 637 | inBlocks = PtrAdd(inBlocks, inIncrement); |
944 | 637 | outBlocks = PtrAdd(outBlocks, outIncrement); |
945 | 637 | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
946 | 637 | length -= blockSize; |
947 | 637 | } |
948 | | |
949 | 530 | return length; |
950 | 530 | } |
951 | | |
952 | | NAMESPACE_END // CryptoPP |
953 | | |
954 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
955 | | |
956 | | // ************************** Altivec/Power 4 ************************** // |
957 | | |
958 | | #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING) |
959 | | |
960 | | NAMESPACE_BEGIN(CryptoPP) |
961 | | |
962 | | /// \brief AdvancedProcessBlocks for 1 and 4 blocks |
963 | | /// \tparam F1 function to process 1 128-bit block |
964 | | /// \tparam F4 function to process 4 128-bit blocks |
965 | | /// \tparam W word type of the subkey table |
966 | | /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words |
967 | | /// at a time. |
968 | | /// \details The subkey type is usually word32 or word64. F1 and F4 must use the |
969 | | /// same word type. |
970 | | template <typename F1, typename F4, typename W> |
971 | | inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, |
972 | | const W *subKeys, size_t rounds, const byte *inBlocks, |
973 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
974 | | { |
975 | | CRYPTOPP_ASSERT(subKeys); |
976 | | CRYPTOPP_ASSERT(inBlocks); |
977 | | CRYPTOPP_ASSERT(outBlocks); |
978 | | CRYPTOPP_ASSERT(length >= 16); |
979 | | |
980 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
981 | | const uint32x4_p s_one = {1,0,0,0}; |
982 | | #else |
983 | | const uint32x4_p s_one = {0,0,0,1}; |
984 | | #endif |
985 | | |
986 | | const size_t blockSize = 16; |
987 | | // const size_t simdBlockSize = 16; |
988 | | |
989 | | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
990 | | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
991 | | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
992 | | |
993 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
994 | | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
995 | | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
996 | | |
997 | | if (flags & BT_ReverseDirection) |
998 | | { |
999 | | inBlocks = PtrAdd(inBlocks, length - blockSize); |
1000 | | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
1001 | | outBlocks = PtrAdd(outBlocks, length - blockSize); |
1002 | | inIncrement = 0-inIncrement; |
1003 | | xorIncrement = 0-xorIncrement; |
1004 | | outIncrement = 0-outIncrement; |
1005 | | } |
1006 | | |
1007 | | if (flags & BT_AllowParallel) |
1008 | | { |
1009 | | while (length >= 4*blockSize) |
1010 | | { |
1011 | | uint32x4_p block0, block1, block2, block3; |
1012 | | |
1013 | | if (flags & BT_InBlockIsCounter) |
1014 | | { |
1015 | | block0 = VecLoadBE(inBlocks); |
1016 | | block1 = VecAdd(block0, s_one); |
1017 | | block2 = VecAdd(block1, s_one); |
1018 | | block3 = VecAdd(block2, s_one); |
1019 | | |
1020 | | // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE). |
1021 | | // CTR_ModePolicy::OperateKeystream is wired such that after |
1022 | | // returning from this function CTR_ModePolicy will detect wrap on |
1023 | | // on the last counter byte and increment the next to last byte. |
1024 | | // The problem is, with a big-endian load, inBlocks[15] is really |
1025 | | // located at index 15. The vector addition using a 32-bit element |
1026 | | // generates a carry into inBlocks[14] and then CTR_ModePolicy |
1027 | | // increments inBlocks[14] too. |
1028 | | const_cast<byte*>(inBlocks)[15] += 6; |
1029 | | } |
1030 | | else |
1031 | | { |
1032 | | block0 = VecLoadBE(inBlocks); |
1033 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1034 | | block1 = VecLoadBE(inBlocks); |
1035 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1036 | | block2 = VecLoadBE(inBlocks); |
1037 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1038 | | block3 = VecLoadBE(inBlocks); |
1039 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1040 | | } |
1041 | | |
1042 | | if (xorInput) |
1043 | | { |
1044 | | block0 = VecXor(block0, VecLoadBE(xorBlocks)); |
1045 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1046 | | block1 = VecXor(block1, VecLoadBE(xorBlocks)); |
1047 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1048 | | block2 = VecXor(block2, VecLoadBE(xorBlocks)); |
1049 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1050 | | block3 = VecXor(block3, VecLoadBE(xorBlocks)); |
1051 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1052 | | } |
1053 | | |
1054 | | func4(block0, block1, block2, block3, subKeys, rounds); |
1055 | | |
1056 | | if (xorOutput) |
1057 | | { |
1058 | | block0 = VecXor(block0, VecLoadBE(xorBlocks)); |
1059 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1060 | | block1 = VecXor(block1, VecLoadBE(xorBlocks)); |
1061 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1062 | | block2 = VecXor(block2, VecLoadBE(xorBlocks)); |
1063 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1064 | | block3 = VecXor(block3, VecLoadBE(xorBlocks)); |
1065 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1066 | | } |
1067 | | |
1068 | | VecStoreBE(block0, outBlocks); |
1069 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1070 | | VecStoreBE(block1, outBlocks); |
1071 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1072 | | VecStoreBE(block2, outBlocks); |
1073 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1074 | | VecStoreBE(block3, outBlocks); |
1075 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1076 | | |
1077 | | length -= 4*blockSize; |
1078 | | } |
1079 | | } |
1080 | | |
1081 | | while (length >= blockSize) |
1082 | | { |
1083 | | uint32x4_p block = VecLoadBE(inBlocks); |
1084 | | |
1085 | | if (xorInput) |
1086 | | block = VecXor(block, VecLoadBE(xorBlocks)); |
1087 | | |
1088 | | if (flags & BT_InBlockIsCounter) |
1089 | | const_cast<byte *>(inBlocks)[15]++; |
1090 | | |
1091 | | func1(block, subKeys, rounds); |
1092 | | |
1093 | | if (xorOutput) |
1094 | | block = VecXor(block, VecLoadBE(xorBlocks)); |
1095 | | |
1096 | | VecStoreBE(block, outBlocks); |
1097 | | |
1098 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1099 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1100 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1101 | | length -= blockSize; |
1102 | | } |
1103 | | |
1104 | | return length; |
1105 | | } |
1106 | | |
1107 | | /// \brief AdvancedProcessBlocks for 1 and 6 blocks |
1108 | | /// \tparam F1 function to process 1 128-bit block |
1109 | | /// \tparam F6 function to process 6 128-bit blocks |
1110 | | /// \tparam W word type of the subkey table |
1111 | | /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words |
1112 | | /// at a time. |
1113 | | /// \details The subkey type is usually word32 or word64. F1 and F6 must use the |
1114 | | /// same word type. |
1115 | | template <typename F1, typename F6, typename W> |
1116 | | inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, |
1117 | | const W *subKeys, size_t rounds, const byte *inBlocks, |
1118 | | const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
1119 | | { |
1120 | | CRYPTOPP_ASSERT(subKeys); |
1121 | | CRYPTOPP_ASSERT(inBlocks); |
1122 | | CRYPTOPP_ASSERT(outBlocks); |
1123 | | CRYPTOPP_ASSERT(length >= 16); |
1124 | | |
1125 | | #if (CRYPTOPP_LITTLE_ENDIAN) |
1126 | | const uint32x4_p s_one = {1,0,0,0}; |
1127 | | #else |
1128 | | const uint32x4_p s_one = {0,0,0,1}; |
1129 | | #endif |
1130 | | |
1131 | | const size_t blockSize = 16; |
1132 | | // const size_t simdBlockSize = 16; |
1133 | | |
1134 | | size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize; |
1135 | | size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; |
1136 | | size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize; |
1137 | | |
1138 | | // Clang and Coverity are generating findings using xorBlocks as a flag. |
1139 | | const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput)); |
1140 | | const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput)); |
1141 | | |
1142 | | if (flags & BT_ReverseDirection) |
1143 | | { |
1144 | | inBlocks = PtrAdd(inBlocks, length - blockSize); |
1145 | | xorBlocks = PtrAdd(xorBlocks, length - blockSize); |
1146 | | outBlocks = PtrAdd(outBlocks, length - blockSize); |
1147 | | inIncrement = 0-inIncrement; |
1148 | | xorIncrement = 0-xorIncrement; |
1149 | | outIncrement = 0-outIncrement; |
1150 | | } |
1151 | | |
1152 | | if (flags & BT_AllowParallel) |
1153 | | { |
1154 | | while (length >= 6*blockSize) |
1155 | | { |
1156 | | uint32x4_p block0, block1, block2, block3, block4, block5; |
1157 | | |
1158 | | if (flags & BT_InBlockIsCounter) |
1159 | | { |
1160 | | block0 = VecLoadBE(inBlocks); |
1161 | | block1 = VecAdd(block0, s_one); |
1162 | | block2 = VecAdd(block1, s_one); |
1163 | | block3 = VecAdd(block2, s_one); |
1164 | | block4 = VecAdd(block3, s_one); |
1165 | | block5 = VecAdd(block4, s_one); |
1166 | | |
1167 | | // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE). |
1168 | | // CTR_ModePolicy::OperateKeystream is wired such that after |
1169 | | // returning from this function CTR_ModePolicy will detect wrap on |
1170 | | // on the last counter byte and increment the next to last byte. |
1171 | | // The problem is, with a big-endian load, inBlocks[15] is really |
1172 | | // located at index 15. The vector addition using a 32-bit element |
1173 | | // generates a carry into inBlocks[14] and then CTR_ModePolicy |
1174 | | // increments inBlocks[14] too. |
1175 | | // |
1176 | | // To find this bug we needed a test case with a ctr of 0xNN...FA. |
1177 | | // The last octet is 0xFA and adding 6 creates the wrap to trigger |
1178 | | // the issue. If the last octet was 0xFC then 4 would trigger it. |
1179 | | // We dumb-lucked into the test with SPECK-128. The test case of |
1180 | | // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA. |
1181 | | uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one); |
1182 | | VecStoreBE(temp, const_cast<byte*>(inBlocks)); |
1183 | | } |
1184 | | else |
1185 | | { |
1186 | | block0 = VecLoadBE(inBlocks); |
1187 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1188 | | block1 = VecLoadBE(inBlocks); |
1189 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1190 | | block2 = VecLoadBE(inBlocks); |
1191 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1192 | | block3 = VecLoadBE(inBlocks); |
1193 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1194 | | block4 = VecLoadBE(inBlocks); |
1195 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1196 | | block5 = VecLoadBE(inBlocks); |
1197 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1198 | | } |
1199 | | |
1200 | | if (xorInput) |
1201 | | { |
1202 | | block0 = VecXor(block0, VecLoadBE(xorBlocks)); |
1203 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1204 | | block1 = VecXor(block1, VecLoadBE(xorBlocks)); |
1205 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1206 | | block2 = VecXor(block2, VecLoadBE(xorBlocks)); |
1207 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1208 | | block3 = VecXor(block3, VecLoadBE(xorBlocks)); |
1209 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1210 | | block4 = VecXor(block4, VecLoadBE(xorBlocks)); |
1211 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1212 | | block5 = VecXor(block5, VecLoadBE(xorBlocks)); |
1213 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1214 | | } |
1215 | | |
1216 | | func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); |
1217 | | |
1218 | | if (xorOutput) |
1219 | | { |
1220 | | block0 = VecXor(block0, VecLoadBE(xorBlocks)); |
1221 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1222 | | block1 = VecXor(block1, VecLoadBE(xorBlocks)); |
1223 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1224 | | block2 = VecXor(block2, VecLoadBE(xorBlocks)); |
1225 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1226 | | block3 = VecXor(block3, VecLoadBE(xorBlocks)); |
1227 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1228 | | block4 = VecXor(block4, VecLoadBE(xorBlocks)); |
1229 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1230 | | block5 = VecXor(block5, VecLoadBE(xorBlocks)); |
1231 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1232 | | } |
1233 | | |
1234 | | VecStoreBE(block0, outBlocks); |
1235 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1236 | | VecStoreBE(block1, outBlocks); |
1237 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1238 | | VecStoreBE(block2, outBlocks); |
1239 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1240 | | VecStoreBE(block3, outBlocks); |
1241 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1242 | | VecStoreBE(block4, outBlocks); |
1243 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1244 | | VecStoreBE(block5, outBlocks); |
1245 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1246 | | |
1247 | | length -= 6*blockSize; |
1248 | | } |
1249 | | } |
1250 | | |
1251 | | while (length >= blockSize) |
1252 | | { |
1253 | | uint32x4_p block = VecLoadBE(inBlocks); |
1254 | | |
1255 | | if (xorInput) |
1256 | | block = VecXor(block, VecLoadBE(xorBlocks)); |
1257 | | |
1258 | | if (flags & BT_InBlockIsCounter) |
1259 | | const_cast<byte *>(inBlocks)[15]++; |
1260 | | |
1261 | | func1(block, subKeys, rounds); |
1262 | | |
1263 | | if (xorOutput) |
1264 | | block = VecXor(block, VecLoadBE(xorBlocks)); |
1265 | | |
1266 | | VecStoreBE(block, outBlocks); |
1267 | | |
1268 | | inBlocks = PtrAdd(inBlocks, inIncrement); |
1269 | | outBlocks = PtrAdd(outBlocks, outIncrement); |
1270 | | xorBlocks = PtrAdd(xorBlocks, xorIncrement); |
1271 | | length -= blockSize; |
1272 | | } |
1273 | | |
1274 | | return length; |
1275 | | } |
1276 | | |
1277 | | NAMESPACE_END // CryptoPP |
1278 | | |
1279 | | #endif // __ALTIVEC__ |
1280 | | |
1281 | | #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES |