Line | Count | Source (jump to first uncovered line) |
1 | | // salsa.cpp - originally written and placed in the public domain by Wei Dai |
2 | | |
3 | | // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code |
4 | | |
5 | | #include "pch.h" |
6 | | #include "config.h" |
7 | | |
8 | | #ifndef CRYPTOPP_GENERATE_X64_MASM |
9 | | |
10 | | #include "salsa.h" |
11 | | #include "argnames.h" |
12 | | #include "misc.h" |
13 | | #include "cpu.h" |
14 | | |
15 | | #if CRYPTOPP_MSC_VERSION |
16 | | # pragma warning(disable: 4702 4740) |
17 | | #endif |
18 | | |
19 | | // Clang due to "Inline assembly operands don't work with .intel_syntax" |
20 | | // https://llvm.org/bugs/show_bug.cgi?id=24232 |
21 | | #if defined(CRYPTOPP_DISABLE_SALSA_ASM) |
22 | | # undef CRYPTOPP_X86_ASM_AVAILABLE |
23 | | # undef CRYPTOPP_X32_ASM_AVAILABLE |
24 | | # undef CRYPTOPP_X64_ASM_AVAILABLE |
25 | | # undef CRYPTOPP_SSE2_ASM_AVAILABLE |
26 | | # undef CRYPTOPP_SSSE3_ASM_AVAILABLE |
27 | | #endif |
28 | | |
29 | | ANONYMOUS_NAMESPACE_BEGIN |
30 | | |
31 | | // Can't use GetAlignmentOf<word32>() because of C++11 and constexpr |
32 | | // Can use 'const unsigned int' because of MSVC 2013 |
33 | | #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) |
34 | | # define ALIGN_SPEC 16 |
35 | | #else |
36 | | # define ALIGN_SPEC 4 |
37 | | #endif |
38 | | |
39 | | ANONYMOUS_NAMESPACE_END |
40 | | |
41 | | NAMESPACE_BEGIN(CryptoPP) |
42 | | |
43 | | #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING) |
44 | | void Salsa20_TestInstantiations() |
45 | | { |
46 | | Salsa20::Encryption x1; |
47 | | XSalsa20::Encryption x2; |
48 | | } |
49 | | #endif |
50 | | |
51 | | void Salsa20_Core(word32* data, unsigned int rounds) |
52 | 77.0k | { |
53 | 77.0k | CRYPTOPP_ASSERT(data != NULLPTR); |
54 | 77.0k | CRYPTOPP_ASSERT(rounds % 2 == 0); |
55 | | |
56 | 77.0k | CRYPTOPP_ALIGN_DATA(ALIGN_SPEC) word32 x[16]; |
57 | | |
58 | 1.30M | for (size_t i = 0; i < 16; ++i) |
59 | 1.23M | x[i] = data[i]; |
60 | | |
61 | | // Rounds must be even |
62 | 385k | for (size_t i = 0; i < rounds; i += 2) |
63 | 308k | { |
64 | 308k | x[ 4] ^= rotlConstant< 7>(x[ 0]+x[12]); |
65 | 308k | x[ 8] ^= rotlConstant< 9>(x[ 4]+x[ 0]); |
66 | 308k | x[12] ^= rotlConstant<13>(x[ 8]+x[ 4]); |
67 | 308k | x[ 0] ^= rotlConstant<18>(x[12]+x[ 8]); |
68 | | |
69 | 308k | x[ 9] ^= rotlConstant< 7>(x[ 5]+x[ 1]); |
70 | 308k | x[13] ^= rotlConstant< 9>(x[ 9]+x[ 5]); |
71 | 308k | x[ 1] ^= rotlConstant<13>(x[13]+x[ 9]); |
72 | 308k | x[ 5] ^= rotlConstant<18>(x[ 1]+x[13]); |
73 | | |
74 | 308k | x[14] ^= rotlConstant< 7>(x[10]+x[ 6]); |
75 | 308k | x[ 2] ^= rotlConstant< 9>(x[14]+x[10]); |
76 | 308k | x[ 6] ^= rotlConstant<13>(x[ 2]+x[14]); |
77 | 308k | x[10] ^= rotlConstant<18>(x[ 6]+x[ 2]); |
78 | | |
79 | 308k | x[ 3] ^= rotlConstant< 7>(x[15]+x[11]); |
80 | 308k | x[ 7] ^= rotlConstant< 9>(x[ 3]+x[15]); |
81 | 308k | x[11] ^= rotlConstant<13>(x[ 7]+x[ 3]); |
82 | 308k | x[15] ^= rotlConstant<18>(x[11]+x[ 7]); |
83 | | |
84 | 308k | x[ 1] ^= rotlConstant< 7>(x[ 0]+x[ 3]); |
85 | 308k | x[ 2] ^= rotlConstant< 9>(x[ 1]+x[ 0]); |
86 | 308k | x[ 3] ^= rotlConstant<13>(x[ 2]+x[ 1]); |
87 | 308k | x[ 0] ^= rotlConstant<18>(x[ 3]+x[ 2]); |
88 | | |
89 | 308k | x[ 6] ^= rotlConstant< 7>(x[ 5]+x[ 4]); |
90 | 308k | x[ 7] ^= rotlConstant< 9>(x[ 6]+x[ 5]); |
91 | 308k | x[ 4] ^= rotlConstant<13>(x[ 7]+x[ 6]); |
92 | 308k | x[ 5] ^= rotlConstant<18>(x[ 4]+x[ 7]); |
93 | | |
94 | 308k | x[11] ^= rotlConstant< 7>(x[10]+x[ 9]); |
95 | 308k | x[ 8] ^= rotlConstant< 9>(x[11]+x[10]); |
96 | 308k | x[ 9] ^= rotlConstant<13>(x[ 8]+x[11]); |
97 | 308k | x[10] ^= rotlConstant<18>(x[ 9]+x[ 8]); |
98 | | |
99 | 308k | x[12] ^= rotlConstant< 7>(x[15]+x[14]); |
100 | 308k | x[13] ^= rotlConstant< 9>(x[12]+x[15]); |
101 | 308k | x[14] ^= rotlConstant<13>(x[13]+x[12]); |
102 | 308k | x[15] ^= rotlConstant<18>(x[14]+x[13]); |
103 | 308k | } |
104 | | |
105 | | // OpenMP 4.0 released July 2013. |
106 | | #if _OPENMP >= 201307 |
107 | | #pragma omp simd |
108 | | for (size_t i = 0; i < 16; ++i) |
109 | | data[i] += x[i]; |
110 | | #else |
111 | 1.30M | for (size_t i = 0; i < 16; ++i) |
112 | 1.23M | data[i] += x[i]; |
113 | 77.0k | #endif |
114 | 77.0k | } |
115 | | |
116 | | std::string Salsa20_Policy::AlgorithmProvider() const |
117 | 0 | { |
118 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SALSA_ASM) |
119 | | if (HasSSE2()) |
120 | | return "SSE2"; |
121 | | #endif |
122 | 0 | return "C++"; |
123 | 0 | } |
124 | | |
125 | | void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) |
126 | 0 | { |
127 | | // Use previous rounds as the default value |
128 | 0 | int rounds = params.GetIntValueWithDefault(Name::Rounds(), m_rounds); |
129 | 0 | if (rounds != 20 && rounds != 12 && rounds != 8) |
130 | 0 | throw InvalidRounds(Salsa20::StaticAlgorithmName(), rounds); |
131 | | |
132 | | // Latch a good value |
133 | 0 | m_rounds = rounds; |
134 | | |
135 | | // m_state is reordered for SSE2 |
136 | 0 | GetBlock<word32, LittleEndian> get1(key); |
137 | 0 | get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]); |
138 | 0 | GetBlock<word32, LittleEndian> get2(key + length - 16); |
139 | 0 | get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]); |
140 | | |
141 | | // "expand 16-byte k" or "expand 32-byte k" |
142 | 0 | m_state[0] = 0x61707865; |
143 | 0 | m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; |
144 | 0 | m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; |
145 | 0 | m_state[3] = 0x6b206574; |
146 | 0 | } |
147 | | |
148 | | void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length) |
149 | 0 | { |
150 | 0 | CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length); |
151 | 0 | CRYPTOPP_ASSERT(length==8); |
152 | |
|
153 | 0 | GetBlock<word32, LittleEndian> get(IV); |
154 | 0 | get(m_state[14])(m_state[11]); |
155 | 0 | m_state[8] = m_state[5] = 0; |
156 | 0 | } |
157 | | |
158 | | void Salsa20_Policy::SeekToIteration(lword iterationCount) |
159 | 0 | { |
160 | 0 | m_state[8] = (word32)iterationCount; |
161 | 0 | m_state[5] = (word32)SafeRightShift<32>(iterationCount); |
162 | 0 | } |
163 | | |
164 | | #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) |
165 | | unsigned int Salsa20_Policy::GetAlignment() const |
166 | 0 | { |
167 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE |
168 | | if (HasSSE2()) |
169 | | return 16; |
170 | | else |
171 | | #endif |
172 | 0 | return GetAlignmentOf<word32>(); |
173 | 0 | } |
174 | | |
175 | | unsigned int Salsa20_Policy::GetOptimalBlockSize() const |
176 | 0 | { |
177 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE |
178 | | if (HasSSE2()) |
179 | | return 4*BYTES_PER_ITERATION; |
180 | | else |
181 | | #endif |
182 | 0 | return BYTES_PER_ITERATION; |
183 | 0 | } |
184 | | #endif |
185 | | |
186 | | #ifdef CRYPTOPP_X64_MASM_AVAILABLE |
187 | | extern "C" { |
188 | | void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state); |
189 | | } |
190 | | #endif |
191 | | |
192 | | #if CRYPTOPP_MSC_VERSION |
193 | | # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code |
194 | | #endif |
195 | | |
196 | | void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) |
197 | 0 | { |
198 | 0 | #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM |
199 | |
|
200 | | #ifdef CRYPTOPP_X64_MASM_AVAILABLE |
201 | | Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data()); |
202 | | return; |
203 | | #endif |
204 | |
|
205 | | #if CRYPTOPP_SSE2_ASM_AVAILABLE |
206 | | #ifdef CRYPTOPP_GENERATE_X64_MASM |
207 | | ALIGN 8 |
208 | | Salsa20_OperateKeystream PROC FRAME |
209 | | mov r10, [rsp + 5*8] ; state |
210 | | alloc_stack(10*16 + 32*16 + 8) |
211 | | save_xmm128 xmm6, 0200h |
212 | | save_xmm128 xmm7, 0210h |
213 | | save_xmm128 xmm8, 0220h |
214 | | save_xmm128 xmm9, 0230h |
215 | | save_xmm128 xmm10, 0240h |
216 | | save_xmm128 xmm11, 0250h |
217 | | save_xmm128 xmm12, 0260h |
218 | | save_xmm128 xmm13, 0270h |
219 | | save_xmm128 xmm14, 0280h |
220 | | save_xmm128 xmm15, 0290h |
221 | | .endprolog |
222 | | |
223 | | #define REG_output rcx |
224 | | #define REG_input rdx |
225 | | #define REG_iterationCount r8 |
226 | | #define REG_state r10 |
227 | | #define REG_rounds e9d |
228 | | #define REG_roundsLeft eax |
229 | | #define REG_temp32 r11d |
230 | | #define REG_temp r11 |
231 | | #define SSE2_WORKSPACE rsp |
232 | | #else |
233 | | if (HasSSE2()) |
234 | | { |
235 | | #if CRYPTOPP_BOOL_X64 |
236 | | #define REG_output %1 |
237 | | #define REG_input %0 |
238 | | #define REG_iterationCount %2 |
239 | | #define REG_state %4 /* constant */ |
240 | | #define REG_rounds %3 /* constant */ |
241 | | #define REG_roundsLeft eax |
242 | | #define REG_temp32 edx |
243 | | #define REG_temp rdx |
244 | | #define SSE2_WORKSPACE %5 /* constant */ |
245 | | |
246 | | CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32]; |
247 | | #else |
248 | | #define REG_output edi |
249 | | #define REG_input eax |
250 | | #define REG_iterationCount ecx |
251 | | #define REG_state esi |
252 | | #define REG_rounds edx |
253 | | #define REG_roundsLeft ebx |
254 | | #define REG_temp32 ebp |
255 | | #define REG_temp ebp |
256 | | #define SSE2_WORKSPACE esp + WORD_SZ |
257 | | #endif |
258 | | |
259 | | #ifdef __GNUC__ |
260 | | __asm__ __volatile__ |
261 | | ( |
262 | | INTEL_NOPREFIX |
263 | | AS_PUSH_IF86( bx) |
264 | | #else |
265 | | void *s = m_state.data(); |
266 | | word32 r = m_rounds; |
267 | | |
268 | | AS2( mov REG_iterationCount, iterationCount) |
269 | | AS2( mov REG_input, input) |
270 | | AS2( mov REG_output, output) |
271 | | AS2( mov REG_state, s) |
272 | | AS2( mov REG_rounds, r) |
273 | | #endif |
274 | | #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM |
275 | | |
276 | | AS_PUSH_IF86( bp) |
277 | | AS2( cmp REG_iterationCount, 4) |
278 | | ASJ( jl, 5, f) |
279 | | |
280 | | #if CRYPTOPP_BOOL_X86 |
281 | | AS2( mov ebx, esp) |
282 | | AS2( and esp, -16) |
283 | | AS2( sub esp, 32*16) |
284 | | AS1( push ebx) |
285 | | #endif |
286 | | |
287 | | #define SSE2_EXPAND_S(i, j) \ |
288 | | ASS( pshufd xmm4, xmm##i, j, j, j, j) \ |
289 | | AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4) |
290 | | |
291 | | AS2( movdqa xmm0, [REG_state + 0*16]) |
292 | | AS2( movdqa xmm1, [REG_state + 1*16]) |
293 | | AS2( movdqa xmm2, [REG_state + 2*16]) |
294 | | AS2( movdqa xmm3, [REG_state + 3*16]) |
295 | | SSE2_EXPAND_S(0, 0) |
296 | | SSE2_EXPAND_S(0, 1) |
297 | | SSE2_EXPAND_S(0, 2) |
298 | | SSE2_EXPAND_S(0, 3) |
299 | | SSE2_EXPAND_S(1, 0) |
300 | | SSE2_EXPAND_S(1, 2) |
301 | | SSE2_EXPAND_S(1, 3) |
302 | | SSE2_EXPAND_S(2, 1) |
303 | | SSE2_EXPAND_S(2, 2) |
304 | | SSE2_EXPAND_S(2, 3) |
305 | | SSE2_EXPAND_S(3, 0) |
306 | | SSE2_EXPAND_S(3, 1) |
307 | | SSE2_EXPAND_S(3, 2) |
308 | | SSE2_EXPAND_S(3, 3) |
309 | | |
310 | | #define SSE2_EXPAND_S85(i) \ |
311 | | AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \ |
312 | | AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \ |
313 | | AS2( add REG_roundsLeft, 1) \ |
314 | | AS2( adc REG_temp32, 0) |
315 | | |
316 | | ASL(1) |
317 | | AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4]) |
318 | | AS2( mov REG_temp32, dword ptr [REG_state + 5*4]) |
319 | | SSE2_EXPAND_S85(0) |
320 | | SSE2_EXPAND_S85(1) |
321 | | SSE2_EXPAND_S85(2) |
322 | | SSE2_EXPAND_S85(3) |
323 | | AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft) |
324 | | AS2( mov dword ptr [REG_state + 5*4], REG_temp32) |
325 | | |
326 | | #ifdef __XOP__ |
327 | | #define SSE2_QUARTER_ROUND(a, b, d, i) \ |
328 | | AS2( movdqa xmm4, xmm##d) \ |
329 | | AS2( paddd xmm4, xmm##a) \ |
330 | | AS3( vprotd xmm4, xmm4, i) \ |
331 | | AS2( pxor xmm##b, xmm4) |
332 | | #else |
333 | | #define SSE2_QUARTER_ROUND(a, b, d, i) \ |
334 | | AS2( movdqa xmm4, xmm##d) \ |
335 | | AS2( paddd xmm4, xmm##a) \ |
336 | | AS2( movdqa xmm5, xmm4) \ |
337 | | AS2( pslld xmm4, i) \ |
338 | | AS2( psrld xmm5, 32-i) \ |
339 | | AS2( pxor xmm##b, xmm4) \ |
340 | | AS2( pxor xmm##b, xmm5) |
341 | | #endif |
342 | | |
343 | | #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */ |
344 | | #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */ |
345 | | #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */ |
346 | | |
347 | | #ifdef __XOP__ |
348 | | #define L04(A,B,C,D,a,b,c,d,i) |
349 | | #define L05(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 7) |
350 | | #define L06(A,B,C,D,a,b,c,d,i) |
351 | | #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256]) |
352 | | #define L08(A,B,C,D,a,b,c,d,i) |
353 | | #else |
354 | | #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) |
355 | | #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7) |
356 | | #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7) |
357 | | #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256]) |
358 | | #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */ |
359 | | #endif |
360 | | |
361 | | #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A) |
362 | | #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) |
363 | | #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */ |
364 | | |
365 | | #ifdef __XOP__ |
366 | | #define L12(A,B,C,D,a,b,c,d,i) |
367 | | #define L13(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 9) |
368 | | #define L14(A,B,C,D,a,b,c,d,i) |
369 | | #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256]) |
370 | | #define L16(A,B,C,D,a,b,c,d,i) |
371 | | #else |
372 | | #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) |
373 | | #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9) |
374 | | #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9) |
375 | | #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256]) |
376 | | #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */ |
377 | | #endif |
378 | | |
379 | | #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A) |
380 | | #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) |
381 | | #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */ |
382 | | |
383 | | #ifdef __XOP__ |
384 | | #define L20(A,B,C,D,a,b,c,d,i) |
385 | | #define L21(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 13) |
386 | | #define L22(A,B,C,D,a,b,c,d,i) |
387 | | #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) |
388 | | #define L24(A,B,C,D,a,b,c,d,i) |
389 | | #else |
390 | | #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) |
391 | | #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13) |
392 | | #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13) |
393 | | #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) |
394 | | #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */ |
395 | | #endif |
396 | | |
397 | | #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A) |
398 | | #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */ |
399 | | |
400 | | #ifdef __XOP__ |
401 | | #define L27(A,B,C,D,a,b,c,d,i) |
402 | | #define L28(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 18) |
403 | | #define L29(A,B,C,D,a,b,c,d,i) |
404 | | #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */ |
405 | | #define L31(A,B,C,D,a,b,c,d,i) |
406 | | #else |
407 | | #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) |
408 | | #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18) |
409 | | #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18) |
410 | | #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */ |
411 | | #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */ |
412 | | #endif |
413 | | |
414 | | #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A) |
415 | | |
416 | | #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \ |
417 | | L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \ |
418 | | L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \ |
419 | | L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \ |
420 | | L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \ |
421 | | L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \ |
422 | | L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \ |
423 | | L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \ |
424 | | L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \ |
425 | | L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \ |
426 | | L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \ |
427 | | L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \ |
428 | | L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \ |
429 | | L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \ |
430 | | L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \ |
431 | | L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \ |
432 | | L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \ |
433 | | L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \ |
434 | | L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \ |
435 | | L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \ |
436 | | L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \ |
437 | | L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \ |
438 | | L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \ |
439 | | L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \ |
440 | | L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \ |
441 | | L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \ |
442 | | L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \ |
443 | | L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \ |
444 | | L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \ |
445 | | L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \ |
446 | | L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \ |
447 | | L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \ |
448 | | L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) |
449 | | |
450 | | #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \ |
451 | | L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \ |
452 | | L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \ |
453 | | L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \ |
454 | | L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \ |
455 | | L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \ |
456 | | L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \ |
457 | | L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \ |
458 | | L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \ |
459 | | L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \ |
460 | | L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \ |
461 | | L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \ |
462 | | L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \ |
463 | | L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \ |
464 | | L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \ |
465 | | L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \ |
466 | | L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \ |
467 | | L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \ |
468 | | L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \ |
469 | | L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \ |
470 | | L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \ |
471 | | L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \ |
472 | | L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \ |
473 | | L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \ |
474 | | L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \ |
475 | | L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \ |
476 | | L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \ |
477 | | L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \ |
478 | | L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \ |
479 | | L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \ |
480 | | L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \ |
481 | | L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \ |
482 | | L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i) |
483 | | |
484 | | #if CRYPTOPP_BOOL_X64 |
485 | | SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) |
486 | | #else |
487 | | SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15) |
488 | | SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13) |
489 | | #endif |
490 | | AS2( mov REG_roundsLeft, REG_rounds) |
491 | | ASJ( jmp, 2, f) |
492 | | |
493 | | ASL(SSE2_Salsa_Output) |
494 | | AS2( movdqa xmm0, xmm4) |
495 | | AS2( punpckldq xmm4, xmm5) |
496 | | AS2( movdqa xmm1, xmm6) |
497 | | AS2( punpckldq xmm6, xmm7) |
498 | | AS2( movdqa xmm2, xmm4) |
499 | | AS2( punpcklqdq xmm4, xmm6) // e |
500 | | AS2( punpckhqdq xmm2, xmm6) // f |
501 | | AS2( punpckhdq xmm0, xmm5) |
502 | | AS2( punpckhdq xmm1, xmm7) |
503 | | AS2( movdqa xmm6, xmm0) |
504 | | AS2( punpcklqdq xmm0, xmm1) // g |
505 | | AS2( punpckhqdq xmm6, xmm1) // h |
506 | | AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1) |
507 | | AS1( ret) |
508 | | |
509 | | ASL(6) |
510 | | #if CRYPTOPP_BOOL_X64 |
511 | | SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) |
512 | | ASL(2) |
513 | | SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6) |
514 | | #else |
515 | | SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15) |
516 | | SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13) |
517 | | ASL(2) |
518 | | SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6) |
519 | | SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4) |
520 | | #endif |
521 | | AS2( sub REG_roundsLeft, 2) |
522 | | ASJ( jnz, 6, b) |
523 | | |
524 | | #define SSE2_OUTPUT_4(a, b, c, d) \ |
525 | | AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\ |
526 | | AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\ |
527 | | AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\ |
528 | | AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\ |
529 | | AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\ |
530 | | AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\ |
531 | | AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\ |
532 | | AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\ |
533 | | ASC( call, SSE2_Salsa_Output) |
534 | | |
535 | | SSE2_OUTPUT_4(0, 13, 10, 7) |
536 | | SSE2_OUTPUT_4(4, 1, 14, 11) |
537 | | SSE2_OUTPUT_4(8, 5, 2, 15) |
538 | | SSE2_OUTPUT_4(12, 9, 6, 3) |
539 | | AS2( test REG_input, REG_input) |
540 | | ASJ( jz, 9, f) |
541 | | AS2( add REG_input, 12*16) |
542 | | ASL(9) |
543 | | AS2( add REG_output, 12*16) |
544 | | AS2( sub REG_iterationCount, 4) |
545 | | AS2( cmp REG_iterationCount, 4) |
546 | | ASJ( jge, 1, b) |
547 | | AS_POP_IF86( sp) |
548 | | |
549 | | ASL(5) |
550 | | AS2( sub REG_iterationCount, 1) |
551 | | ASJ( jl, 4, f) |
552 | | AS2( movdqa xmm0, [REG_state + 0*16]) |
553 | | AS2( movdqa xmm1, [REG_state + 1*16]) |
554 | | AS2( movdqa xmm2, [REG_state + 2*16]) |
555 | | AS2( movdqa xmm3, [REG_state + 3*16]) |
556 | | AS2( mov REG_roundsLeft, REG_rounds) |
557 | | |
558 | | ASL(0) |
559 | | SSE2_QUARTER_ROUND(0, 1, 3, 7) |
560 | | SSE2_QUARTER_ROUND(1, 2, 0, 9) |
561 | | SSE2_QUARTER_ROUND(2, 3, 1, 13) |
562 | | SSE2_QUARTER_ROUND(3, 0, 2, 18) |
563 | | ASS( pshufd xmm1, xmm1, 2, 1, 0, 3) |
564 | | ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) |
565 | | ASS( pshufd xmm3, xmm3, 0, 3, 2, 1) |
566 | | SSE2_QUARTER_ROUND(0, 3, 1, 7) |
567 | | SSE2_QUARTER_ROUND(3, 2, 0, 9) |
568 | | SSE2_QUARTER_ROUND(2, 1, 3, 13) |
569 | | SSE2_QUARTER_ROUND(1, 0, 2, 18) |
570 | | ASS( pshufd xmm1, xmm1, 0, 3, 2, 1) |
571 | | ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) |
572 | | ASS( pshufd xmm3, xmm3, 2, 1, 0, 3) |
573 | | AS2( sub REG_roundsLeft, 2) |
574 | | ASJ( jnz, 0, b) |
575 | | |
576 | | AS2( paddd xmm0, [REG_state + 0*16]) |
577 | | AS2( paddd xmm1, [REG_state + 1*16]) |
578 | | AS2( paddd xmm2, [REG_state + 2*16]) |
579 | | AS2( paddd xmm3, [REG_state + 3*16]) |
580 | | |
581 | | AS2( add dword ptr [REG_state + 8*4], 1) |
582 | | AS2( adc dword ptr [REG_state + 5*4], 0) |
583 | | |
584 | | AS2( pcmpeqb xmm6, xmm6) // all ones |
585 | | AS2( psrlq xmm6, 32) // lo32 mask |
586 | | ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask |
587 | | AS2( movdqa xmm4, xmm0) |
588 | | AS2( movdqa xmm5, xmm3) |
589 | | AS2( pand xmm0, xmm7) |
590 | | AS2( pand xmm4, xmm6) |
591 | | AS2( pand xmm3, xmm6) |
592 | | AS2( pand xmm5, xmm7) |
593 | | AS2( por xmm4, xmm5) // 0,13,2,15 |
594 | | AS2( movdqa xmm5, xmm1) |
595 | | AS2( pand xmm1, xmm7) |
596 | | AS2( pand xmm5, xmm6) |
597 | | AS2( por xmm0, xmm5) // 4,1,6,3 |
598 | | AS2( pand xmm6, xmm2) |
599 | | AS2( pand xmm2, xmm7) |
600 | | AS2( por xmm1, xmm6) // 8,5,10,7 |
601 | | AS2( por xmm2, xmm3) // 12,9,14,11 |
602 | | |
603 | | AS2( movdqa xmm5, xmm4) |
604 | | AS2( movdqa xmm6, xmm0) |
605 | | AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7 |
606 | | AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11 |
607 | | AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15 |
608 | | AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3 |
609 | | |
610 | | // output keystream |
611 | | AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4) |
612 | | ASJ( jmp, 5, b) |
613 | | ASL(4) |
614 | | |
615 | | AS_POP_IF86( bp) |
616 | | #ifdef __GNUC__ |
617 | | AS_POP_IF86( bx) |
618 | | ATT_PREFIX |
619 | | #if CRYPTOPP_BOOL_X64 |
620 | | : "+r" (input), "+r" (output), "+r" (iterationCount) |
621 | | : "r" (m_rounds), "r" (m_state.begin()), "r" (workspace) |
622 | | : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" |
623 | | #else |
624 | | : "+a" (input), "+D" (output), "+c" (iterationCount) |
625 | | : "d" (m_rounds), "S" (m_state.begin()) |
626 | | : "memory", "cc" |
627 | | #endif |
628 | | ); |
629 | | #endif |
630 | | #ifdef CRYPTOPP_GENERATE_X64_MASM |
631 | | movdqa xmm6, [rsp + 0200h] |
632 | | movdqa xmm7, [rsp + 0210h] |
633 | | movdqa xmm8, [rsp + 0220h] |
634 | | movdqa xmm9, [rsp + 0230h] |
635 | | movdqa xmm10, [rsp + 0240h] |
636 | | movdqa xmm11, [rsp + 0250h] |
637 | | movdqa xmm12, [rsp + 0260h] |
638 | | movdqa xmm13, [rsp + 0270h] |
639 | | movdqa xmm14, [rsp + 0280h] |
640 | | movdqa xmm15, [rsp + 0290h] |
641 | | add rsp, 10*16 + 32*16 + 8 |
642 | | ret |
643 | | Salsa20_OperateKeystream ENDP |
644 | | #else |
645 | | } |
646 | | else |
647 | | #endif |
648 | | #endif |
649 | 0 | #ifndef CRYPTOPP_GENERATE_X64_MASM |
650 | 0 | { |
651 | 0 | word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; |
652 | |
|
653 | 0 | while (iterationCount--) |
654 | 0 | { |
655 | 0 | x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; |
656 | 0 | x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; |
657 | 0 | x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; |
658 | 0 | x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; |
659 | |
|
660 | 0 | for (int i=m_rounds; i>0; i-=2) |
661 | 0 | { |
662 | 0 | #define QUARTER_ROUND(a, b, c, d) \ |
663 | 0 | b = b ^ rotlConstant<7>(a + d); \ |
664 | 0 | c = c ^ rotlConstant<9>(b + a); \ |
665 | 0 | d = d ^ rotlConstant<13>(c + b); \ |
666 | 0 | a = a ^ rotlConstant<18>(d + c); |
667 | |
|
668 | 0 | QUARTER_ROUND(x0, x4, x8, x12) |
669 | 0 | QUARTER_ROUND(x1, x5, x9, x13) |
670 | 0 | QUARTER_ROUND(x2, x6, x10, x14) |
671 | 0 | QUARTER_ROUND(x3, x7, x11, x15) |
672 | |
|
673 | 0 | QUARTER_ROUND(x0, x13, x10, x7) |
674 | 0 | QUARTER_ROUND(x1, x14, x11, x4) |
675 | 0 | QUARTER_ROUND(x2, x15, x8, x5) |
676 | 0 | QUARTER_ROUND(x3, x12, x9, x6) |
677 | 0 | } |
678 | |
|
679 | 0 | #ifndef CRYPTOPP_DOXYGEN_PROCESSING |
680 | 0 | #define SALSA_OUTPUT(x) {\ |
681 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ |
682 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ |
683 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ |
684 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ |
685 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ |
686 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ |
687 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ |
688 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ |
689 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ |
690 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ |
691 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ |
692 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ |
693 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ |
694 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ |
695 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ |
696 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} |
697 | |
|
698 | 0 | CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); |
699 | 0 | #undef SALSA_OUTPUT |
700 | 0 | #endif |
701 | |
|
702 | 0 | if (++m_state[8] == 0) |
703 | 0 | ++m_state[5]; |
704 | 0 | } |
705 | 0 | } |
706 | 0 | } // see comment above if an internal compiler error occurs here |
707 | | |
708 | | void XSalsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) |
709 | 0 | { |
710 | 0 | m_rounds = params.GetIntValueWithDefault(Name::Rounds(), m_rounds); |
711 | 0 | if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) |
712 | 0 | throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds); |
713 | | |
714 | 0 | GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length); |
715 | 0 | if (length == 16) |
716 | 0 | std::memcpy(m_key.begin()+4, m_key.begin(), 16); |
717 | | |
718 | | // "expand 32-byte k" |
719 | 0 | m_state[0] = 0x61707865; |
720 | 0 | m_state[1] = 0x3320646e; |
721 | 0 | m_state[2] = 0x79622d32; |
722 | 0 | m_state[3] = 0x6b206574; |
723 | 0 | } |
724 | | |
725 | | void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length) |
726 | 0 | { |
727 | 0 | CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length); |
728 | 0 | CRYPTOPP_ASSERT(length==24); |
729 | |
|
730 | 0 | word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; |
731 | |
|
732 | 0 | GetBlock<word32, LittleEndian> get(IV); |
733 | 0 | get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]); |
734 | |
|
735 | 0 | x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3]; |
736 | 0 | x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7]; |
737 | 0 | x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; |
738 | |
|
739 | 0 | for (int i=m_rounds; i>0; i-=2) |
740 | 0 | { |
741 | 0 | QUARTER_ROUND(x0, x4, x8, x12) |
742 | 0 | QUARTER_ROUND(x1, x5, x9, x13) |
743 | 0 | QUARTER_ROUND(x2, x6, x10, x14) |
744 | 0 | QUARTER_ROUND(x3, x7, x11, x15) |
745 | |
|
746 | 0 | QUARTER_ROUND(x0, x13, x10, x7) |
747 | 0 | QUARTER_ROUND(x1, x14, x11, x4) |
748 | 0 | QUARTER_ROUND(x2, x15, x8, x5) |
749 | 0 | QUARTER_ROUND(x3, x12, x9, x6) |
750 | 0 | } |
751 | |
|
752 | 0 | m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3; |
753 | 0 | m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5; |
754 | 0 | m_state[8] = m_state[5] = 0; |
755 | 0 | } |
756 | | |
757 | | NAMESPACE_END |
758 | | |
759 | | #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM |