/src/SymCrypt/lib/ghash.c
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // GHASH.c |
3 | | // |
4 | | // Implementation of the NIST SP800-38D GHASH function which is the |
5 | | // core authentication function for the GCM and GMAC modes. |
6 | | // |
7 | | // This implementation was done by Niels Ferguson for the RSA32.lib library in 2008, |
8 | | // and adapted to the SymCrypt library in 2009. |
9 | | // |
10 | | // Copyright (c) Microsoft Corporation. Licensed under the MIT license. |
11 | | // |
12 | | |
13 | | #include "precomp.h" |
14 | | #include "ghash_definitions.h" |
15 | | |
16 | | ////////////////////////////////////////////////////////////////////////////// |
17 | | // Platform-independent code |
18 | | // |
19 | | |
20 | | // |
21 | | // GHashExpandKeyC |
22 | | // Generic GHash key expansion routine, works on all platforms. |
23 | | // This function computes a table of H, Hx, Hx^2, Hx^3, ..., Hx^127 |
24 | | // |
25 | | VOID |
26 | | SYMCRYPT_CALL |
27 | | SymCryptGHashExpandKeyC( |
28 | | _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey, |
29 | | _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH ) |
30 | 0 | { |
31 | 0 | UINT64 H0, H1, t; |
32 | 0 | UINT32 i; |
33 | | |
34 | | // |
35 | | // (H1, H0) form a 128-bit integer, H1 is the upper part, H0 the lower part. |
36 | | // Convert pH[] to (H1, H0) using MSByte first convention. |
37 | | // |
38 | 0 | H1 = SYMCRYPT_LOAD_MSBFIRST64( &pH[0] ); |
39 | 0 | H0 = SYMCRYPT_LOAD_MSBFIRST64( &pH[8] ); |
40 | |
|
41 | 0 | for( i=0; i<SYMCRYPT_GF128_FIELD_SIZE; i++ ) |
42 | 0 | { |
43 | 0 | expandedKey[i].ull[0] = H0; |
44 | 0 | expandedKey[i].ull[1] = H1; |
45 | | // |
46 | | // Multiply (H1,H0) by x in the GF(2^128) field using the field encoding from SP800-38D |
47 | | // |
48 | 0 | t = UINT64_NEG(H0 & 1) & ((UINT64)GF128_FIELD_R_BYTE << (8 * ( sizeof( UINT64 ) - 1 )) ) ; |
49 | 0 | H0 = (H0 >> 1) | (H1 << 63); |
50 | 0 | H1 = (H1 >> 1) ^ t; |
51 | 0 | } |
52 | 0 | } |
53 | | |
54 | | |
55 | | // |
56 | | // GHashAppendDataC |
57 | | // Generic GHash routine, works on all platforms. |
58 | | // |
59 | | VOID |
60 | | SYMCRYPT_CALL |
61 | | SymCryptGHashAppendDataC( |
62 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
63 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
64 | | _In_reads_( cbData ) PCBYTE pbData, |
65 | | SIZE_T cbData ) |
66 | 0 | { |
67 | 0 | UINT64 R0, R1; |
68 | 0 | UINT64 mask; |
69 | 0 | SYMCRYPT_ALIGN UINT32 state32[4]; |
70 | 0 | UINT32 t; |
71 | 0 | int i,j; |
72 | 0 | while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE ) |
73 | 0 | { |
74 | 0 | R0 = R1 = 0; |
75 | | |
76 | | // |
77 | | // We have two nested loops so that we can do most of our operations |
78 | | // on 32-bit words. 64-bit rotates/shifts can be really slow on a 32-bit CPU. |
79 | | // On AMD64 we use the XMM version which is much faster. |
80 | | // |
81 | 0 | state32[0] = (UINT32)pState->ull[0]; |
82 | 0 | state32[1] = (UINT32)(pState->ull[0] >> 32); |
83 | 0 | state32[2] = (UINT32)pState->ull[1]; |
84 | 0 | state32[3] = (UINT32)(pState->ull[1] >> 32); |
85 | 0 | for( i=0; i<4; i++ ) |
86 | 0 | { |
87 | 0 | t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ state32[3-i]; |
88 | 0 | for( j=31; j>=0; j-- ) |
89 | 0 | { |
90 | 0 | mask = (UINT64)( -(INT64)(t & 1 )); |
91 | 0 | R0 ^= expandedKeyTable[32*i+j].ull[0] & mask; |
92 | 0 | R1 ^= expandedKeyTable[32*i+j].ull[1] & mask; |
93 | 0 | t >>= 1; |
94 | 0 | } |
95 | 0 | } |
96 | 0 | pState->ull[0] = R0; |
97 | 0 | pState->ull[1] = R1; |
98 | 0 | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
99 | 0 | cbData -= SYMCRYPT_GF128_BLOCK_SIZE; |
100 | 0 | } |
101 | |
|
102 | 0 | SymCryptWipeKnownSize( state32, sizeof( state32 ) ); |
103 | 0 | } |
104 | | |
105 | | |
106 | | VOID |
107 | | SYMCRYPT_CALL |
108 | | SymCryptGHashResult( |
109 | | _In_ PCSYMCRYPT_GF128_ELEMENT pState, |
110 | | _Out_writes_( SYMCRYPT_GF128_BLOCK_SIZE ) PBYTE pbResult ) |
111 | 0 | { |
112 | 0 | SYMCRYPT_STORE_MSBFIRST64( pbResult , pState->ull[1] ); |
113 | 0 | SYMCRYPT_STORE_MSBFIRST64( pbResult + 8, pState->ull[0] ); |
114 | 0 | } |
115 | | |
116 | | //////////////////////////////////////////////////////////////////////////////////////////// |
117 | | // XMM code |
118 | | // |
119 | | |
120 | | VOID |
121 | | SYMCRYPT_CALL |
122 | | SymCryptGHashExpandKeyXmm( |
123 | | _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey, |
124 | | _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH ) |
125 | 0 | { |
126 | | // |
127 | | // We use the same layout for XMM code as we did for C code, so we can use the same key |
128 | | // expansion code. |
129 | | // Improvement: we can add an expansion routine that uses the XMM registers for speed. |
130 | | // |
131 | |
|
132 | 0 | SymCryptGHashExpandKeyC( expandedKey, pH ); |
133 | 0 | } |
134 | | |
135 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
136 | | // |
137 | | // The XMM-based GHash append data function, only on AMD64 & X86 |
138 | | // |
139 | | VOID |
140 | | SYMCRYPT_CALL |
141 | | SymCryptGHashAppendDataXmm( |
142 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
143 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
144 | | _In_reads_( cbData ) PCBYTE pbData, |
145 | | SIZE_T cbData ) |
146 | 0 | { |
147 | 0 | __m128i R; |
148 | 0 | __m128i cmpValue; |
149 | 0 | __m128i mask; |
150 | 0 | __m128i T; |
151 | 0 | __m128i tmp; |
152 | |
|
153 | 0 | PCSYMCRYPT_GF128_ELEMENT p; |
154 | 0 | PCSYMCRYPT_GF128_ELEMENT pLimit; |
155 | 0 | UINT32 t; |
156 | 0 | int i; |
157 | |
|
158 | 0 | cmpValue = _mm_setzero_si128(); // cmpValue = 0 |
159 | |
|
160 | 0 | while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE ) |
161 | 0 | { |
162 | 0 | R = _mm_setzero_si128(); |
163 | | |
164 | | // |
165 | | // The amd64 compiler can't optimize array indices in a loop where |
166 | | // you use _mm intrinics, |
167 | | // so we do all the pointer arithmetic for the compiler. |
168 | | // |
169 | 0 | p = &expandedKeyTable[0]; |
170 | 0 | pLimit = &expandedKeyTable[32]; |
171 | |
|
172 | 0 | for( i=0; i<4; i++ ) |
173 | 0 | { |
174 | | // |
175 | | // Set up our XMM register with 4 identical 32-bit integers so that |
176 | | // we can generate the mask from the individual bits of the 32-bit value. |
177 | | // Note the use of tmp; if we assign directly to the fields of T the |
178 | | // compiler no longer caches T in an XMM register, which is bad. |
179 | | // |
180 | | // There are XMM instructions where we can do the duplication in the XMM |
181 | | // registers, but they require SSE3 support, and this code only requires |
182 | | // SSE2. As the inner loop consumes most of the time, it isn't worth |
183 | | // using the SSE3 instructions. |
184 | | // |
185 | | // Note that accessing the state as an array of UINT32s depends on the |
186 | | // endianness of the CPU, but this is XMM code that only runs on |
187 | | // little endian machines. |
188 | | // |
189 | 0 | t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i]; |
190 | 0 | tmp = _mm_set_epi32(t, t, t, t); |
191 | |
|
192 | 0 | T = tmp; |
193 | 0 | while( p < pLimit ) |
194 | 0 | { |
195 | | // |
196 | | // p and plimit are always at indexes that are multiples of 4 from |
197 | | // the start of the array. |
198 | | // We need to explain to prefast that this means that p <= pLimit - 4 |
199 | | // |
200 | 0 | SYMCRYPT_ASSERT( p <= pLimit - 4 ); |
201 | |
|
202 | 0 | mask = _mm_cmpgt_epi32( cmpValue, T ); |
203 | 0 | T = _mm_add_epi32( T, T ); |
204 | 0 | mask = _mm_and_si128( mask, p[0].m128i ); |
205 | 0 | R = _mm_xor_si128( R, mask ); |
206 | |
|
207 | 0 | mask = _mm_cmpgt_epi32( cmpValue, T ); |
208 | 0 | T = _mm_add_epi32( T, T ); |
209 | 0 | mask = _mm_and_si128( mask, p[1].m128i ); |
210 | 0 | R = _mm_xor_si128( R, mask ); |
211 | |
|
212 | 0 | mask = _mm_cmpgt_epi32( cmpValue, T ); |
213 | 0 | T = _mm_add_epi32( T, T ); |
214 | 0 | mask = _mm_and_si128( mask, p[2].m128i ); |
215 | 0 | R = _mm_xor_si128( R, mask ); |
216 | |
|
217 | 0 | mask = _mm_cmpgt_epi32( cmpValue, T ); |
218 | 0 | T = _mm_add_epi32( T, T ); |
219 | 0 | mask = _mm_and_si128( mask, p[3].m128i ); |
220 | 0 | R = _mm_xor_si128( R, mask ); |
221 | |
|
222 | 0 | p += 4; |
223 | 0 | } |
224 | 0 | pLimit += 32; |
225 | 0 | } |
226 | |
|
227 | 0 | pState->m128i = R; |
228 | 0 | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
229 | 0 | cbData -= SYMCRYPT_GF128_BLOCK_SIZE; |
230 | 0 | } |
231 | 0 | } |
232 | | #endif |
233 | | |
234 | | #if SYMCRYPT_CPU_ARM | SYMCRYPT_CPU_ARM64 |
235 | | // |
236 | | // The NEON-based GHash append data function, only on ARM & ARM64 |
237 | | // |
238 | | VOID |
239 | | SYMCRYPT_CALL |
240 | | SymCryptGHashAppendDataNeon( |
241 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
242 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
243 | | _In_reads_( cbData ) PCBYTE pbData, |
244 | | SIZE_T cbData ) |
245 | | { |
246 | | // Room for improvement: replace non-crypto NEON code below, based on a bit by bit lookup with |
247 | | // pmull on 8b elements - 8x(8bx8b) -> 8x(16b) pmull is NEON instruction since Armv7 |
248 | | // |
249 | | // When properly unrolled: |
250 | | // 1 (64bx64b -> 128b) pmull instruction and 1 eor instruction can be replaced by |
251 | | // 8 (8x(8bx8b) -> 8x(16b)) pmull instructions and 8 eor instructions |
252 | | // so each 128b of data could be processed by less than 64 instructions (using karatsuba) |
253 | | // rather than ~512 instructions (bit by bit) |
254 | | // |
255 | | // Not a priority, expect that AES-GCM performance will be dominated by AES on these platforms |
256 | | |
257 | | __n128 R; |
258 | | __n128 cmpValue; |
259 | | __n128 mask; |
260 | | __n128 T; |
261 | | |
262 | | PCSYMCRYPT_GF128_ELEMENT p; |
263 | | PCSYMCRYPT_GF128_ELEMENT pLimit; |
264 | | UINT32 t; |
265 | | int i; |
266 | | |
267 | | cmpValue = vdupq_n_u32(0); // cmpValue = 0 |
268 | | |
269 | | while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE ) |
270 | | { |
271 | | R = cmpValue; |
272 | | |
273 | | // |
274 | | // Do all the pointer arithmetic for the compiler. |
275 | | // |
276 | | p = &expandedKeyTable[0]; |
277 | | pLimit = &expandedKeyTable[32]; |
278 | | |
279 | | for( i=0; i<4; i++ ) |
280 | | { |
281 | | // |
282 | | // Set up our XMM register with 4 identical 32-bit integers so that |
283 | | // we can generate the mask from the individual bits of the 32-bit value. |
284 | | // Note the use of tmp; if we assign directly to the fields of T the |
285 | | // compiler no longer caches T in an XMM register, which is bad. |
286 | | // |
287 | | // Note that accessing the state as an array of UINT32s depends on the |
288 | | // endianness of the CPU, but Arm code is always expected to execute in |
289 | | // little endian mode. |
290 | | // |
291 | | t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i]; |
292 | | T = vdupq_n_u32( t ); |
293 | | |
294 | | while( p < pLimit ) |
295 | | { |
296 | | // |
297 | | // p and plimit are always at indexes that are multiples of 4 from |
298 | | // the start of the array. |
299 | | // We need to explain to prefast that this means that p <= pLimit - 4 |
300 | | // |
301 | | SYMCRYPT_ASSERT( p <= pLimit - 4 ); |
302 | | |
303 | | mask = vcgtq_s32( cmpValue, T ); |
304 | | T = vaddq_u32( T, T ); |
305 | | mask = vandq_u32( mask, p[0].n128 ); |
306 | | R = veorq_u32( R, mask ); |
307 | | |
308 | | mask = vcgtq_s32( cmpValue, T ); |
309 | | T = vaddq_u32( T, T ); |
310 | | mask = vandq_u32( mask, p[1].n128 ); |
311 | | R = veorq_u32( R, mask ); |
312 | | |
313 | | mask = vcgtq_s32( cmpValue, T ); |
314 | | T = vaddq_u32( T, T ); |
315 | | mask = vandq_u32( mask, p[2].n128 ); |
316 | | R = veorq_u32( R, mask ); |
317 | | |
318 | | mask = vcgtq_s32( cmpValue, T ); |
319 | | T = vaddq_u32( T, T ); |
320 | | mask = vandq_u32( mask, p[3].n128 ); |
321 | | R = veorq_u32( R, mask ); |
322 | | |
323 | | p += 4; |
324 | | } |
325 | | pLimit += 32; |
326 | | } |
327 | | |
328 | | pState->n128 = R; |
329 | | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
330 | | cbData -= SYMCRYPT_GF128_BLOCK_SIZE; |
331 | | } |
332 | | } |
333 | | #endif |
334 | | |
335 | | |
336 | | ////////////////////////////////////////////////////////////////////////////////////// |
337 | | // Pclmulqdq implementation |
338 | | // |
339 | | |
340 | | /* |
341 | | GHASH GF(2^128) multiplication using PCLMULQDQ |
342 | | |
343 | | The GF(2^128) field used in GHASH is GF(2)[x]/p(x) where p(x) is the primitive polynomial |
344 | | x^128 + x^7 + x^2 + x + 1 |
345 | | |
346 | | Notation: We use the standard mathematical notation '+' for the addition in the field, |
347 | | which corresponds to a xor of the bits. |
348 | | |
349 | | Multiplication: |
350 | | Given two field elements A and B (represented as 128-bit values), |
351 | | we first compute the polynomial product |
352 | | (C,D) := A * B |
353 | | where C and D are also 128-bit values. |
354 | | |
355 | | The PCLMULQDQ instruction performs a 64 x 64 -> 128 bit carryless multiplication. |
356 | | To multiply 128-bit values we write A = (A1, A0) and B = (B1, B0) in two 64-bit halves. |
357 | | |
358 | | The schoolbook multiplication is computed by |
359 | | (C, D) = (A1 * B1)x^128 + (A1 * B0 + A0 * B1)x^64 + (A0 * B0) |
360 | | This require four PCLMULQDQ instructions. The middle 128-bit result has to be shifted |
361 | | left and right, and each half added to the upper and lower 128-bit result to get (C,D). |
362 | | |
363 | | Alternatively, the middle 128-bit intermediate result be computed using Karatsuba: |
364 | | (A1*B0 + A0*B1) = (A1 + A0) * (B1 + B0) + (A1*B1) + (A0*B0) |
365 | | This requires only one PCLMULQDQ instruction to multiply (A1 + A0) by (B1 + B0) |
366 | | as the other two products are already computed. |
367 | | Whether this is faster depends on the relative speed of shift/xor verses PCLMULQDQ. |
368 | | |
369 | | Both multiplication algorithms produce three 128-bit intermediate results (R1, Rmid, R0), |
370 | | with the full result defined by R1 x^128 + Rmid x^64 + R0. |
371 | | If we do Multiply-Accumulate then we can accumulate the three 128-bit intermediate results |
372 | | directly. As there are no carries, there is no overflow, and the combining of the three |
373 | | intermediate results into a 256-bit result can be shared amongst all multiplications. |
374 | | |
375 | | |
376 | | Modulo reduction: |
377 | | We use << and >> to denote shifts on 128-bit values. |
378 | | The modulo reduction can now be done as follows: |
379 | | given a 256-bit value (C,D) representing C x^128 + D we compute |
380 | | (T1,T0) := C + C*x + C * x^2 + C * x^7 |
381 | | R := D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) |
382 | | |
383 | | (T1,T0) is just the value C x^128 reduced one step modulo p(x).The value T1 is at most 7 bits, |
384 | | so in the next step the reduciton, which computes the result R, is easy. The |
385 | | expression T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) is just T1 * x^128 reduced modulo p(x). |
386 | | |
387 | | Let's first get rid of the polynomial arithmetic and write this completely using shifts on |
388 | | 128-bit values. |
389 | | |
390 | | T0 := C + (C << 1) + (C << 2) + (C << 7) |
391 | | T1 := (C >> 127) + (C >> 126) + (C >> 121) |
392 | | R := D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) |
393 | | |
394 | | We can optimize this by rewriting the equations |
395 | | |
396 | | T2 := T1 + C |
397 | | = C + (C>>127) + (C>>126) + (C>>121) |
398 | | R = D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) |
399 | | = D + C + (C << 1) + (C << 2) + (C << 7) + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) |
400 | | = D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7) |
401 | | |
402 | | Thus |
403 | | T2 = C + (C>>127) + (C>>126) + (C>>121) |
404 | | R = D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7) |
405 | | |
406 | | Gets the right result and uses only 6 shifts. |
407 | | |
408 | | The SSE instruction set does not implement bit-shifts of 128-bit values. Instead, we will |
409 | | use bit-shifts of the 32-bit subvalues, and byte shifts (shifts by a multiple of 8 bits) |
410 | | on the full 128-bit values. |
411 | | We use the <<<< and >>>> operators to denote shifts on 32-bit subwords. |
412 | | |
413 | | We can now do the modulo reduction by |
414 | | |
415 | | t1 := (C >> 127) = (C >>>> 31) >> 96 |
416 | | t2 := (C >> 126) = (C >>>> 30) >> 96 |
417 | | t3 := (C >> 121) = (C >>>> 25) >> 96 |
418 | | T2 = C + t1 + t2 + t3 |
419 | | |
420 | | left-shifts in the computation of R are a bit more involved as we have to move bits from |
421 | | one subword to the next |
422 | | |
423 | | u1 := (T2 << 1) = (T2 <<<< 1) + ((T2 >>>> 31) << 32) |
424 | | u2 := (T2 << 2) = (T2 <<<< 2) + ((T2 >>>> 30) << 32) |
425 | | u3 := (T2 << 7) = (T2 <<<< 7) + ((T2 >>>> 25) << 32) |
426 | | R = D + T2 + u1 + u2 + u3 |
427 | | |
428 | | We can eliminate some common subexpressions. For any k we have |
429 | | (T2 >>>> k) = ((C + r) >>>> k) |
430 | | where r is a 7-bit value. If k>7 then this is equal to (C >>>> k). This means that |
431 | | the value (T2 >>>> 31) is equal to (C >>>> 31) so we don't have to compute it again. |
432 | | |
433 | | So we can rewrite our formulas as |
434 | | t4 := (C >>>> 31) |
435 | | t5 := (C >>>> 30) |
436 | | t6 := (C >>>> 25) |
437 | | ts = t4 + t5 + t6 |
438 | | T2 = C + (ts >> 96) |
439 | | |
440 | | Note that ts = (C >>>> 31) + (C >>>> 30) + (C >>>> 25) |
441 | | which is equal to (T2 >>>> 31) + (T2 >>>> 30) + (T2 >>>> 25) |
442 | | |
443 | | R = D + T2 + u1 + u2 + u3 |
444 | | = D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32) |
445 | | |
446 | | All together, we can do the modulo reduction using the following formulas |
447 | | |
448 | | ts := (C >>>> 31) + (C >>>> 30) + (C >>>> 25) |
449 | | T2 := C + (ts >> 96) |
450 | | R = D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32) |
451 | | |
452 | | Using a total of 16 operations. (6 subword shifts, 2 byte shifts, and 8 additions) |
453 | | |
454 | | Reversed bit order: |
455 | | There is one more complication. GHASH uses the bits in the reverse order from normal representation. |
456 | | The bits b_0, b_1, ..., b_127 represent the polynomial b_0 + b_1 * x + ... + b_127 * x^127. |
457 | | This means that the most significant bit in each byte is actually the least significant bit in the |
458 | | polynomial. |
459 | | |
460 | | SSE CPUs use the LSBFirst convention. This means that the bits b_0, b_1, ..., b_127 of the polynimial |
461 | | end up at positions 7, 6, 5, ..., 1, 0, 15, 14, ..., 9, 8, 23, 22, ... of our XMM register. |
462 | | This is obviously not a useful representation to do arithmetic in. |
463 | | The first step is to BSWAP the value so that the bits appear in pure reverse order. |
464 | | That is at least algebraically useful. |
465 | | |
466 | | To compute the multiplication we use the fact that GF(2)[x] multiplication has no carries and |
467 | | thus no preference for bit order. After the BSWAP we don't have the values A and B, but rather |
468 | | rev(A) and rev(B) where rev() is a function that reverses the bit order. We can now compute |
469 | | |
470 | | rev(A) * rev(B) = rev( A*B ) >> 1 |
471 | | |
472 | | where the shift operator is on the 256-bit product. |
473 | | |
474 | | The modulo reduction remains the same, except that we change all the shifts to be the other direction. |
475 | | |
476 | | This gives us finally the outline of our multiplication: |
477 | | |
478 | | - Apply BSWAP to all values loaded from memory. |
479 | | A := BSWAP( Abytes ) |
480 | | B := BSWAP( Bbytes ) |
481 | | - Compute the 256-bit product, possibly using Karatsuba. |
482 | | (P1, P0) := A * B // 128x128 carryless multiplication |
483 | | - Shift the result left one bit. |
484 | | (Q1, Q0) := (P1, P0) << 1 |
485 | | which is computed as |
486 | | Q0 = (P0 <<<< 1) + (P0 >>>> 31) << 32 |
487 | | Q1 = (P1 <<<< 1) + (P1 >>>> 31) << 32 + (P0 >>>> 31) >> 96 |
488 | | - Perform the modulo reduction, with reversed bit order |
489 | | ts := (Q0 <<<< 31) + (Q0 <<<< 30) + (Q0 <<<< 25) |
490 | | T2 := Q0 + (ts << 96) |
491 | | R = Q1 + T2 + (T2 >>>> 1) + (T2 >>>> 2) + (T2 >>>> 7) + (ts >> 32) |
492 | | |
493 | | Future work: |
494 | | It might be possible to construct a faster solution by merging the leftshift of (P1,P0) |
495 | | with the modulo reduction. |
496 | | |
497 | | */ |
498 | | |
499 | | #if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64 |
500 | | |
501 | | VOID |
502 | | SYMCRYPT_CALL |
503 | | SymCryptGHashExpandKeyPclmulqdq( |
504 | | _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey, |
505 | | _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH ) |
506 | 0 | { |
507 | 0 | int i; |
508 | 0 | __m128i H, Hx, H2, H2x; |
509 | 0 | __m128i t0, t1, t2, t3, t4, t5; |
510 | 0 | __m128i Hi_even, Hix_even, Hi_odd, Hix_odd; |
511 | 0 | __m128i BYTE_REVERSE_ORDER = _mm_set_epi8( |
512 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
513 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
514 | | |
515 | | // |
516 | | // Our expanded key consists of a list of N=SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS |
517 | | // powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ... |
518 | | // |
519 | | // For each power we store two 128-bit values. The first is H^i (Hi) and the second |
520 | | // contains the two halves of H^i xorred with each other in the lower 64 bits (Hix). |
521 | | // |
522 | | // We keep all of the Hi entries together in the first half of the expanded key |
523 | | // table, and all of the Hix entries together in the second half of the table. |
524 | | // |
525 | | // This ordering allow for efficient vectorization with arbitrary vector width, as |
526 | | // many multiplication constants can be loaded into wider vectors with the correct |
527 | | // alignment. Not maintaining different layouts for different vector lengths does |
528 | | // leave a small amount of performance on the table, but experimentally it seems to |
529 | | // <1% difference, and using a single layout reduces complexity significantly. |
530 | | // |
531 | 0 | C_ASSERT( 2*SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE ); |
532 | |
|
533 | 0 | H = _mm_loadu_si128((__m128i *) pH ); |
534 | 0 | H = _mm_shuffle_epi8( H, BYTE_REVERSE_ORDER ); |
535 | 0 | Hx = _mm_xor_si128( H, _mm_srli_si128( H, 8 ) ); |
536 | |
|
537 | 0 | _mm_store_si128( &GHASH_H_POWER(expandedKey, 1), H ); |
538 | 0 | _mm_store_si128( &GHASH_Hx_POWER(expandedKey, 1), Hx ); |
539 | |
|
540 | 0 | CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 ); |
541 | 0 | CLMUL_3_POST( t0, t1, t2 ); |
542 | 0 | MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 ); |
543 | 0 | H2x = _mm_xor_si128( H2, _mm_srli_si128( H2, 8 ) ); |
544 | 0 | _mm_store_si128( &GHASH_H_POWER(expandedKey, 2), H2 ); |
545 | 0 | _mm_store_si128( &GHASH_Hx_POWER(expandedKey, 2), H2x ); |
546 | |
|
547 | 0 | Hi_even = H2; |
548 | 0 | Hix_even = H2x; |
549 | |
|
550 | 0 | for( i=2; i<SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS; i+=2 ) |
551 | 0 | { |
552 | 0 | CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 ); |
553 | 0 | CLMUL_3_POST( t0, t1, t2 ); |
554 | 0 | CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 ); |
555 | 0 | CLMUL_3_POST( t3, t4, t5 ); |
556 | 0 | MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd ); |
557 | 0 | MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even ); |
558 | 0 | Hix_odd = _mm_xor_si128( Hi_odd, _mm_srli_si128( Hi_odd, 8 ) ); |
559 | 0 | Hix_even = _mm_xor_si128( Hi_even, _mm_srli_si128( Hi_even, 8 ) ); |
560 | |
|
561 | 0 | _mm_store_si128( &GHASH_H_POWER(expandedKey, i + 1), Hi_odd ); |
562 | 0 | _mm_store_si128( &GHASH_H_POWER(expandedKey, i + 2), Hi_even ); |
563 | 0 | _mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 1), Hix_odd ); |
564 | 0 | _mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 2), Hix_even ); |
565 | 0 | } |
566 | 0 | } |
567 | | |
568 | | |
569 | | |
570 | | VOID |
571 | | SYMCRYPT_CALL |
572 | | SymCryptGHashAppendDataPclmulqdq( |
573 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
574 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
575 | | _In_reads_( cbData ) PCBYTE pbData, |
576 | | SIZE_T cbData ) |
577 | 0 | { |
578 | 0 | __m128i state; |
579 | 0 | __m128i data; |
580 | 0 | __m128i a0, a1, a2; |
581 | 0 | __m128i Hi, Hix; |
582 | 0 | SIZE_T i; |
583 | 0 | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
584 | 0 | SIZE_T todo; |
585 | | |
586 | | // |
587 | | // To do a BSWAP we need an __m128i value with the bytes |
588 | | // |
589 | |
|
590 | 0 | __m128i BYTE_REVERSE_ORDER = _mm_set_epi8( |
591 | 0 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); |
592 | 0 | __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 ); |
593 | |
|
594 | 0 | state = _mm_loadu_si128( (__m128i *) pState ); |
595 | |
|
596 | 0 | while( nBlocks > 0 ) |
597 | 0 | { |
598 | | // |
599 | | // We process the data in blocks of up to SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS blocks |
600 | | // |
601 | 0 | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ); |
602 | | |
603 | | // |
604 | | // The first block is xorred with the state before multiplying it with a power of H |
605 | | // |
606 | 0 | data = _mm_loadu_si128( (__m128i *) pbData ); |
607 | 0 | data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER ); |
608 | 0 | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
609 | |
|
610 | 0 | state = _mm_xor_si128( state, data ); |
611 | 0 | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
612 | | |
613 | | // |
614 | | // Then we just do an improduct |
615 | | // |
616 | 0 | for( i=1; i<todo; i++ ) |
617 | 0 | { |
618 | 0 | data = _mm_loadu_si128( (__m128i *) pbData ); |
619 | 0 | data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER ); |
620 | 0 | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
621 | |
|
622 | 0 | Hi = _mm_load_si128( &GHASH_H_POWER(expandedKeyTable, todo - i) ); |
623 | 0 | Hix = _mm_load_si128( &GHASH_Hx_POWER(expandedKeyTable, todo - i) ); |
624 | 0 | CLMUL_ACC_3( data, Hi, Hix, a0, a1, a2 ); |
625 | 0 | } |
626 | |
|
627 | 0 | CLMUL_3_POST( a0, a1, a2 ); |
628 | 0 | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
629 | 0 | nBlocks -= todo; |
630 | 0 | } |
631 | |
|
632 | 0 | _mm_storeu_si128((__m128i *)pState, state ); |
633 | 0 | } |
634 | | |
635 | | #endif // CPU_X86 || CPU_AMD64 |
636 | | |
637 | | #if SYMCRYPT_CPU_ARM64 |
638 | | |
639 | | VOID |
640 | | SYMCRYPT_CALL |
641 | | SymCryptGHashExpandKeyPmull( |
642 | | _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE ) PSYMCRYPT_GF128_ELEMENT expandedKey, |
643 | | _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH ) |
644 | | { |
645 | | int i; |
646 | | __n128 H, Hx, H2, H2x; |
647 | | __n128 t0, t1, t2, t3, t4, t5; |
648 | | __n128 Hi_even, Hix_even, Hi_odd, Hix_odd; |
649 | | const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000); |
650 | | // |
651 | | // Our expanded key consists of a list of N=SYMCRYPT_GHASH_PMULL_HPOWERS |
652 | | // powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ... |
653 | | // |
654 | | // For each power we store two 128-bit values. The first is H^i (Hi) and the second |
655 | | // contains the two halves of H^i xorred with each other in the lower 64 bits (Hix). |
656 | | // |
657 | | // We keep all of the Hi entries together in the first half of the expanded key |
658 | | // table, and all of the Hix entries together in the second half of the table. |
659 | | // |
660 | | // This ordering allow for efficient vectorization with arbitrary vector width, as |
661 | | // many multiplication constants can be loaded into wider vectors with the correct |
662 | | // alignment. Not maintaining different layouts for different vector lengths does |
663 | | // leave a small amount of performance on the table, but experimentally it seems to |
664 | | // <1% difference, and using a single layout reduces complexity significantly. |
665 | | // |
666 | | C_ASSERT( 2*SYMCRYPT_GHASH_PMULL_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE ); |
667 | | |
668 | | H = *(__n128 *) pH; |
669 | | Hx = vrev64q_u8( H ); |
670 | | H = vextq_u8( Hx, Hx, 8 ); |
671 | | Hx = veorq_u8( H, Hx ); |
672 | | |
673 | | GHASH_H_POWER(expandedKey, 1) = H; |
674 | | GHASH_Hx_POWER(expandedKey, 1) = Hx; |
675 | | |
676 | | CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 ); |
677 | | CLMUL_3_POST( t0, t1, t2 ); |
678 | | MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 ); |
679 | | H2x = veorq_u8( H2, vextq_u8( H2, H2, 8 ) ); |
680 | | GHASH_H_POWER(expandedKey, 2) = H2; |
681 | | GHASH_Hx_POWER(expandedKey, 2) = H2x; |
682 | | |
683 | | Hi_even = H2; |
684 | | Hix_even = H2x; |
685 | | |
686 | | for( i=2; i<SYMCRYPT_GHASH_PMULL_HPOWERS; i+=2 ) |
687 | | { |
688 | | CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 ); |
689 | | CLMUL_3_POST( t0, t1, t2 ); |
690 | | CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 ); |
691 | | CLMUL_3_POST( t3, t4, t5 ); |
692 | | MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd ); |
693 | | MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even ); |
694 | | Hix_odd = veorq_u8( Hi_odd, vextq_u8( Hi_odd, Hi_odd, 8 ) ); |
695 | | Hix_even = veorq_u8( Hi_even, vextq_u8( Hi_even, Hi_even, 8 ) ); |
696 | | |
697 | | GHASH_H_POWER(expandedKey, i + 1) = Hi_odd; |
698 | | GHASH_H_POWER(expandedKey, i + 2) = Hi_even; |
699 | | GHASH_Hx_POWER(expandedKey, i + 1) = Hix_odd; |
700 | | GHASH_Hx_POWER(expandedKey, i + 2) = Hix_even; |
701 | | } |
702 | | } |
703 | | |
704 | | VOID |
705 | | SYMCRYPT_CALL |
706 | | SymCryptGHashAppendDataPmull( |
707 | | _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable, |
708 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
709 | | _In_reads_( cbData ) PCBYTE pbData, |
710 | | SIZE_T cbData ) |
711 | | { |
712 | | __n128 state; |
713 | | __n128 data, datax; |
714 | | __n128 a0, a1, a2; |
715 | | __n128 Hi, Hix; |
716 | | const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000); |
717 | | SIZE_T i; |
718 | | SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE; |
719 | | SIZE_T todo; |
720 | | |
721 | | state = *(__n128 *) pState; |
722 | | |
723 | | while( nBlocks > 0 ) |
724 | | { |
725 | | // |
726 | | // We process the data in blocks of up to SYMCRYPT_GHASH_PMULL_HPOWERS blocks |
727 | | // |
728 | | todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS ); |
729 | | |
730 | | // |
731 | | // The first block is xorred with the state before multiplying it with a power of H |
732 | | // |
733 | | data = *(__n128 *)pbData; |
734 | | REVERSE_BYTES( data, data ); |
735 | | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
736 | | |
737 | | state = veorq_u8( state, data ); |
738 | | CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 ); |
739 | | |
740 | | // |
741 | | // Then we just do an improduct |
742 | | // |
743 | | for( i=1; i<todo; i++ ) |
744 | | { |
745 | | // we can avoid an EXT here by precomputing datax for CLMUL_ACCX_3 |
746 | | datax = vrev64q_u8( *(__n128 *)pbData ); |
747 | | data = vextq_u8( datax, datax, 8 ); |
748 | | datax = veorq_u8( data, datax ); |
749 | | pbData += SYMCRYPT_GF128_BLOCK_SIZE; |
750 | | |
751 | | Hi = GHASH_H_POWER(expandedKeyTable, todo - i); |
752 | | Hix = GHASH_Hx_POWER(expandedKeyTable, todo - i); |
753 | | CLMUL_ACCX_3( data, datax, Hi, Hix, a0, a1, a2 ); |
754 | | } |
755 | | |
756 | | CLMUL_3_POST( a0, a1, a2 ); |
757 | | MODREDUCE( vMultiplicationConstant, a0, a1, a2, state ); |
758 | | nBlocks -= todo; |
759 | | } |
760 | | |
761 | | *(__n128 *) pState = state; |
762 | | } |
763 | | |
764 | | #endif // CPU_ARM64 |
765 | | |
766 | | |
767 | | |
768 | | ////////////////////////////////////////////////////////////// |
769 | | // Stuff around the core algorithm implementation functions |
770 | | // |
771 | | |
772 | | |
773 | | VOID |
774 | | SYMCRYPT_CALL |
775 | | SymCryptGHashExpandKey( |
776 | | _Out_ PSYMCRYPT_GHASH_EXPANDED_KEY expandedKey, |
777 | | _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE ) PCBYTE pH ) |
778 | 0 | { |
779 | | #if SYMCRYPT_CPU_X86 |
780 | | PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
781 | | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
782 | | |
783 | | // |
784 | | // Initialize offset into table space for 16-alignment. |
785 | | // |
786 | | expandedKey->tableOffset = (0 -((UINT_PTR) &expandedKey->tableSpace[0])) % sizeof(SYMCRYPT_GF128_ELEMENT); |
787 | | |
788 | | pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset]; |
789 | | |
790 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) ) |
791 | | { |
792 | | // |
793 | | // We can only use the PCLMULQDQ data representation if the SaveXmm never fails. |
794 | | // This is one of the CPU features required. |
795 | | // We check anyway... |
796 | | // |
797 | | if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR ) |
798 | | { |
799 | | SymCryptFatal( 'pclm' ); |
800 | | } |
801 | | SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH ); |
802 | | SymCryptRestoreXmm( &SaveData ); |
803 | | } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR ) |
804 | | { |
805 | | SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH ); |
806 | | SymCryptRestoreXmm( &SaveData ); |
807 | | } else { |
808 | | SymCryptGHashExpandKeyC( pExpandedKeyTable, pH ); |
809 | | } |
810 | | |
811 | | #elif SYMCRYPT_CPU_AMD64 |
812 | 0 | PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
813 | 0 | pExpandedKeyTable = &expandedKey->table[0]; |
814 | |
|
815 | 0 | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) ) |
816 | 0 | { |
817 | 0 | SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH ); |
818 | 0 | } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) ) |
819 | 0 | { |
820 | 0 | SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH ); |
821 | 0 | } else { |
822 | 0 | SymCryptGHashExpandKeyC( pExpandedKeyTable, pH ); |
823 | 0 | } |
824 | |
|
825 | | #elif SYMCRYPT_CPU_ARM64 |
826 | | PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
827 | | pExpandedKeyTable = &expandedKey->table[0]; |
828 | | |
829 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) ) |
830 | | { |
831 | | SymCryptGHashExpandKeyPmull( pExpandedKeyTable, pH ); |
832 | | } else { |
833 | | SymCryptGHashExpandKeyC( pExpandedKeyTable, pH ); |
834 | | } |
835 | | |
836 | | #else |
837 | | SymCryptGHashExpandKeyC( &expandedKey->table[0], pH ); // Default expansion (does not need alignment) |
838 | | #endif |
839 | 0 | } |
840 | | |
841 | | VOID |
842 | | SYMCRYPT_CALL |
843 | | SymCryptGHashAppendData( |
844 | | _In_ PCSYMCRYPT_GHASH_EXPANDED_KEY expandedKey, |
845 | | _Inout_ PSYMCRYPT_GF128_ELEMENT pState, |
846 | | _In_reads_( cbData ) PCBYTE pbData, |
847 | | SIZE_T cbData ) |
848 | 0 | { |
849 | | #if SYMCRYPT_CPU_X86 |
850 | | PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
851 | | SYMCRYPT_EXTENDED_SAVE_DATA SaveData; |
852 | | |
853 | | pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset]; |
854 | | |
855 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) ) |
856 | | { |
857 | | if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR ) |
858 | | { |
859 | | SymCryptFatal( 'pclm' ); |
860 | | } |
861 | | SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData ); |
862 | | SymCryptRestoreXmm( &SaveData ); |
863 | | } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR ) |
864 | | { |
865 | | SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData ); |
866 | | SymCryptRestoreXmm( &SaveData ); |
867 | | } else { |
868 | | SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData ); |
869 | | } |
870 | | |
871 | | #elif SYMCRYPT_CPU_AMD64 |
872 | 0 | PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
873 | |
|
874 | 0 | pExpandedKeyTable = &expandedKey->table[0]; |
875 | 0 | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) ) |
876 | 0 | { |
877 | 0 | SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData ); |
878 | 0 | } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) ) |
879 | 0 | { |
880 | 0 | SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData ); |
881 | 0 | } else { |
882 | 0 | SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData ); |
883 | 0 | } |
884 | | #elif SYMCRYPT_CPU_ARM |
885 | | PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
886 | | |
887 | | pExpandedKeyTable = &expandedKey->table[0]; |
888 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) ) |
889 | | { |
890 | | SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData ); |
891 | | } else { |
892 | | SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData ); |
893 | | } |
894 | | #elif SYMCRYPT_CPU_ARM64 |
895 | | PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable; |
896 | | |
897 | | pExpandedKeyTable = &expandedKey->table[0]; |
898 | | if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) ) |
899 | | { |
900 | | SymCryptGHashAppendDataPmull( pExpandedKeyTable, pState, pbData, cbData ); |
901 | | } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) ) |
902 | | { |
903 | | SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData ); |
904 | | } else { |
905 | | SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData ); |
906 | | } |
907 | | #else |
908 | | SymCryptGHashAppendDataC( &expandedKey->table[0], pState, pbData, cbData ); |
909 | | #endif |
910 | 0 | } |