Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/ghash.c
Line
Count
Source (jump to first uncovered line)
1
//
2
// GHASH.c
3
//
4
// Implementation of the NIST SP800-38D GHASH function which is the
5
// core authentication function for the GCM and GMAC modes.
6
//
7
// This implementation was done by Niels Ferguson for the RSA32.lib library in 2008,
8
// and adapted to the SymCrypt library in 2009.
9
//
10
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
11
//
12
13
#include "precomp.h"
14
#include "ghash_definitions.h"
15
16
//////////////////////////////////////////////////////////////////////////////
17
// Platform-independent code
18
//
19
20
//
21
// GHashExpandKeyC
22
// Generic GHash key expansion routine, works on all platforms.
23
// This function computes a table of H, Hx, Hx^2, Hx^3, ..., Hx^127
24
//
25
VOID
26
SYMCRYPT_CALL
27
SymCryptGHashExpandKeyC(
28
    _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE )   PSYMCRYPT_GF128_ELEMENT expandedKey,
29
    _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE )    PCBYTE                  pH )
30
0
{
31
0
    UINT64 H0, H1, t;
32
0
    UINT32 i;
33
34
    //
35
    // (H1, H0) form a 128-bit integer, H1 is the upper part, H0 the lower part.
36
    // Convert pH[] to (H1, H0) using MSByte first convention.
37
    //
38
0
    H1 = SYMCRYPT_LOAD_MSBFIRST64( &pH[0] );
39
0
    H0 = SYMCRYPT_LOAD_MSBFIRST64( &pH[8] );
40
41
0
    for( i=0; i<SYMCRYPT_GF128_FIELD_SIZE; i++ )
42
0
    {
43
0
        expandedKey[i].ull[0] = H0;
44
0
        expandedKey[i].ull[1] = H1;
45
        //
46
        // Multiply (H1,H0) by x in the GF(2^128) field using the field encoding from SP800-38D
47
        //
48
0
        t =  UINT64_NEG(H0 & 1) & ((UINT64)GF128_FIELD_R_BYTE << (8 * ( sizeof( UINT64 ) - 1 )) ) ;
49
0
        H0 = (H0 >> 1) | (H1 << 63);
50
0
        H1 = (H1 >> 1) ^ t;
51
0
    }
52
0
}
53
54
55
//
56
// GHashAppendDataC
57
// Generic GHash routine, works on all platforms.
58
//
59
VOID
60
SYMCRYPT_CALL
61
SymCryptGHashAppendDataC(
62
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE )  PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
63
    _Inout_                                  PSYMCRYPT_GF128_ELEMENT     pState,
64
    _In_reads_( cbData )                     PCBYTE                      pbData,
65
                                             SIZE_T                      cbData )
66
0
{
67
0
    UINT64 R0, R1;
68
0
    UINT64 mask;
69
0
    SYMCRYPT_ALIGN UINT32 state32[4];
70
0
    UINT32 t;
71
0
    int i,j;
72
0
    while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
73
0
    {
74
0
        R0 = R1 = 0;
75
76
        //
77
        // We have two nested loops so that we can do most of our operations
78
        // on 32-bit words. 64-bit rotates/shifts can be really slow on a 32-bit CPU.
79
        // On AMD64 we use the XMM version which is much faster.
80
        //
81
0
        state32[0] = (UINT32)pState->ull[0];
82
0
        state32[1] = (UINT32)(pState->ull[0] >> 32);
83
0
        state32[2] = (UINT32)pState->ull[1];
84
0
        state32[3] = (UINT32)(pState->ull[1] >> 32);
85
0
        for( i=0; i<4; i++ )
86
0
        {
87
0
            t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ state32[3-i];
88
0
            for( j=31; j>=0; j-- )
89
0
            {
90
0
                mask = (UINT64)( -(INT64)(t & 1 ));
91
0
                R0 ^= expandedKeyTable[32*i+j].ull[0] & mask;
92
0
                R1 ^= expandedKeyTable[32*i+j].ull[1] & mask;
93
0
                t >>= 1;
94
0
            }
95
0
        }
96
0
        pState->ull[0] = R0;
97
0
        pState->ull[1] = R1;
98
0
        pbData += SYMCRYPT_GF128_BLOCK_SIZE;
99
0
        cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
100
0
    }
101
102
0
    SymCryptWipeKnownSize( state32, sizeof( state32 ) );
103
0
}
104
105
106
VOID
107
SYMCRYPT_CALL
108
SymCryptGHashResult(
109
    _In_                                        PCSYMCRYPT_GF128_ELEMENT    pState,
110
    _Out_writes_( SYMCRYPT_GF128_BLOCK_SIZE )   PBYTE                       pbResult )
111
0
{
112
0
    SYMCRYPT_STORE_MSBFIRST64( pbResult    , pState->ull[1] );
113
0
    SYMCRYPT_STORE_MSBFIRST64( pbResult + 8, pState->ull[0] );
114
0
}
115
116
////////////////////////////////////////////////////////////////////////////////////////////
117
// XMM code
118
//
119
120
VOID
121
SYMCRYPT_CALL
122
SymCryptGHashExpandKeyXmm(
123
    _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE )   PSYMCRYPT_GF128_ELEMENT expandedKey,
124
    _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE )    PCBYTE                  pH )
125
0
{
126
    //
127
    // We use the same layout for XMM code as we did for C code, so we can use the same key
128
    // expansion code.
129
    // Improvement: we can add an expansion routine that uses the XMM registers for speed.
130
    //
131
132
0
    SymCryptGHashExpandKeyC( expandedKey, pH );
133
0
}
134
135
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
136
//
137
// The XMM-based GHash append data function, only on AMD64 & X86
138
//
139
VOID
140
SYMCRYPT_CALL
141
SymCryptGHashAppendDataXmm(
142
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
143
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
144
    _In_reads_( cbData )                    PCBYTE                      pbData,
145
                                            SIZE_T                      cbData )
146
0
{
147
0
    __m128i R;
148
0
    __m128i cmpValue;
149
0
    __m128i mask;
150
0
    __m128i T;
151
0
    __m128i tmp;
152
153
0
    PCSYMCRYPT_GF128_ELEMENT   p;
154
0
    PCSYMCRYPT_GF128_ELEMENT   pLimit;
155
0
    UINT32 t;
156
0
    int i;
157
158
0
    cmpValue = _mm_setzero_si128();             // cmpValue = 0
159
160
0
    while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
161
0
    {
162
0
        R = _mm_setzero_si128();
163
164
        //
165
        // The amd64 compiler can't optimize array indices in a loop where
166
        // you use _mm intrinics,
167
        // so we do all the pointer arithmetic for the compiler.
168
        //
169
0
        p = &expandedKeyTable[0];
170
0
        pLimit = &expandedKeyTable[32];
171
172
0
        for( i=0; i<4; i++ )
173
0
        {
174
            //
175
            // Set up our XMM register with 4 identical 32-bit integers so that
176
            // we can generate the mask from the individual bits of the 32-bit value.
177
            // Note the use of tmp; if we assign directly to the fields of T the
178
            // compiler no longer caches T in an XMM register, which is bad.
179
            //
180
            // There are XMM instructions where we can do the duplication in the XMM
181
            // registers, but they require SSE3 support, and this code only requires
182
            // SSE2. As the inner loop consumes most of the time, it isn't worth
183
            // using the SSE3 instructions.
184
            //
185
            // Note that accessing the state as an array of UINT32s depends on the
186
            // endianness of the CPU, but this is XMM code that only runs on
187
            // little endian machines.
188
            //
189
0
            t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i];
190
0
            tmp = _mm_set_epi32(t, t, t, t);
191
192
0
            T = tmp;
193
0
            while( p < pLimit )
194
0
            {
195
                //
196
                // p and plimit are always at indexes that are multiples of 4 from
197
                // the start of the array.
198
                // We need to explain to prefast that this means that p <= pLimit - 4
199
                //
200
0
                SYMCRYPT_ASSERT( p <= pLimit - 4 );
201
202
0
                mask = _mm_cmpgt_epi32( cmpValue, T );
203
0
                T = _mm_add_epi32( T, T );
204
0
                mask = _mm_and_si128( mask, p[0].m128i );
205
0
                R = _mm_xor_si128( R, mask );
206
207
0
                mask = _mm_cmpgt_epi32( cmpValue, T );
208
0
                T = _mm_add_epi32( T, T );
209
0
                mask = _mm_and_si128( mask, p[1].m128i );
210
0
                R = _mm_xor_si128( R, mask );
211
212
0
                mask = _mm_cmpgt_epi32( cmpValue, T );
213
0
                T = _mm_add_epi32( T, T );
214
0
                mask = _mm_and_si128( mask, p[2].m128i );
215
0
                R = _mm_xor_si128( R, mask );
216
217
0
                mask = _mm_cmpgt_epi32( cmpValue, T );
218
0
                T = _mm_add_epi32( T, T );
219
0
                mask = _mm_and_si128( mask, p[3].m128i );
220
0
                R = _mm_xor_si128( R, mask );
221
222
0
                p += 4;
223
0
            }
224
0
            pLimit += 32;
225
0
        }
226
227
0
        pState->m128i = R;
228
0
        pbData += SYMCRYPT_GF128_BLOCK_SIZE;
229
0
        cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
230
0
    }
231
0
}
232
#endif
233
234
#if SYMCRYPT_CPU_ARM | SYMCRYPT_CPU_ARM64
235
//
236
// The NEON-based GHash append data function, only on ARM & ARM64
237
//
238
VOID
239
SYMCRYPT_CALL
240
SymCryptGHashAppendDataNeon(
241
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE )     PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
242
    _Inout_                                     PSYMCRYPT_GF128_ELEMENT     pState,
243
    _In_reads_( cbData )                        PCBYTE                      pbData,
244
                                                SIZE_T                      cbData )
245
{
246
    // Room for improvement: replace non-crypto NEON code below, based on a bit by bit lookup with
247
    // pmull on 8b elements - 8x(8bx8b) -> 8x(16b) pmull is NEON instruction since Armv7
248
    //
249
    // When properly unrolled:
250
    // 1 (64bx64b -> 128b) pmull instruction and 1 eor instruction can be replaced by
251
    // 8 (8x(8bx8b) -> 8x(16b)) pmull instructions and 8 eor instructions
252
    // so each 128b of data could be processed by less than 64 instructions (using karatsuba)
253
    // rather than ~512 instructions (bit by bit)
254
    //
255
    // Not a priority, expect that AES-GCM performance will be dominated by AES on these platforms
256
257
    __n128 R;
258
    __n128 cmpValue;
259
    __n128 mask;
260
    __n128 T;
261
262
    PCSYMCRYPT_GF128_ELEMENT   p;
263
    PCSYMCRYPT_GF128_ELEMENT   pLimit;
264
    UINT32 t;
265
    int i;
266
267
    cmpValue = vdupq_n_u32(0);             // cmpValue = 0
268
269
    while( cbData >= SYMCRYPT_GF128_BLOCK_SIZE )
270
    {
271
        R = cmpValue;
272
273
        //
274
        // Do all the pointer arithmetic for the compiler.
275
        //
276
        p = &expandedKeyTable[0];
277
        pLimit = &expandedKeyTable[32];
278
279
        for( i=0; i<4; i++ )
280
        {
281
            //
282
            // Set up our XMM register with 4 identical 32-bit integers so that
283
            // we can generate the mask from the individual bits of the 32-bit value.
284
            // Note the use of tmp; if we assign directly to the fields of T the
285
            // compiler no longer caches T in an XMM register, which is bad.
286
            //
287
            // Note that accessing the state as an array of UINT32s depends on the
288
            // endianness of the CPU, but Arm code is always expected to execute in
289
            // little endian mode.
290
            //
291
            t = SYMCRYPT_LOAD_MSBFIRST32( &pbData[4*i] ) ^ pState->ul[3-i];
292
            T = vdupq_n_u32( t );
293
294
            while( p < pLimit )
295
            {
296
                //
297
                // p and plimit are always at indexes that are multiples of 4 from
298
                // the start of the array.
299
                // We need to explain to prefast that this means that p <= pLimit - 4
300
                //
301
                SYMCRYPT_ASSERT( p <= pLimit - 4 );
302
303
                mask = vcgtq_s32( cmpValue, T );
304
                T = vaddq_u32( T, T );
305
                mask = vandq_u32( mask, p[0].n128 );
306
                R = veorq_u32( R, mask );
307
308
                mask = vcgtq_s32( cmpValue, T );
309
                T = vaddq_u32( T, T );
310
                mask = vandq_u32( mask, p[1].n128 );
311
                R = veorq_u32( R, mask );
312
313
                mask = vcgtq_s32( cmpValue, T );
314
                T = vaddq_u32( T, T );
315
                mask = vandq_u32( mask, p[2].n128 );
316
                R = veorq_u32( R, mask );
317
318
                mask = vcgtq_s32( cmpValue, T );
319
                T = vaddq_u32( T, T );
320
                mask = vandq_u32( mask, p[3].n128 );
321
                R = veorq_u32( R, mask );
322
323
                p += 4;
324
            }
325
            pLimit += 32;
326
        }
327
328
        pState->n128 = R;
329
        pbData += SYMCRYPT_GF128_BLOCK_SIZE;
330
        cbData -= SYMCRYPT_GF128_BLOCK_SIZE;
331
    }
332
}
333
#endif
334
335
336
//////////////////////////////////////////////////////////////////////////////////////
337
// Pclmulqdq implementation
338
//
339
340
/*
341
GHASH GF(2^128) multiplication using PCLMULQDQ
342
343
The GF(2^128) field used in GHASH is GF(2)[x]/p(x) where p(x) is the primitive polynomial
344
    x^128 + x^7 + x^2 + x + 1
345
346
Notation: We use the standard mathematical notation '+' for the addition in the field,
347
which corresponds to a xor of the bits.
348
349
Multiplication:
350
Given two field elements A and B (represented as 128-bit values),
351
we first compute the polynomial product
352
    (C,D) := A * B
353
where C and D are also 128-bit values.
354
355
The PCLMULQDQ instruction performs a 64 x 64 -> 128 bit carryless multiplication.
356
To multiply 128-bit values we write A = (A1, A0) and B = (B1, B0) in two 64-bit halves.
357
358
The schoolbook multiplication is computed by
359
    (C, D) = (A1 * B1)x^128 + (A1 * B0 + A0 * B1)x^64 + (A0 * B0)
360
This require four PCLMULQDQ instructions. The middle 128-bit result has to be shifted
361
left and right, and each half added to the upper and lower 128-bit result to get (C,D).
362
363
Alternatively, the middle 128-bit intermediate result be computed using Karatsuba:
364
    (A1*B0 + A0*B1) = (A1 + A0) * (B1 + B0) + (A1*B1) + (A0*B0)
365
This requires only one PCLMULQDQ instruction to multiply (A1 + A0) by (B1 + B0)
366
as the other two products are already computed.
367
Whether this is faster depends on the relative speed of shift/xor verses PCLMULQDQ.
368
369
Both multiplication algorithms produce three 128-bit intermediate results (R1, Rmid, R0),
370
with the full result defined by R1 x^128 + Rmid x^64 + R0.
371
If we do Multiply-Accumulate then we can accumulate the three 128-bit intermediate results
372
directly. As there are no carries, there is no overflow, and the combining of the three
373
intermediate results into a 256-bit result can be shared amongst all multiplications.
374
375
376
Modulo reduction:
377
We use << and >> to denote shifts on 128-bit values.
378
The modulo reduction can now be done as follows:
379
given a 256-bit value (C,D) representing C x^128 + D we compute
380
    (T1,T0) := C + C*x + C * x^2 + C * x^7
381
    R := D + T0 + T1 + (T1 << 1) + (T1 << 2) + (T1 << 7)
382
383
(T1,T0) is just the value C x^128 reduced one step modulo p(x).The value T1 is at most 7 bits,
384
so in the next step the reduciton, which computes the result R, is easy. The
385
expression T1 + (T1 << 1) + (T1 << 2) + (T1 << 7) is just T1 * x^128 reduced modulo p(x).
386
387
Let's first get rid of the polynomial arithmetic and write this completely using shifts on
388
128-bit values.
389
390
T0 := C + (C << 1) + (C << 2) + (C << 7)
391
T1 := (C >> 127) + (C >> 126) + (C >> 121)
392
R := D + T0 + T1  + (T1 << 1) + (T1 << 2) + (T1 << 7)
393
394
We can optimize this by rewriting the equations
395
396
T2 := T1 + C
397
    = C + (C>>127) + (C>>126) + (C>>121)
398
R   = D + T0 + T1  + (T1 << 1) + (T1 << 2) + (T1 << 7)
399
    = D + C + (C << 1) + (C << 2) + (C << 7) + T1  + (T1 << 1) + (T1 << 2) + (T1 << 7)
400
    = D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7)
401
402
Thus
403
T2  = C + (C>>127) + (C>>126) + (C>>121)
404
R   = D + T2 + (T2 << 1) + (T2 << 2) + (T2 << 7)
405
406
Gets the right result and uses only 6 shifts.
407
408
The SSE instruction set does not implement bit-shifts of 128-bit values. Instead, we will
409
use bit-shifts of the 32-bit subvalues, and byte shifts (shifts by a multiple of 8 bits)
410
on the full 128-bit values.
411
We use the <<<< and >>>> operators to denote shifts on 32-bit subwords.
412
413
We can now do the modulo reduction by
414
415
t1 := (C >> 127) = (C >>>> 31) >> 96
416
t2 := (C >> 126) = (C >>>> 30) >> 96
417
t3 := (C >> 121) = (C >>>> 25) >> 96
418
T2 = C + t1 + t2 + t3
419
420
left-shifts in the computation of R are a bit more involved as we have to move bits from
421
one subword to the next
422
423
u1 := (T2 << 1) = (T2 <<<< 1) + ((T2 >>>> 31) << 32)
424
u2 := (T2 << 2) = (T2 <<<< 2) + ((T2 >>>> 30) << 32)
425
u3 := (T2 << 7) = (T2 <<<< 7) + ((T2 >>>> 25) << 32)
426
R = D + T2 + u1 + u2 + u3
427
428
We can eliminate some common subexpressions. For any k we have
429
(T2 >>>> k) = ((C + r) >>>> k)
430
where r is a 7-bit value. If k>7 then this is equal to (C >>>> k). This means that
431
the value (T2 >>>> 31) is equal to (C >>>> 31) so we don't have to compute it again.
432
433
So we can rewrite our formulas as
434
t4 := (C >>>> 31)
435
t5 := (C >>>> 30)
436
t6 := (C >>>> 25)
437
ts = t4 + t5 + t6
438
T2 = C + (ts >> 96)
439
440
Note that ts = (C >>>> 31) + (C >>>> 30) + (C >>>> 25)
441
which is equal to (T2 >>>> 31) + (T2 >>>> 30) + (T2 >>>> 25)
442
443
R = D + T2 + u1 + u2 + u3
444
  = D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32)
445
446
All together, we can do the modulo reduction using the following formulas
447
448
ts := (C >>>> 31) + (C >>>> 30) + (C >>>> 25)
449
T2 := C + (ts >> 96)
450
R = D + T2 + (T2 <<<< 1) + (T2 <<<< 2) + (T2 <<<< 7) + (ts << 32)
451
452
Using a total of 16 operations. (6 subword shifts, 2 byte shifts, and 8 additions)
453
454
Reversed bit order:
455
There is one more complication. GHASH uses the bits in the reverse order from normal representation.
456
The bits b_0, b_1, ..., b_127 represent the polynomial b_0 + b_1 * x + ... + b_127 * x^127.
457
This means that the most significant bit in each byte is actually the least significant bit in the
458
polynomial.
459
460
SSE CPUs use the LSBFirst convention. This means that the bits b_0, b_1, ..., b_127 of the polynimial
461
end up at positions 7, 6, 5, ..., 1, 0, 15, 14, ..., 9, 8, 23, 22, ... of our XMM register.
462
This is obviously not a useful representation to do arithmetic in.
463
The first step is to BSWAP the value so that the bits appear in pure reverse order.
464
That is at least algebraically useful.
465
466
To compute the multiplication we use the fact that GF(2)[x] multiplication has no carries and
467
thus no preference for bit order. After the BSWAP we don't have the values A and B, but rather
468
rev(A) and rev(B) where rev() is a function that reverses the bit order. We can now compute
469
470
  rev(A) * rev(B) = rev( A*B ) >> 1
471
472
where the shift operator is on the 256-bit product.
473
474
The modulo reduction remains the same, except that we change all the shifts to be the other direction.
475
476
This gives us finally the outline of our multiplication:
477
478
- Apply BSWAP to all values loaded from memory.
479
    A := BSWAP( Abytes )
480
    B := BSWAP( Bbytes )
481
- Compute the 256-bit product, possibly using Karatsuba.
482
    (P1, P0) := A * B   // 128x128 carryless multiplication
483
- Shift the result left one bit.
484
    (Q1, Q0) := (P1, P0) << 1
485
    which is computed as
486
        Q0 = (P0 <<<< 1) + (P0 >>>> 31) << 32
487
        Q1 = (P1 <<<< 1) + (P1 >>>> 31) << 32 + (P0 >>>> 31) >> 96
488
- Perform the modulo reduction, with reversed bit order
489
    ts := (Q0 <<<< 31) + (Q0 <<<< 30) + (Q0 <<<< 25)
490
    T2 := Q0 + (ts << 96)
491
    R = Q1 + T2 + (T2 >>>> 1) + (T2 >>>> 2) + (T2 >>>> 7) + (ts >> 32)
492
493
Future work:
494
It might be possible to construct a faster solution by merging the leftshift of (P1,P0)
495
with the modulo reduction.
496
497
*/
498
499
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
500
501
VOID
502
SYMCRYPT_CALL
503
SymCryptGHashExpandKeyPclmulqdq(
504
    _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE )   PSYMCRYPT_GF128_ELEMENT expandedKey,
505
    _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE )     PCBYTE                  pH )
506
0
{
507
0
    int i;
508
0
    __m128i H, Hx, H2, H2x;
509
0
    __m128i t0, t1, t2, t3, t4, t5;
510
0
    __m128i Hi_even, Hix_even, Hi_odd, Hix_odd;
511
0
    __m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
512
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
513
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
514
515
    //
516
    // Our expanded key consists of a list of N=SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS
517
    // powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ...
518
    //
519
    // For each power we store two 128-bit values. The first is H^i (Hi) and the second
520
    // contains the two halves of H^i xorred with each other in the lower 64 bits (Hix).
521
    //
522
    // We keep all of the Hi entries together in the first half of the expanded key
523
    // table, and all of the Hix entries together in the second half of the table.
524
    //
525
    // This ordering allow for efficient vectorization with arbitrary vector width, as
526
    // many multiplication constants can be loaded into wider vectors with the correct
527
    // alignment. Not maintaining different layouts for different vector lengths does
528
    // leave a small amount of performance on the table, but experimentally it seems to
529
    // <1% difference, and using a single layout reduces complexity significantly.
530
    //
531
0
    C_ASSERT( 2*SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE );
532
533
0
    H = _mm_loadu_si128((__m128i *) pH );
534
0
    H = _mm_shuffle_epi8( H, BYTE_REVERSE_ORDER );
535
0
    Hx = _mm_xor_si128( H, _mm_srli_si128( H, 8 ) );
536
537
0
    _mm_store_si128( &GHASH_H_POWER(expandedKey, 1), H );
538
0
    _mm_store_si128( &GHASH_Hx_POWER(expandedKey, 1), Hx );
539
540
0
    CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 );
541
0
    CLMUL_3_POST( t0, t1, t2 );
542
0
    MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 );
543
0
    H2x = _mm_xor_si128( H2, _mm_srli_si128( H2, 8 ) );
544
0
    _mm_store_si128( &GHASH_H_POWER(expandedKey, 2), H2 );
545
0
    _mm_store_si128( &GHASH_Hx_POWER(expandedKey, 2), H2x );
546
547
0
    Hi_even = H2;
548
0
    Hix_even = H2x;
549
550
0
    for( i=2; i<SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS; i+=2 )
551
0
    {
552
0
        CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 );
553
0
        CLMUL_3_POST( t0, t1, t2 );
554
0
        CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 );
555
0
        CLMUL_3_POST( t3, t4, t5 );
556
0
        MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd );
557
0
        MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even );
558
0
        Hix_odd  = _mm_xor_si128( Hi_odd, _mm_srli_si128( Hi_odd, 8 ) );
559
0
        Hix_even = _mm_xor_si128( Hi_even, _mm_srli_si128( Hi_even, 8 ) );
560
561
0
        _mm_store_si128( &GHASH_H_POWER(expandedKey, i + 1), Hi_odd );
562
0
        _mm_store_si128( &GHASH_H_POWER(expandedKey, i + 2), Hi_even );
563
0
        _mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 1), Hix_odd );
564
0
        _mm_store_si128( &GHASH_Hx_POWER(expandedKey, i + 2), Hix_even );
565
0
    }
566
0
}
567
568
569
570
VOID
571
SYMCRYPT_CALL
572
SymCryptGHashAppendDataPclmulqdq(
573
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
574
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
575
    _In_reads_( cbData )                    PCBYTE                      pbData,
576
                                            SIZE_T                      cbData )
577
0
{
578
0
    __m128i state;
579
0
    __m128i data;
580
0
    __m128i a0, a1, a2;
581
0
    __m128i Hi, Hix;
582
0
    SIZE_T i;
583
0
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
584
0
    SIZE_T todo;
585
586
    //
587
    // To do a BSWAP we need an __m128i value with the bytes
588
    //
589
590
0
    __m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
591
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
592
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
593
594
0
    state = _mm_loadu_si128( (__m128i *) pState );
595
596
0
    while( nBlocks > 0 )
597
0
    {
598
        //
599
        // We process the data in blocks of up to SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS blocks
600
        //
601
0
        todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
602
603
        //
604
        // The first block is xorred with the state before multiplying it with a power of H
605
        //
606
0
        data = _mm_loadu_si128( (__m128i *) pbData );
607
0
        data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER );
608
0
        pbData += SYMCRYPT_GF128_BLOCK_SIZE;
609
610
0
        state = _mm_xor_si128( state, data );
611
0
        CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
612
613
        //
614
        // Then we just do an improduct
615
        //
616
0
        for( i=1; i<todo; i++ )
617
0
        {
618
0
            data = _mm_loadu_si128( (__m128i *) pbData );
619
0
            data = _mm_shuffle_epi8( data, BYTE_REVERSE_ORDER );
620
0
            pbData += SYMCRYPT_GF128_BLOCK_SIZE;
621
622
0
            Hi  = _mm_load_si128( &GHASH_H_POWER(expandedKeyTable, todo - i) );
623
0
            Hix = _mm_load_si128( &GHASH_Hx_POWER(expandedKeyTable, todo - i) );
624
0
            CLMUL_ACC_3( data, Hi, Hix, a0, a1, a2 );
625
0
        }
626
627
0
        CLMUL_3_POST( a0, a1, a2 );
628
0
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
629
0
        nBlocks -= todo;
630
0
    }
631
632
0
    _mm_storeu_si128((__m128i *)pState, state );
633
0
}
634
635
#endif  // CPU_X86 || CPU_AMD64
636
637
#if SYMCRYPT_CPU_ARM64
638
639
VOID
640
SYMCRYPT_CALL
641
SymCryptGHashExpandKeyPmull(
642
    _Out_writes_( SYMCRYPT_GF128_FIELD_SIZE )   PSYMCRYPT_GF128_ELEMENT expandedKey,
643
    _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE )    PCBYTE                  pH )
644
{
645
    int i;
646
    __n128 H, Hx, H2, H2x;
647
    __n128 t0, t1, t2, t3, t4, t5;
648
    __n128 Hi_even, Hix_even, Hi_odd, Hix_odd;
649
    const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
650
    //
651
    // Our expanded key consists of a list of N=SYMCRYPT_GHASH_PMULL_HPOWERS
652
    // powers of H. The first entry is H^N, the next H^(N-1), then H^(N-2), ...
653
    //
654
    // For each power we store two 128-bit values. The first is H^i (Hi) and the second
655
    // contains the two halves of H^i xorred with each other in the lower 64 bits (Hix).
656
    //
657
    // We keep all of the Hi entries together in the first half of the expanded key
658
    // table, and all of the Hix entries together in the second half of the table.
659
    //
660
    // This ordering allow for efficient vectorization with arbitrary vector width, as
661
    // many multiplication constants can be loaded into wider vectors with the correct
662
    // alignment. Not maintaining different layouts for different vector lengths does
663
    // leave a small amount of performance on the table, but experimentally it seems to
664
    // <1% difference, and using a single layout reduces complexity significantly.
665
    //
666
    C_ASSERT( 2*SYMCRYPT_GHASH_PMULL_HPOWERS <= SYMCRYPT_GF128_FIELD_SIZE );
667
668
    H = *(__n128 *) pH;
669
    Hx = vrev64q_u8( H );
670
    H = vextq_u8( Hx, Hx, 8 );
671
    Hx = veorq_u8( H, Hx );
672
673
    GHASH_H_POWER(expandedKey, 1) = H;
674
    GHASH_Hx_POWER(expandedKey, 1) = Hx;
675
676
    CLMUL_X_3( H, Hx, H, Hx, t0, t1, t2 );
677
    CLMUL_3_POST( t0, t1, t2 );
678
    MODREDUCE( vMultiplicationConstant, t0, t1, t2, H2 );
679
    H2x = veorq_u8( H2, vextq_u8( H2, H2, 8 ) );
680
    GHASH_H_POWER(expandedKey, 2) = H2;
681
    GHASH_Hx_POWER(expandedKey, 2) = H2x;
682
683
    Hi_even = H2;
684
    Hix_even = H2x;
685
686
    for( i=2; i<SYMCRYPT_GHASH_PMULL_HPOWERS; i+=2 )
687
    {
688
        CLMUL_X_3( H, Hx, Hi_even, Hix_even, t0, t1, t2 );
689
        CLMUL_3_POST( t0, t1, t2 );
690
        CLMUL_X_3( H2, H2x, Hi_even, Hix_even, t3, t4, t5 );
691
        CLMUL_3_POST( t3, t4, t5 );
692
        MODREDUCE( vMultiplicationConstant, t0, t1, t2, Hi_odd );
693
        MODREDUCE( vMultiplicationConstant, t3, t4, t5, Hi_even );
694
        Hix_odd = veorq_u8( Hi_odd, vextq_u8( Hi_odd, Hi_odd, 8 ) );
695
        Hix_even = veorq_u8( Hi_even, vextq_u8( Hi_even, Hi_even, 8 ) );
696
697
        GHASH_H_POWER(expandedKey, i + 1) = Hi_odd;
698
        GHASH_H_POWER(expandedKey, i + 2) = Hi_even;
699
        GHASH_Hx_POWER(expandedKey, i + 1) = Hix_odd;
700
        GHASH_Hx_POWER(expandedKey, i + 2) = Hix_even;
701
    }
702
}
703
704
VOID
705
SYMCRYPT_CALL
706
SymCryptGHashAppendDataPmull(
707
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
708
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
709
    _In_reads_( cbData )                    PCBYTE                      pbData,
710
                                            SIZE_T                      cbData )
711
{
712
    __n128 state;
713
    __n128 data, datax;
714
    __n128 a0, a1, a2;
715
    __n128 Hi, Hix;
716
    const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
717
    SIZE_T i;
718
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
719
    SIZE_T todo;
720
721
    state = *(__n128 *) pState;
722
723
    while( nBlocks > 0 )
724
    {
725
        //
726
        // We process the data in blocks of up to SYMCRYPT_GHASH_PMULL_HPOWERS blocks
727
        //
728
        todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
729
730
        //
731
        // The first block is xorred with the state before multiplying it with a power of H
732
        //
733
        data = *(__n128 *)pbData;
734
        REVERSE_BYTES( data, data );
735
        pbData += SYMCRYPT_GF128_BLOCK_SIZE;
736
737
        state = veorq_u8( state, data );
738
        CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
739
740
        //
741
        // Then we just do an improduct
742
        //
743
        for( i=1; i<todo; i++ )
744
        {
745
            // we can avoid an EXT here by precomputing datax for CLMUL_ACCX_3
746
            datax = vrev64q_u8( *(__n128 *)pbData );
747
            data = vextq_u8( datax, datax, 8 );
748
            datax = veorq_u8( data, datax );
749
            pbData += SYMCRYPT_GF128_BLOCK_SIZE;
750
751
            Hi  = GHASH_H_POWER(expandedKeyTable, todo - i);
752
            Hix = GHASH_Hx_POWER(expandedKeyTable, todo - i);
753
            CLMUL_ACCX_3( data, datax, Hi, Hix, a0, a1, a2 );
754
        }
755
756
        CLMUL_3_POST( a0, a1, a2 );
757
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
758
        nBlocks -= todo;
759
    }
760
761
    *(__n128 *) pState = state;
762
}
763
764
#endif  // CPU_ARM64
765
766
767
768
//////////////////////////////////////////////////////////////
769
// Stuff around the core algorithm implementation functions
770
//
771
772
773
VOID
774
SYMCRYPT_CALL
775
SymCryptGHashExpandKey(
776
    _Out_                                       PSYMCRYPT_GHASH_EXPANDED_KEY    expandedKey,
777
    _In_reads_( SYMCRYPT_GF128_BLOCK_SIZE )     PCBYTE                          pH )
778
0
{
779
#if  SYMCRYPT_CPU_X86
780
    PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
781
    SYMCRYPT_EXTENDED_SAVE_DATA  SaveData;
782
783
    //
784
    // Initialize offset into table space for 16-alignment.
785
    //
786
    expandedKey->tableOffset = (0 -((UINT_PTR) &expandedKey->tableSpace[0])) % sizeof(SYMCRYPT_GF128_ELEMENT);
787
788
    pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset];
789
790
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
791
    {
792
        //
793
        // We can only use the PCLMULQDQ data representation if the SaveXmm never fails.
794
        // This is one of the CPU features required.
795
        // We check anyway...
796
        //
797
        if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR )
798
        {
799
            SymCryptFatal( 'pclm' );
800
        }
801
        SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH );
802
        SymCryptRestoreXmm( &SaveData );
803
    } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
804
    {
805
        SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH );
806
        SymCryptRestoreXmm( &SaveData );
807
    } else {
808
        SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
809
    }
810
811
#elif SYMCRYPT_CPU_AMD64
812
0
    PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
813
0
    pExpandedKeyTable = &expandedKey->table[0];
814
815
0
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
816
0
    {
817
0
        SymCryptGHashExpandKeyPclmulqdq( pExpandedKeyTable, pH );
818
0
    } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) )
819
0
    {
820
0
        SymCryptGHashExpandKeyXmm( pExpandedKeyTable, pH );
821
0
    } else {
822
0
        SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
823
0
    }
824
825
#elif SYMCRYPT_CPU_ARM64
826
    PSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
827
    pExpandedKeyTable = &expandedKey->table[0];
828
829
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) )
830
    {
831
        SymCryptGHashExpandKeyPmull( pExpandedKeyTable, pH );
832
    } else {
833
        SymCryptGHashExpandKeyC( pExpandedKeyTable, pH );
834
    }
835
836
#else
837
    SymCryptGHashExpandKeyC( &expandedKey->table[0], pH );      // Default expansion (does not need alignment)
838
#endif
839
0
}
840
841
VOID
842
SYMCRYPT_CALL
843
SymCryptGHashAppendData(
844
    _In_                              PCSYMCRYPT_GHASH_EXPANDED_KEY   expandedKey,
845
    _Inout_                           PSYMCRYPT_GF128_ELEMENT         pState,
846
    _In_reads_( cbData )              PCBYTE                          pbData,
847
                                      SIZE_T                          cbData )
848
0
{
849
#if SYMCRYPT_CPU_X86
850
    PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
851
    SYMCRYPT_EXTENDED_SAVE_DATA  SaveData;
852
853
    pExpandedKeyTable = (PSYMCRYPT_GF128_ELEMENT)&expandedKey->tableSpace[expandedKey->tableOffset];
854
855
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
856
    {
857
        if( SymCryptSaveXmm( &SaveData ) != SYMCRYPT_NO_ERROR )
858
        {
859
            SymCryptFatal( 'pclm' );
860
        }
861
        SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData );
862
        SymCryptRestoreXmm( &SaveData );
863
    } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) && SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
864
    {
865
        SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData );
866
        SymCryptRestoreXmm( &SaveData );
867
    } else {
868
        SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
869
    }
870
871
#elif SYMCRYPT_CPU_AMD64
872
0
    PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
873
874
0
    pExpandedKeyTable = &expandedKey->table[0];
875
0
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_PCLMULQDQ_CODE ) )
876
0
    {
877
0
        SymCryptGHashAppendDataPclmulqdq( pExpandedKeyTable, pState, pbData, cbData );
878
0
    } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_SSE2 ) )
879
0
    {
880
0
        SymCryptGHashAppendDataXmm( pExpandedKeyTable, pState, pbData, cbData );
881
0
    } else {
882
0
        SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
883
0
    }
884
#elif SYMCRYPT_CPU_ARM
885
    PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
886
887
    pExpandedKeyTable = &expandedKey->table[0];
888
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) )
889
    {
890
        SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData );
891
    } else {
892
        SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
893
    }
894
#elif SYMCRYPT_CPU_ARM64
895
    PCSYMCRYPT_GF128_ELEMENT pExpandedKeyTable;
896
897
    pExpandedKeyTable = &expandedKey->table[0];
898
    if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_PMULL ) )
899
    {
900
        SymCryptGHashAppendDataPmull( pExpandedKeyTable, pState, pbData, cbData );
901
    } else if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON ) )
902
    {
903
        SymCryptGHashAppendDataNeon( pExpandedKeyTable, pState, pbData, cbData );
904
    } else {
905
        SymCryptGHashAppendDataC( pExpandedKeyTable, pState, pbData, cbData );
906
    }
907
#else
908
    SymCryptGHashAppendDataC( &expandedKey->table[0], pState, pbData, cbData );
909
#endif
910
0
}