Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/aes-xmm.c
Line
Count
Source (jump to first uncovered line)
1
//
2
// aes-xmm.c   code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All XMM code for AES operations
7
// Requires compiler support for ssse3, aesni and pclmulqdq
8
//
9
10
#include "precomp.h"
11
12
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
13
14
#include "xtsaes_definitions.h"
15
#include "ghash_definitions.h"
16
17
VOID
18
SYMCRYPT_CALL
19
SymCryptAes4SboxXmm( _In_reads_(4) PCBYTE pIn, _Out_writes_(4) PBYTE pOut )
20
0
{
21
0
    __m128i x;
22
0
    x = _mm_set1_epi32( *(int *) pIn );
23
24
0
    x = _mm_aeskeygenassist_si128( x, 0 );
25
26
    // Could use _mm_storeu_si32( pOut, x ) but it is missing from some headers and _mm_store_ss will be as fast
27
0
    _mm_store_ss( (float *) pOut, _mm_castsi128_ps(x) );
28
0
}
29
30
VOID
31
SYMCRYPT_CALL
32
SymCryptAesCreateDecryptionRoundKeyXmm(
33
    _In_reads_(16)      PCBYTE  pEncryptionRoundKey,
34
    _Out_writes_(16)    PBYTE   pDecryptionRoundKey )
35
0
{
36
    //
37
    // On x86 our key structure is only 4-aligned (the best we can do) so we use unaligned load/stores.
38
    // On Amd64 our round keys are aligned, but recent CPUs have fast unaligned load/store if the address is
39
    // actually aligned properly.
40
    //
41
0
    _mm_storeu_si128( (__m128i *) pDecryptionRoundKey, _mm_aesimc_si128( _mm_loadu_si128( (__m128i *)pEncryptionRoundKey ) ) );
42
0
}
43
44
//
45
// The latency of AES instruction has increased up to 8 cycles in Ivy Bridge,
46
// and back to 7 in Haswell.
47
// We use 8-parallel code to expose the maximum parallelism to the CPU.
48
// On x86 it will introduce some register spilling, but the load/stores
49
// should be able to hide behind the AES instruction latencies.
50
// Silvermont x86 CPUs has AES-NI with latency = 8 and throughput = 5, so there
51
// the CPU parallelism is low.
52
// For things like BitLocker that is fine, but other uses, such as GCM & AES_CTR_DRBG
53
// use odd sizes.
54
// We try to do 5-8 blocks in 8-parallel code, 2-4 blocks in 4-parallel code, and
55
// 1 block in 1-parallel code.
56
// This is a compromise; the big cores can do 8 parallel in about the time of a 4-parallel,
57
// but Silvermont cannot and would pay a big price on small requests if we only use 8-parallel.
58
// Doing only 8-parallel and then 1-parallel would penalize the big cores a lot.
59
//
60
// We used to have 7-parallel code, but common request sizes are not multiples of 7
61
// blocks so we end up doing a lot of extra work. This is especially expensive on
62
// Silvermont where the extra work isn't hidden in the latencies.
63
//
64
65
0
#define AES_ENCRYPT_1( pExpandedKey, c0 ) \
66
0
{ \
67
0
    const BYTE (*keyPtr)[4][4]; \
68
0
    const BYTE (*keyLimit)[4][4]; \
69
0
    __m128i roundkey; \
70
0
\
71
0
    keyPtr = &pExpandedKey->RoundKey[0]; \
72
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
73
0
\
74
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
75
0
    keyPtr ++; \
76
0
\
77
0
    c0 = _mm_xor_si128( c0, roundkey ); \
78
0
\
79
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
80
0
    keyPtr ++; \
81
0
    c0 = _mm_aesenc_si128( c0, roundkey ); \
82
0
\
83
0
    do \
84
0
    { \
85
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
86
0
        keyPtr ++; \
87
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
88
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
89
0
        keyPtr ++; \
90
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
91
0
    } while( keyPtr < keyLimit ); \
92
0
\
93
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
94
0
\
95
0
    c0 = _mm_aesenclast_si128( c0, roundkey ); \
96
0
};
97
98
99
// Perform AES encryption without the first round key and with a specified last round key
100
//
101
// For algorithms where performance is dominated by a chain of dependent AES rounds (i.e. CBC encryption, CCM, CMAC)
102
// we can gain a reasonable performance uplift by computing (last round key ^ next plaintext block ^ first round key)
103
// off the critical path and using this computed value in place of last round key in AESENCLAST instructions.
104
0
#define AES_ENCRYPT_1_CHAIN( pExpandedKey, cipherState, mergedLastRoundKey ) \
105
0
{ \
106
0
    const BYTE (*keyPtr)[4][4]; \
107
0
    const BYTE (*keyLimit)[4][4]; \
108
0
    __m128i roundkey; \
109
0
\
110
0
    keyPtr = &pExpandedKey->RoundKey[1]; \
111
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
112
0
\
113
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
114
0
    keyPtr ++; \
115
0
\
116
0
    cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
117
0
\
118
0
    do \
119
0
    { \
120
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
121
0
        keyPtr ++; \
122
0
        cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
123
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
124
0
        keyPtr ++; \
125
0
        cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
126
0
    } while( keyPtr < keyLimit ); \
127
0
\
128
0
    cipherState = _mm_aesenclast_si128( cipherState, mergedLastRoundKey ); \
129
0
};
130
131
0
#define AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
132
0
{ \
133
0
    const BYTE (*keyPtr)[4][4]; \
134
0
    const BYTE (*keyLimit)[4][4]; \
135
0
    __m128i roundkey; \
136
0
\
137
0
    keyPtr = &pExpandedKey->RoundKey[0]; \
138
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
139
0
\
140
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
141
0
    keyPtr ++; \
142
0
\
143
0
    c0 = _mm_xor_si128( c0, roundkey ); \
144
0
    c1 = _mm_xor_si128( c1, roundkey ); \
145
0
    c2 = _mm_xor_si128( c2, roundkey ); \
146
0
    c3 = _mm_xor_si128( c3, roundkey ); \
147
0
\
148
0
    do \
149
0
    { \
150
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
151
0
        keyPtr ++; \
152
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
153
0
        c1 = _mm_aesenc_si128( c1, roundkey ); \
154
0
        c2 = _mm_aesenc_si128( c2, roundkey ); \
155
0
        c3 = _mm_aesenc_si128( c3, roundkey ); \
156
0
    } while( keyPtr < keyLimit ); \
157
0
\
158
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
159
0
\
160
0
    c0 = _mm_aesenclast_si128( c0, roundkey ); \
161
0
    c1 = _mm_aesenclast_si128( c1, roundkey ); \
162
0
    c2 = _mm_aesenclast_si128( c2, roundkey ); \
163
0
    c3 = _mm_aesenclast_si128( c3, roundkey ); \
164
0
};
165
166
0
#define AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
167
0
{ \
168
0
    const BYTE (*keyPtr)[4][4]; \
169
0
    const BYTE (*keyLimit)[4][4]; \
170
0
    __m128i roundkey; \
171
0
\
172
0
    keyPtr = &pExpandedKey->RoundKey[0]; \
173
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
174
0
\
175
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
176
0
    keyPtr ++; \
177
0
\
178
0
    c0 = _mm_xor_si128( c0, roundkey ); \
179
0
    c1 = _mm_xor_si128( c1, roundkey ); \
180
0
    c2 = _mm_xor_si128( c2, roundkey ); \
181
0
    c3 = _mm_xor_si128( c3, roundkey ); \
182
0
    c4 = _mm_xor_si128( c4, roundkey ); \
183
0
    c5 = _mm_xor_si128( c5, roundkey ); \
184
0
    c6 = _mm_xor_si128( c6, roundkey ); \
185
0
    c7 = _mm_xor_si128( c7, roundkey ); \
186
0
\
187
0
    do \
188
0
    { \
189
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
190
0
        keyPtr ++; \
191
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
192
0
        c1 = _mm_aesenc_si128( c1, roundkey ); \
193
0
        c2 = _mm_aesenc_si128( c2, roundkey ); \
194
0
        c3 = _mm_aesenc_si128( c3, roundkey ); \
195
0
        c4 = _mm_aesenc_si128( c4, roundkey ); \
196
0
        c5 = _mm_aesenc_si128( c5, roundkey ); \
197
0
        c6 = _mm_aesenc_si128( c6, roundkey ); \
198
0
        c7 = _mm_aesenc_si128( c7, roundkey ); \
199
0
    } while( keyPtr < keyLimit ); \
200
0
\
201
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
202
0
\
203
0
    c0 = _mm_aesenclast_si128( c0, roundkey ); \
204
0
    c1 = _mm_aesenclast_si128( c1, roundkey ); \
205
0
    c2 = _mm_aesenclast_si128( c2, roundkey ); \
206
0
    c3 = _mm_aesenclast_si128( c3, roundkey ); \
207
0
    c4 = _mm_aesenclast_si128( c4, roundkey ); \
208
0
    c5 = _mm_aesenclast_si128( c5, roundkey ); \
209
0
    c6 = _mm_aesenclast_si128( c6, roundkey ); \
210
0
    c7 = _mm_aesenclast_si128( c7, roundkey ); \
211
0
};
212
213
0
#define AES_DECRYPT_1( pExpandedKey, c0 ) \
214
0
{ \
215
0
    const BYTE (*keyPtr)[4][4]; \
216
0
    const BYTE (*keyLimit)[4][4]; \
217
0
    __m128i roundkey; \
218
0
\
219
0
    keyPtr = pExpandedKey->lastEncRoundKey; \
220
0
    keyLimit = pExpandedKey->lastDecRoundKey; \
221
0
\
222
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
223
0
    keyPtr ++; \
224
0
\
225
0
    c0 = _mm_xor_si128( c0, roundkey ); \
226
0
\
227
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
228
0
    keyPtr ++; \
229
0
    c0 = _mm_aesdec_si128( c0, roundkey ); \
230
0
\
231
0
    do \
232
0
    { \
233
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
234
0
        keyPtr ++; \
235
0
        c0 = _mm_aesdec_si128( c0, roundkey ); \
236
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
237
0
        keyPtr ++; \
238
0
        c0 = _mm_aesdec_si128( c0, roundkey ); \
239
0
    } while( keyPtr < keyLimit ); \
240
0
\
241
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
242
0
\
243
0
    c0 = _mm_aesdeclast_si128( c0, roundkey ); \
244
0
};
245
246
0
#define AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
247
0
{ \
248
0
    const BYTE (*keyPtr)[4][4]; \
249
0
    const BYTE (*keyLimit)[4][4]; \
250
0
    __m128i roundkey; \
251
0
\
252
0
    keyPtr = pExpandedKey->lastEncRoundKey; \
253
0
    keyLimit = pExpandedKey->lastDecRoundKey; \
254
0
\
255
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
256
0
    keyPtr ++; \
257
0
\
258
0
    c0 = _mm_xor_si128( c0, roundkey ); \
259
0
    c1 = _mm_xor_si128( c1, roundkey ); \
260
0
    c2 = _mm_xor_si128( c2, roundkey ); \
261
0
    c3 = _mm_xor_si128( c3, roundkey ); \
262
0
\
263
0
    do \
264
0
    { \
265
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
266
0
        keyPtr ++; \
267
0
        c0 = _mm_aesdec_si128( c0, roundkey ); \
268
0
        c1 = _mm_aesdec_si128( c1, roundkey ); \
269
0
        c2 = _mm_aesdec_si128( c2, roundkey ); \
270
0
        c3 = _mm_aesdec_si128( c3, roundkey ); \
271
0
    } while( keyPtr < keyLimit ); \
272
0
\
273
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
274
0
\
275
0
    c0 = _mm_aesdeclast_si128( c0, roundkey ); \
276
0
    c1 = _mm_aesdeclast_si128( c1, roundkey ); \
277
0
    c2 = _mm_aesdeclast_si128( c2, roundkey ); \
278
0
    c3 = _mm_aesdeclast_si128( c3, roundkey ); \
279
0
};
280
281
0
#define AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
282
0
{ \
283
0
    const BYTE (*keyPtr)[4][4]; \
284
0
    const BYTE (*keyLimit)[4][4]; \
285
0
    __m128i roundkey; \
286
0
\
287
0
    keyPtr = pExpandedKey->lastEncRoundKey; \
288
0
    keyLimit = pExpandedKey->lastDecRoundKey; \
289
0
\
290
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
291
0
    keyPtr ++; \
292
0
\
293
0
    c0 = _mm_xor_si128( c0, roundkey ); \
294
0
    c1 = _mm_xor_si128( c1, roundkey ); \
295
0
    c2 = _mm_xor_si128( c2, roundkey ); \
296
0
    c3 = _mm_xor_si128( c3, roundkey ); \
297
0
    c4 = _mm_xor_si128( c4, roundkey ); \
298
0
    c5 = _mm_xor_si128( c5, roundkey ); \
299
0
    c6 = _mm_xor_si128( c6, roundkey ); \
300
0
    c7 = _mm_xor_si128( c7, roundkey ); \
301
0
\
302
0
    do \
303
0
    { \
304
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
305
0
        keyPtr ++; \
306
0
        c0 = _mm_aesdec_si128( c0, roundkey ); \
307
0
        c1 = _mm_aesdec_si128( c1, roundkey ); \
308
0
        c2 = _mm_aesdec_si128( c2, roundkey ); \
309
0
        c3 = _mm_aesdec_si128( c3, roundkey ); \
310
0
        c4 = _mm_aesdec_si128( c4, roundkey ); \
311
0
        c5 = _mm_aesdec_si128( c5, roundkey ); \
312
0
        c6 = _mm_aesdec_si128( c6, roundkey ); \
313
0
        c7 = _mm_aesdec_si128( c7, roundkey ); \
314
0
    } while( keyPtr < keyLimit ); \
315
0
\
316
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
317
0
\
318
0
    c0 = _mm_aesdeclast_si128( c0, roundkey ); \
319
0
    c1 = _mm_aesdeclast_si128( c1, roundkey ); \
320
0
    c2 = _mm_aesdeclast_si128( c2, roundkey ); \
321
0
    c3 = _mm_aesdeclast_si128( c3, roundkey ); \
322
0
    c4 = _mm_aesdeclast_si128( c4, roundkey ); \
323
0
    c5 = _mm_aesdeclast_si128( c5, roundkey ); \
324
0
    c6 = _mm_aesdeclast_si128( c6, roundkey ); \
325
0
    c7 = _mm_aesdeclast_si128( c7, roundkey ); \
326
0
};
327
328
329
//
330
// The EncryptXmm code is tested through the CFB mode encryption which has no further optimizations.
331
//
332
VOID
333
SYMCRYPT_CALL
334
SymCryptAesEncryptXmm(
335
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
336
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PCBYTE                      pbSrc,
337
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbDst )
338
0
{
339
0
    __m128i c;
340
341
0
    c = _mm_loadu_si128( ( __m128i * ) pbSrc);
342
343
0
    AES_ENCRYPT_1( pExpandedKey, c );
344
345
0
    _mm_storeu_si128( (__m128i *) pbDst, c );
346
0
}
347
348
//
349
// The DecryptXmm code is tested through the EcbDecrypt calls which has no further optimizations.
350
//
351
VOID
352
SYMCRYPT_CALL
353
SymCryptAesDecryptXmm(
354
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
355
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PCBYTE                      pbSrc,
356
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbDst )
357
0
{
358
0
    __m128i c;
359
360
0
    c = _mm_loadu_si128( ( __m128i * ) pbSrc);
361
362
0
    AES_DECRYPT_1( pExpandedKey, c );
363
364
0
    _mm_storeu_si128( (__m128i *) pbDst, c );
365
0
}
366
367
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
368
#pragma warning(push)
369
#pragma warning( disable: 6001 4701 )
370
#pragma runtime_checks( "u", off )
371
VOID
372
SYMCRYPT_CALL
373
SymCryptAesEcbEncryptXmm(
374
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
375
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
376
    _Out_writes_( cbData )                      PBYTE                       pbDst,
377
                                                SIZE_T                      cbData )
378
0
{
379
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
380
381
0
    while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
382
0
    {
383
0
        c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc +  0 ));
384
0
        c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ));
385
0
        c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ));
386
0
        c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ));
387
0
        c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ));
388
0
        c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ));
389
0
        c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ));
390
0
        c7 = _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ));
391
392
0
        AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
393
394
0
        _mm_storeu_si128( (__m128i *) (pbDst +  0 ), c0 );
395
0
        _mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 );
396
0
        _mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 );
397
0
        _mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 );
398
0
        _mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 );
399
0
        _mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 );
400
0
        _mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 );
401
0
        _mm_storeu_si128( (__m128i *) (pbDst +112 ), c7 );
402
403
0
        pbSrc   += 8 * SYMCRYPT_AES_BLOCK_SIZE;
404
0
        pbDst   += 8 * SYMCRYPT_AES_BLOCK_SIZE;
405
0
        cbData  -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
406
0
    }
407
408
0
    if( cbData < 16 )
409
0
    {
410
0
        return;
411
0
    }
412
413
0
    c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ));
414
0
    if( cbData >= 32 )
415
0
    {
416
0
    c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ));
417
0
        if( cbData >= 48 )
418
0
        {
419
0
    c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ));
420
0
            if( cbData >= 64 )
421
0
            {
422
0
    c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ));
423
0
                if( cbData >= 80 )
424
0
                {
425
0
    c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ));
426
0
                    if( cbData >= 96 )
427
0
                    {
428
0
    c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ));
429
0
                        if( cbData >= 112 )
430
0
                        {
431
0
    c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ));
432
0
                        }
433
0
                    }
434
0
                }
435
0
            }
436
0
        }
437
0
    }
438
439
0
    if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
440
0
    {
441
0
        AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
442
0
    }
443
0
    else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
444
0
    {
445
0
        AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
446
0
    }
447
0
    else
448
0
    {
449
0
        AES_ENCRYPT_1( pExpandedKey, c0 );
450
0
    }
451
452
0
    _mm_storeu_si128( (__m128i *) (pbDst + 0  ), c0 );
453
0
    if( cbData >= 32 )
454
0
    {
455
0
    _mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 );
456
0
        if( cbData >= 48 )
457
0
        {
458
0
    _mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 );
459
0
            if( cbData >= 64 )
460
0
            {
461
0
    _mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 );
462
0
                if( cbData >= 80 )
463
0
                {
464
0
    _mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 );
465
0
                    if( cbData >= 96 )
466
0
                    {
467
0
    _mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 );
468
0
                        if( cbData >= 112 )
469
0
                        {
470
0
    _mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 );
471
0
                        }
472
0
                    }
473
0
                }
474
0
            }
475
0
        }
476
0
    }
477
0
}
478
#pragma runtime_checks( "u", restore )
479
#pragma warning( pop )
480
481
482
483
VOID
484
SYMCRYPT_CALL
485
SymCryptAesCbcEncryptXmm(
486
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
487
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
488
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
489
    _Out_writes_( cbData )                      PBYTE                       pbDst,
490
                                                SIZE_T                      cbData )
491
0
{
492
0
    __m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue );
493
0
    __m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
494
0
    __m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
495
0
    __m128i d;
496
497
0
    if (cbData < SYMCRYPT_AES_BLOCK_SIZE)
498
0
        return;
499
500
    // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR
501
    // instructions on the critical path where possible
502
    // We can compute (last round key ^ next plaintext block ^ first round key) off the critical
503
    // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
504
    // the main loop
505
0
    d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 );
506
0
    c = _mm_xor_si128( c, d );
507
0
    pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
508
0
    cbData -= SYMCRYPT_AES_BLOCK_SIZE;
509
510
0
    while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
511
0
    {
512
0
        d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 );
513
0
        AES_ENCRYPT_1_CHAIN( pExpandedKey, c, _mm_xor_si128(d, rkLast ) );
514
0
        _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128(c, d) );
515
516
0
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
517
0
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
518
0
        cbData -= SYMCRYPT_AES_BLOCK_SIZE;
519
0
    }
520
0
    AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast );
521
0
    _mm_storeu_si128( (__m128i *) pbDst, c );
522
0
    _mm_storeu_si128( (__m128i *) pbChainingValue, c );
523
0
}
524
525
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
526
#pragma warning(push)
527
#pragma warning( disable: 6001 4701 )
528
#pragma runtime_checks( "u", off )
529
VOID
530
SYMCRYPT_CALL
531
SymCryptAesCbcDecryptXmm(
532
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
533
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
534
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
535
    _Out_writes_( cbData )                      PBYTE                       pbDst,
536
                                                SIZE_T                      cbData )
537
0
{
538
0
    __m128i chain;
539
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
540
0
    __m128i d0, d1, d2, d3, d4, d5, d6, d7;
541
542
0
    if( cbData < SYMCRYPT_AES_BLOCK_SIZE )
543
0
    {
544
0
        return;
545
0
    }
546
547
0
    chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
548
549
    //
550
    // First we do all multiples of 8 blocks
551
    //
552
553
0
    while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
554
0
    {
555
0
        d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) );
556
0
        d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) );
557
0
        d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) );
558
0
        d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) );
559
0
        d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) );
560
0
        d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) );
561
0
        d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) );
562
0
        d7 = c7 = _mm_loadu_si128( (__m128i *) (pbSrc + 7 * SYMCRYPT_AES_BLOCK_SIZE ) );
563
564
0
        AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
565
566
0
        c0 = _mm_xor_si128( c0, chain );
567
0
        c1 = _mm_xor_si128( c1, d0 );
568
0
        c2 = _mm_xor_si128( c2, d1 );
569
0
        c3 = _mm_xor_si128( c3, d2 );
570
0
        c4 = _mm_xor_si128( c4, d3 );
571
0
        c5 = _mm_xor_si128( c5, d4 );
572
0
        c6 = _mm_xor_si128( c6, d5 );
573
0
        c7 = _mm_xor_si128( c7, d6 );
574
0
        chain = d7;
575
576
0
        _mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 );
577
0
        _mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 );
578
0
        _mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 );
579
0
        _mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 );
580
0
        _mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 );
581
0
        _mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 );
582
0
        _mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 );
583
0
        _mm_storeu_si128( (__m128i *) (pbDst + 7 * SYMCRYPT_AES_BLOCK_SIZE ), c7 );
584
585
0
        pbSrc  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
586
0
        pbDst  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
587
0
        cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
588
0
    }
589
590
0
    if( cbData >= 16 )
591
0
    {
592
        //
593
        // There is remaining work to be done
594
        //
595
0
        d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) );
596
0
        if( cbData >= 32 )
597
0
        {
598
0
        d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) );
599
0
            if( cbData >= 48 )
600
0
            {
601
0
        d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) );
602
0
                if( cbData >= 64 )
603
0
                {
604
0
        d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) );
605
0
                    if( cbData >= 80 )
606
0
                    {
607
0
        d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) );
608
0
                        if( cbData >= 96 )
609
0
                        {
610
0
        d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) );
611
0
                            if( cbData >= 112 )
612
0
                            {
613
0
        d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) );
614
0
                            }
615
0
                        }
616
0
                    }
617
0
                }
618
0
            }
619
0
        }
620
621
        //
622
        // Decrypt 1, 4, or 8 blocks in AES-CBC mode. This might decrypt uninitialized registers,
623
        // but those will not be used when we store the results.
624
        //
625
0
        if( cbData > 4 * SYMCRYPT_AES_BLOCK_SIZE )
626
0
        {
627
0
            AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
628
0
            c0 = _mm_xor_si128( c0, chain );
629
0
            c1 = _mm_xor_si128( c1, d0 );
630
0
            c2 = _mm_xor_si128( c2, d1 );
631
0
            c3 = _mm_xor_si128( c3, d2 );
632
0
            c4 = _mm_xor_si128( c4, d3 );
633
0
            c5 = _mm_xor_si128( c5, d4 );
634
0
            c6 = _mm_xor_si128( c6, d5 );
635
0
        }
636
0
        else if( cbData > SYMCRYPT_AES_BLOCK_SIZE )
637
0
        {
638
0
            AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 );
639
0
            c0 = _mm_xor_si128( c0, chain );
640
0
            c1 = _mm_xor_si128( c1, d0 );
641
0
            c2 = _mm_xor_si128( c2, d1 );
642
0
            c3 = _mm_xor_si128( c3, d2 );
643
0
        } else
644
0
        {
645
0
            AES_DECRYPT_1( pExpandedKey, c0 );
646
0
            c0 = _mm_xor_si128( c0, chain );
647
0
        }
648
649
0
        chain = _mm_loadu_si128( (__m128i *) (pbSrc + cbData - SYMCRYPT_AES_BLOCK_SIZE ) );
650
0
        _mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 );
651
0
        if( cbData >= 32 )
652
0
        {
653
0
        _mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 );
654
0
            if( cbData >= 48 )
655
0
            {
656
0
        _mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 );
657
0
                if( cbData >= 64 )
658
0
                {
659
0
        _mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 );
660
0
                    if( cbData >= 80 )
661
0
                    {
662
0
        _mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 );
663
0
                        if( cbData >= 96 )
664
0
                        {
665
0
        _mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 );
666
0
                            if( cbData >= 112 )
667
0
                            {
668
0
        _mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 );
669
0
                            }
670
0
                        }
671
0
                    }
672
0
                }
673
0
            }
674
0
        }
675
0
    }
676
677
0
    _mm_storeu_si128( (__m128i *) pbChainingValue, chain );
678
679
0
    return;
680
0
}
681
#pragma runtime_checks( "u", restore )
682
#pragma warning( pop )
683
684
VOID
685
SYMCRYPT_CALL
686
SymCryptAesCbcMacXmm(
687
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
688
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
689
    _In_reads_( cbData )                        PCBYTE                      pbData,
690
                                                SIZE_T                      cbData )
691
0
{
692
0
    __m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue );
693
0
    __m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
694
0
    __m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
695
0
    __m128i d, rk0AndLast;
696
697
0
    if (cbData < SYMCRYPT_AES_BLOCK_SIZE)
698
0
        return;
699
700
    // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR
701
    // instructions on the critical path where possible
702
    // We can compute (last round key ^ next plaintext block ^ first round key) off the critical
703
    // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
704
    // the main loop
705
0
    d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0 );
706
0
    c = _mm_xor_si128( c, d );
707
0
    pbData += SYMCRYPT_AES_BLOCK_SIZE;
708
0
    cbData -= SYMCRYPT_AES_BLOCK_SIZE;
709
710
    // As we don't compute ciphertext here, we only need to XOR rk0 and rkLast once
711
0
    rk0AndLast = _mm_xor_si128( rk0, rkLast );
712
713
0
    while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
714
0
    {
715
0
        d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0AndLast );
716
0
        AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
717
718
0
        pbData += SYMCRYPT_AES_BLOCK_SIZE;
719
0
        cbData -= SYMCRYPT_AES_BLOCK_SIZE;
720
0
    }
721
0
    AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast );
722
0
    _mm_storeu_si128( (__m128i *) pbChainingValue, c );
723
0
}
724
725
726
#pragma warning(push)
727
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
728
#pragma runtime_checks( "u", off )
729
730
#define SYMCRYPT_AesCtrMsbXxXmm     SymCryptAesCtrMsb64Xmm
731
0
#define MM_ADD_EPIXX               _mm_add_epi64
732
0
#define MM_SUB_EPIXX               _mm_sub_epi64
733
734
#include "aes-pattern.c"
735
736
#undef MM_SUB_EPIXX
737
#undef MM_ADD_EPIXX
738
#undef SYMCRYPT_AesCtrMsbXxXmm
739
740
#define SYMCRYPT_AesCtrMsbXxXmm     SymCryptAesCtrMsb32Xmm
741
0
#define MM_ADD_EPIXX               _mm_add_epi32
742
0
#define MM_SUB_EPIXX               _mm_sub_epi32
743
744
#include "aes-pattern.c"
745
746
#undef MM_SUB_EPIXX
747
#undef MM_ADD_EPIXX
748
#undef SYMCRYPT_AesCtrMsbXxXmm
749
750
#pragma runtime_checks( "u", restore )
751
#pragma warning(pop)
752
753
/*
754
    if( cbData >= 16 )
755
    {
756
        if( cbData >= 32 )
757
        {
758
            if( cbData >= 48 )
759
            {
760
                if( cbData >= 64 )
761
                {
762
                    if( cbData >= 80 )
763
                    {
764
                        if( cbData >= 96 )
765
                        {
766
                            if( cbData >= 112 )
767
                            {
768
                            }
769
                        }
770
                    }
771
                }
772
            }
773
        }
774
    }
775
*/
776
777
VOID
778
SYMCRYPT_CALL
779
SymCryptXtsAesEncryptDataUnitXmm(
780
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
781
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )       PBYTE                       pbTweakBlock,
782
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 )  PBYTE                       pbScratch,
783
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
784
    _Out_writes_( cbData )                      PBYTE                       pbDst,
785
                                                SIZE_T                      cbData )
786
0
{
787
0
    __m128i t0;
788
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
789
0
    __m128i roundkey, firstRoundKey, lastRoundKey;
790
0
    __m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
791
0
    SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch;
792
793
0
    const BYTE (*keyPtr)[4][4];
794
0
    const BYTE (*keyLimit)[4][4] = pExpandedKey->lastEncRoundKey;
795
0
    UINT64 lastTweakLow, lastTweakHigh;
796
0
    int aesEncryptXtsLoop;
797
798
0
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
799
0
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
800
801
0
    SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
802
803
    // To simplify logic and unusual size processing, we handle all
804
    // data not a multiple of 8 blocks in the tail loop
805
0
    cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
806
    // Additionally, so that ciphertext stealing logic does not rely on
807
    // reading back from the destination buffer, when we have a non-zero
808
    // tail, we ensure that we handle at least 1 whole block in the tail
809
    //
810
    // Note that our caller has ensured we have at least 1 whole block
811
    // to process, this is checked in debug build
812
    // This means that cbDataTail is in [1,15] at this point iff there are
813
    // at least 8 whole blocks to process; so the below does not cause
814
    // cbDataTail or cbDataMain to exceed cbData
815
0
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
816
0
    cbDataMain = cbData - cbDataTail;
817
818
0
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
819
0
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
820
0
    SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
821
822
0
    c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
823
0
    XTS_MUL_ALPHA( c0, c1 );
824
0
    XTS_MUL_ALPHA( c1, c2 );
825
0
    XTS_MUL_ALPHA( c2, c3 );
826
827
0
    XTS_MUL_ALPHA4( c0, c4 );
828
0
    XTS_MUL_ALPHA ( c4, c5 );
829
0
    XTS_MUL_ALPHA ( c5, c6 );
830
0
    XTS_MUL_ALPHA ( c6, c7 );
831
832
0
    tweakBuffer[0].m128i = c0;
833
0
    tweakBuffer[1].m128i = c1;
834
0
    tweakBuffer[2].m128i = c2;
835
0
    tweakBuffer[3].m128i = c3;
836
0
    tweakBuffer[4].m128i = c4;
837
0
    tweakBuffer[5].m128i = c5;
838
0
    tweakBuffer[6].m128i = c6;
839
0
    tweakBuffer[7].m128i = c7;
840
0
    lastTweakLow  = tweakBuffer[7].ull[0];
841
0
    lastTweakHigh = tweakBuffer[7].ull[1];
842
843
0
    firstRoundKey = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
844
0
    lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
845
846
0
    while( cbDataMain > 0 )
847
0
    {
848
        // At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks
849
0
        c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey );
850
0
        c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey );
851
0
        c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey );
852
0
        c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey );
853
0
        c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey );
854
0
        c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey );
855
0
        c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey );
856
0
        c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey );
857
858
0
        c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +   0) ) );
859
0
        c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc +  16) ) );
860
0
        c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc +  32) ) );
861
0
        c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc +  48) ) );
862
0
        c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc +  64) ) );
863
0
        c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc +  80) ) );
864
0
        c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc +  96) ) );
865
0
        c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) );
866
867
0
        keyPtr = &pExpandedKey->RoundKey[1];
868
869
        // Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (peformed in scalar registers)
870
0
        for( aesEncryptXtsLoop = 0; aesEncryptXtsLoop < 8; aesEncryptXtsLoop++ )
871
0
        {
872
0
            roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
873
0
            keyPtr ++;
874
0
            c0 = _mm_aesenc_si128( c0, roundkey );
875
0
            c1 = _mm_aesenc_si128( c1, roundkey );
876
0
            c2 = _mm_aesenc_si128( c2, roundkey );
877
0
            c3 = _mm_aesenc_si128( c3, roundkey );
878
0
            c4 = _mm_aesenc_si128( c4, roundkey );
879
0
            c5 = _mm_aesenc_si128( c5, roundkey );
880
0
            c6 = _mm_aesenc_si128( c6, roundkey );
881
0
            c7 = _mm_aesenc_si128( c7, roundkey );
882
883
            // Prepare tweakBuffer[8-15] with tweak^lastRoundKey
884
0
            tweakBuffer[ 8+aesEncryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesEncryptXtsLoop ].m128i, lastRoundKey );
885
            // Prepare tweakBuffer[0-7] with tweaks for next 8 blocks
886
0
            XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh );
887
0
            tweakBuffer[ aesEncryptXtsLoop ].ull[0] = lastTweakLow;
888
0
            tweakBuffer[ aesEncryptXtsLoop ].ull[1] = lastTweakHigh;
889
0
        }
890
891
0
        do
892
0
        {
893
0
            roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
894
0
            keyPtr ++;
895
0
            c0 = _mm_aesenc_si128( c0, roundkey );
896
0
            c1 = _mm_aesenc_si128( c1, roundkey );
897
0
            c2 = _mm_aesenc_si128( c2, roundkey );
898
0
            c3 = _mm_aesenc_si128( c3, roundkey );
899
0
            c4 = _mm_aesenc_si128( c4, roundkey );
900
0
            c5 = _mm_aesenc_si128( c5, roundkey );
901
0
            c6 = _mm_aesenc_si128( c6, roundkey );
902
0
            c7 = _mm_aesenc_si128( c7, roundkey );
903
0
        } while( keyPtr < keyLimit );
904
905
0
        _mm_storeu_si128( (__m128i *) (pbDst +   0), _mm_aesenclast_si128( c0, tweakBuffer[ 8].m128i ) );
906
0
        _mm_storeu_si128( (__m128i *) (pbDst +  16), _mm_aesenclast_si128( c1, tweakBuffer[ 9].m128i ) );
907
0
        _mm_storeu_si128( (__m128i *) (pbDst +  32), _mm_aesenclast_si128( c2, tweakBuffer[10].m128i ) );
908
0
        _mm_storeu_si128( (__m128i *) (pbDst +  48), _mm_aesenclast_si128( c3, tweakBuffer[11].m128i ) );
909
0
        _mm_storeu_si128( (__m128i *) (pbDst +  64), _mm_aesenclast_si128( c4, tweakBuffer[12].m128i ) );
910
0
        _mm_storeu_si128( (__m128i *) (pbDst +  80), _mm_aesenclast_si128( c5, tweakBuffer[13].m128i ) );
911
0
        _mm_storeu_si128( (__m128i *) (pbDst +  96), _mm_aesenclast_si128( c6, tweakBuffer[14].m128i ) );
912
0
        _mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesenclast_si128( c7, tweakBuffer[15].m128i ) );
913
914
0
        pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
915
0
        pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
916
0
        cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
917
0
    }
918
919
0
    if( cbDataTail == 0 )
920
0
    {
921
0
        return; // <-- expected case; early return here
922
0
    }
923
924
    // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
925
0
    t0 = tweakBuffer[0].m128i;
926
927
0
    while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
928
0
    {
929
0
        c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 );
930
0
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
931
0
        AES_ENCRYPT_1( pExpandedKey, c0 );
932
0
        _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
933
0
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
934
0
        XTS_MUL_ALPHA( t0, t0 );
935
0
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
936
0
    }
937
    
938
0
    if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
939
0
    {
940
        // Ciphertext stealing encryption
941
        // 
942
        //                      +--------------+
943
        //                      |              |
944
        //                      |              V
945
        // +-----------------+  |  +-----+-----------+
946
        // |      P_m-1      |  |  | P_m |++++CP+++++|
947
        // +-----------------+  |  +-----+-----------+
948
        //          |           |           |
949
        //       enc_m-1        |         enc_m
950
        //          |           |           |
951
        //          V           |           V
952
        // +-----+-----------+  |  +-----------------+
953
        // | C_m |++++CP+++++|--+  |      C_m-1      |
954
        // +-----+-----------+     +-----------------+
955
        //    |                   /
956
        //    +----------------  /  --+
957
        //                      /     |
958
        //                      |     V
959
        // +-----------------+  |  +-----+
960
        // |      C_m-1      |<-+  | C_m |
961
        // +-----------------+     +-----+
962
963
        // Encrypt penultimate plaintext block into tweakBuffer[0]
964
0
        c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), t0 );
965
0
        AES_ENCRYPT_1( pExpandedKey, c0 );
966
0
        tweakBuffer[0].m128i = _mm_xor_si128( c0, t0 );
967
968
0
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
969
970
        // Copy tweakBuffer[0] to tweakBuffer[1]
971
0
        tweakBuffer[1].m128i = tweakBuffer[0].m128i;
972
        // Copy final plaintext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place encryption
973
0
        memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
974
        // Copy prefix of tweakBuffer[1] to the right place in the destination buffer
975
0
        memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail );
976
977
        // Do final tweak update
978
0
        XTS_MUL_ALPHA( t0, t0 );
979
980
        // Load updated tweakBuffer[0] into c0
981
0
        c0 = tweakBuffer[0].m128i;
982
0
    } else {
983
        // Just load final plaintext block into c0
984
0
        c0 = _mm_loadu_si128( (__m128i*) pbSrc );
985
0
    }
986
987
    // Final full block encryption
988
0
    c0 = _mm_xor_si128( c0, t0 );
989
0
    AES_ENCRYPT_1( pExpandedKey, c0 );
990
0
    _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
991
0
}
992
993
VOID
994
SYMCRYPT_CALL
995
SymCryptXtsAesDecryptDataUnitXmm(
996
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
997
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )       PBYTE                       pbTweakBlock,
998
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 )  PBYTE                       pbScratch,
999
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
1000
    _Out_writes_( cbData )                      PBYTE                       pbDst,
1001
                                                SIZE_T                      cbData )
1002
0
{
1003
0
    __m128i t0;
1004
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
1005
0
    __m128i roundkey, firstRoundKey, lastRoundKey;
1006
0
    __m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
1007
0
    SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch;
1008
1009
0
    const BYTE (*keyPtr)[4][4];
1010
0
    const BYTE (*keyLimit)[4][4] = pExpandedKey->lastDecRoundKey;
1011
0
    UINT64 lastTweakLow, lastTweakHigh;
1012
0
    int aesDecryptXtsLoop;
1013
1014
0
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
1015
0
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
1016
    
1017
0
    SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
1018
1019
    // To simplify logic and unusual size processing, we handle all
1020
    // data not a multiple of 8 blocks in the tail loop
1021
0
    cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
1022
    // Additionally, so that ciphertext stealing logic does not rely on
1023
    // reading back from the destination buffer, when we have a non-zero
1024
    // tail, we ensure that we handle at least 1 whole block in the tail
1025
    //
1026
    // Note that our caller has ensured we have at least 1 whole block
1027
    // to process, this is checked in debug build
1028
    // This means that cbDataTail is in [1,15] at this point iff there are
1029
    // at least 8 whole blocks to process; so the below does not cause
1030
    // cbDataTail or cbDataMain to exceed cbData
1031
0
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
1032
0
    cbDataMain = cbData - cbDataTail;
1033
1034
0
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
1035
0
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
1036
0
    SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
1037
1038
0
    c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
1039
0
    XTS_MUL_ALPHA( c0, c1 );
1040
0
    XTS_MUL_ALPHA( c1, c2 );
1041
0
    XTS_MUL_ALPHA( c2, c3 );
1042
1043
0
    XTS_MUL_ALPHA4( c0, c4 );
1044
0
    XTS_MUL_ALPHA ( c4, c5 );
1045
0
    XTS_MUL_ALPHA ( c5, c6 );
1046
0
    XTS_MUL_ALPHA ( c6, c7 );
1047
1048
0
    tweakBuffer[0].m128i = c0;
1049
0
    tweakBuffer[1].m128i = c1;
1050
0
    tweakBuffer[2].m128i = c2;
1051
0
    tweakBuffer[3].m128i = c3;
1052
0
    tweakBuffer[4].m128i = c4;
1053
0
    tweakBuffer[5].m128i = c5;
1054
0
    tweakBuffer[6].m128i = c6;
1055
0
    tweakBuffer[7].m128i = c7;
1056
0
    lastTweakLow  = tweakBuffer[7].ull[0];
1057
0
    lastTweakHigh = tweakBuffer[7].ull[1];
1058
1059
0
    firstRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
1060
0
    lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastDecRoundKey );
1061
1062
0
    while( cbDataMain > 0 )
1063
0
    {
1064
        // At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks
1065
0
        c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey );
1066
0
        c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey );
1067
0
        c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey );
1068
0
        c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey );
1069
0
        c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey );
1070
0
        c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey );
1071
0
        c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey );
1072
0
        c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey );
1073
1074
0
        c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +   0) ) );
1075
0
        c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc +  16) ) );
1076
0
        c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc +  32) ) );
1077
0
        c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc +  48) ) );
1078
0
        c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc +  64) ) );
1079
0
        c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc +  80) ) );
1080
0
        c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc +  96) ) );
1081
0
        c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) );
1082
1083
0
        keyPtr = pExpandedKey->lastEncRoundKey + 1;
1084
1085
        // Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (peformed in scalar registers)
1086
0
        for( aesDecryptXtsLoop = 0; aesDecryptXtsLoop < 8; aesDecryptXtsLoop++ )
1087
0
        {
1088
0
            roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
1089
0
            keyPtr ++;
1090
0
            c0 = _mm_aesdec_si128( c0, roundkey );
1091
0
            c1 = _mm_aesdec_si128( c1, roundkey );
1092
0
            c2 = _mm_aesdec_si128( c2, roundkey );
1093
0
            c3 = _mm_aesdec_si128( c3, roundkey );
1094
0
            c4 = _mm_aesdec_si128( c4, roundkey );
1095
0
            c5 = _mm_aesdec_si128( c5, roundkey );
1096
0
            c6 = _mm_aesdec_si128( c6, roundkey );
1097
0
            c7 = _mm_aesdec_si128( c7, roundkey );
1098
1099
            // Prepare tweakBuffer[8-15] with tweak^lastRoundKey
1100
0
            tweakBuffer[ 8+aesDecryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesDecryptXtsLoop ].m128i, lastRoundKey );
1101
            // Prepare tweakBuffer[0-7] with tweaks for next 8 blocks
1102
0
            XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh );
1103
0
            tweakBuffer[ aesDecryptXtsLoop ].ull[0] = lastTweakLow;
1104
0
            tweakBuffer[ aesDecryptXtsLoop ].ull[1] = lastTweakHigh;
1105
0
        }
1106
1107
0
        do
1108
0
        {
1109
0
            roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
1110
0
            keyPtr ++;
1111
0
            c0 = _mm_aesdec_si128( c0, roundkey );
1112
0
            c1 = _mm_aesdec_si128( c1, roundkey );
1113
0
            c2 = _mm_aesdec_si128( c2, roundkey );
1114
0
            c3 = _mm_aesdec_si128( c3, roundkey );
1115
0
            c4 = _mm_aesdec_si128( c4, roundkey );
1116
0
            c5 = _mm_aesdec_si128( c5, roundkey );
1117
0
            c6 = _mm_aesdec_si128( c6, roundkey );
1118
0
            c7 = _mm_aesdec_si128( c7, roundkey );
1119
0
        } while( keyPtr < keyLimit );
1120
1121
0
        _mm_storeu_si128( (__m128i *) (pbDst +   0), _mm_aesdeclast_si128( c0, tweakBuffer[ 8].m128i ) );
1122
0
        _mm_storeu_si128( (__m128i *) (pbDst +  16), _mm_aesdeclast_si128( c1, tweakBuffer[ 9].m128i ) );
1123
0
        _mm_storeu_si128( (__m128i *) (pbDst +  32), _mm_aesdeclast_si128( c2, tweakBuffer[10].m128i ) );
1124
0
        _mm_storeu_si128( (__m128i *) (pbDst +  48), _mm_aesdeclast_si128( c3, tweakBuffer[11].m128i ) );
1125
0
        _mm_storeu_si128( (__m128i *) (pbDst +  64), _mm_aesdeclast_si128( c4, tweakBuffer[12].m128i ) );
1126
0
        _mm_storeu_si128( (__m128i *) (pbDst +  80), _mm_aesdeclast_si128( c5, tweakBuffer[13].m128i ) );
1127
0
        _mm_storeu_si128( (__m128i *) (pbDst +  96), _mm_aesdeclast_si128( c6, tweakBuffer[14].m128i ) );
1128
0
        _mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesdeclast_si128( c7, tweakBuffer[15].m128i ) );
1129
1130
0
        pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1131
0
        pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1132
0
        cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
1133
0
    }
1134
1135
0
    if( cbDataTail == 0 )
1136
0
    {
1137
0
        return; // <-- expected case; early return here
1138
0
    }
1139
1140
    // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1141
0
    t0 = tweakBuffer[0].m128i;
1142
1143
0
    while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1144
0
    {
1145
0
        c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 );
1146
0
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1147
0
        AES_DECRYPT_1( pExpandedKey, c0 );
1148
0
        _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
1149
0
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1150
0
        c7 = t0;
1151
0
        XTS_MUL_ALPHA( t0, t0 );
1152
0
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1153
0
    }
1154
    
1155
0
    if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1156
0
    {
1157
        // Ciphertext stealing decryption
1158
        // 
1159
        //                      +--------------+
1160
        //                      |              |
1161
        //                      |              V
1162
        // +-----------------+  |  +-----+-----------+
1163
        // |      C_m-1      |  |  | C_m |++++CP+++++|
1164
        // +-----------------+  |  +-----+-----------+
1165
        //          |           |           |
1166
        //        dec_m         |        dec_m-1
1167
        //          |           |           |
1168
        //          V           |           V
1169
        // +-----+-----------+  |  +-----------------+
1170
        // | P_m |++++CP+++++|--+  |      P_m-1      |
1171
        // +-----+-----------+     +-----------------+
1172
        //    |                   /
1173
        //    +----------------  /  --+
1174
        //                      /     |
1175
        //                      |     V
1176
        // +-----------------+  |  +-----+
1177
        // |      P_m-1      |<-+  | P_m |
1178
        // +-----------------+     +-----+
1179
1180
        // Do final tweak update into c1
1181
        // Penultimate tweak is in t0, ready for final decryption
1182
0
        XTS_MUL_ALPHA( t0, c1 );
1183
1184
        // Decrypt penultimate ciphertext block into tweakBuffer[0]
1185
0
        c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), c1 );
1186
0
        AES_DECRYPT_1( pExpandedKey, c0 );
1187
0
        tweakBuffer[0].m128i = _mm_xor_si128( c0, c1 );
1188
1189
0
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1190
1191
        // Copy tweakBuffer[0] to tweakBuffer[1]
1192
0
        tweakBuffer[1].m128i = tweakBuffer[0].m128i;
1193
        // Copy final ciphertext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place decryption
1194
0
        memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1195
        // Copy prefix of tweakBuffer[1] to the right place in the destination buffer
1196
0
        memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail );
1197
1198
        // Load updated tweakBuffer[0] into c0
1199
0
        c0 = tweakBuffer[0].m128i;
1200
0
    } else {
1201
        // Just load final ciphertext block into c0
1202
0
        c0 = _mm_loadu_si128( (__m128i*) pbSrc );
1203
0
    }
1204
1205
    // Final full block decryption
1206
0
    c0 = _mm_xor_si128( c0, t0 );
1207
0
    AES_DECRYPT_1( pExpandedKey, c0 );
1208
0
    _mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
1209
0
}
1210
1211
0
#define AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1212
0
{ \
1213
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1214
0
    keyPtr ++; \
1215
0
    c0 = _mm_aesenc_si128( c0, roundkey ); \
1216
0
    c1 = _mm_aesenc_si128( c1, roundkey ); \
1217
0
    c2 = _mm_aesenc_si128( c2, roundkey ); \
1218
0
    c3 = _mm_aesenc_si128( c3, roundkey ); \
1219
0
\
1220
0
    r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \
1221
0
    r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \
1222
0
    gHashPointer += 16; \
1223
0
\
1224
0
    t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
1225
0
    t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1226
0
    t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \
1227
0
\
1228
0
    resl = _mm_xor_si128( resl, t0 ); \
1229
0
    resh = _mm_xor_si128( resh, t1 ); \
1230
0
\
1231
0
    t0 = _mm_srli_si128( r0, 8 ); \
1232
0
    r0 = _mm_xor_si128( r0, t0 ); \
1233
0
    t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
1234
0
    t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1235
0
\
1236
0
    resm = _mm_xor_si128( resm, t1 ); \
1237
0
    todo --; \
1238
0
};
1239
1240
0
#define AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1241
0
{ \
1242
0
    const BYTE (*keyPtr)[4][4]; \
1243
0
    const BYTE (*keyLimit)[4][4]; \
1244
0
    __m128i roundkey; \
1245
0
    __m128i t0, t1; \
1246
0
    __m128i r0; \
1247
0
    SIZE_T aesEncryptGhashLoop; \
1248
0
\
1249
0
    keyPtr = &pExpandedKey->RoundKey[0]; \
1250
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
1251
0
\
1252
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1253
0
    keyPtr ++; \
1254
0
    c0 = _mm_xor_si128( c0, roundkey ); \
1255
0
    c1 = _mm_xor_si128( c1, roundkey ); \
1256
0
    c2 = _mm_xor_si128( c2, roundkey ); \
1257
0
    c3 = _mm_xor_si128( c3, roundkey ); \
1258
0
\
1259
0
    /* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1260
0
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \
1261
0
    { \
1262
0
        AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
1263
0
    } \
1264
0
\
1265
0
    do \
1266
0
    { \
1267
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1268
0
        keyPtr ++; \
1269
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
1270
0
        c1 = _mm_aesenc_si128( c1, roundkey ); \
1271
0
        c2 = _mm_aesenc_si128( c2, roundkey ); \
1272
0
        c3 = _mm_aesenc_si128( c3, roundkey ); \
1273
0
    } while( keyPtr < keyLimit ); \
1274
0
\
1275
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1276
0
\
1277
0
    c0 = _mm_aesenclast_si128( c0, roundkey ); \
1278
0
    c1 = _mm_aesenclast_si128( c1, roundkey ); \
1279
0
    c2 = _mm_aesenclast_si128( c2, roundkey ); \
1280
0
    c3 = _mm_aesenclast_si128( c3, roundkey ); \
1281
0
};
1282
1283
0
#define AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1284
0
{ \
1285
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1286
0
    keyPtr ++; \
1287
0
    c0 = _mm_aesenc_si128( c0, roundkey ); \
1288
0
    c1 = _mm_aesenc_si128( c1, roundkey ); \
1289
0
    c2 = _mm_aesenc_si128( c2, roundkey ); \
1290
0
    c3 = _mm_aesenc_si128( c3, roundkey ); \
1291
0
    c4 = _mm_aesenc_si128( c4, roundkey ); \
1292
0
    c5 = _mm_aesenc_si128( c5, roundkey ); \
1293
0
    c6 = _mm_aesenc_si128( c6, roundkey ); \
1294
0
    c7 = _mm_aesenc_si128( c7, roundkey ); \
1295
0
\
1296
0
    r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \
1297
0
    r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \
1298
0
    gHashPointer += 16; \
1299
0
\
1300
0
    t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
1301
0
    t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1302
0
    t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \
1303
0
\
1304
0
    resl = _mm_xor_si128( resl, t0 ); \
1305
0
    resh = _mm_xor_si128( resh, t1 ); \
1306
0
\
1307
0
    t0 = _mm_srli_si128( r0, 8 ); \
1308
0
    r0 = _mm_xor_si128( r0, t0 ); \
1309
0
    t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
1310
0
    t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1311
0
\
1312
0
    resm = _mm_xor_si128( resm, t1 ); \
1313
0
    todo --; \
1314
0
};
1315
1316
0
#define AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1317
0
{ \
1318
0
    const BYTE (*keyPtr)[4][4]; \
1319
0
    const BYTE (*keyLimit)[4][4]; \
1320
0
    __m128i roundkey; \
1321
0
    __m128i t0, t1; \
1322
0
    __m128i r0; \
1323
0
    SIZE_T aesEncryptGhashLoop; \
1324
0
\
1325
0
    keyPtr = &pExpandedKey->RoundKey[0]; \
1326
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
1327
0
\
1328
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1329
0
    keyPtr ++; \
1330
0
    c0 = _mm_xor_si128( c0, roundkey ); \
1331
0
    c1 = _mm_xor_si128( c1, roundkey ); \
1332
0
    c2 = _mm_xor_si128( c2, roundkey ); \
1333
0
    c3 = _mm_xor_si128( c3, roundkey ); \
1334
0
    c4 = _mm_xor_si128( c4, roundkey ); \
1335
0
    c5 = _mm_xor_si128( c5, roundkey ); \
1336
0
    c6 = _mm_xor_si128( c6, roundkey ); \
1337
0
    c7 = _mm_xor_si128( c7, roundkey ); \
1338
0
\
1339
0
    /* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1340
0
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \
1341
0
    { \
1342
0
        AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
1343
0
    } \
1344
0
\
1345
0
    do \
1346
0
    { \
1347
0
        roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1348
0
        keyPtr ++; \
1349
0
        c0 = _mm_aesenc_si128( c0, roundkey ); \
1350
0
        c1 = _mm_aesenc_si128( c1, roundkey ); \
1351
0
        c2 = _mm_aesenc_si128( c2, roundkey ); \
1352
0
        c3 = _mm_aesenc_si128( c3, roundkey ); \
1353
0
        c4 = _mm_aesenc_si128( c4, roundkey ); \
1354
0
        c5 = _mm_aesenc_si128( c5, roundkey ); \
1355
0
        c6 = _mm_aesenc_si128( c6, roundkey ); \
1356
0
        c7 = _mm_aesenc_si128( c7, roundkey ); \
1357
0
    } while( keyPtr < keyLimit ); \
1358
0
\
1359
0
    roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1360
0
\
1361
0
    c0 = _mm_aesenclast_si128( c0, roundkey ); \
1362
0
    c1 = _mm_aesenclast_si128( c1, roundkey ); \
1363
0
    c2 = _mm_aesenclast_si128( c2, roundkey ); \
1364
0
    c3 = _mm_aesenclast_si128( c3, roundkey ); \
1365
0
    c4 = _mm_aesenclast_si128( c4, roundkey ); \
1366
0
    c5 = _mm_aesenclast_si128( c5, roundkey ); \
1367
0
    c6 = _mm_aesenclast_si128( c6, roundkey ); \
1368
0
    c7 = _mm_aesenclast_si128( c7, roundkey ); \
1369
0
};
1370
1371
// This call is functionally identical to:
1372
// SymCryptAesCtrMsb64Xmm( pExpandedKey,
1373
//                         pbChainingValue,
1374
//                         pbSrc,
1375
//                         pbDst,
1376
//                         cbData );
1377
// SymCryptGHashAppendDataPclmulqdq(   expandedKeyTable,
1378
//                                     pState,
1379
//                                     pbDst,
1380
//                                     cbData );
1381
VOID
1382
SYMCRYPT_CALL
1383
SymCryptAesGcmEncryptStitchedXmm(
1384
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1385
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
1386
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
1387
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
1388
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
1389
    _Out_writes_( cbData )                  PBYTE                       pbDst,
1390
                                            SIZE_T                      cbData )
1391
0
{
1392
0
    __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
1393
1394
0
    __m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
1395
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
1396
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
1397
1398
0
    __m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
1399
0
    __m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
1400
0
    __m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 );
1401
1402
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
1403
0
    __m128i r0, r1;
1404
1405
0
    __m128i state;
1406
0
    __m128i a0, a1, a2;
1407
0
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1408
0
    SIZE_T todo;
1409
0
    PCBYTE pbGhashSrc = pbDst;
1410
1411
0
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1412
1413
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1414
0
    state = _mm_loadu_si128( (__m128i *) pState );
1415
1416
0
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1417
0
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1418
1419
    // Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks
1420
0
    c0 = chain;
1421
0
    c1 = _mm_add_epi32( chain, chainIncrement1 );
1422
0
    c2 = _mm_add_epi32( chain, chainIncrement2 );
1423
0
    c3 = _mm_add_epi32( c1, chainIncrement2 );
1424
0
    c4 = _mm_add_epi32( c2, chainIncrement2 );
1425
0
    c5 = _mm_add_epi32( c3, chainIncrement2 );
1426
0
    c6 = _mm_add_epi32( c4, chainIncrement2 );
1427
0
    c7 = _mm_add_epi32( c5, chainIncrement2 );
1428
1429
0
    c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1430
0
    c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1431
0
    c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1432
0
    c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1433
0
    c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1434
0
    c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1435
0
    c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1436
0
    c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1437
1438
0
    AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1439
1440
0
    if( nBlocks >= 8 )
1441
0
    {
1442
        // Encrypt first 8 blocks - update chain
1443
0
        chain = _mm_add_epi32( chain, chainIncrement8 );
1444
1445
0
        _mm_storeu_si128( (__m128i *) (pbDst +  0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) ) );
1446
0
        _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1447
0
        _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1448
0
        _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1449
0
        _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1450
0
        _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1451
0
        _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1452
0
        _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1453
1454
0
        pbDst  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1455
0
        pbSrc  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1456
1457
0
        while( nBlocks >= 16 )
1458
0
        {
1459
            // In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH
1460
0
            c0 = chain;
1461
0
            c1 = _mm_add_epi32( chain, chainIncrement1 );
1462
0
            c2 = _mm_add_epi32( chain, chainIncrement2 );
1463
0
            c3 = _mm_add_epi32( c1, chainIncrement2 );
1464
0
            c4 = _mm_add_epi32( c2, chainIncrement2 );
1465
0
            c5 = _mm_add_epi32( c3, chainIncrement2 );
1466
0
            c6 = _mm_add_epi32( c4, chainIncrement2 );
1467
0
            c7 = _mm_add_epi32( c5, chainIncrement2 );
1468
0
            chain = _mm_add_epi32( c6, chainIncrement2 );
1469
1470
0
            c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1471
0
            c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1472
0
            c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1473
0
            c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1474
0
            c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1475
0
            c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1476
0
            c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1477
0
            c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1478
1479
0
            AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1480
1481
0
            _mm_storeu_si128( (__m128i *) (pbDst +  0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) ) );
1482
0
            _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1483
0
            _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1484
0
            _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1485
0
            _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1486
0
            _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1487
0
            _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1488
0
            _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1489
1490
0
            pbDst  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1491
0
            pbSrc  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1492
0
            nBlocks -= 8;
1493
1494
0
            if( todo == 0 )
1495
0
            {
1496
0
                CLMUL_3_POST( a0, a1, a2 );
1497
0
                MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1498
1499
0
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1500
0
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1501
0
            }
1502
0
        }
1503
1504
        // We now have at least 8 blocks of encrypted data to GHASH and at most 7 blocks left to encrypt
1505
        // Do 8 blocks of GHASH in parallel with generating 0, 4, or 8 AES-CTR blocks for tail encryption
1506
0
        nBlocks -= 8;
1507
0
        if (nBlocks > 0)
1508
0
        {
1509
0
            c0 = chain;
1510
0
            c1 = _mm_add_epi32( chain, chainIncrement1 );
1511
0
            c2 = _mm_add_epi32( chain, chainIncrement2 );
1512
0
            c3 = _mm_add_epi32( c1, chainIncrement2 );
1513
0
            c4 = _mm_add_epi32( c2, chainIncrement2 );
1514
1515
0
            c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1516
0
            c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1517
0
            c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1518
0
            c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1519
1520
0
            if (nBlocks > 4)
1521
0
            {
1522
                // Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1523
0
                c5 = _mm_add_epi32( c4, chainIncrement1 );
1524
0
                c6 = _mm_add_epi32( c4, chainIncrement2 );
1525
1526
0
                c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1527
0
                c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1528
0
                c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1529
1530
0
                AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1531
0
            }
1532
0
            else
1533
0
            {
1534
                // Do 4 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1535
0
                AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1536
0
            }
1537
1538
0
            if( todo == 0)
1539
0
            {
1540
0
                CLMUL_3_POST( a0, a1, a2 );
1541
0
                MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1542
1543
0
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1544
0
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1545
0
            }
1546
0
        }
1547
0
        else
1548
0
        {
1549
            // Just do the final 8 rounds of GHASH
1550
0
            for( todo=8; todo>0; todo-- )
1551
0
            {
1552
0
                r0 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) (pbGhashSrc +  0) ), BYTE_REVERSE_ORDER );
1553
0
                pbGhashSrc += SYMCRYPT_AES_BLOCK_SIZE;
1554
1555
0
                CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1556
0
            }
1557
1558
0
            CLMUL_3_POST( a0, a1, a2 );
1559
0
            MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1560
0
        }
1561
0
    }
1562
1563
0
    if( nBlocks > 0 )
1564
0
    {
1565
        // Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
1566
0
        while( nBlocks >= 2 )
1567
0
        {
1568
0
            chain = _mm_add_epi32( chain, chainIncrement2 );
1569
1570
0
            r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) );
1571
0
            r1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) );
1572
1573
0
            _mm_storeu_si128( (__m128i *) (pbDst +  0), r0 );
1574
0
            _mm_storeu_si128( (__m128i *) (pbDst + 16), r1 );
1575
1576
0
            r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER );
1577
0
            r1 = _mm_shuffle_epi8( r1, BYTE_REVERSE_ORDER );
1578
1579
0
            CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo - 0), GHASH_Hx_POWER(expandedKeyTable, todo - 0), a0, a1, a2 );
1580
0
            CLMUL_ACC_3( r1, GHASH_H_POWER(expandedKeyTable, todo - 1), GHASH_Hx_POWER(expandedKeyTable, todo - 1), a0, a1, a2 );
1581
1582
0
            pbDst   += 2*SYMCRYPT_AES_BLOCK_SIZE;
1583
0
            pbSrc   += 2*SYMCRYPT_AES_BLOCK_SIZE;
1584
0
            todo    -= 2;
1585
0
            nBlocks -= 2;
1586
0
            c0 = c2;
1587
0
            c1 = c3;
1588
0
            c2 = c4;
1589
0
            c3 = c5;
1590
0
            c4 = c6;
1591
0
        }
1592
1593
0
        if( nBlocks > 0 )
1594
0
        {
1595
0
            chain = _mm_add_epi32( chain, chainIncrement1 );
1596
1597
0
            r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) );
1598
1599
0
            _mm_storeu_si128( (__m128i *) (pbDst +  0), r0 );
1600
1601
0
            r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER );
1602
1603
0
            CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, 1), GHASH_Hx_POWER(expandedKeyTable, 1), a0, a1, a2 );
1604
0
        }
1605
1606
0
        CLMUL_3_POST( a0, a1, a2 );
1607
0
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1608
0
    }
1609
1610
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1611
0
    _mm_storeu_si128( (__m128i *) pbChainingValue, chain );
1612
0
    _mm_storeu_si128( (__m128i *) pState, state );
1613
0
}
1614
1615
#pragma warning(push)
1616
#pragma warning( disable:4701 )
1617
#pragma runtime_checks( "u", off )
1618
// This call is functionally identical to:
1619
// SymCryptGHashAppendDataPclmulqdq(   expandedKeyTable,
1620
//                                     pState,
1621
//                                     pbSrc,
1622
//                                     cbData );
1623
// SymCryptAesCtrMsb64Xmm( pExpandedKey,
1624
//                         pbChainingValue,
1625
//                         pbSrc,
1626
//                         pbDst,
1627
//                         cbData );
1628
VOID
1629
SYMCRYPT_CALL
1630
SymCryptAesGcmDecryptStitchedXmm(
1631
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1632
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
1633
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
1634
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
1635
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
1636
    _Out_writes_( cbData )                  PBYTE                       pbDst,
1637
                                            SIZE_T                      cbData )
1638
0
{
1639
0
    __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
1640
1641
0
    __m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
1642
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
1643
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
1644
1645
0
    __m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
1646
0
    __m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
1647
1648
0
    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
1649
1650
0
    __m128i state;
1651
0
    __m128i a0, a1, a2;
1652
0
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1653
0
    SIZE_T todo = 0;
1654
0
    PCBYTE pbGhashSrc = pbSrc;
1655
1656
0
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1657
1658
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1659
0
    state = _mm_loadu_si128( (__m128i *) pState );
1660
1661
0
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1662
0
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1663
1664
0
    while( nBlocks >= 8 )
1665
0
    {
1666
        // In this loop we always have 8 blocks to decrypt and GHASH
1667
0
        c0 = chain;
1668
0
        c1 = _mm_add_epi32( chain, chainIncrement1 );
1669
0
        c2 = _mm_add_epi32( chain, chainIncrement2 );
1670
0
        c3 = _mm_add_epi32( c1, chainIncrement2 );
1671
0
        c4 = _mm_add_epi32( c2, chainIncrement2 );
1672
0
        c5 = _mm_add_epi32( c3, chainIncrement2 );
1673
0
        c6 = _mm_add_epi32( c4, chainIncrement2 );
1674
0
        c7 = _mm_add_epi32( c5, chainIncrement2 );
1675
0
        chain = _mm_add_epi32( c6, chainIncrement2 );
1676
1677
0
        c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1678
0
        c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1679
0
        c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1680
0
        c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1681
0
        c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1682
0
        c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1683
0
        c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1684
0
        c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1685
1686
0
        AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1687
1688
0
        _mm_storeu_si128( (__m128i *) (pbDst +  0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) ) );
1689
0
        _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1690
0
        _mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1691
0
        _mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1692
0
        _mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1693
0
        _mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1694
0
        _mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1695
0
        _mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1696
1697
0
        pbDst  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1698
0
        pbSrc  += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1699
0
        nBlocks -= 8;
1700
1701
0
        if ( todo == 0 )
1702
0
        {
1703
0
            CLMUL_3_POST( a0, a1, a2 );
1704
0
            MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1705
1706
0
            if ( nBlocks > 0 )
1707
0
            {
1708
0
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1709
0
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1710
0
            }
1711
0
        }
1712
0
    }
1713
1714
0
    if( nBlocks > 0 )
1715
0
    {
1716
        // We have 1-7 blocks to GHASH and decrypt
1717
        // Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR
1718
0
        c0 = chain;
1719
0
        c1 = _mm_add_epi32( chain, chainIncrement1 );
1720
0
        c2 = _mm_add_epi32( chain, chainIncrement2 );
1721
0
        c3 = _mm_add_epi32( c1, chainIncrement2 );
1722
0
        c4 = _mm_add_epi32( c2, chainIncrement2 );
1723
1724
0
        c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1725
0
        c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1726
0
        c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1727
0
        c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1728
1729
0
        if( nBlocks > 4 )
1730
0
        {
1731
0
            c5 = _mm_add_epi32( c4, chainIncrement1 );
1732
0
            c6 = _mm_add_epi32( c4, chainIncrement2 );
1733
1734
0
            c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1735
0
            c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1736
0
            c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1737
1738
0
            AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1739
0
        } else {
1740
0
            AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1741
0
        }
1742
1743
0
        CLMUL_3_POST( a0, a1, a2 );
1744
0
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1745
1746
        // Decrypt 1-7 blocks with pre-generated AES-CTR blocks
1747
0
        while( nBlocks >= 2 )
1748
0
        {
1749
0
            chain = _mm_add_epi32( chain, chainIncrement2 );
1750
1751
0
            _mm_storeu_si128( (__m128i *) (pbDst +  0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) ) );
1752
0
            _mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1753
1754
0
            pbDst   += 2*SYMCRYPT_AES_BLOCK_SIZE;
1755
0
            pbSrc   += 2*SYMCRYPT_AES_BLOCK_SIZE;
1756
0
            nBlocks -= 2;
1757
0
            c0 = c2;
1758
0
            c1 = c3;
1759
0
            c2 = c4;
1760
0
            c3 = c5;
1761
0
            c4 = c6;
1762
0
        }
1763
1764
0
        if( nBlocks > 0 )
1765
0
        {
1766
0
            chain = _mm_add_epi32( chain, chainIncrement1 );
1767
1768
0
            _mm_storeu_si128( (__m128i *) (pbDst +  0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc +  0) ) ) );
1769
0
        }
1770
0
    }
1771
1772
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1773
0
    _mm_storeu_si128( (__m128i *) pbChainingValue, chain );
1774
0
    _mm_storeu_si128((__m128i *)pState, state );
1775
0
}
1776
#pragma runtime_checks( "u", restore )
1777
#pragma warning(pop)
1778
1779
#endif // CPU_X86 | CPU_AMD64