Coverage Report

Created: 2024-11-21 07:03

/src/SymCrypt/lib/aes-ymm.c
Line
Count
Source (jump to first uncovered line)
1
//
2
// aes-ymm.c    code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All YMM code for AES operations
7
// Requires compiler support for aesni, pclmulqdq, avx2, vaes and vpclmulqdq
8
//
9
10
#include "precomp.h"
11
12
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
13
14
#include "xtsaes_definitions.h"
15
#include "ghash_definitions.h"
16
17
0
#define AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
18
0
{ \
19
0
    const BYTE (*keyPtr)[4][4]; \
20
0
    const BYTE (*keyLimit)[4][4]; \
21
0
    __m256i roundkeys; \
22
0
\
23
0
    keyPtr = pExpandedKey->RoundKey; \
24
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
25
0
\
26
0
    /* _mm256_broadcastsi128_si256 requires AVX2 */ \
27
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
28
0
    keyPtr ++; \
29
0
\
30
0
    /* _mm256_xor_si256 requires AVX2 */ \
31
0
    c0 = _mm256_xor_si256( c0, roundkeys ); \
32
0
    c1 = _mm256_xor_si256( c1, roundkeys ); \
33
0
    c2 = _mm256_xor_si256( c2, roundkeys ); \
34
0
    c3 = _mm256_xor_si256( c3, roundkeys ); \
35
0
    c4 = _mm256_xor_si256( c4, roundkeys ); \
36
0
    c5 = _mm256_xor_si256( c5, roundkeys ); \
37
0
    c6 = _mm256_xor_si256( c6, roundkeys ); \
38
0
    c7 = _mm256_xor_si256( c7, roundkeys ); \
39
0
\
40
0
    do \
41
0
    { \
42
0
        roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
43
0
        keyPtr ++; \
44
0
        c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
45
0
        c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
46
0
        c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
47
0
        c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
48
0
        c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
49
0
        c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
50
0
        c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
51
0
        c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
52
0
    } while( keyPtr < keyLimit ); \
53
0
\
54
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
55
0
\
56
0
    c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \
57
0
    c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \
58
0
    c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \
59
0
    c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \
60
0
    c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \
61
0
    c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \
62
0
    c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \
63
0
    c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \
64
0
};
65
66
0
#define AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
67
0
{ \
68
0
    const BYTE (*keyPtr)[4][4]; \
69
0
    const BYTE (*keyLimit)[4][4]; \
70
0
    __m256i roundkeys; \
71
0
\
72
0
    keyPtr = pExpandedKey->lastEncRoundKey; \
73
0
    keyLimit = pExpandedKey->lastDecRoundKey; \
74
0
\
75
0
    /* _mm256_broadcastsi128_si256 requires AVX2 */ \
76
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
77
0
    keyPtr ++; \
78
0
\
79
0
    /* _mm256_xor_si256 requires AVX2 */ \
80
0
    c0 = _mm256_xor_si256( c0, roundkeys ); \
81
0
    c1 = _mm256_xor_si256( c1, roundkeys ); \
82
0
    c2 = _mm256_xor_si256( c2, roundkeys ); \
83
0
    c3 = _mm256_xor_si256( c3, roundkeys ); \
84
0
    c4 = _mm256_xor_si256( c4, roundkeys ); \
85
0
    c5 = _mm256_xor_si256( c5, roundkeys ); \
86
0
    c6 = _mm256_xor_si256( c6, roundkeys ); \
87
0
    c7 = _mm256_xor_si256( c7, roundkeys ); \
88
0
\
89
0
    do \
90
0
    { \
91
0
        roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
92
0
        keyPtr ++; \
93
0
        c0 = _mm256_aesdec_epi128( c0, roundkeys ); \
94
0
        c1 = _mm256_aesdec_epi128( c1, roundkeys ); \
95
0
        c2 = _mm256_aesdec_epi128( c2, roundkeys ); \
96
0
        c3 = _mm256_aesdec_epi128( c3, roundkeys ); \
97
0
        c4 = _mm256_aesdec_epi128( c4, roundkeys ); \
98
0
        c5 = _mm256_aesdec_epi128( c5, roundkeys ); \
99
0
        c6 = _mm256_aesdec_epi128( c6, roundkeys ); \
100
0
        c7 = _mm256_aesdec_epi128( c7, roundkeys ); \
101
0
    } while( keyPtr < keyLimit ); \
102
0
\
103
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
104
0
\
105
0
    c0 = _mm256_aesdeclast_epi128( c0, roundkeys ); \
106
0
    c1 = _mm256_aesdeclast_epi128( c1, roundkeys ); \
107
0
    c2 = _mm256_aesdeclast_epi128( c2, roundkeys ); \
108
0
    c3 = _mm256_aesdeclast_epi128( c3, roundkeys ); \
109
0
    c4 = _mm256_aesdeclast_epi128( c4, roundkeys ); \
110
0
    c5 = _mm256_aesdeclast_epi128( c5, roundkeys ); \
111
0
    c6 = _mm256_aesdeclast_epi128( c6, roundkeys ); \
112
0
    c7 = _mm256_aesdeclast_epi128( c7, roundkeys ); \
113
0
};
114
115
VOID
116
SYMCRYPT_CALL
117
SymCryptXtsAesEncryptDataUnitYmm_2048(
118
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
119
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbTweakBlock,
120
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 )  PBYTE                       pbScratch,
121
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
122
    _Out_writes_( cbData )                      PBYTE                       pbDst,
123
                                                SIZE_T                      cbData )
124
0
{
125
0
    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
126
0
    __m256i c0, c1, c2, c3, c4, c5, c6, c7;
127
0
    __m128i XTS_ALPHA_MASK;
128
0
    __m256i XTS_ALPHA_MULTIPLIER_Ymm;
129
130
    // Load tweaks into big T
131
0
    __m256i T0, T1, T2, T3, T4, T5, T6, T7;
132
133
0
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
134
0
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
135
136
    // To simplify logic and unusual size processing, we handle all
137
    // data not a multiple of 16 blocks in the tail loop
138
0
    cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1);
139
    // Additionally, so that ciphertext stealing logic does not rely on
140
    // reading back from the destination buffer, when we have a non-zero
141
    // tail, we ensure that we handle at least 1 whole block in the tail
142
0
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0;
143
0
    cbDataMain = cbData - cbDataTail;
144
145
0
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
146
0
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
147
0
    SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
148
149
0
    if( cbDataMain == 0 )
150
0
    {
151
0
        SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
152
0
        return;
153
0
    }
154
155
0
    t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
156
0
    XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
157
0
    XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 );
158
159
    // Do not stall.
160
0
    XTS_MUL_ALPHA4( t0, t4 );
161
0
    XTS_MUL_ALPHA ( t0, t1 );
162
0
    XTS_MUL_ALPHA ( t4, t5 );
163
0
    XTS_MUL_ALPHA ( t1, t2 );
164
0
    XTS_MUL_ALPHA ( t5, t6 );
165
0
    XTS_MUL_ALPHA ( t2, t3 );
166
0
    XTS_MUL_ALPHA ( t6, t7 );
167
168
0
    T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1 ); // AVX
169
0
    T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1 );
170
0
    T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1 );
171
0
    T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1 );
172
0
    XTS_MUL_ALPHA8_YMM(T0, T4);
173
0
    XTS_MUL_ALPHA8_YMM(T1, T5);
174
0
    XTS_MUL_ALPHA8_YMM(T2, T6);
175
0
    XTS_MUL_ALPHA8_YMM(T3, T7);
176
177
0
    for(;;)
178
0
    {
179
0
        c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +                           0 ) ) );
180
0
        c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   2*SYMCRYPT_AES_BLOCK_SIZE ) ) );
181
0
        c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   4*SYMCRYPT_AES_BLOCK_SIZE ) ) );
182
0
        c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   6*SYMCRYPT_AES_BLOCK_SIZE ) ) );
183
0
        c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   8*SYMCRYPT_AES_BLOCK_SIZE ) ) );
184
0
        c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  10*SYMCRYPT_AES_BLOCK_SIZE ) ) );
185
0
        c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  12*SYMCRYPT_AES_BLOCK_SIZE ) ) );
186
0
        c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  14*SYMCRYPT_AES_BLOCK_SIZE ) ) );
187
188
0
        pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
189
190
0
        AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
191
192
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +                          0 ), _mm256_xor_si256( c0, T0 ) );
193
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) );
194
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) );
195
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) );
196
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) );
197
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) );
198
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) );
199
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) );
200
201
0
        pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
202
203
0
        cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE;
204
0
        if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE )
205
0
        {
206
0
            break;
207
0
        }
208
209
0
        XTS_MUL_ALPHA16_YMM(T0, T0);
210
0
        XTS_MUL_ALPHA16_YMM(T1, T1);
211
0
        XTS_MUL_ALPHA16_YMM(T2, T2);
212
0
        XTS_MUL_ALPHA16_YMM(T3, T3);
213
0
        XTS_MUL_ALPHA16_YMM(T4, T4);
214
0
        XTS_MUL_ALPHA16_YMM(T5, T5);
215
0
        XTS_MUL_ALPHA16_YMM(T6, T6);
216
0
        XTS_MUL_ALPHA16_YMM(T7, T7);
217
0
    }
218
219
    // We won't do another 16-block set so we don't update the tweak blocks
220
221
0
    if( cbDataTail > 0 )
222
0
    {
223
        //
224
        // This is a rare case: the data unit length is not a multiple of 256 bytes.
225
        // We do this in the Xmm implementation.
226
        // Fix up the tweak block first
227
        //
228
0
        t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2
229
0
        _mm256_zeroupper();
230
0
        XTS_MUL_ALPHA( t7, t0 );
231
0
        _mm_storeu_si128( (__m128i *) pbTweakBlock, t0 );
232
233
0
        SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
234
0
    }
235
0
    else {
236
0
        _mm256_zeroupper();
237
0
    }
238
0
}
239
240
VOID
241
SYMCRYPT_CALL
242
SymCryptXtsAesDecryptDataUnitYmm_2048(
243
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
244
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbTweakBlock,
245
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 )  PBYTE                       pbScratch,
246
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
247
    _Out_writes_( cbData )                      PBYTE                       pbDst,
248
                                                SIZE_T                      cbData )
249
0
{
250
0
    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
251
0
    __m256i c0, c1, c2, c3, c4, c5, c6, c7;
252
0
    __m128i XTS_ALPHA_MASK;
253
0
    __m256i XTS_ALPHA_MULTIPLIER_Ymm;
254
255
    // Load tweaks into big T
256
0
    __m256i T0, T1, T2, T3, T4, T5, T6, T7;
257
258
0
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
259
0
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
260
261
    // To simplify logic and unusual size processing, we handle all
262
    // data not a multiple of 16 blocks in the tail loop
263
0
    cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1);
264
    // Additionally, so that ciphertext stealing logic does not rely on
265
    // reading back from the destination buffer, when we have a non-zero
266
    // tail, we ensure that we handle at least 1 whole block in the tail
267
0
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0;
268
0
    cbDataMain = cbData - cbDataTail;
269
270
0
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
271
0
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
272
0
    SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
273
274
0
    if( cbDataMain == 0 )
275
0
    {
276
0
        SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
277
0
        return;
278
0
    }
279
280
0
    t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
281
0
    XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
282
0
    XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 );
283
284
    // Do not stall.
285
0
    XTS_MUL_ALPHA4( t0, t4 );
286
0
    XTS_MUL_ALPHA ( t0, t1 );
287
0
    XTS_MUL_ALPHA ( t4, t5 );
288
0
    XTS_MUL_ALPHA ( t1, t2 );
289
0
    XTS_MUL_ALPHA ( t5, t6 );
290
0
    XTS_MUL_ALPHA ( t2, t3 );
291
0
    XTS_MUL_ALPHA ( t6, t7 );
292
293
0
    T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1); // AVX
294
0
    T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1);
295
0
    T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1);
296
0
    T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1);
297
0
    XTS_MUL_ALPHA8_YMM(T0, T4);
298
0
    XTS_MUL_ALPHA8_YMM(T1, T5);
299
0
    XTS_MUL_ALPHA8_YMM(T2, T6);
300
0
    XTS_MUL_ALPHA8_YMM(T3, T7);
301
302
0
    for(;;)
303
0
    {
304
0
        c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +                           0 ) ) );
305
0
        c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   2*SYMCRYPT_AES_BLOCK_SIZE ) ) );
306
0
        c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   4*SYMCRYPT_AES_BLOCK_SIZE ) ) );
307
0
        c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   6*SYMCRYPT_AES_BLOCK_SIZE ) ) );
308
0
        c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +   8*SYMCRYPT_AES_BLOCK_SIZE ) ) );
309
0
        c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  10*SYMCRYPT_AES_BLOCK_SIZE ) ) );
310
0
        c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  12*SYMCRYPT_AES_BLOCK_SIZE ) ) );
311
0
        c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc +  14*SYMCRYPT_AES_BLOCK_SIZE ) ) );
312
313
0
        pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
314
315
0
        AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
316
317
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +                          0 ), _mm256_xor_si256( c0, T0 ) );
318
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) );
319
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) );
320
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) );
321
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst +  8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) );
322
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) );
323
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) );
324
0
        _mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) );
325
326
0
        pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
327
328
0
        cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE;
329
0
        if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE )
330
0
        {
331
0
            break;
332
0
        }
333
334
0
        XTS_MUL_ALPHA16_YMM(T0, T0);
335
0
        XTS_MUL_ALPHA16_YMM(T1, T1);
336
0
        XTS_MUL_ALPHA16_YMM(T2, T2);
337
0
        XTS_MUL_ALPHA16_YMM(T3, T3);
338
0
        XTS_MUL_ALPHA16_YMM(T4, T4);
339
0
        XTS_MUL_ALPHA16_YMM(T5, T5);
340
0
        XTS_MUL_ALPHA16_YMM(T6, T6);
341
0
        XTS_MUL_ALPHA16_YMM(T7, T7);
342
0
    }
343
344
    // We won't do another 16-block set so we don't update the tweak blocks
345
346
0
    if( cbDataTail > 0 )
347
0
    {
348
        //
349
        // This is a rare case: the data unit length is not a multiple of 256 bytes.
350
        // We do this in the Xmm implementation.
351
        // Fix up the tweak block first
352
        //
353
0
        t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2
354
0
        _mm256_zeroupper();
355
0
        XTS_MUL_ALPHA( t7, t0 );
356
0
        _mm_storeu_si128( (__m128i *) pbTweakBlock, t0 );
357
358
0
        SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
359
0
    }
360
0
    else {
361
0
        _mm256_zeroupper();
362
0
    }
363
0
}
364
365
0
#define AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
366
0
{ \
367
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
368
0
    keyPtr ++; \
369
0
    c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
370
0
    c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
371
0
    c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
372
0
    c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
373
0
    c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
374
0
    c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
375
0
    c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
376
0
    c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
377
0
\
378
0
    r0 = _mm256_loadu_si256( (__m256i *) gHashPointer ); \
379
0
    r0 = _mm256_shuffle_epi8( r0, byteReverseOrder ); \
380
0
    gHashPointer += 32; \
381
0
\
382
0
    t1 = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
383
0
    t0 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \
384
0
    t1 = _mm256_clmulepi64_epi128( r0, t1, 0x11 ); \
385
0
\
386
0
    resl = _mm256_xor_si256( resl, t0 ); \
387
0
    resh = _mm256_xor_si256( resh, t1 ); \
388
0
\
389
0
    t0 = _mm256_srli_si256( r0, 8 ); \
390
0
    r0 = _mm256_xor_si256( r0, t0 ); \
391
0
    t1 = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
392
0
    t1 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \
393
0
\
394
0
    resm = _mm256_xor_si256( resm, t1 ); \
395
0
    todo -= 2; \
396
0
};
397
398
0
#define AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
399
0
{ \
400
0
    const BYTE (*keyPtr)[4][4]; \
401
0
    const BYTE (*keyLimit)[4][4]; \
402
0
    __m256i roundkeys; \
403
0
    __m256i t0, t1; \
404
0
    __m256i r0; \
405
0
    int aesEncryptGhashLoop; \
406
0
\
407
0
    keyPtr = pExpandedKey->RoundKey; \
408
0
    keyLimit = pExpandedKey->lastEncRoundKey; \
409
0
\
410
0
    /* _mm256_broadcastsi128_si256 requires AVX2 */ \
411
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
412
0
    keyPtr ++; \
413
0
\
414
0
    /* _mm256_xor_si256 requires AVX2 */ \
415
0
    c0 = _mm256_xor_si256( c0, roundkeys ); \
416
0
    c1 = _mm256_xor_si256( c1, roundkeys ); \
417
0
    c2 = _mm256_xor_si256( c2, roundkeys ); \
418
0
    c3 = _mm256_xor_si256( c3, roundkeys ); \
419
0
    c4 = _mm256_xor_si256( c4, roundkeys ); \
420
0
    c5 = _mm256_xor_si256( c5, roundkeys ); \
421
0
    c6 = _mm256_xor_si256( c6, roundkeys ); \
422
0
    c7 = _mm256_xor_si256( c7, roundkeys ); \
423
0
\
424
0
    /* Do 8(x2) full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
425
0
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < 4; aesEncryptGhashLoop++ ) \
426
0
    { \
427
0
        AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
428
0
        AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
429
0
    } \
430
0
\
431
0
    do \
432
0
    { \
433
0
        roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
434
0
        keyPtr ++; \
435
0
        c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
436
0
        c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
437
0
        c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
438
0
        c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
439
0
        c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
440
0
        c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
441
0
        c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
442
0
        c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
443
0
    } while( keyPtr < keyLimit ); \
444
0
\
445
0
    roundkeys =  _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
446
0
\
447
0
    c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \
448
0
    c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \
449
0
    c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \
450
0
    c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \
451
0
    c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \
452
0
    c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \
453
0
    c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \
454
0
    c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \
455
0
};
456
457
VOID
458
SYMCRYPT_CALL
459
SymCryptAesGcmEncryptStitchedYmm_2048(
460
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
461
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
462
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
463
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
464
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
465
    _Out_writes_( cbData )                  PBYTE                       pbDst,
466
                                            SIZE_T                      cbData )
467
0
{
468
0
    __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
469
470
0
    __m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8(
471
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
472
0
    __m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f );
473
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
474
475
0
    __m256i chainIncrementUpper1  = _mm256_set_epi64x( 0,  1, 0,  0 );
476
0
    __m256i chainIncrement2  = _mm256_set_epi64x( 0,  2, 0,  2 );
477
0
    __m256i chainIncrement4  = _mm256_set_epi64x( 0,  4, 0,  4 );
478
0
    __m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 );
479
480
0
    __m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
481
0
    __m256i c0, c1, c2, c3, c4, c5, c6, c7;
482
0
    __m256i r0, r1, r2, r3, r4, r5, r6, r7;
483
0
    __m256i Hi, Hix;
484
485
0
    __m128i state;
486
0
    __m128i a0_xmm, a1_xmm, a2_xmm;
487
0
    __m256i a0, a1, a2;
488
0
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
489
0
    SIZE_T todo;
490
0
    PCBYTE pbGhashSrc = pbDst;
491
492
0
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
493
0
    SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS );
494
495
0
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
496
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
497
498
0
    state = _mm_loadu_si128( (__m128i *) pState );
499
0
    ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
500
0
    ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
501
0
    ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
502
0
    ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
503
0
    ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
504
0
    ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
505
0
    ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
506
0
    ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
507
0
    ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
508
509
0
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
510
0
    a0 = a1 = a2 = _mm256_setzero_si256();
511
512
0
    c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
513
0
    c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
514
0
    c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
515
0
    c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
516
0
    c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
517
0
    c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
518
0
    c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
519
0
    c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
520
521
0
    ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
522
0
    ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
523
0
    ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
524
0
    ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
525
0
    ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
526
0
    ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
527
0
    ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
528
0
    ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
529
530
0
    AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
531
532
0
    _mm256_storeu_si256( (__m256i *) (pbDst +  0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc +  0) ) ) );
533
0
    _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
534
0
    _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
535
0
    _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
536
0
    _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
537
0
    _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
538
0
    _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
539
0
    _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
540
541
0
    pbDst  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
542
0
    pbSrc  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
543
544
0
    while( nBlocks >= 2*GCM_YMM_MINBLOCKS )
545
0
    {
546
0
        c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
547
0
        c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
548
0
        c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
549
0
        c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
550
0
        c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
551
0
        c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
552
0
        c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
553
0
        c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
554
555
0
        ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
556
0
        ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
557
0
        ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
558
0
        ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
559
0
        ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
560
0
        ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
561
0
        ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
562
0
        ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
563
564
0
        AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
565
566
0
        _mm256_storeu_si256( (__m256i *) (pbDst +  0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc +  0) ) ) );
567
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
568
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
569
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
570
0
        _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
571
0
        _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
572
0
        _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
573
0
        _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
574
575
0
        pbDst  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
576
0
        pbSrc  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
577
0
        nBlocks -= 16;
578
579
0
        if ( todo == 0 )
580
0
        {
581
0
            a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
582
0
            a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
583
0
            a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
584
585
0
            a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
586
0
            a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
587
0
            a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
588
0
            CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
589
0
            MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
590
591
0
            todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
592
0
            CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
593
0
            a0 = a1 = a2 = _mm256_setzero_si256();
594
0
        }
595
0
    }
596
597
0
    r0 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +  0) ), BYTE_REVERSE_ORDER );
598
0
    r1 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 32) ), BYTE_REVERSE_ORDER );
599
0
    r2 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 64) ), BYTE_REVERSE_ORDER );
600
0
    r3 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 96) ), BYTE_REVERSE_ORDER );
601
0
    r4 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +128) ), BYTE_REVERSE_ORDER );
602
0
    r5 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +160) ), BYTE_REVERSE_ORDER );
603
0
    r6 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +192) ), BYTE_REVERSE_ORDER );
604
0
    r7 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +224) ), BYTE_REVERSE_ORDER );
605
606
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo - 0) );
607
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 0) );
608
0
    CLMUL_ACC_3_Ymm( r0, Hi, Hix, a0, a1, a2 );
609
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo - 2) );
610
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 2) );
611
0
    CLMUL_ACC_3_Ymm( r1, Hi, Hix, a0, a1, a2 );
612
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo - 4) );
613
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 4) );
614
0
    CLMUL_ACC_3_Ymm( r2, Hi, Hix, a0, a1, a2 );
615
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo - 6) );
616
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 6) );
617
0
    CLMUL_ACC_3_Ymm( r3, Hi, Hix, a0, a1, a2 );
618
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo - 8) );
619
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 8) );
620
0
    CLMUL_ACC_3_Ymm( r4, Hi, Hix, a0, a1, a2 );
621
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo -10) );
622
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -10) );
623
0
    CLMUL_ACC_3_Ymm( r5, Hi, Hix, a0, a1, a2 );
624
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo -12) );
625
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -12) );
626
0
    CLMUL_ACC_3_Ymm( r6, Hi, Hix, a0, a1, a2 );
627
0
    Hi  = _mm256_loadu_si256( (__m256i *)  &GHASH_H_POWER(expandedKeyTable, todo -14) );
628
0
    Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -14) );
629
0
    CLMUL_ACC_3_Ymm( r7, Hi, Hix, a0, a1, a2 );
630
631
0
    a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
632
0
    a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
633
0
    a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
634
635
0
    a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
636
0
    a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
637
0
    a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
638
0
    CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
639
0
    MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
640
641
0
    chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ );
642
0
    _mm256_zeroupper();
643
644
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
645
0
    _mm_storeu_si128((__m128i *) pbChainingValue, chain );
646
0
    _mm_storeu_si128((__m128i *) pState, state );
647
648
0
    cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1;
649
0
    SYMCRYPT_ASSERT( cbData == (nBlocks-16)*SYMCRYPT_AES_BLOCK_SIZE );
650
0
    if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
651
0
    {
652
0
        SymCryptAesGcmEncryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData);
653
0
    }
654
0
}
655
656
VOID
657
SYMCRYPT_CALL
658
SymCryptAesGcmDecryptStitchedYmm_2048(
659
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
660
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
661
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
662
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
663
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
664
    _Out_writes_( cbData )                  PBYTE                       pbDst,
665
                                            SIZE_T                      cbData )
666
0
{
667
0
    __m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
668
669
0
    __m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8(
670
0
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
671
0
    __m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f );
672
0
    __m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
673
674
0
    __m256i chainIncrementUpper1  = _mm256_set_epi64x( 0,  1, 0,  0 );
675
0
    __m256i chainIncrement2  = _mm256_set_epi64x( 0,  2, 0,  2 );
676
0
    __m256i chainIncrement4  = _mm256_set_epi64x( 0,  4, 0,  4 );
677
0
    __m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 );
678
679
0
    __m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
680
0
    __m256i c0, c1, c2, c3, c4, c5, c6, c7;
681
682
0
    __m128i state;
683
0
    __m128i a0_xmm, a1_xmm, a2_xmm;
684
0
    __m256i a0, a1, a2;
685
0
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
686
0
    SIZE_T todo;
687
0
    PCBYTE pbGhashSrc = pbSrc;
688
689
0
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
690
0
    SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS );
691
692
0
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
693
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
694
695
0
    state = _mm_loadu_si128( (__m128i *) pState );
696
0
    ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
697
0
    ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
698
0
    ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
699
0
    ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
700
0
    ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
701
0
    ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
702
0
    ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
703
0
    ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
704
0
    ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
705
706
0
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
707
0
    a0 = a1 = a2 = _mm256_setzero_si256();
708
709
0
    while( nBlocks >= GCM_YMM_MINBLOCKS )
710
0
    {
711
0
        c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
712
0
        c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
713
0
        c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
714
0
        c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
715
0
        c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
716
0
        c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
717
0
        c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
718
0
        c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
719
720
0
        ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
721
0
        ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
722
0
        ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
723
0
        ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
724
0
        ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
725
0
        ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
726
0
        ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
727
0
        ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
728
729
0
        AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
730
731
0
        _mm256_storeu_si256( (__m256i *) (pbDst +  0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc +  0) ) ) );
732
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
733
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
734
0
        _mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
735
0
        _mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
736
0
        _mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
737
0
        _mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
738
0
        _mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
739
740
0
        pbDst  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
741
0
        pbSrc  += 16 * SYMCRYPT_AES_BLOCK_SIZE;
742
0
        nBlocks -= 16;
743
744
0
        if ( todo == 0 )
745
0
        {
746
0
            a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
747
0
            a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
748
0
            a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
749
750
0
            a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
751
0
            a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
752
0
            a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
753
0
            CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
754
0
            MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
755
756
0
            if ( nBlocks > 0 )
757
0
            {
758
0
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
759
0
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
760
0
                a0 = a1 = a2 = _mm256_setzero_si256();
761
0
            }
762
0
        }
763
0
    }
764
765
0
    chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ );
766
0
    _mm256_zeroupper();
767
768
0
    chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
769
0
    _mm_storeu_si128((__m128i *) pbChainingValue, chain );
770
0
    _mm_storeu_si128((__m128i *) pState, state );
771
772
0
    cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1;
773
0
    SYMCRYPT_ASSERT( cbData == nBlocks*SYMCRYPT_AES_BLOCK_SIZE );
774
0
    if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
775
0
    {
776
0
        SymCryptAesGcmDecryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData);
777
0
    }
778
0
}
779
780
#endif // CPU_X86 | CPU_AMD64