Coverage Report

Created: 2025-12-31 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/evp/enc_b64_avx2.c
Line
Count
Source
1
#include <openssl/evp.h>
2
#include "enc_b64_scalar.h"
3
#include "enc_b64_avx2.h"
4
#include "internal/cryptlib.h"
5
#include "crypto/evp.h"
6
#include "evp_local.h"
7
8
#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
9
#define STRINGIFY_IMPLEMENTATION_(a) #a
10
#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
11
12
#ifdef __clang__
13
/*
14
 * clang does not have GCC push pop
15
 * warning: clang attribute push can't be used within a namespace in clang up
16
 * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be
17
 * outside* of a namespace.
18
 */
19
#define OPENSSL_TARGET_REGION(T)                                       \
20
    _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \
21
        apply_to = function)))
22
#define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop")
23
#elif defined(__GNUC__)
24
#define OPENSSL_TARGET_REGION(T) \
25
    _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
26
#define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options")
27
#endif /* clang then gcc */
28
29
/* Default target region macros don't do anything. */
30
#ifndef OPENSSL_TARGET_REGION
31
#define OPENSSL_TARGET_REGION(T)
32
#define OPENSSL_UNTARGET_REGION
33
#endif
34
35
#define OPENSSL_TARGET_AVX2 \
36
    OPENSSL_TARGET_REGION("avx2")
37
#define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION
38
39
/*
40
 * Ensure this whole block is compiled with AVX2 enabled on GCC.
41
 * Clang/MSVC will just ignore these pragmas.
42
 */
43
44
#include <string.h>
45
#include <immintrin.h>
46
#include <stddef.h>
47
#include <stdint.h>
48
49
OPENSSL_TARGET_AVX2
50
static __m256i lookup_pshufb_std(__m256i input)
51
0
{
52
0
    __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
53
0
    const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
54
55
0
    result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
56
0
    __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
57
0
        '0' - 52, '0' - 52,
58
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
59
0
        '/' - 63, 'A', 0, 0,
60
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
61
0
        '0' - 52, '0' - 52,
62
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
63
0
        '/' - 63, 'A', 0, 0);
64
65
0
    result = _mm256_shuffle_epi8(shift_LUT, result);
66
0
    return _mm256_add_epi8(result, input);
67
0
}
68
OPENSSL_UNTARGET_AVX2
69
70
OPENSSL_TARGET_AVX2
71
static inline __m256i lookup_pshufb_srp(__m256i input)
72
0
{
73
0
    const __m256i zero = _mm256_setzero_si256();
74
0
    const __m256i hi = _mm256_set1_epi8((char)0x80);
75
0
    __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input),
76
0
        _mm256_cmpgt_epi8(input,
77
0
            _mm256_set1_epi8(63)));
78
0
    __m256i idx = _mm256_setzero_si256();
79
80
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9)));
81
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35)));
82
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3),
83
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62)));
84
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4),
85
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63)));
86
87
    /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */
88
0
    idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi));
89
90
0
    const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
91
0
        0, 0, 0, 0, 0, 0, 0, 0, 0,
92
0
        '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
93
0
        0, 0, 0, 0, 0, 0, 0, 0, 0);
94
95
0
    __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx);
96
0
    __m256i ascii = _mm256_add_epi8(shift, input);
97
0
    return ascii;
98
0
}
99
OPENSSL_UNTARGET_AVX2
100
101
OPENSSL_TARGET_AVX2
102
static inline __m256i shift_right_zeros(__m256i v, int n)
103
0
{
104
0
    switch (n) {
105
0
    case 0:
106
0
        return v;
107
0
    case 1:
108
0
        return _mm256_srli_si256(v, 1);
109
0
    case 2:
110
0
        return _mm256_srli_si256(v, 2);
111
0
    case 3:
112
0
        return _mm256_srli_si256(v, 3);
113
0
    case 4:
114
0
        return _mm256_srli_si256(v, 4);
115
0
    case 5:
116
0
        return _mm256_srli_si256(v, 5);
117
0
    case 6:
118
0
        return _mm256_srli_si256(v, 6);
119
0
    case 7:
120
0
        return _mm256_srli_si256(v, 7);
121
0
    case 8:
122
0
        return _mm256_srli_si256(v, 8);
123
0
    case 9:
124
0
        return _mm256_srli_si256(v, 9);
125
0
    case 10:
126
0
        return _mm256_srli_si256(v, 10);
127
0
    case 11:
128
0
        return _mm256_srli_si256(v, 11);
129
0
    case 12:
130
0
        return _mm256_srli_si256(v, 12);
131
0
    case 13:
132
0
        return _mm256_srli_si256(v, 13);
133
0
    case 14:
134
0
        return _mm256_srli_si256(v, 14);
135
0
    case 15:
136
0
        return _mm256_srli_si256(v, 15);
137
0
    default:
138
0
        return _mm256_setzero_si256();
139
0
    }
140
0
}
141
OPENSSL_UNTARGET_AVX2
142
143
OPENSSL_TARGET_AVX2
144
static inline __m256i shift_left_zeros(__m256i v, int n)
145
0
{
146
0
    switch (n) {
147
0
    case 0:
148
0
        return v;
149
0
    case 1:
150
0
        return _mm256_slli_si256(v, 1);
151
0
    case 2:
152
0
        return _mm256_slli_si256(v, 2);
153
0
    case 3:
154
0
        return _mm256_slli_si256(v, 3);
155
0
    case 4:
156
0
        return _mm256_slli_si256(v, 4);
157
0
    case 5:
158
0
        return _mm256_slli_si256(v, 5);
159
0
    case 6:
160
0
        return _mm256_slli_si256(v, 6);
161
0
    case 7:
162
0
        return _mm256_slli_si256(v, 7);
163
0
    case 8:
164
0
        return _mm256_slli_si256(v, 8);
165
0
    case 9:
166
0
        return _mm256_slli_si256(v, 9);
167
0
    case 10:
168
0
        return _mm256_slli_si256(v, 10);
169
0
    case 11:
170
0
        return _mm256_slli_si256(v, 11);
171
0
    case 12:
172
0
        return _mm256_slli_si256(v, 12);
173
0
    case 13:
174
0
        return _mm256_slli_si256(v, 13);
175
0
    case 14:
176
0
        return _mm256_slli_si256(v, 14);
177
0
    case 15:
178
0
        return _mm256_slli_si256(v, 15);
179
0
    case 16:
180
0
        return _mm256_setzero_si256();
181
0
    default:
182
0
        return _mm256_setzero_si256();
183
0
    }
184
0
}
185
OPENSSL_UNTARGET_AVX2
186
187
static const uint8_t shuffle_masks[16][16] = {
188
    { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
189
    { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
190
    { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
191
    { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
192
    { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
193
    { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
194
    { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
195
    { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 },
196
    { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 },
197
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 },
198
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 },
199
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 },
200
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 },
201
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 },
202
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 },
203
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 }
204
};
205
206
/**
207
 * Insert a line feed character in the 64-byte input at index K in [0,32).
208
 */
209
OPENSSL_TARGET_AVX2
210
static inline __m256i insert_line_feed32(__m256i input, int K)
211
0
{
212
0
    __m256i line_feed_vector = _mm256_set1_epi8('\n');
213
0
    __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
214
215
0
    if (K >= 16) {
216
0
        __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]);
217
0
        __m256i mask = _mm256_set_m128i(maskhi, identity);
218
0
        __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
219
0
        __m256i shuffled = _mm256_shuffle_epi8(input, mask);
220
0
        __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
221
222
0
        return result;
223
0
    }
224
    /* Shift input right by 1 byte */
225
0
    __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21),
226
0
        15);
227
0
    input = _mm256_blend_epi32(input, shift, 0xF0);
228
0
    __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]);
229
0
    __m256i mask = _mm256_set_m128i(identity, masklo);
230
0
    __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
231
0
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
232
0
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
233
0
    return result;
234
0
}
235
OPENSSL_UNTARGET_AVX2
236
237
OPENSSL_TARGET_AVX2
238
static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride,
239
    int *wrap_cnt)
240
0
{
241
0
    const int until_nl = stride - *wrap_cnt;
242
243
0
    if (until_nl > 32) {
244
0
        _mm256_storeu_si256((__m256i *)out, v);
245
246
0
        *wrap_cnt += 32;
247
0
        return 32;
248
0
    }
249
250
0
    if (until_nl == 32) {
251
0
        _mm256_storeu_si256((__m256i *)out, v);
252
253
0
        out[32] = '\n';
254
0
        *wrap_cnt = 0;
255
0
        return 33;
256
0
    }
257
258
0
    const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31);
259
0
    const __m256i with_lf = insert_line_feed32(v, until_nl);
260
0
    _mm256_storeu_si256((__m256i *)out, with_lf);
261
0
    out[32] = last;
262
263
0
    *wrap_cnt = 32 - until_nl;
264
0
    return 33;
265
0
}
266
OPENSSL_UNTARGET_AVX2
267
268
OPENSSL_TARGET_AVX2
269
static inline size_t insert_nl_gt16(const __m256i v0,
270
    uint8_t *output,
271
    int wrap_max, int *wrap_cnt)
272
0
{
273
0
    uint8_t *out = output;
274
0
    int wrap_rem = wrap_max - *wrap_cnt;
275
0
    _mm256_storeu_si256((__m256i *)(output), v0);
276
277
0
    if (wrap_rem > 32) {
278
0
        *wrap_cnt += 32;
279
0
        return 32;
280
0
    }
281
282
0
    __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF);
283
284
0
    __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
285
0
        0, 0, 0, 0, 0, 0, 0, 0,
286
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
287
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
288
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
289
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF);
290
291
0
    __m256i blended_0L = v0;
292
0
    int surplus_0 = wrap_rem < 16 ? 1 : 0;
293
0
    if (surplus_0 == 1) {
294
0
        __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem),
295
0
            wrap_rem + surplus_0);
296
0
        __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0);
297
0
        __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane);
298
0
        __m256i shifted_1_L = shift_left_zeros(v0, 1);
299
0
        __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask);
300
301
0
        blended_0L = _mm256_blendv_epi8(v0, shifted, mask);
302
0
        _mm256_storeu_si256((__m256i *)(output), blended_0L);
303
0
        wrap_rem += wrap_max;
304
0
    }
305
306
0
    int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0;
307
0
    int last_of_1L = _mm256_extract_epi8(v0, 31);
308
309
0
    if (surplus_1 == 1) {
310
0
        uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30);
311
0
        int wrap_rem_1 = wrap_rem - 16;
312
0
        __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1),
313
0
            wrap_rem_1 + surplus_0 + surplus_1);
314
0
        __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1);
315
0
        __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L);
316
0
        __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask);
317
0
        _mm256_storeu_si256((__m256i *)(output), blended_1L);
318
319
0
        output[wrap_rem + surplus_0] = '\n';
320
0
        output[31 + surplus_0] = (uint8_t)sec_last_of_1L;
321
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
322
0
    }
323
324
0
    if (surplus_0 == 1) {
325
0
        output[wrap_rem - wrap_max] = '\n';
326
0
        output[16] = _mm256_extract_epi8(v0, 15);
327
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
328
0
    }
329
330
0
    *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem;
331
332
0
    int nl_at_end = 0;
333
0
    if (*wrap_cnt == wrap_max || *wrap_cnt == 0) {
334
0
        *wrap_cnt = 0;
335
0
        output[32 + surplus_0 + surplus_1] = '\n';
336
0
        nl_at_end = 1;
337
0
    }
338
339
0
    out += 32 + surplus_0 + surplus_1 + nl_at_end;
340
0
    size_t written = (size_t)(out - output);
341
342
0
    return written;
343
0
}
344
OPENSSL_UNTARGET_AVX2
345
346
OPENSSL_TARGET_AVX2
347
static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0,
348
    uint8_t *output,
349
    int dummy_stride,
350
    int *wrap_cnt)
351
0
{
352
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
353
0
        (char)0xFF,
354
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF,
355
0
        12);
356
0
    __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask);
357
358
0
    _mm256_storeu_si256((__m256i *)(output + 0), shuffled);
359
360
0
    int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7);
361
0
    int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29);
362
0
    int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15);
363
364
0
    uint8_t *out = output;
365
0
    out[4] = '\n';
366
0
    memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext));
367
0
    out[16 + 1] = '\n';
368
0
    memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1));
369
0
    out[16 + 14] = '\n';
370
0
    memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2));
371
372
0
    out += 32 + 3;
373
0
    *wrap_cnt = 4;
374
375
0
    size_t written = (out - output);
376
0
    return written;
377
0
}
378
OPENSSL_UNTARGET_AVX2
379
380
OPENSSL_TARGET_AVX2
381
static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask)
382
0
{
383
0
    __m256i newline = _mm256_set1_epi8('\n');
384
385
0
    return _mm256_or_si256(_mm256_and_si256(mask, newline),
386
0
        _mm256_andnot_si256(mask, data));
387
0
}
388
OPENSSL_UNTARGET_AVX2
389
390
OPENSSL_TARGET_AVX2
391
static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output)
392
0
{
393
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6,
394
0
        7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12,
395
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3,
396
0
        (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9);
397
0
    __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
398
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
399
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
400
0
        0, 0);
401
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
402
0
    __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes);
403
404
0
    _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl);
405
406
    /* Handle cross-lane remainder logic */
407
    /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */
408
0
#define B_LANE 16 /* Bytes per lane */
409
0
#define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */
410
0
#define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */
411
412
    /* Bytes that were shifted out of lane 0 */
413
0
    __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L);
414
415
    /* Bytes that were shifted out of lane 1 */
416
0
    __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L),
417
0
                                               B_LANE - N_RET_1_L),
418
0
        B_LANE - 2);
419
420
    /* isolate the bytes that were shifted out of lane 1 */
421
0
    __m256i rem_2_L_P2 = _mm256_slli_si256(
422
0
        _mm256_srli_si256(v0,
423
0
            B_LANE - N_RET_2_L + N_RET_1_L),
424
0
        N_RET_1_L);
425
426
0
    __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2);
427
428
0
    int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0);
429
0
    int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2);
430
431
0
    uint8_t *out = output + 16;
432
0
    memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext));
433
0
    out += 3;
434
0
    *out++ = '\n';
435
436
0
    out = output + 32;
437
0
    memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext));
438
0
    out += 2;
439
0
    *out++ = '\n';
440
0
    out += 4;
441
0
    *out++ = '\n';
442
443
0
    size_t written = (out - output);
444
0
    return written;
445
0
}
446
OPENSSL_UNTARGET_AVX2
447
448
OPENSSL_TARGET_AVX2
449
static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output)
450
0
{
451
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF,
452
0
        8, 9, 10, 11, 12, 13, 14,
453
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6,
454
0
        7, (char)0xFF, 8, 9, 10, 11, 12);
455
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
456
0
    _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes);
457
0
    int8_t rem_1_L = _mm256_extract_epi8(v0, 15);
458
0
    int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29);
459
0
    int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15);
460
0
    uint8_t *out = output;
461
462
0
    memcpy(out + 16, &rem_1_L, sizeof(rem_1_L));
463
0
    memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1));
464
0
    memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2));
465
466
0
    output[8] = '\n';
467
0
    output[17] = '\n';
468
0
    output[26] = '\n';
469
0
    output[35] = '\n';
470
471
0
    out += 32 + 4;
472
473
0
    size_t written = (out - output);
474
0
    return written;
475
0
}
476
OPENSSL_UNTARGET_AVX2
477
478
OPENSSL_TARGET_AVX2
479
int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst,
480
    const unsigned char *src, int srclen, int ctx_length,
481
    int *final_wrap_cnt)
482
0
{
483
0
    const uint8_t *input = (const uint8_t *)src;
484
0
    uint8_t *out = (uint8_t *)dst;
485
0
    int i = 0;
486
0
    int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4;
487
0
    int wrap_cnt = 0;
488
0
    const int use_srp = (ctx != NULL
489
0
        && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0);
490
0
    const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
491
0
        10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
492
0
    int base = 0;
493
494
    /* Process 96 bytes at a time */
495
0
    for (; i + 100 <= srclen; i += 96) {
496
        /* We shave off 4 bytes from the beginning and the end */
497
0
        const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0));
498
0
        const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1));
499
0
        const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2));
500
0
        const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3));
501
0
        const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4));
502
0
        const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5));
503
0
        const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6));
504
0
        const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7));
505
0
        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
506
0
        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
507
0
        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
508
0
        __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
509
0
        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
510
0
        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
511
0
        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
512
0
        const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
513
0
        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
514
0
        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
515
0
        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
516
0
        const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
517
0
        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
518
0
        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
519
0
        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
520
0
        const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
521
0
        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
522
0
        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
523
0
        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
524
0
        const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
525
0
        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
526
0
        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
527
0
        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
528
0
        const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
529
0
        __m256i vec0;
530
0
        __m256i vec1;
531
0
        __m256i vec2;
532
0
        __m256i vec3;
533
534
0
        if (use_srp) {
535
0
            vec0 = lookup_pshufb_srp(input0);
536
0
            vec1 = lookup_pshufb_srp(input1);
537
0
            vec2 = lookup_pshufb_srp(input2);
538
0
            vec3 = lookup_pshufb_srp(input3);
539
540
0
        } else {
541
0
            vec0 = lookup_pshufb_std(input0);
542
0
            vec1 = lookup_pshufb_std(input1);
543
0
            vec2 = lookup_pshufb_std(input2);
544
0
            vec3 = lookup_pshufb_std(input3);
545
0
        }
546
547
0
        if (stride == 0) {
548
0
            _mm256_storeu_si256((__m256i *)out, vec0);
549
550
0
            out += 32;
551
0
            _mm256_storeu_si256((__m256i *)out, vec1);
552
553
0
            out += 32;
554
0
            _mm256_storeu_si256((__m256i *)out, vec2);
555
556
0
            out += 32;
557
0
            _mm256_storeu_si256((__m256i *)out, vec3);
558
559
0
            out += 32;
560
0
        } else if (stride == 64) {
561
0
            _mm256_storeu_si256((__m256i *)out, vec0);
562
563
0
            out += 32;
564
0
            _mm256_storeu_si256((__m256i *)out, vec1);
565
566
0
            out += 32;
567
0
            *(out++) = '\n';
568
569
0
            _mm256_storeu_si256((__m256i *)out, vec2);
570
0
            out += 32;
571
572
0
            _mm256_storeu_si256((__m256i *)out, vec3);
573
0
            out += 32;
574
575
0
            *(out++) = '\n';
576
0
        } else if (stride == 4) {
577
0
            int out_idx = 0;
578
579
0
            out_idx += (int)insert_nl_str4(vec0, out + out_idx);
580
0
            out_idx += (int)insert_nl_str4(vec1, out + out_idx);
581
0
            out_idx += (int)insert_nl_str4(vec2, out + out_idx);
582
0
            out_idx += (int)insert_nl_str4(vec3, out + out_idx);
583
584
0
            out += out_idx;
585
0
        } else if (stride == 8) {
586
587
0
            out += insert_nl_str8(vec0, out);
588
0
            out += insert_nl_str8(vec1, out);
589
0
            out += insert_nl_str8(vec2, out);
590
0
            out += insert_nl_str8(vec3, out);
591
592
0
        } else if (stride == 12) {
593
0
            switch (base) {
594
0
            case 0:
595
596
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
597
0
                out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt);
598
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
599
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
600
0
                break;
601
0
            case 1:
602
0
                out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt);
603
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
604
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
605
0
                out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt);
606
0
                break;
607
0
            default: /* base == 2 */
608
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
609
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
610
0
                out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt);
611
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
612
0
                break;
613
0
            }
614
615
0
            if (++base == 3)
616
0
                base = 0;
617
0
        } else if (stride >= 32) {
618
0
            out += ins_nl_gt32(vec0, out, stride, &wrap_cnt);
619
0
            out += ins_nl_gt32(vec1, out, stride, &wrap_cnt);
620
0
            out += ins_nl_gt32(vec2, out, stride, &wrap_cnt);
621
0
            out += ins_nl_gt32(vec3, out, stride, &wrap_cnt);
622
0
        } else if (stride >= 16) {
623
0
            out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
624
0
            out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
625
0
            out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
626
0
            out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
627
0
        }
628
0
    }
629
630
0
    if (stride == 0) {
631
0
        for (; i + 28 <= srclen; i += 24) {
632
            /* lo = [xxxx|DDDC|CCBB|BAAA] */
633
            /* hi = [xxxx|HHHG|GGFF|FEEE] */
634
0
            const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i));
635
0
            const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3));
636
            /*
637
             * bytes from groups A, B and C are needed in separate 32-bit lanes
638
             * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
639
             */
640
0
            __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
641
0
            const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
642
0
            const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
643
0
            const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
644
0
            const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
645
0
            const __m256i indices = _mm256_or_si256(t1, t3);
646
0
            _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices));
647
648
0
            out += 32;
649
0
        }
650
0
    }
651
0
    *final_wrap_cnt = wrap_cnt;
652
653
0
    if (stride >= 32 && wrap_cnt == stride) {
654
0
        wrap_cnt = 0;
655
0
        *out++ = '\n';
656
0
    }
657
658
0
    return (int)(out - (uint8_t *)dst) + +evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt);
659
0
}
660
OPENSSL_UNTARGET_AVX2
661
#endif