Coverage Report

Created: 2026-04-12 07:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/evp/enc_b64_avx2.c
Line
Count
Source
1
#include <openssl/evp.h>
2
#include "enc_b64_scalar.h"
3
#include "enc_b64_avx2.h"
4
#include "internal/cryptlib.h"
5
#include "crypto/evp.h"
6
#include "evp_local.h"
7
8
#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
9
#if !defined(_M_ARM64EC)
10
#define STRINGIFY_IMPLEMENTATION_(a) #a
11
#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
12
13
#ifdef __clang__
14
/*
15
 * clang does not have GCC push pop
16
 * warning: clang attribute push can't be used within a namespace in clang up
17
 * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be
18
 * outside* of a namespace.
19
 */
20
#define OPENSSL_TARGET_REGION(T)                                       \
21
    _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \
22
        apply_to = function)))
23
#define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop")
24
#elif defined(__GNUC__)
25
#define OPENSSL_TARGET_REGION(T) \
26
    _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
27
#define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options")
28
#endif /* clang then gcc */
29
30
/* Default target region macros don't do anything. */
31
#ifndef OPENSSL_TARGET_REGION
32
#define OPENSSL_TARGET_REGION(T)
33
#define OPENSSL_UNTARGET_REGION
34
#endif
35
36
#define OPENSSL_TARGET_AVX2 \
37
    OPENSSL_TARGET_REGION("avx2")
38
#define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION
39
40
/*
41
 * Ensure this whole block is compiled with AVX2 enabled on GCC.
42
 * Clang/MSVC will just ignore these pragmas.
43
 */
44
45
#include <string.h>
46
#include <immintrin.h>
47
#include <stddef.h>
48
#include <stdint.h>
49
50
OPENSSL_TARGET_AVX2
51
static __m256i lookup_pshufb_std(__m256i input)
52
0
{
53
0
    __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
54
0
    const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
55
56
0
    result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
57
0
    __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
58
0
        '0' - 52, '0' - 52,
59
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
60
0
        '/' - 63, 'A', 0, 0,
61
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
62
0
        '0' - 52, '0' - 52,
63
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
64
0
        '/' - 63, 'A', 0, 0);
65
66
0
    result = _mm256_shuffle_epi8(shift_LUT, result);
67
0
    return _mm256_add_epi8(result, input);
68
0
}
69
OPENSSL_UNTARGET_AVX2
70
71
OPENSSL_TARGET_AVX2
72
static inline __m256i lookup_pshufb_srp(__m256i input)
73
0
{
74
0
    const __m256i zero = _mm256_setzero_si256();
75
0
    const __m256i hi = _mm256_set1_epi8((char)0x80);
76
0
    __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input),
77
0
        _mm256_cmpgt_epi8(input,
78
0
            _mm256_set1_epi8(63)));
79
0
    __m256i idx = _mm256_setzero_si256();
80
81
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9)));
82
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35)));
83
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3),
84
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62)));
85
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4),
86
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63)));
87
88
    /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */
89
0
    idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi));
90
91
0
    const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
92
0
        0, 0, 0, 0, 0, 0, 0, 0, 0,
93
0
        '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
94
0
        0, 0, 0, 0, 0, 0, 0, 0, 0);
95
96
0
    __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx);
97
0
    __m256i ascii = _mm256_add_epi8(shift, input);
98
0
    return ascii;
99
0
}
100
OPENSSL_UNTARGET_AVX2
101
102
OPENSSL_TARGET_AVX2
103
static inline __m256i shift_right_zeros(__m256i v, int n)
104
0
{
105
0
    switch (n) {
106
0
    case 0:
107
0
        return v;
108
0
    case 1:
109
0
        return _mm256_srli_si256(v, 1);
110
0
    case 2:
111
0
        return _mm256_srli_si256(v, 2);
112
0
    case 3:
113
0
        return _mm256_srli_si256(v, 3);
114
0
    case 4:
115
0
        return _mm256_srli_si256(v, 4);
116
0
    case 5:
117
0
        return _mm256_srli_si256(v, 5);
118
0
    case 6:
119
0
        return _mm256_srli_si256(v, 6);
120
0
    case 7:
121
0
        return _mm256_srli_si256(v, 7);
122
0
    case 8:
123
0
        return _mm256_srli_si256(v, 8);
124
0
    case 9:
125
0
        return _mm256_srli_si256(v, 9);
126
0
    case 10:
127
0
        return _mm256_srli_si256(v, 10);
128
0
    case 11:
129
0
        return _mm256_srli_si256(v, 11);
130
0
    case 12:
131
0
        return _mm256_srli_si256(v, 12);
132
0
    case 13:
133
0
        return _mm256_srli_si256(v, 13);
134
0
    case 14:
135
0
        return _mm256_srli_si256(v, 14);
136
0
    case 15:
137
0
        return _mm256_srli_si256(v, 15);
138
0
    default:
139
0
        return _mm256_setzero_si256();
140
0
    }
141
0
}
142
OPENSSL_UNTARGET_AVX2
143
144
OPENSSL_TARGET_AVX2
145
static inline __m256i shift_left_zeros(__m256i v, int n)
146
0
{
147
0
    switch (n) {
148
0
    case 0:
149
0
        return v;
150
0
    case 1:
151
0
        return _mm256_slli_si256(v, 1);
152
0
    case 2:
153
0
        return _mm256_slli_si256(v, 2);
154
0
    case 3:
155
0
        return _mm256_slli_si256(v, 3);
156
0
    case 4:
157
0
        return _mm256_slli_si256(v, 4);
158
0
    case 5:
159
0
        return _mm256_slli_si256(v, 5);
160
0
    case 6:
161
0
        return _mm256_slli_si256(v, 6);
162
0
    case 7:
163
0
        return _mm256_slli_si256(v, 7);
164
0
    case 8:
165
0
        return _mm256_slli_si256(v, 8);
166
0
    case 9:
167
0
        return _mm256_slli_si256(v, 9);
168
0
    case 10:
169
0
        return _mm256_slli_si256(v, 10);
170
0
    case 11:
171
0
        return _mm256_slli_si256(v, 11);
172
0
    case 12:
173
0
        return _mm256_slli_si256(v, 12);
174
0
    case 13:
175
0
        return _mm256_slli_si256(v, 13);
176
0
    case 14:
177
0
        return _mm256_slli_si256(v, 14);
178
0
    case 15:
179
0
        return _mm256_slli_si256(v, 15);
180
0
    case 16:
181
0
        return _mm256_setzero_si256();
182
0
    default:
183
0
        return _mm256_setzero_si256();
184
0
    }
185
0
}
186
OPENSSL_UNTARGET_AVX2
187
188
static const uint8_t shuffle_masks[16][16] = {
189
    { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
190
    { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
191
    { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
192
    { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
193
    { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
194
    { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
195
    { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
196
    { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 },
197
    { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 },
198
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 },
199
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 },
200
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 },
201
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 },
202
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 },
203
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 },
204
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 }
205
};
206
207
/**
208
 * Insert a line feed character in the 64-byte input at index K in [0,32).
209
 */
210
OPENSSL_TARGET_AVX2
211
static inline __m256i insert_line_feed32(__m256i input, int K)
212
0
{
213
0
    __m256i line_feed_vector = _mm256_set1_epi8('\n');
214
0
    __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
215
216
0
    if (K >= 16) {
217
0
        __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]);
218
0
        __m256i mask = _mm256_set_m128i(maskhi, identity);
219
0
        __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
220
0
        __m256i shuffled = _mm256_shuffle_epi8(input, mask);
221
0
        __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
222
223
0
        return result;
224
0
    }
225
    /* Shift input right by 1 byte */
226
0
    __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21),
227
0
        15);
228
0
    input = _mm256_blend_epi32(input, shift, 0xF0);
229
0
    __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]);
230
0
    __m256i mask = _mm256_set_m128i(identity, masklo);
231
0
    __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
232
0
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
233
0
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
234
0
    return result;
235
0
}
236
OPENSSL_UNTARGET_AVX2
237
238
OPENSSL_TARGET_AVX2
239
static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride,
240
    int *wrap_cnt)
241
0
{
242
0
    const int until_nl = stride - *wrap_cnt;
243
244
0
    if (until_nl > 32) {
245
0
        _mm256_storeu_si256((__m256i *)out, v);
246
247
0
        *wrap_cnt += 32;
248
0
        return 32;
249
0
    }
250
251
0
    if (until_nl == 32) {
252
0
        _mm256_storeu_si256((__m256i *)out, v);
253
254
0
        out[32] = '\n';
255
0
        *wrap_cnt = 0;
256
0
        return 33;
257
0
    }
258
259
0
    const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31);
260
0
    const __m256i with_lf = insert_line_feed32(v, until_nl);
261
0
    _mm256_storeu_si256((__m256i *)out, with_lf);
262
0
    out[32] = last;
263
264
0
    *wrap_cnt = 32 - until_nl;
265
0
    return 33;
266
0
}
267
OPENSSL_UNTARGET_AVX2
268
269
OPENSSL_TARGET_AVX2
270
static inline size_t insert_nl_gt16(const __m256i v0,
271
    uint8_t *output,
272
    int wrap_max, int *wrap_cnt)
273
0
{
274
0
    uint8_t *out = output;
275
0
    int wrap_rem = wrap_max - *wrap_cnt;
276
0
    _mm256_storeu_si256((__m256i *)(output), v0);
277
278
0
    if (wrap_rem > 32) {
279
0
        *wrap_cnt += 32;
280
0
        return 32;
281
0
    }
282
283
0
    __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF);
284
285
0
    __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
286
0
        0, 0, 0, 0, 0, 0, 0, 0,
287
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
288
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
289
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
290
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF);
291
292
0
    __m256i blended_0L = v0;
293
0
    int surplus_0 = wrap_rem < 16 ? 1 : 0;
294
0
    if (surplus_0 == 1) {
295
0
        __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem),
296
0
            wrap_rem + surplus_0);
297
0
        __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0);
298
0
        __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane);
299
0
        __m256i shifted_1_L = shift_left_zeros(v0, 1);
300
0
        __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask);
301
302
0
        blended_0L = _mm256_blendv_epi8(v0, shifted, mask);
303
0
        _mm256_storeu_si256((__m256i *)(output), blended_0L);
304
0
        wrap_rem += wrap_max;
305
0
    }
306
307
0
    int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0;
308
0
    int last_of_1L = _mm256_extract_epi8(v0, 31);
309
310
0
    if (surplus_1 == 1) {
311
0
        uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30);
312
0
        int wrap_rem_1 = wrap_rem - 16;
313
0
        __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1),
314
0
            wrap_rem_1 + surplus_0 + surplus_1);
315
0
        __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1);
316
0
        __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L);
317
0
        __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask);
318
0
        _mm256_storeu_si256((__m256i *)(output), blended_1L);
319
320
0
        output[wrap_rem + surplus_0] = '\n';
321
0
        output[31 + surplus_0] = (uint8_t)sec_last_of_1L;
322
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
323
0
    }
324
325
0
    if (surplus_0 == 1) {
326
0
        output[wrap_rem - wrap_max] = '\n';
327
0
        output[16] = _mm256_extract_epi8(v0, 15);
328
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
329
0
    }
330
331
0
    *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem;
332
333
0
    int nl_at_end = 0;
334
0
    if (*wrap_cnt == wrap_max || *wrap_cnt == 0) {
335
0
        *wrap_cnt = 0;
336
0
        output[32 + surplus_0 + surplus_1] = '\n';
337
0
        nl_at_end = 1;
338
0
    }
339
340
0
    out += 32 + surplus_0 + surplus_1 + nl_at_end;
341
0
    size_t written = (size_t)(out - output);
342
343
0
    return written;
344
0
}
345
OPENSSL_UNTARGET_AVX2
346
347
OPENSSL_TARGET_AVX2
348
static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0,
349
    uint8_t *output,
350
    int dummy_stride,
351
    int *wrap_cnt)
352
0
{
353
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
354
0
        (char)0xFF,
355
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF,
356
0
        12);
357
0
    __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask);
358
359
0
    _mm256_storeu_si256((__m256i *)(output + 0), shuffled);
360
361
0
    int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7);
362
0
    int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29);
363
0
    int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15);
364
365
0
    uint8_t *out = output;
366
0
    out[4] = '\n';
367
0
    memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext));
368
0
    out[16 + 1] = '\n';
369
0
    memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1));
370
0
    out[16 + 14] = '\n';
371
0
    memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2));
372
373
0
    out += 32 + 3;
374
0
    *wrap_cnt = 4;
375
376
0
    size_t written = (out - output);
377
0
    return written;
378
0
}
379
OPENSSL_UNTARGET_AVX2
380
381
OPENSSL_TARGET_AVX2
382
static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask)
383
0
{
384
0
    __m256i newline = _mm256_set1_epi8('\n');
385
386
0
    return _mm256_or_si256(_mm256_and_si256(mask, newline),
387
0
        _mm256_andnot_si256(mask, data));
388
0
}
389
OPENSSL_UNTARGET_AVX2
390
391
OPENSSL_TARGET_AVX2
392
static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output)
393
0
{
394
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6,
395
0
        7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12,
396
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3,
397
0
        (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9);
398
0
    __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
399
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
400
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
401
0
        0, 0);
402
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
403
0
    __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes);
404
405
0
    _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl);
406
407
    /* Handle cross-lane remainder logic */
408
    /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */
409
0
#define B_LANE 16 /* Bytes per lane */
410
0
#define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */
411
0
#define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */
412
413
    /* Bytes that were shifted out of lane 0 */
414
0
    __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L);
415
416
    /* Bytes that were shifted out of lane 1 */
417
0
    __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L),
418
0
                                               B_LANE - N_RET_1_L),
419
0
        B_LANE - 2);
420
421
    /* isolate the bytes that were shifted out of lane 1 */
422
0
    __m256i rem_2_L_P2 = _mm256_slli_si256(
423
0
        _mm256_srli_si256(v0,
424
0
            B_LANE - N_RET_2_L + N_RET_1_L),
425
0
        N_RET_1_L);
426
427
0
    __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2);
428
429
0
    int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0);
430
0
    int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2);
431
432
0
    uint8_t *out = output + 16;
433
0
    memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext));
434
0
    out += 3;
435
0
    *out++ = '\n';
436
437
0
    out = output + 32;
438
0
    memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext));
439
0
    out += 2;
440
0
    *out++ = '\n';
441
0
    out += 4;
442
0
    *out++ = '\n';
443
444
0
    size_t written = (out - output);
445
0
    return written;
446
0
}
447
OPENSSL_UNTARGET_AVX2
448
449
OPENSSL_TARGET_AVX2
450
static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output)
451
0
{
452
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF,
453
0
        8, 9, 10, 11, 12, 13, 14,
454
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6,
455
0
        7, (char)0xFF, 8, 9, 10, 11, 12);
456
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
457
0
    _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes);
458
0
    int8_t rem_1_L = _mm256_extract_epi8(v0, 15);
459
0
    int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29);
460
0
    int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15);
461
0
    uint8_t *out = output;
462
463
0
    memcpy(out + 16, &rem_1_L, sizeof(rem_1_L));
464
0
    memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1));
465
0
    memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2));
466
467
0
    output[8] = '\n';
468
0
    output[17] = '\n';
469
0
    output[26] = '\n';
470
0
    output[35] = '\n';
471
472
0
    out += 32 + 4;
473
474
0
    size_t written = (out - output);
475
0
    return written;
476
0
}
477
OPENSSL_UNTARGET_AVX2
478
479
OPENSSL_TARGET_AVX2
480
size_t encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst,
481
    const unsigned char *src, int srclen, int ctx_length,
482
    int *final_wrap_cnt)
483
0
{
484
0
    const uint8_t *input = (const uint8_t *)src;
485
0
    uint8_t *out = (uint8_t *)dst;
486
0
    int i = 0;
487
0
    int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4;
488
0
    int wrap_cnt = 0;
489
0
    const int use_srp = (ctx != NULL
490
0
        && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0);
491
0
    const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
492
0
        10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
493
0
    int base = 0;
494
495
    /* Process 96 bytes at a time */
496
0
    for (; i + 100 <= srclen; i += 96) {
497
0
        _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0);
498
        /*
499
         * Interleaved for each vector: load, shuffle, bit-split, lookup
500
         * before starting the next, giving the OoO engine independent work chains
501
         * across execution ports.
502
         */
503
0
        const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0));
504
0
        const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1));
505
0
        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
506
0
        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
507
0
        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
508
0
        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
509
0
        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
510
0
        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
511
512
0
        const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2));
513
0
        const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3));
514
0
        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
515
0
        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
516
0
        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
517
0
        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
518
0
        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
519
0
        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
520
521
0
        const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4));
522
0
        const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5));
523
0
        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
524
0
        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
525
0
        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
526
0
        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
527
0
        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
528
0
        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
529
530
0
        const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6));
531
0
        const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7));
532
0
        __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
533
0
        const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
534
0
        const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
535
0
        const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
536
0
        const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
537
0
        const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
538
539
0
        __m256i vec0;
540
0
        __m256i vec1;
541
0
        __m256i vec2;
542
0
        __m256i vec3;
543
544
0
        if (use_srp) {
545
0
            vec0 = lookup_pshufb_srp(input0);
546
0
            vec1 = lookup_pshufb_srp(input1);
547
0
            vec2 = lookup_pshufb_srp(input2);
548
0
            vec3 = lookup_pshufb_srp(input3);
549
550
0
        } else {
551
0
            vec0 = lookup_pshufb_std(input0);
552
0
            vec1 = lookup_pshufb_std(input1);
553
0
            vec2 = lookup_pshufb_std(input2);
554
0
            vec3 = lookup_pshufb_std(input3);
555
0
        }
556
557
0
        if (stride == 0) {
558
0
            _mm256_storeu_si256((__m256i *)out, vec0);
559
560
0
            out += 32;
561
0
            _mm256_storeu_si256((__m256i *)out, vec1);
562
563
0
            out += 32;
564
0
            _mm256_storeu_si256((__m256i *)out, vec2);
565
566
0
            out += 32;
567
0
            _mm256_storeu_si256((__m256i *)out, vec3);
568
569
0
            out += 32;
570
0
        } else if (stride == 64) {
571
0
            _mm256_storeu_si256((__m256i *)out, vec0);
572
573
0
            out += 32;
574
0
            _mm256_storeu_si256((__m256i *)out, vec1);
575
576
0
            out += 32;
577
0
            *(out++) = '\n';
578
579
0
            _mm256_storeu_si256((__m256i *)out, vec2);
580
0
            out += 32;
581
582
0
            _mm256_storeu_si256((__m256i *)out, vec3);
583
0
            out += 32;
584
585
0
            *(out++) = '\n';
586
0
        } else if (stride == 4) {
587
0
            int out_idx = 0;
588
589
0
            out_idx += (int)insert_nl_str4(vec0, out + out_idx);
590
0
            out_idx += (int)insert_nl_str4(vec1, out + out_idx);
591
0
            out_idx += (int)insert_nl_str4(vec2, out + out_idx);
592
0
            out_idx += (int)insert_nl_str4(vec3, out + out_idx);
593
594
0
            out += out_idx;
595
0
        } else if (stride == 8) {
596
597
0
            out += insert_nl_str8(vec0, out);
598
0
            out += insert_nl_str8(vec1, out);
599
0
            out += insert_nl_str8(vec2, out);
600
0
            out += insert_nl_str8(vec3, out);
601
602
0
        } else if (stride == 12) {
603
0
            switch (base) {
604
0
            case 0:
605
606
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
607
0
                out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt);
608
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
609
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
610
0
                break;
611
0
            case 1:
612
0
                out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt);
613
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
614
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
615
0
                out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt);
616
0
                break;
617
0
            default: /* base == 2 */
618
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
619
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
620
0
                out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt);
621
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
622
0
                break;
623
0
            }
624
625
0
            if (++base == 3)
626
0
                base = 0;
627
0
        } else if (stride >= 32) {
628
0
            out += ins_nl_gt32(vec0, out, stride, &wrap_cnt);
629
0
            out += ins_nl_gt32(vec1, out, stride, &wrap_cnt);
630
0
            out += ins_nl_gt32(vec2, out, stride, &wrap_cnt);
631
0
            out += ins_nl_gt32(vec3, out, stride, &wrap_cnt);
632
0
        } else if (stride >= 16) {
633
0
            out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
634
0
            out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
635
0
            out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
636
0
            out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
637
0
        }
638
0
    }
639
640
0
    if (stride == 0) {
641
0
        for (; i + 28 <= srclen; i += 24) {
642
            /* lo = [xxxx|DDDC|CCBB|BAAA] */
643
            /* hi = [xxxx|HHHG|GGFF|FEEE] */
644
0
            const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i));
645
0
            const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3));
646
            /*
647
             * bytes from groups A, B and C are needed in separate 32-bit lanes
648
             * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
649
             */
650
0
            __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
651
0
            const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
652
0
            const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
653
0
            const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
654
0
            const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
655
0
            const __m256i indices = _mm256_or_si256(t1, t3);
656
0
            _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices));
657
658
0
            out += 32;
659
0
        }
660
0
    }
661
0
    *final_wrap_cnt = wrap_cnt;
662
663
0
    if (stride >= 32 && wrap_cnt == stride) {
664
0
        wrap_cnt = 0;
665
0
        *out++ = '\n';
666
0
    }
667
668
0
    return (size_t)(out - (uint8_t *)dst) + evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt);
669
0
}
670
OPENSSL_UNTARGET_AVX2
671
#endif /* !defined(_M_ARM64EC) */
672
#endif