Coverage Report

Created: 2026-05-06 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/evp/enc_b64_avx2.c
Line
Count
Source
1
#include <openssl/evp.h>
2
#include "enc_b64_scalar.h"
3
#include "enc_b64_avx2.h"
4
#include "internal/cryptlib.h"
5
#include "crypto/evp.h"
6
#include "evp_local.h"
7
8
#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
9
#if !defined(_M_ARM64EC)
10
#if defined(HAVE_AVX2_INTRINSICS)
11
#define STRINGIFY_IMPLEMENTATION_(a) #a
12
#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
13
14
#ifdef __clang__
15
/*
16
 * clang does not have GCC push pop
17
 * warning: clang attribute push can't be used within a namespace in clang up
18
 * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be
19
 * outside* of a namespace.
20
 */
21
#define OPENSSL_TARGET_REGION(T)                                       \
22
    _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \
23
        apply_to = function)))
24
#define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop")
25
#elif defined(__GNUC__)
26
#define OPENSSL_TARGET_REGION(T) \
27
    _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
28
#define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options")
29
#endif /* clang then gcc */
30
31
/* Default target region macros don't do anything. */
32
#ifndef OPENSSL_TARGET_REGION
33
#define OPENSSL_TARGET_REGION(T)
34
#define OPENSSL_UNTARGET_REGION
35
#endif
36
37
#define OPENSSL_TARGET_AVX2 \
38
    OPENSSL_TARGET_REGION("avx2")
39
#define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION
40
41
/*
42
 * Ensure this whole block is compiled with AVX2 enabled on GCC.
43
 * Clang/MSVC will just ignore these pragmas.
44
 */
45
46
#include <string.h>
47
#include <immintrin.h>
48
#include <stddef.h>
49
#include <stdint.h>
50
51
OPENSSL_TARGET_AVX2
52
static __m256i lookup_pshufb_std(__m256i input)
53
0
{
54
0
    __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
55
0
    const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
56
57
0
    result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
58
0
    __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
59
0
        '0' - 52, '0' - 52,
60
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
61
0
        '/' - 63, 'A', 0, 0,
62
0
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
63
0
        '0' - 52, '0' - 52,
64
0
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
65
0
        '/' - 63, 'A', 0, 0);
66
67
0
    result = _mm256_shuffle_epi8(shift_LUT, result);
68
0
    return _mm256_add_epi8(result, input);
69
0
}
70
OPENSSL_UNTARGET_AVX2
71
72
OPENSSL_TARGET_AVX2
73
static ossl_inline __m256i lookup_pshufb_srp(__m256i input)
74
0
{
75
0
    const __m256i zero = _mm256_setzero_si256();
76
0
    const __m256i hi = _mm256_set1_epi8((char)0x80);
77
0
    __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input),
78
0
        _mm256_cmpgt_epi8(input,
79
0
            _mm256_set1_epi8(63)));
80
0
    __m256i idx = _mm256_setzero_si256();
81
82
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9)));
83
0
    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35)));
84
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3),
85
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62)));
86
0
    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4),
87
0
        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63)));
88
89
    /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */
90
0
    idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi));
91
92
0
    const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
93
0
        0, 0, 0, 0, 0, 0, 0, 0, 0,
94
0
        '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
95
0
        0, 0, 0, 0, 0, 0, 0, 0, 0);
96
97
0
    __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx);
98
0
    __m256i ascii = _mm256_add_epi8(shift, input);
99
0
    return ascii;
100
0
}
101
OPENSSL_UNTARGET_AVX2
102
103
OPENSSL_TARGET_AVX2
104
static ossl_inline __m256i shift_right_zeros(__m256i v, int n)
105
0
{
106
0
    switch (n) {
107
0
    case 0:
108
0
        return v;
109
0
    case 1:
110
0
        return _mm256_srli_si256(v, 1);
111
0
    case 2:
112
0
        return _mm256_srli_si256(v, 2);
113
0
    case 3:
114
0
        return _mm256_srli_si256(v, 3);
115
0
    case 4:
116
0
        return _mm256_srli_si256(v, 4);
117
0
    case 5:
118
0
        return _mm256_srli_si256(v, 5);
119
0
    case 6:
120
0
        return _mm256_srli_si256(v, 6);
121
0
    case 7:
122
0
        return _mm256_srli_si256(v, 7);
123
0
    case 8:
124
0
        return _mm256_srli_si256(v, 8);
125
0
    case 9:
126
0
        return _mm256_srli_si256(v, 9);
127
0
    case 10:
128
0
        return _mm256_srli_si256(v, 10);
129
0
    case 11:
130
0
        return _mm256_srli_si256(v, 11);
131
0
    case 12:
132
0
        return _mm256_srli_si256(v, 12);
133
0
    case 13:
134
0
        return _mm256_srli_si256(v, 13);
135
0
    case 14:
136
0
        return _mm256_srli_si256(v, 14);
137
0
    case 15:
138
0
        return _mm256_srli_si256(v, 15);
139
0
    default:
140
0
        return _mm256_setzero_si256();
141
0
    }
142
0
}
143
OPENSSL_UNTARGET_AVX2
144
145
OPENSSL_TARGET_AVX2
146
static ossl_inline __m256i shift_left_zeros(__m256i v, int n)
147
0
{
148
0
    switch (n) {
149
0
    case 0:
150
0
        return v;
151
0
    case 1:
152
0
        return _mm256_slli_si256(v, 1);
153
0
    case 2:
154
0
        return _mm256_slli_si256(v, 2);
155
0
    case 3:
156
0
        return _mm256_slli_si256(v, 3);
157
0
    case 4:
158
0
        return _mm256_slli_si256(v, 4);
159
0
    case 5:
160
0
        return _mm256_slli_si256(v, 5);
161
0
    case 6:
162
0
        return _mm256_slli_si256(v, 6);
163
0
    case 7:
164
0
        return _mm256_slli_si256(v, 7);
165
0
    case 8:
166
0
        return _mm256_slli_si256(v, 8);
167
0
    case 9:
168
0
        return _mm256_slli_si256(v, 9);
169
0
    case 10:
170
0
        return _mm256_slli_si256(v, 10);
171
0
    case 11:
172
0
        return _mm256_slli_si256(v, 11);
173
0
    case 12:
174
0
        return _mm256_slli_si256(v, 12);
175
0
    case 13:
176
0
        return _mm256_slli_si256(v, 13);
177
0
    case 14:
178
0
        return _mm256_slli_si256(v, 14);
179
0
    case 15:
180
0
        return _mm256_slli_si256(v, 15);
181
0
    case 16:
182
0
        return _mm256_setzero_si256();
183
0
    default:
184
0
        return _mm256_setzero_si256();
185
0
    }
186
0
}
187
OPENSSL_UNTARGET_AVX2
188
189
static const uint8_t shuffle_masks[16][16] = {
190
    { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
191
    { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
192
    { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
193
    { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
194
    { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
195
    { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
196
    { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
197
    { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 },
198
    { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 },
199
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 },
200
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 },
201
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 },
202
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 },
203
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 },
204
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 },
205
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 }
206
};
207
208
/**
209
 * Insert a line feed character in the 64-byte input at index K in [0,32).
210
 */
211
OPENSSL_TARGET_AVX2
212
static ossl_inline __m256i insert_line_feed32(__m256i input, int K)
213
0
{
214
0
    __m256i line_feed_vector = _mm256_set1_epi8('\n');
215
0
    __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
216
217
0
    if (K >= 16) {
218
0
        __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]);
219
0
        __m256i mask = _mm256_set_m128i(maskhi, identity);
220
0
        __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
221
0
        __m256i shuffled = _mm256_shuffle_epi8(input, mask);
222
0
        __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
223
224
0
        return result;
225
0
    }
226
    /* Shift input right by 1 byte */
227
0
    __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21),
228
0
        15);
229
0
    input = _mm256_blend_epi32(input, shift, 0xF0);
230
0
    __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]);
231
0
    __m256i mask = _mm256_set_m128i(identity, masklo);
232
0
    __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
233
0
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
234
0
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
235
0
    return result;
236
0
}
237
OPENSSL_UNTARGET_AVX2
238
239
OPENSSL_TARGET_AVX2
240
static ossl_inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride,
241
    int *wrap_cnt)
242
0
{
243
0
    const int until_nl = stride - *wrap_cnt;
244
245
0
    if (until_nl > 32) {
246
0
        _mm256_storeu_si256((__m256i *)out, v);
247
248
0
        *wrap_cnt += 32;
249
0
        return 32;
250
0
    }
251
252
0
    if (until_nl == 32) {
253
0
        _mm256_storeu_si256((__m256i *)out, v);
254
255
0
        out[32] = '\n';
256
0
        *wrap_cnt = 0;
257
0
        return 33;
258
0
    }
259
260
0
    const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31);
261
0
    const __m256i with_lf = insert_line_feed32(v, until_nl);
262
0
    _mm256_storeu_si256((__m256i *)out, with_lf);
263
0
    out[32] = last;
264
265
0
    *wrap_cnt = 32 - until_nl;
266
0
    return 33;
267
0
}
268
OPENSSL_UNTARGET_AVX2
269
270
OPENSSL_TARGET_AVX2
271
static ossl_inline size_t insert_nl_gt16(const __m256i v0,
272
    uint8_t *output,
273
    int wrap_max, int *wrap_cnt)
274
0
{
275
0
    uint8_t *out = output;
276
0
    int wrap_rem = wrap_max - *wrap_cnt;
277
0
    _mm256_storeu_si256((__m256i *)(output), v0);
278
279
0
    if (wrap_rem > 32) {
280
0
        *wrap_cnt += 32;
281
0
        return 32;
282
0
    }
283
284
0
    __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF);
285
286
0
    __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
287
0
        0, 0, 0, 0, 0, 0, 0, 0,
288
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
289
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
290
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
291
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF);
292
293
0
    __m256i blended_0L = v0;
294
0
    int surplus_0 = wrap_rem < 16 ? 1 : 0;
295
0
    if (surplus_0 == 1) {
296
0
        __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem),
297
0
            wrap_rem + surplus_0);
298
0
        __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0);
299
0
        __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane);
300
0
        __m256i shifted_1_L = shift_left_zeros(v0, 1);
301
0
        __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask);
302
303
0
        blended_0L = _mm256_blendv_epi8(v0, shifted, mask);
304
0
        _mm256_storeu_si256((__m256i *)(output), blended_0L);
305
0
        wrap_rem += wrap_max;
306
0
    }
307
308
0
    int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0;
309
0
    int last_of_1L = _mm256_extract_epi8(v0, 31);
310
311
0
    if (surplus_1 == 1) {
312
0
        uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30);
313
0
        int wrap_rem_1 = wrap_rem - 16;
314
0
        __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1),
315
0
            wrap_rem_1 + surplus_0 + surplus_1);
316
0
        __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1);
317
0
        __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L);
318
0
        __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask);
319
0
        _mm256_storeu_si256((__m256i *)(output), blended_1L);
320
321
0
        output[wrap_rem + surplus_0] = '\n';
322
0
        output[31 + surplus_0] = (uint8_t)sec_last_of_1L;
323
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
324
0
    }
325
326
0
    if (surplus_0 == 1) {
327
0
        output[wrap_rem - wrap_max] = '\n';
328
0
        output[16] = _mm256_extract_epi8(v0, 15);
329
0
        output[31 + surplus_0 + surplus_1] = last_of_1L;
330
0
    }
331
332
0
    *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem;
333
334
0
    int nl_at_end = 0;
335
0
    if (*wrap_cnt == wrap_max || *wrap_cnt == 0) {
336
0
        *wrap_cnt = 0;
337
0
        output[32 + surplus_0 + surplus_1] = '\n';
338
0
        nl_at_end = 1;
339
0
    }
340
341
0
    out += 32 + surplus_0 + surplus_1 + nl_at_end;
342
0
    size_t written = (size_t)(out - output);
343
344
0
    return written;
345
0
}
346
OPENSSL_UNTARGET_AVX2
347
348
OPENSSL_TARGET_AVX2
349
static ossl_inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0,
350
    uint8_t *output,
351
    int dummy_stride,
352
    int *wrap_cnt)
353
0
{
354
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
355
0
        (char)0xFF,
356
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF,
357
0
        12);
358
0
    __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask);
359
360
0
    _mm256_storeu_si256((__m256i *)(output + 0), shuffled);
361
362
0
    int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7);
363
0
    int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29);
364
0
    int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15);
365
366
0
    uint8_t *out = output;
367
0
    out[4] = '\n';
368
0
    memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext));
369
0
    out[16 + 1] = '\n';
370
0
    memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1));
371
0
    out[16 + 14] = '\n';
372
0
    memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2));
373
374
0
    out += 32 + 3;
375
0
    *wrap_cnt = 4;
376
377
0
    size_t written = (out - output);
378
0
    return written;
379
0
}
380
OPENSSL_UNTARGET_AVX2
381
382
OPENSSL_TARGET_AVX2
383
static ossl_inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask)
384
0
{
385
0
    __m256i newline = _mm256_set1_epi8('\n');
386
387
0
    return _mm256_or_si256(_mm256_and_si256(mask, newline),
388
0
        _mm256_andnot_si256(mask, data));
389
0
}
390
OPENSSL_UNTARGET_AVX2
391
392
OPENSSL_TARGET_AVX2
393
static ossl_inline size_t insert_nl_str4(const __m256i v0, uint8_t *output)
394
0
{
395
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6,
396
0
        7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12,
397
0
        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3,
398
0
        (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9);
399
0
    __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
400
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
401
0
        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
402
0
        0, 0);
403
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
404
0
    __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes);
405
406
0
    _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl);
407
408
    /* Handle cross-lane remainder logic */
409
    /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */
410
0
#define B_LANE 16 /* Bytes per lane */
411
0
#define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */
412
0
#define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */
413
414
    /* Bytes that were shifted out of lane 0 */
415
0
    __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L);
416
417
    /* Bytes that were shifted out of lane 1 */
418
0
    __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L),
419
0
                                               B_LANE - N_RET_1_L),
420
0
        B_LANE - 2);
421
422
    /* isolate the bytes that were shifted out of lane 1 */
423
0
    __m256i rem_2_L_P2 = _mm256_slli_si256(
424
0
        _mm256_srli_si256(v0,
425
0
            B_LANE - N_RET_2_L + N_RET_1_L),
426
0
        N_RET_1_L);
427
428
0
    __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2);
429
430
0
    int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0);
431
0
    int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2);
432
433
0
    uint8_t *out = output + 16;
434
0
    memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext));
435
0
    out += 3;
436
0
    *out++ = '\n';
437
438
0
    out = output + 32;
439
0
    memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext));
440
0
    out += 2;
441
0
    *out++ = '\n';
442
0
    out += 4;
443
0
    *out++ = '\n';
444
445
0
    size_t written = (out - output);
446
0
    return written;
447
0
}
448
OPENSSL_UNTARGET_AVX2
449
450
OPENSSL_TARGET_AVX2
451
static ossl_inline size_t insert_nl_str8(const __m256i v0, uint8_t *output)
452
0
{
453
0
    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF,
454
0
        8, 9, 10, 11, 12, 13, 14,
455
0
        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6,
456
0
        7, (char)0xFF, 8, 9, 10, 11, 12);
457
0
    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
458
0
    _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes);
459
0
    int8_t rem_1_L = _mm256_extract_epi8(v0, 15);
460
0
    int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29);
461
0
    int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15);
462
0
    uint8_t *out = output;
463
464
0
    memcpy(out + 16, &rem_1_L, sizeof(rem_1_L));
465
0
    memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1));
466
0
    memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2));
467
468
0
    output[8] = '\n';
469
0
    output[17] = '\n';
470
0
    output[26] = '\n';
471
0
    output[35] = '\n';
472
473
0
    out += 32 + 4;
474
475
0
    size_t written = (out - output);
476
0
    return written;
477
0
}
478
OPENSSL_UNTARGET_AVX2
479
480
OPENSSL_TARGET_AVX2
481
size_t encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst,
482
    const unsigned char *src, int srclen, int ctx_length,
483
    int *final_wrap_cnt)
484
0
{
485
0
    const uint8_t *input = (const uint8_t *)src;
486
0
    uint8_t *out = (uint8_t *)dst;
487
0
    int i = 0;
488
0
    int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4;
489
0
    int wrap_cnt = 0;
490
0
    const int use_srp = (ctx != NULL
491
0
        && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0);
492
0
    const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
493
0
        10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
494
0
    int base = 0;
495
496
    /* Process 96 bytes at a time */
497
0
    for (; i + 100 <= srclen; i += 96) {
498
0
        _mm_prefetch((const char *)(input + i + 192), _MM_HINT_T0);
499
        /*
500
         * Interleaved for each vector: load, shuffle, bit-split, lookup
501
         * before starting the next, giving the OoO engine independent work chains
502
         * across execution ports.
503
         */
504
0
        const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0));
505
0
        const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1));
506
0
        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
507
0
        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
508
0
        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
509
0
        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
510
0
        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
511
0
        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
512
513
0
        const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2));
514
0
        const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3));
515
0
        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
516
0
        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
517
0
        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
518
0
        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
519
0
        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
520
0
        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
521
522
0
        const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4));
523
0
        const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5));
524
0
        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
525
0
        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
526
0
        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
527
0
        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
528
0
        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
529
0
        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
530
531
0
        const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6));
532
0
        const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7));
533
0
        __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
534
0
        const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
535
0
        const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
536
0
        const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
537
0
        const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
538
0
        const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
539
540
0
        __m256i vec0;
541
0
        __m256i vec1;
542
0
        __m256i vec2;
543
0
        __m256i vec3;
544
545
0
        if (use_srp) {
546
0
            vec0 = lookup_pshufb_srp(input0);
547
0
            vec1 = lookup_pshufb_srp(input1);
548
0
            vec2 = lookup_pshufb_srp(input2);
549
0
            vec3 = lookup_pshufb_srp(input3);
550
551
0
        } else {
552
0
            vec0 = lookup_pshufb_std(input0);
553
0
            vec1 = lookup_pshufb_std(input1);
554
0
            vec2 = lookup_pshufb_std(input2);
555
0
            vec3 = lookup_pshufb_std(input3);
556
0
        }
557
558
0
        if (stride == 0) {
559
0
            _mm256_storeu_si256((__m256i *)out, vec0);
560
561
0
            out += 32;
562
0
            _mm256_storeu_si256((__m256i *)out, vec1);
563
564
0
            out += 32;
565
0
            _mm256_storeu_si256((__m256i *)out, vec2);
566
567
0
            out += 32;
568
0
            _mm256_storeu_si256((__m256i *)out, vec3);
569
570
0
            out += 32;
571
0
        } else if (stride == 64) {
572
0
            _mm256_storeu_si256((__m256i *)out, vec0);
573
574
0
            out += 32;
575
0
            _mm256_storeu_si256((__m256i *)out, vec1);
576
577
0
            out += 32;
578
0
            *(out++) = '\n';
579
580
0
            _mm256_storeu_si256((__m256i *)out, vec2);
581
0
            out += 32;
582
583
0
            _mm256_storeu_si256((__m256i *)out, vec3);
584
0
            out += 32;
585
586
0
            *(out++) = '\n';
587
0
        } else if (stride == 4) {
588
0
            int out_idx = 0;
589
590
0
            out_idx += (int)insert_nl_str4(vec0, out + out_idx);
591
0
            out_idx += (int)insert_nl_str4(vec1, out + out_idx);
592
0
            out_idx += (int)insert_nl_str4(vec2, out + out_idx);
593
0
            out_idx += (int)insert_nl_str4(vec3, out + out_idx);
594
595
0
            out += out_idx;
596
0
        } else if (stride == 8) {
597
598
0
            out += insert_nl_str8(vec0, out);
599
0
            out += insert_nl_str8(vec1, out);
600
0
            out += insert_nl_str8(vec2, out);
601
0
            out += insert_nl_str8(vec3, out);
602
603
0
        } else if (stride == 12) {
604
0
            switch (base) {
605
0
            case 0:
606
607
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
608
0
                out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt);
609
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
610
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
611
0
                break;
612
0
            case 1:
613
0
                out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt);
614
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
615
0
                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
616
0
                out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt);
617
0
                break;
618
0
            default: /* base == 2 */
619
0
                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
620
0
                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
621
0
                out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt);
622
0
                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
623
0
                break;
624
0
            }
625
626
0
            if (++base == 3)
627
0
                base = 0;
628
0
        } else if (stride >= 32) {
629
0
            out += ins_nl_gt32(vec0, out, stride, &wrap_cnt);
630
0
            out += ins_nl_gt32(vec1, out, stride, &wrap_cnt);
631
0
            out += ins_nl_gt32(vec2, out, stride, &wrap_cnt);
632
0
            out += ins_nl_gt32(vec3, out, stride, &wrap_cnt);
633
0
        } else if (stride >= 16) {
634
0
            out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
635
0
            out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
636
0
            out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
637
0
            out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
638
0
        }
639
0
    }
640
641
0
    if (stride == 0) {
642
0
        for (; i + 28 <= srclen; i += 24) {
643
            /* lo = [xxxx|DDDC|CCBB|BAAA] */
644
            /* hi = [xxxx|HHHG|GGFF|FEEE] */
645
0
            const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i));
646
0
            const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3));
647
            /*
648
             * bytes from groups A, B and C are needed in separate 32-bit lanes
649
             * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
650
             */
651
0
            __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
652
0
            const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
653
0
            const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
654
0
            const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
655
0
            const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
656
0
            const __m256i indices = _mm256_or_si256(t1, t3);
657
0
            _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices));
658
659
0
            out += 32;
660
0
        }
661
0
    }
662
0
    *final_wrap_cnt = wrap_cnt;
663
664
0
    if (stride >= 32 && wrap_cnt == stride) {
665
0
        wrap_cnt = 0;
666
0
        *out++ = '\n';
667
0
    }
668
669
0
    return (size_t)(out - (uint8_t *)dst) + evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt);
670
0
}
671
OPENSSL_UNTARGET_AVX2
672
#endif /* defined(HAVE_AVX2_INTRINSICS) */
673
#endif /* !defined(_M_ARM64EC) */
674
#endif